18 files changed, 1280 insertions, 9 deletions
diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m
new file mode 100644
index 0000000..0e69275
--- a/dev/null
+++ b/test/entropy/RefineAlignments.m
@@ -0,0 +1,276 @@
+function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin)
+%   RefineAlignments
+%       Uses an interative algorithm to improve the multiple alignments of
+%       an input sequence.  It will iteratively remove a fraction of the
+%       sequences, compress any gaps, and then re-align.  This will be done
+%       for NUM_REPS.
+%
+%   Written by Will Dampier.  Contact at wnd22@drexel.edu
+%   Version 1.0 on 10/23/09
+%   
+%   [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN)
+%
+%   IN_ALIGN        A char-array representing a multiple alignment to be
+%                   refined.
+%
+%   OUT_ALIGN       The refined alignment.
+%   SCORE           The new alignment score.
+%
+%
+%   Optional Arguements:
+%
+%       'Alphabet'      Either 'AA' or 'NT' to indicate whether this is a
+%                       nucleotide or amino acid alignment.
+%
+%       'GapPenalty'    The penalty for gaps.  This is set at -8 by
+%                       default.
+%
+%       'DistMat'       The distance matrix to use for the alignment 
+%                       scoring.  Default is BLOSUM50 for 'AA' and NUC44
+%                       for 'NT'
+%
+%       'NumReps'       The number of refining iterations to do.
+%
+%       'NumTry'        The number of tries to perform at each iteration.
+%
+%       'RefineType'    Which type of refinement to perform.  This can
+%                       be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'.
+%
+%       'Display'       A boolean indicating whether to display the output
+%                       at each iteration.
+%
+%
+RM_FRAC = 0.1;
+NUM_TRIES = 10;
+NUM_REPS = 100;
+alpha = 'aa';
+dist_mat = [];
+gap_penalty = -8;
+TYPE = 'mixed';
+disp_flag = true;
+for i = 1:2:length(varargin)
+    switch lower(varargin{i})
+        
+        case 'alphabet'
+            if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1})
+                alpha = varargin{i+1};
+            else
+                error('RefineAlignments:BADALPHA', ...
+                    'Arguement to "alphabet" must be "NT" or "AA"')
+            end
+        case 'gappenalty'
+            if isnumeric(varargin{i+1}) && varargin{i+1} < 0
+                gap_penalty = varargin{i+1};
+            else
+                error('RefineAlignments:BADGAP', ...
+                    'Arguement to "GapPenalty" must be negative numeric')
+            end
+        case 'distmat'
+            dist_mat = varargin{i+1};
+            
+        case 'numreps'
+            if isnumeric(varargin{i+1}) && varargin{i+1} > 1
+                NUM_REPS = varargin{i+1};
+            else
+                error('RefineAlignments:BADREPS', ...
+                        'Arguement to NumReps must be a positive numeric')
+            end
+            
+        case 'refinetype'
+            TYPE = varargin{i+1};
+            
+        case 'display'
+            if islogical(varargin{i+1})
+                disp_flag = varargin{i+1};
+            else
+                error('RefineAlignments:DISPLAY', ...
+                        'Arguement to Display must be a boolean.')
+            end
+       
+        otherwise
+            error('RefineAlignments:BADARG', 'Unknown arguement: %s', ...
+                    lower(varargin{i}))
+        
+    end
+end
+if isempty(dist_mat)
+    if strcmpi(alpha, 'nt')
+        dist_mat = nuc44;
+        toint = @nt2int;
+    else
+        dist_mat = blosum50;
+        toint = @aa2int;
+    end
+end
+dist_mat(end+1,:) = gap_penalty;
+dist_mat(:,end+1) = gap_penalty;
+dist_mat(end,end) = 0;
+switch upper(TYPE)
+    case 'MIXED'
+        is_stoch = true;
+        is_mix = true;
+    case 'STOCASTIC'
+        is_stoch = true;
+        is_mix = false;
+    case 'DETERMINISTIC'
+        is_stoch = false;
+        is_mix = false;
+end
+OUT_ALIGN = IN_ALIGN;
+cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint);
+for i = 1:NUM_REPS
+    
+    %determine whether to switch based on the "mix" factor
+    if is_mix
+        is_stoch = ~is_stoch;
+    end
+    
+    
+    rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES);
+    counter = 1;
+    while counter < NUM_TRIES
+        new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha);
+        
+        new_score = CalculateScore(new_mat, dist_mat, toint);
+        if new_score > cur_score
+            if disp_flag
+                disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ...
+                    num2str(abs(new_score-cur_score)), ' and by ', ...
+                    num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ...
+                    ' columns', ' Using stoch:', num2str(is_stoch)};
+                display([disp_cell{:}])
+            end
+            cur_score = new_score;
+            OUT_ALIGN = new_mat;
+            break
+        end
+        counter = counter + 1;
+    end
+end
+function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES)
+%   GetRows
+%       A helper function which determines the rows to split the alignment
+%       with.
+%
+%   MAT         A char-array of the multiple alignment.
+%   STOCH       A boolean indicating whether to use a stochastic approach.
+%   RM_FRAC     The fraction of rows to remove.
+%   NUM_TRIES   The number of tries to return.
+%
+%   rows_mat    A matrix indicating which rows to remove.
+%
+if STOCH
+    rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC;
+    if any(all(rows_mat,1)) || any(all(~rows_mat,1))
+        cols = any(all(rows_mat,1)) || any(all(~rows_mat,1));
+        rows_mat(1,cols) = ~rows_mat(1,cols);
+    end
+        
+else
+    gap_mask = MAT =='-';
+    runs = FindRuns(gap_mask);
+    score = mode(runs.*(runs~=0))-min(runs);
+    [~, order] = sort(score, 'descend');
+    rm_rows = order(1:NUM_TRIES);
+    [~, norder] = sort(runs(:,rm_rows),'descend');
+    rows_mat = false(size(MAT,1), NUM_TRIES);
+    num_remove = ceil(RM_FRAC*size(MAT,1));
+    for i = 1:NUM_TRIES
+        rows_mat(norder(1:num_remove,i),i) = true;
+    end
+end
+function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha)
+%   SplitAndReAlign
+%       This will split the alignment matix based on the provided rows and
+%       then create two profiles and re-align them.
+%
+%   MAT         A char-array of the alignment.
+%   ROWS        A boolean-array describing the switch between rows
+%   alpha       Either 'NT' or 'AA' to indicate which sequence profile to
+%               create.
+%
+%   NEW_MAT     The ReAlinged alignment
+%
+mat1 = MAT(ROWS, :);
+mat2 = MAT(~ROWS, :);
+mat1(:,all(mat1 == '-',1)) = [];
+mat2(:,all(mat2 == '-',1)) = [];
+pmat1 = seqprofile(mat1, 'alphabet', alpha);
+pmat2 = seqprofile(mat2, 'alphabet', alpha);
+try
+    [~, ind1, ind2] = profalign(pmat1, pmat2);
+catch
+    ind1 = 1:size(pmat1,2);
+    ind2 = 1:size(pmat2,2);
+end
+    
+NEW_MAT = char(0);
+NEW_MAT(ROWS, ind1) = mat1;
+NEW_MAT(~ROWS, ind2) = mat2;
+NEW_MAT(~isletter(NEW_MAT)) = '-';
+NEW_MAT(all(NEW_MAT == '-',1)) = [];
+function s = CalculateScore(MAT, DIST, FUN)
+%   CalculateScore
+%       A function which calculates the score of the alignment.  This score
+%       is based on the agreement between the consensus and each individual
+%       alignment.
+%
+%   MAT         A char-array of a multiple alignment.
+%   DIST        A distance matrix which indicates the penalty for
+%               mismatches.
+%   FUN         A function which converts letter to numbers such at nt2int
+%               or aa2int.
+%   
+%   s           The score for this alingment.
+%
+%
+cons = seqconsensus(MAT);
+num_cons = FUN(cons);
+num_align = FUN(MAT);
+ncons = repmat(num_cons(:), [size(num_align,1), 1]);
+nalign = num_align(:);
+lookupinds = sub2ind(size(DIST), ncons, nalign);
+s = sum(DIST(lookupinds));
+function reverse_looking = FindRuns(input)
+%   FindRuns
+%       Finds consecutive runs of 1's along the rows of a boolean array.
+%
+%   INPUT = [1 1 1 0 0 0 1 0 1 0 1 1;
+%            0 1 1 1 0 1 1 0 0 1 1 1];
+%   RL =    [1 2 3 0 0 0 1 0 1 0 1 2;
+%           [0 3 2 1 0 1 2 0 0 1 2 3];
+%
+[m,n] = size(input);
+reverse_looking = [zeros(1,m);input.'];
+reverse_looking = reverse_looking(:);
+p = find(~reverse_looking);
+reverse_looking(p) = [0;1-diff(p)];
+reverse_looking = reshape(cumsum(reverse_looking),[],m).';
+reverse_looking(:,1) = [];

diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m new file mode 100644 index 0000000..0e69275 --- a/dev/null +++ b/test/entropy/RefineAlignments.m
@@ -0,0 +1,276 @@
	1	function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin)
	2	% RefineAlignments
	3	% Uses an interative algorithm to improve the multiple alignments of
	4	% an input sequence. It will iteratively remove a fraction of the
	5	% sequences, compress any gaps, and then re-align. This will be done
	6	% for NUM_REPS.
	7	%
	8	% Written by Will Dampier. Contact at wnd22@drexel.edu
	9	% Version 1.0 on 10/23/09
	10	%
	11	% [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN)
	12	%
	13	% IN_ALIGN A char-array representing a multiple alignment to be
	14	% refined.
	15	%
	16	% OUT_ALIGN The refined alignment.
	17	% SCORE The new alignment score.
	18	%
	19	%
	20	% Optional Arguements:
	21	%
	22	% 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a
	23	% nucleotide or amino acid alignment.
	24	%
	25	% 'GapPenalty' The penalty for gaps. This is set at -8 by
	26	% default.
	27	%
	28	% 'DistMat' The distance matrix to use for the alignment
	29	% scoring. Default is BLOSUM50 for 'AA' and NUC44
	30	% for 'NT'
	31	%
	32	% 'NumReps' The number of refining iterations to do.
	33	%
	34	% 'NumTry' The number of tries to perform at each iteration.
	35	%
	36	% 'RefineType' Which type of refinement to perform. This can
	37	% be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'.
	38	%
	39	% 'Display' A boolean indicating whether to display the output
	40	% at each iteration.
	41	%
	42	%
	43
	44
	45
	46
	47	RM_FRAC = 0.1;
	48	NUM_TRIES = 10;
	49	NUM_REPS = 100;
	50	alpha = 'aa';
	51	dist_mat = [];
	52	gap_penalty = -8;
	53	TYPE = 'mixed';
	54	disp_flag = true;
	55
	56	for i = 1:2:length(varargin)
	57	switch lower(varargin{i})
	58
	59	case 'alphabet'
	60	if strcmpi('nt', varargin{i+1}) \|\| strcmpi('aa', varargin{i+1})
	61	alpha = varargin{i+1};
	62	else
	63	error('RefineAlignments:BADALPHA', ...
	64	'Arguement to "alphabet" must be "NT" or "AA"')
	65	end
	66	case 'gappenalty'
	67	if isnumeric(varargin{i+1}) && varargin{i+1} < 0
	68	gap_penalty = varargin{i+1};
	69	else
	70	error('RefineAlignments:BADGAP', ...
	71	'Arguement to "GapPenalty" must be negative numeric')
	72	end
	73	case 'distmat'
	74	dist_mat = varargin{i+1};
	75
	76	case 'numreps'
	77	if isnumeric(varargin{i+1}) && varargin{i+1} > 1
	78	NUM_REPS = varargin{i+1};
	79	else
	80	error('RefineAlignments:BADREPS', ...
	81	'Arguement to NumReps must be a positive numeric')
	82	end
	83
	84	case 'refinetype'
	85	TYPE = varargin{i+1};
	86
	87	case 'display'
	88	if islogical(varargin{i+1})
	89	disp_flag = varargin{i+1};
	90	else
	91	error('RefineAlignments:DISPLAY', ...
	92	'Arguement to Display must be a boolean.')
	93	end
	94
	95	otherwise
	96	error('RefineAlignments:BADARG', 'Unknown arguement: %s', ...
	97	lower(varargin{i}))
	98
	99	end
	100	end
	101
	102	if isempty(dist_mat)
	103	if strcmpi(alpha, 'nt')
	104	dist_mat = nuc44;
	105	toint = @nt2int;
	106	else
	107	dist_mat = blosum50;
	108	toint = @aa2int;
	109	end
	110	end
	111
	112	dist_mat(end+1,:) = gap_penalty;
	113	dist_mat(:,end+1) = gap_penalty;
	114	dist_mat(end,end) = 0;
	115
	116	switch upper(TYPE)
	117	case 'MIXED'
	118	is_stoch = true;
	119	is_mix = true;
	120	case 'STOCASTIC'
	121	is_stoch = true;
	122	is_mix = false;
	123	case 'DETERMINISTIC'
	124	is_stoch = false;
	125	is_mix = false;
	126	end
	127
	128
	129	OUT_ALIGN = IN_ALIGN;
	130	cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint);
	131
	132	for i = 1:NUM_REPS
	133
	134	%determine whether to switch based on the "mix" factor
	135	if is_mix
	136	is_stoch = ~is_stoch;
	137	end
	138
	139
	140	rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES);
	141	counter = 1;
	142	while counter < NUM_TRIES
	143	new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha);
	144
	145	new_score = CalculateScore(new_mat, dist_mat, toint);
	146	if new_score > cur_score
	147	if disp_flag
	148	disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ...
	149	num2str(abs(new_score-cur_score)), ' and by ', ...
	150	num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ...
	151	' columns', ' Using stoch:', num2str(is_stoch)};
	152	display([disp_cell{:}])
	153	end
	154	cur_score = new_score;
	155	OUT_ALIGN = new_mat;
	156	break
	157	end
	158	counter = counter + 1;
	159	end
	160	end
	161
	162	function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES)
	163	% GetRows
	164	% A helper function which determines the rows to split the alignment
	165	% with.
	166	%
	167	% MAT A char-array of the multiple alignment.
	168	% STOCH A boolean indicating whether to use a stochastic approach.
	169	% RM_FRAC The fraction of rows to remove.
	170	% NUM_TRIES The number of tries to return.
	171	%
	172	% rows_mat A matrix indicating which rows to remove.
	173	%
	174
	175	if STOCH
	176	rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC;
	177	if any(all(rows_mat,1)) \|\| any(all(~rows_mat,1))
	178	cols = any(all(rows_mat,1)) \|\| any(all(~rows_mat,1));
	179	rows_mat(1,cols) = ~rows_mat(1,cols);
	180	end
	181
	182	else
	183	gap_mask = MAT =='-';
	184	runs = FindRuns(gap_mask);
	185	score = mode(runs.*(runs~=0))-min(runs);
	186	[~, order] = sort(score, 'descend');
	187	rm_rows = order(1:NUM_TRIES);
	188	[~, norder] = sort(runs(:,rm_rows),'descend');
	189	rows_mat = false(size(MAT,1), NUM_TRIES);
	190	num_remove = ceil(RM_FRAC*size(MAT,1));
	191	for i = 1:NUM_TRIES
	192	rows_mat(norder(1:num_remove,i),i) = true;
	193	end
	194	end
	195
	196	function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha)
	197	% SplitAndReAlign
	198	% This will split the alignment matix based on the provided rows and
	199	% then create two profiles and re-align them.
	200	%
	201	% MAT A char-array of the alignment.
	202	% ROWS A boolean-array describing the switch between rows
	203	% alpha Either 'NT' or 'AA' to indicate which sequence profile to
	204	% create.
	205	%
	206	% NEW_MAT The ReAlinged alignment
	207	%
	208
	209	mat1 = MAT(ROWS, :);
	210	mat2 = MAT(~ROWS, :);
	211
	212	mat1(:,all(mat1 == '-',1)) = [];
	213	mat2(:,all(mat2 == '-',1)) = [];
	214
	215	pmat1 = seqprofile(mat1, 'alphabet', alpha);
	216	pmat2 = seqprofile(mat2, 'alphabet', alpha);
	217	try
	218	[~, ind1, ind2] = profalign(pmat1, pmat2);
	219	catch
	220	ind1 = 1:size(pmat1,2);
	221	ind2 = 1:size(pmat2,2);
	222	end
	223
	224	NEW_MAT = char(0);
	225	NEW_MAT(ROWS, ind1) = mat1;
	226	NEW_MAT(~ROWS, ind2) = mat2;
	227	NEW_MAT(~isletter(NEW_MAT)) = '-';
	228	NEW_MAT(all(NEW_MAT == '-',1)) = [];
	229
	230
	231
	232
	233	function s = CalculateScore(MAT, DIST, FUN)
	234	% CalculateScore
	235	% A function which calculates the score of the alignment. This score
	236	% is based on the agreement between the consensus and each individual
	237	% alignment.
	238	%
	239	% MAT A char-array of a multiple alignment.
	240	% DIST A distance matrix which indicates the penalty for
	241	% mismatches.
	242	% FUN A function which converts letter to numbers such at nt2int
	243	% or aa2int.
	244	%
	245	% s The score for this alingment.
	246	%
	247	%
	248
	249	cons = seqconsensus(MAT);
	250	num_cons = FUN(cons);
	251	num_align = FUN(MAT);
	252	ncons = repmat(num_cons(:), [size(num_align,1), 1]);
	253	nalign = num_align(:);
	254	lookupinds = sub2ind(size(DIST), ncons, nalign);
	255	s = sum(DIST(lookupinds));
	256
	257	function reverse_looking = FindRuns(input)
	258	% FindRuns
	259	% Finds consecutive runs of 1's along the rows of a boolean array.
	260	%
	261	% INPUT = [1 1 1 0 0 0 1 0 1 0 1 1;
	262	% 0 1 1 1 0 1 1 0 0 1 1 1];
	263	% RL = [1 2 3 0 0 0 1 0 1 0 1 2;
	264	% [0 3 2 1 0 1 2 0 0 1 2 3];
	265	%
	266
	267	[m,n] = size(input);
	268	reverse_looking = [zeros(1,m);input.'];
	269	reverse_looking = reverse_looking(:);
	270	p = find(~reverse_looking);
	271	reverse_looking(p) = [0;1-diff(p)];
	272	reverse_looking = reshape(cumsum(reverse_looking),[],m).';
	273	reverse_looking(:,1) = [];
	274
	275
	276