summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-08 17:20:16 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-08 17:20:16 (GMT)
commit170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf (patch) (side-by-side diff)
treebcfea699462d48906f11dfc5fb4d36725bc12710
parent15b4e1318130d4c49109746efecc1018dce54da9 (diff)
downloadexp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.zip
exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.gz
exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.bz2
Will Dampier's Matlab code for analyzing the entropy of the collection by sequence position and time.
-rw-r--r--test/entropy/Align2Ref.m24
-rw-r--r--test/entropy/CalculateEntropy.m15
-rw-r--r--test/entropy/CalculateProteinEntropy.m25
-rw-r--r--test/entropy/FastNWalign2.c94
-rw-r--r--test/entropy/GenomeAlignments.m31
-rw-r--r--test/entropy/GenomePairwiseDist.m98
-rw-r--r--test/entropy/RefineAlignments.m276
-rw-r--r--test/entropy/don_anal.m40
-rw-r--r--test/entropy/nwalign_mod.m637
9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/Align2Ref.m b/test/entropy/Align2Ref.m
new file mode 100644
index 0000000..3021d4a
--- a/dev/null
+++ b/test/entropy/Align2Ref.m
@@ -0,0 +1,24 @@
+function NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS, ALPHA)
+% Align2Ref
+% Uses a profile alignment to change a set of indicices in the
+% "alignment space" to the "reference space".
+%
+%
+% NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS)
+%
+
+if nargin == 3
+ ALPHA = 'aa';
+end
+
+ref_prof = seqprofile(REF_SEQ, 'alphabet', ALPHA);
+align_prof = seqprofile(ALIGN, 'alphabet', ALPHA);
+
+[~, ind1, ind2] = profalign(ref_prof, align_prof);
+
+ninds = ind2(ALIGN_INDS);
+
+mask = sum(bsxfun(@le, ind1(:)', ninds(:)),2);
+mask(mask == 0) = 1;
+
+NEW_INDS = reshape(ind1(mask), size(ALIGN_INDS)); \ No newline at end of file
diff --git a/test/entropy/CalculateEntropy.m b/test/entropy/CalculateEntropy.m
new file mode 100644
index 0000000..18bfda5
--- a/dev/null
+++ b/test/entropy/CalculateEntropy.m
@@ -0,0 +1,15 @@
+function OUT_VEC = CalculateEntropy(ARRAY)
+% CalculateEntropy
+% Calculate the Informationational Entropy of each position in a
+% multialignment. In this function I remove all gap characters from
+% each column of the alignment before doing the calculation.
+%
+%
+%
+
+num_array = aa2int(ARRAY);
+bins = histc(num_array, 0:21, 1);
+
+probs=bsxfun(@rdivide, bins(2:end-1,:),sum(bins(2:end-1,:)));
+
+OUT_VEC = -nansum(log(probs).*double(probs~=0).*probs); \ No newline at end of file
diff --git a/test/entropy/CalculateProteinEntropy.m b/test/entropy/CalculateProteinEntropy.m
new file mode 100644
index 0000000..41f06f5
--- a/dev/null
+++ b/test/entropy/CalculateProteinEntropy.m
@@ -0,0 +1,25 @@
+function OUT_DATA=CalculateProteinEntropy(Ref, ALIGN_CELL)
+
+OUT_DATA = cell(size(ALIGN_CELL));
+for i = 1:size(ALIGN_CELL,1)
+
+ aligned_seqs = ALIGN_CELL{i};
+
+ ref_seq = Ref.Sequence;
+
+ ref_prof = seqprofile(ref_seq, 'alphabet', 'aa');
+ seqs_prof = seqprofile(aligned_seqs, 'alphabet', 'aa');
+
+ [mp, H1, H2] = profalign(ref_prof, seqs_prof);
+
+ merged_seqs = char(zeros(size(aligned_seqs,1)+1, size(mp,2)));
+ merged_seqs(2:end, H2) = aligned_seqs;
+ merged_seqs(1, H1) = ref_seq;
+
+ entropy_vals = CalculateEntropy(merged_seqs);
+
+ %only pull out the values that are in the reference positions
+ ref_entropy = entropy_vals(H1);
+
+ OUT_DATA{i} = ref_entropy;
+end
diff --git a/test/entropy/FastNWalign2.c b/test/entropy/FastNWalign2.c
new file mode 100644
index 0000000..33b286d
--- a/dev/null
+++ b/test/entropy/FastNWalign2.c
@@ -0,0 +1,94 @@
+#include "mex.h"
+#include "matrix.h"
+
+void simplegap(double scoredMatchMat[],
+const double gap, int n, int m, double output_F[], double output_P[])
+{
+ // Standard Needleman-Wunsch algorithm
+
+ double up, left, diagonal, best, pos;
+
+
+ int i,j;
+
+
+ for(i=0;i<m;i++)
+ {
+ output_F[i]=i*gap;
+ output_P[i]=2;
+ }
+
+
+ for(j=0; j<n; j++) //put initial values in
+ {
+ output_F[j*m]=j*gap;
+ output_P[j*m]=4;
+ }
+ output_P[0]=1;
+
+
+
+ for(j=1;j<n;j++) //cycle through columns
+ {
+ best=output_F[(j)*m];
+
+ for(i=1; i<m; i++) //cycle through the rows
+ {
+ up=best+gap;
+ left=output_F[i+(j-1)*m]+gap;
+ diagonal=output_F[i-1+(j-1)*m]+scoredMatchMat[i-1+(j-1)*m];
+
+ if (up>left)
+ {
+ best=up;
+ pos=2;
+ }
+ else
+ {
+ best=left;
+ pos=4;
+ }
+
+ if (diagonal>=best)
+ {
+ best=diagonal;
+ output_P[i+j*m]=1;
+ }
+ else
+ {
+ output_P[i+j*m]=pos;
+ }
+ output_F[i+j*m]=best;
+ }
+ }
+}
+
+void mexFunction(int nlhs, mxArray *plhs[],
+int nrhs, const mxArray *prhs[])
+{
+
+ double *gap, *output_F, *output_P;
+ double *scoredMatchMat;
+ int n, m;
+ mwSize i;
+
+ //double *F_col, *ptr_col;
+ m=mxGetM(prhs[0]);
+ n=mxGetN(prhs[0]);
+ gap=mxGetPr(prhs[1]);
+
+ plhs[0]=mxCreateDoubleMatrix(m,n,mxREAL);
+ plhs[1]=mxCreateDoubleMatrix(m,n,mxREAL);
+
+ output_F=mxGetPr(plhs[0]);
+ output_P=mxGetPr(plhs[1]);
+
+
+ scoredMatchMat=mxGetPr(prhs[0]);
+ //
+
+
+ simplegap(scoredMatchMat,*gap,n,m,output_F,output_P);
+
+}
+
diff --git a/test/entropy/GenomeAlignments.m b/test/entropy/GenomeAlignments.m
new file mode 100644
index 0000000..be56570
--- a/dev/null
+++ b/test/entropy/GenomeAlignments.m
@@ -0,0 +1,31 @@
+function [ALIGN inds] = GenomeAlignments(IN_GENOMES, DIST_MAT)
+% GenomeAlignments
+% This will align all of the protein segments in the genome structure
+% and return a cell-array. Each cell is the alignment of one
+% protein. If the protein is missing then the distance is NaN.
+%
+
+[tree inds] = MakeTree(DIST_MAT);
+ALIGN = malign(tree, inds);
+
+ function align=malign(tree, inds)
+ seqs = arrayfun(@(x)(x.Sequence), IN_GENOMES(inds), 'uniformoutput', false);
+ align = multialign(seqs, tree);
+ end
+
+
+
+ function [tree_obj inds]=MakeTree(dist_mat)
+ % MakeTree
+ % Takes a distance matrix and returns a tree object and the indicies
+ % to the rows that are in the tree.
+ %
+
+ inds = find(~all(isnan(dist_mat)|isinf(dist_mat)|dist_mat==0));
+ dvec = squareform(dist_mat(inds, inds));
+ dvec(dvec <= 0) = rand(nnz(dvec <= 0),1)*min(dvec(dvec>0));
+ tree_obj = seqlinkage(dvec);
+
+ end
+
+end \ No newline at end of file
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m
new file mode 100644
index 0000000..544db0a
--- a/dev/null
+++ b/test/entropy/GenomePairwiseDist.m
@@ -0,0 +1,98 @@
+function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
+% GenomePairwiseDist
+% This will align all of the protein segments in the genome structure
+% and return a cell-array. Each cell is the alignment of one
+% protein. If the protein is missing then the distance is NaN.
+%
+
+if nargin < 1
+ ALIGN_NAME = 'saved_alignments';
+end
+
+try
+ display('Loading Previous Alignments')
+ d = load(ALIGN_NAME);
+ saved_dists = d.DIST_CELL;
+ saved_headers = d.header_names;
+catch %#ok<CTCH>
+ display('No previous alignments found')
+ saved_headers = {};
+ saved_dists = zeros(numel(IN_PROTS));
+end
+
+NUM_LETTERS = 4000000;
+
+
+[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
+
+display('Starting Alignments')
+
+dist_mat = NaN(size(IN_PROTS,1));
+
+[tf loc]=ismember(header_names, saved_headers);
+
+dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
+[I J] = find(triu(isnan(dist_mat)));
+inds = sub2ind(size(dist_mat), I, J);
+
+sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
+
+%find all of the ones that are not worth aligning
+bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
+dist_mat(inds(bad_mask)) = Inf;
+I(bad_mask) = [];
+J(bad_mask) = [];
+
+p_sizes = sizes(I)+sizes(J);
+counter = 1;
+display('Aligning...')
+tic
+while counter < length(I)
+ cum_sizes = cumsum(p_sizes(counter+1:end));
+ top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
+
+ I_spots = I(counter:top);
+ J_spots = J(counter:top);
+ need_align = cell(length(I_spots),2);
+ [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
+ [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
+ dist = zeros(size(I_spots));
+
+ %parfor k = 1:size(need_align,1)
+ for k = 1:size(need_align,1)
+ %display([num2str(k) ' ' num2str(toc)])
+ try
+ dist(k)=nwalign_mod(need_align{k,:});
+ catch %#ok<CTCH>
+ dist(k) = Inf;
+ end
+ end
+ time = toc;
+ inds = sub2ind(size(dist_mat), I_spots,J_spots);
+ dist_mat(inds) = dist;
+ counter = top+1;
+ display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
+end
+
+DIST_CELL = dist_mat;
+save(ALIGN_NAME, 'DIST_CELL', 'header_names')
+DIST_OUT = dfun(DIST_CELL);
+
+
+
+
+
+function out_d = dfun(MAT)
+
+score12 = triu(MAT,1);
+self_d = diag(MAT);
+d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
+
+out_d = triu(d,1) + triu(d,1)';
+
+
+
+
+
+
+
diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m
new file mode 100644
index 0000000..0e69275
--- a/dev/null
+++ b/test/entropy/RefineAlignments.m
@@ -0,0 +1,276 @@
+function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin)
+% RefineAlignments
+% Uses an interative algorithm to improve the multiple alignments of
+% an input sequence. It will iteratively remove a fraction of the
+% sequences, compress any gaps, and then re-align. This will be done
+% for NUM_REPS.
+%
+% Written by Will Dampier. Contact at wnd22@drexel.edu
+% Version 1.0 on 10/23/09
+%
+% [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN)
+%
+% IN_ALIGN A char-array representing a multiple alignment to be
+% refined.
+%
+% OUT_ALIGN The refined alignment.
+% SCORE The new alignment score.
+%
+%
+% Optional Arguements:
+%
+% 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a
+% nucleotide or amino acid alignment.
+%
+% 'GapPenalty' The penalty for gaps. This is set at -8 by
+% default.
+%
+% 'DistMat' The distance matrix to use for the alignment
+% scoring. Default is BLOSUM50 for 'AA' and NUC44
+% for 'NT'
+%
+% 'NumReps' The number of refining iterations to do.
+%
+% 'NumTry' The number of tries to perform at each iteration.
+%
+% 'RefineType' Which type of refinement to perform. This can
+% be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'.
+%
+% 'Display' A boolean indicating whether to display the output
+% at each iteration.
+%
+%
+
+
+
+
+RM_FRAC = 0.1;
+NUM_TRIES = 10;
+NUM_REPS = 100;
+alpha = 'aa';
+dist_mat = [];
+gap_penalty = -8;
+TYPE = 'mixed';
+disp_flag = true;
+
+for i = 1:2:length(varargin)
+ switch lower(varargin{i})
+
+ case 'alphabet'
+ if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1})
+ alpha = varargin{i+1};
+ else
+ error('RefineAlignments:BADALPHA', ...
+ 'Arguement to "alphabet" must be "NT" or "AA"')
+ end
+ case 'gappenalty'
+ if isnumeric(varargin{i+1}) && varargin{i+1} < 0
+ gap_penalty = varargin{i+1};
+ else
+ error('RefineAlignments:BADGAP', ...
+ 'Arguement to "GapPenalty" must be negative numeric')
+ end
+ case 'distmat'
+ dist_mat = varargin{i+1};
+
+ case 'numreps'
+ if isnumeric(varargin{i+1}) && varargin{i+1} > 1
+ NUM_REPS = varargin{i+1};
+ else
+ error('RefineAlignments:BADREPS', ...
+ 'Arguement to NumReps must be a positive numeric')
+ end
+
+ case 'refinetype'
+ TYPE = varargin{i+1};
+
+ case 'display'
+ if islogical(varargin{i+1})
+ disp_flag = varargin{i+1};
+ else
+ error('RefineAlignments:DISPLAY', ...
+ 'Arguement to Display must be a boolean.')
+ end
+
+ otherwise
+ error('RefineAlignments:BADARG', 'Unknown arguement: %s', ...
+ lower(varargin{i}))
+
+ end
+end
+
+if isempty(dist_mat)
+ if strcmpi(alpha, 'nt')
+ dist_mat = nuc44;
+ toint = @nt2int;
+ else
+ dist_mat = blosum50;
+ toint = @aa2int;
+ end
+end
+
+dist_mat(end+1,:) = gap_penalty;
+dist_mat(:,end+1) = gap_penalty;
+dist_mat(end,end) = 0;
+
+switch upper(TYPE)
+ case 'MIXED'
+ is_stoch = true;
+ is_mix = true;
+ case 'STOCASTIC'
+ is_stoch = true;
+ is_mix = false;
+ case 'DETERMINISTIC'
+ is_stoch = false;
+ is_mix = false;
+end
+
+
+OUT_ALIGN = IN_ALIGN;
+cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint);
+
+for i = 1:NUM_REPS
+
+ %determine whether to switch based on the "mix" factor
+ if is_mix
+ is_stoch = ~is_stoch;
+ end
+
+
+ rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES);
+ counter = 1;
+ while counter < NUM_TRIES
+ new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha);
+
+ new_score = CalculateScore(new_mat, dist_mat, toint);
+ if new_score > cur_score
+ if disp_flag
+ disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ...
+ num2str(abs(new_score-cur_score)), ' and by ', ...
+ num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ...
+ ' columns', ' Using stoch:', num2str(is_stoch)};
+ display([disp_cell{:}])
+ end
+ cur_score = new_score;
+ OUT_ALIGN = new_mat;
+ break
+ end
+ counter = counter + 1;
+ end
+end
+
+function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES)
+% GetRows
+% A helper function which determines the rows to split the alignment
+% with.
+%
+% MAT A char-array of the multiple alignment.
+% STOCH A boolean indicating whether to use a stochastic approach.
+% RM_FRAC The fraction of rows to remove.
+% NUM_TRIES The number of tries to return.
+%
+% rows_mat A matrix indicating which rows to remove.
+%
+
+if STOCH
+ rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC;
+ if any(all(rows_mat,1)) || any(all(~rows_mat,1))
+ cols = any(all(rows_mat,1)) || any(all(~rows_mat,1));
+ rows_mat(1,cols) = ~rows_mat(1,cols);
+ end
+
+else
+ gap_mask = MAT =='-';
+ runs = FindRuns(gap_mask);
+ score = mode(runs.*(runs~=0))-min(runs);
+ [~, order] = sort(score, 'descend');
+ rm_rows = order(1:NUM_TRIES);
+ [~, norder] = sort(runs(:,rm_rows),'descend');
+ rows_mat = false(size(MAT,1), NUM_TRIES);
+ num_remove = ceil(RM_FRAC*size(MAT,1));
+ for i = 1:NUM_TRIES
+ rows_mat(norder(1:num_remove,i),i) = true;
+ end
+end
+
+function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha)
+% SplitAndReAlign
+% This will split the alignment matix based on the provided rows and
+% then create two profiles and re-align them.
+%
+% MAT A char-array of the alignment.
+% ROWS A boolean-array describing the switch between rows
+% alpha Either 'NT' or 'AA' to indicate which sequence profile to
+% create.
+%
+% NEW_MAT The ReAlinged alignment
+%
+
+mat1 = MAT(ROWS, :);
+mat2 = MAT(~ROWS, :);
+
+mat1(:,all(mat1 == '-',1)) = [];
+mat2(:,all(mat2 == '-',1)) = [];
+
+pmat1 = seqprofile(mat1, 'alphabet', alpha);
+pmat2 = seqprofile(mat2, 'alphabet', alpha);
+try
+ [~, ind1, ind2] = profalign(pmat1, pmat2);
+catch
+ ind1 = 1:size(pmat1,2);
+ ind2 = 1:size(pmat2,2);
+end
+
+NEW_MAT = char(0);
+NEW_MAT(ROWS, ind1) = mat1;
+NEW_MAT(~ROWS, ind2) = mat2;
+NEW_MAT(~isletter(NEW_MAT)) = '-';
+NEW_MAT(all(NEW_MAT == '-',1)) = [];
+
+
+
+
+function s = CalculateScore(MAT, DIST, FUN)
+% CalculateScore
+% A function which calculates the score of the alignment. This score
+% is based on the agreement between the consensus and each individual
+% alignment.
+%
+% MAT A char-array of a multiple alignment.
+% DIST A distance matrix which indicates the penalty for
+% mismatches.
+% FUN A function which converts letter to numbers such at nt2int
+% or aa2int.
+%
+% s The score for this alingment.
+%
+%
+
+cons = seqconsensus(MAT);
+num_cons = FUN(cons);
+num_align = FUN(MAT);
+ncons = repmat(num_cons(:), [size(num_align,1), 1]);
+nalign = num_align(:);
+lookupinds = sub2ind(size(DIST), ncons, nalign);
+s = sum(DIST(lookupinds));
+
+function reverse_looking = FindRuns(input)
+% FindRuns
+% Finds consecutive runs of 1's along the rows of a boolean array.
+%
+% INPUT = [1 1 1 0 0 0 1 0 1 0 1 1;
+% 0 1 1 1 0 1 1 0 0 1 1 1];
+% RL = [1 2 3 0 0 0 1 0 1 0 1 2;
+% [0 3 2 1 0 1 2 0 0 1 2 3];
+%
+
+[m,n] = size(input);
+reverse_looking = [zeros(1,m);input.'];
+reverse_looking = reverse_looking(:);
+p = find(~reverse_looking);
+reverse_looking(p) = [0;1-diff(p)];
+reverse_looking = reshape(cumsum(reverse_looking),[],m).';
+reverse_looking(:,1) = [];
+
+
+
diff --git a/test/entropy/don_anal.m b/test/entropy/don_anal.m
new file mode 100644
index 0000000..7d0a648
--- a/dev/null
+++ b/test/entropy/don_anal.m
@@ -0,0 +1,40 @@
+d = fastaread('C:\dondata\FASTA2.fa');
+lens = arrayfun(@(x)(numel(x.Sequence)), d);
+d(lens < 300) = [];
+rexp = '/Human/(\w*)/([\d\w]*)/(.*?)/(\d{4})/';
+headers = cell(length(d), 1);
+[headers{:}] = deal(d(:).Header);
+groups = regexp(headers, rexp, 'tokens', 'once');
+cgroups = cat(1,groups{:});
+dates = unique(cgroups(:,end));
+
+seqs = cell(length(d), 1);
+[seqs{:}] = deal(d(:).Sequence);
+
+ALIGN_CELL = cell(length(dates), 1);
+INDS_CELL = cell(length(dates), 1);
+PAIRWISE_CELL = cell(length(dates),1);
+for i = 1:length(dates)
+ display(dates{i})
+ tf = strcmp(cgroups(:,end), dates{i});
+ if isempty(PAIRWISE_CELL{i})
+ pairwise_dists = GenomePairwiseDist(d(tf), dates{i});
+ PAIRWISE_CELL{i} = pairwise_dists;
+ else
+ pairwise_dists = PAIRWISE_CELL{i};
+ end
+ if isempty(ALIGN_CELL{i}) || isempty(INDS_CELL{i})
+ [ALIGN_CELL{i}, INDS_CELL{i}] = GenomeAlignments(d(tf), ...
+ pairwise_dists);
+ save restore_data PAIRWISE_CELL ALIGN_CELL INDS_CELL
+ end
+end
+
+for i = 1:length(ALIGN_CELL)
+ [ALIGN_CELL{i}, ~]= RefineAlignments(ALIGN_CELL{i});
+end
+
+
+ents = CalculateProteinEntropy(d(1), ALIGN_CELL);
+
+dents = cat(1, ents{:});
diff --git a/test/entropy/nwalign_mod.m b/test/entropy/nwalign_mod.m
new file mode 100644
index 0000000..76aade6
--- a/dev/null
+++ b/test/entropy/nwalign_mod.m
@@ -0,0 +1,637 @@
+function [score, alignment, startat, matrices] = nwalign(seq1,seq2,varargin)
+%NWALIGN performs Needleman-Wunsch global alignment of two sequences.
+%
+% NWALIGN(SEQ1, SEQ2) returns the score (in bits) for the optimal
+% alignment. Note: The scale factor used to calculate the score is
+% provided by the scoring matrix info (see below). If this is not
+% defined, then NWALIGN returns the raw score.
+%
+% [SCORE, ALIGNMENT] = NWALIGN(SEQ1, SEQ2) returns a string showing an
+% optimal global alignment of amino acid (or nucleotide) sequences SEQ1
+% and SEQ2.
+%
+% [SCORE, ALIGNMENT, STARTAT] = NWALIGN(SEQ1, SEQ2) returns a 2x1 vector
+% with the starting point indices indicating the starting point of the
+% alignment in the two sequences. Note: this output is for consistency
+% with SWALIGN and will always be [1;1] because this is a global
+% alignment.
+%
+% NWALIGN(..., 'ALPHABET', A) specifies whether the sequences are
+% amino acids ('AA') or nucleotides ('NT'). The default is AA.
+%
+% NWALIGN(..., 'SCORINGMATRIX', matrix) defines the scoring matrix to be
+% used for the alignment. The default is BLOSUM50 for AA or NUC44 for NT.
+%
+% NWALIGN(..., 'SCALE' ,scale) indicates the scale factor of the scoring
+% matrix to return the score using arbitrary units. If the scoring matrix
+% Info also provides a scale factor, then both are used.
+%
+% NWALIGN(..., 'GAPOPEN', penalty) defines the penalty for opening a gap
+% in the alignment. The default gap open penalty is 8.
+%
+% NWALIGN(..., 'EXTENDGAP', penalty) defines the penalty for extending a
+% gap in the alignment. If EXTENDGAP is not specified, then extensions to
+% gaps are scored with the same value as GAPOPEN.
+%
+% NWALIGN(..., 'SHOWSCORE', true) displays the scoring space and the
+% winning path.
+%
+%
+% Examples:
+%
+% % Return the score in bits and the global alignment using the
+% % default scoring matrix (BLOSUM50).
+% [score, align] = nwalign('VSPAGMASGYD', 'IPGKASYD')
+%
+% % Use user-specified scoring matrix and "gap open" penalty.
+% [score, align] = nwalign('IGRHRYHIGG', 'SRYIGRG',...
+% 'scoringmatrix', @pam250, 'gapopen',5)
+%
+% % Return the score in nat units (nats).
+% [score, align] = nwalign('HEAGAWGHEE', 'PAWHEAE', 'scale', log(2))
+%
+% % Display the scoring space and the winning path.
+% nwalign('VSPAGMASGYD', 'IPGKASYD', 'showscore', true)
+%
+% See also BLOSUM, MULTIALIGN, NT2AA, PAM, PROFALIGN, SEQDOTPLOT,
+% SHOWALIGNMENT, SWALIGN.
+
+% References:
+% R. Durbin, S. Eddy, A. Krogh, and G. Mitchison. Biological Sequence
+% Analysis. Cambridge UP, 1998.
+% Needleman, S. B., Wunsch, C. D., J. Mol. Biol. (1970) 48:443-453
+
+% Copyright 2002-2006 The MathWorks, Inc.
+% $Revision: 1.22.6.10 $ $Date: 2006/05/17 20:48:28 $
+
+gapopen = -8;
+gapextend = -8;
+setGapExtend = false;
+showscore=false;
+isAminoAcid = true;
+scale=1;
+
+% If the input is a structure then extract the Sequence data.
+if isstruct(seq1)
+ try
+ seq1 = seqfromstruct(seq1);
+ catch
+ rethrow(lasterror);
+ end
+end
+if isstruct(seq2)
+ try
+ seq2 = seqfromstruct(seq2);
+ catch
+ rethrow(lasterror);
+ end
+end
+if nargin > 2
+ if rem(nargin,2) == 1
+ error('Bioinfo:IncorrectNumberOfArguments',...
+ 'Incorrect number of arguments to %s.',mfilename);
+ end
+ okargs = {'scoringmatrix','gapopen','extendgap','alphabet','scale','showscore'};
+ for j=1:2:nargin-2
+ pname = varargin{j};
+ pval = varargin{j+1};
+ k = strmatch(lower(pname), okargs); %#ok
+ if isempty(k)
+ error('Bioinfo:UnknownParameterName',...
+ 'Unknown parameter name: %s.',pname);
+ elseif length(k)>1
+ error('Bioinfo:AmbiguousParameterName',...
+ 'Ambiguous parameter name: %s.',pname);
+ else
+ switch(k)
+ case 1 % scoring matrix
+ if isnumeric(pval)
+ ScoringMatrix = pval;
+ else
+ if ischar(pval)
+ pval = lower(pval);
+ end
+ try
+ [ScoringMatrix,ScoringMatrixInfo] = feval(pval);
+ catch
+ error('Bioinfo:InvalidScoringMatrix','Invalid scoring matrix.');
+ end
+ end
+ case 2 %gap open penalty
+ gapopen = -pval;
+ case 3 %gap extend penalty
+ gapextend = -pval;
+ setGapExtend = true;
+ case 4 %if sequence is nucleotide
+ if strcmpi(pval,'nt')
+ isAminoAcid = false;
+ end
+ case 5 % scale
+ scale=pval;
+ case 6 % showscore
+ showscore = pval == true;
+ end
+ end
+ end
+end
+
+% setting the default scoring matrix
+if ~exist('ScoringMatrix','var')
+ if isAminoAcid
+ [ScoringMatrix,ScoringMatrixInfo] = blosum50;
+ else
+ [ScoringMatrix,ScoringMatrixInfo] = nuc44;
+ end
+end
+
+
+% getting the scale from ScoringMatrixInfo, if it exists
+if exist('ScoringMatrixInfo','var') && isfield(ScoringMatrixInfo,'Scale')
+ scale=scale*ScoringMatrixInfo.Scale;
+end
+
+% handle properly "?" characters typically found in pdb files
+if isAminoAcid
+ if ischar(seq1)
+ seq1 = strrep(seq1,'?','X');
+ else
+ seq1(seq1 == 26) = 23;
+ end
+ if ischar(seq2)
+ seq2 = strrep(seq2,'?','X');
+ else
+ seq2(seq2 == 26) = 23;
+ end
+end
+
+% check input sequences
+% if isAminoAcid && ~(isaa(seq1) && isaa(seq2))
+% error('Bioinfo:InvalidAminoAcidSequences',...
+% 'Both sequences must be amino acids, use ALPHABET = ''NT'' for aligning nucleotides.');
+% elseif ~isAminoAcid && ~(isnt(seq1) && isnt(seq2))
+% error('Bioinfo:InvalidNucleotideSequences',...
+% 'When ALPHABET = ''NT'', both sequences must be nucleotides.');
+% end
+
+% use numerical arrays for easy indexing
+if ischar(seq1)
+ seq1=upper(seq1); %the output alignment will be all uppercase
+ if isAminoAcid
+ intseq1 = aa2int(seq1);
+ else
+ intseq1 = nt2int(seq1);
+ end
+else
+ intseq1=seq1;
+ if isAminoAcid
+ seq1 = int2aa(intseq1);
+ else
+ seq1 = int2nt(intseq1);
+ end
+end
+if ischar(seq2)
+ seq2=upper(seq2); %the output alignment will be all uppercase
+ if isAminoAcid
+ intseq2 = aa2int(seq2);
+ else
+ intseq2 = nt2int(seq2);
+ end
+else
+ intseq2=seq2;
+ if isAminoAcid
+ seq2 = int2aa(intseq2);
+ else
+ seq2 = int2nt(intseq2);
+ end
+end
+
+
+m = length(seq1);
+n = length(seq2);
+if ~n||~m
+ error('Bioinfo:InvalidLengthSequences','Length of input sequences must be greater than 0');
+end
+
+% If unknown, ambiguous or gaps appear in the sequence, we need to make
+% sure that ScoringMatrix can handle them.
+
+% possible values are
+% B Z X * - ?
+% 21 22 23 24 25 26
+
+scoringMatrixSize = size(ScoringMatrix,1);
+
+highestVal = max([intseq1, intseq2]);
+if highestVal > scoringMatrixSize
+ % if the matrix contains the 'Any' we map to that
+ if isAminoAcid
+ anyVal = aa2int('X');
+ else
+ anyVal = nt2int('N');
+ end
+ if scoringMatrixSize >= anyVal
+ intseq1(intseq1>scoringMatrixSize) = anyVal;
+ intseq2(intseq2>scoringMatrixSize) = anyVal;
+ else
+ error('Bioinfo:InvalidSymbolsInInputSequeces',...
+ 'Sequences contain symbols that cannot be handled by the given scoring matrix.');
+ end
+end
+
+if setGapExtend % call more complicated algorithm if we have
+ [F, pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend);
+else
+% [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen);
+ scoredMatchMat=[ScoringMatrix(intseq2,intseq1(:)) zeros(n,1); zeros(1,m+1)];
+ [F pointer]=FastNWalign2(scoredMatchMat,gapopen);
+end
+
+% trace back through the pointer matrix
+halfn=ceil((n+1)/2);
+halfm=ceil((m+1)/2);
+
+i = n+1; j = m+1;
+path = zeros(n+m,2);
+step = 1;
+
+score = max(F(n+1,m+1,:));
+
+if setGapExtend
+
+ if F(n+1,m+1,3)==score % favor with left-gap
+ laststate=3;
+ elseif F(n+1,m+1,2)==score % then with up-gap
+ laststate=2;
+ else % at last with match
+ laststate=1;
+ end
+
+ while (i>halfn && j>halfm) % in the rigth half favor gaps when several
+ % paths lead to the highest score
+ state=laststate;
+ if bitget(pointer(i,j,state),3)
+ laststate=3;
+ elseif bitget(pointer(i,j,state),2)
+ laststate=2;
+ else
+ laststate=1;
+ end
+
+ switch state
+ case 1 % is diagonal
+ j = j - 1;
+ i = i - 1;
+ path(step,:) = [j,i];
+ case 2 % is up
+ i = i - 1;
+ path(step,2) = i;
+ case 3 % is left
+ j = j - 1;
+ path(step,1) = j;
+ end
+ step = step +1;
+ end
+
+ while (i>1 || j>1) % in the rigth half favor matchs when several
+ % paths lead to the highest score
+ state=laststate;
+ if bitget(pointer(i,j,state),1)
+ laststate=1;
+ elseif bitget(pointer(i,j,state),2)
+ laststate=2;
+ else
+ laststate=3;
+ end
+
+ switch state
+ case 1 % is diagonal
+ j = j - 1;
+ i = i - 1;
+ path(step,:) = [j,i];
+ case 2 % is up
+ i = i - 1;
+ path(step,2) = i;
+ case 3 % is left
+ j = j - 1;
+ path(step,1) = j;
+ end
+ step = step +1;
+ end
+
+else % ~setGapExtend
+
+ while (i > 1 || j > 1)
+
+ switch pointer(i,j)
+ case 1 % diagonal only
+ j = j - 1;
+ i = i - 1;
+ path(step,:) = [j,i];
+ case 2 % up only
+ i = i - 1;
+ path(step,2) = i;
+ case 4 % left only
+ j = j - 1;
+ path(step,1) = j;
+ case 6 % up or left --> up (favors gaps in seq2)
+ j = j - 1;
+ path(step,1) = j;
+ otherwise %3 diagonal or up --> diagonal (favors no gaps)
+ %4 diagonal or left --> diagonal (favors no gaps)
+ %7 diagonal or left or up --> diagonal (favors no gaps)
+ j = j - 1;
+ i = i - 1;
+ path(step,:) = [j,i];
+ end
+ step = step +1;
+ end
+
+end % if setGapExtend
+
+% re-scaling the output score
+score = scale * score;
+
+if nargout<=1 && ~showscore
+ return
+end
+
+path(step:end,:) = []; % step-1 is the length of the alignment
+path = flipud(path);
+
+% setting the size of the alignment
+alignment = repmat(('- -')',1,step-1);
+
+% adding sequence to alignment
+alignment(1,path(:,1)>0) = seq1;
+alignment(3,path(:,2)>0) = seq2;
+
+% find locations where there are no gaps
+h=find(all(path>0,2));
+if isAminoAcid
+ noGaps1=aa2int(alignment(1,h));
+ noGaps2=aa2int(alignment(3,h));
+else
+ noGaps1=nt2int(alignment(1,h));
+ noGaps2=nt2int(alignment(3,h));
+end
+
+% erasing symbols that can not be scored
+htodel=max([noGaps1;noGaps2])>scoringMatrixSize;
+h(htodel)=[];
+noGaps1(htodel)=[];
+noGaps2(htodel)=[];
+% score pairs with no gap
+value = ScoringMatrix(sub2ind(size(ScoringMatrix),double(noGaps1),double(noGaps2)));
+
+% insert symbols of the match string into the alignment
+alignment(2,h(value>=0)) = ':';
+alignment(2,h(noGaps1==noGaps2)) = '|';
+
+startat = [1;1];
+
+% undocumented fourth output -- score and pointer matrices
+if nargout > 3
+ matrices.Scores = F;
+ matrices.Pointers = pointer;
+end
+
+if showscore
+ figure
+ F=scale.*max(F(2:end,2:end,:),[],3);
+ clim=max(max(max(abs(F(~isinf(F))))),eps);
+ imagesc(F,[-clim clim]);
+ colormap(privateColorMap(1));
+ set(colorbar,'YLim',[min([F(:);-eps]) max([F(:);eps])])
+ title('Score for best path')
+ xlabel('Sequence 1')
+ ylabel('Sequence 2')
+ hold on
+ plot(path(all(path>0,2),1),path(all(path>0,2),2),'k.')
+end
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% function [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gap)
+% % Standard Needleman-Wunsch algorithm
+%
+% %LOOP IMPLEMENTED IN FastNWalign2.c
+%
+% % set up storage for dynamic programming matrix
+% F = zeros(n+1,m+1);
+% F(2:end,1) = gap * (1:n)';
+% F(1,2:end) = gap * (1:m);
+%
+%
+% %and for the back tracing matrix
+% pointer= repmat(uint8(4),n+1,m+1);
+% pointer(:,1) = 2; % up
+% pointer(1,1) = 1;
+%
+%
+% %initialize buffers to the first column
+% ptr = pointer(:,2); % ptr(1) is always 4
+% currentFColumn = F(:,1);
+%
+%
+% %main loop runs through the matrix looking for maximal scores
+%
+% for outer = 2:m+1
+%
+% %score current column
+% scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
+% %grab the data from the matrices and initialize some values
+% lastFColumn = F(:,outer-1);
+% currentFColumn = F(:,outer);
+% best = currentFColumn(1);
+%
+% %[F(:,outer) pointer(:,outer)]=FastNWalign(lastFColumn,currentFColumn,scoredMatchColumn,gap);
+%
+% for inner = 2:n+1
+% % score the three options
+% up = best + gap;
+% left = lastFColumn(inner) + gap;
+% diagonal = lastFColumn(inner-1) + scoredMatchColumn(inner-1);
+%
+% % max could be used here but it is quicker to use if statements
+% if up > left
+% best = up;
+% pos = 2;
+% else
+% best = left;
+% pos = 4;
+% end
+%
+% if diagonal >= best
+% best = diagonal;
+% ptr(inner) = 1;
+% else
+% ptr(inner) = pos;
+% end
+% currentFColumn(inner) = best;
+%
+% end % inner
+% %put back updated columns
+%
+% % save columns of pointers
+% F(:,outer) = currentFColumn;
+% % save columns of pointers
+% pointer(:,outer) = ptr;
+%
+% end % outer
+%
+% end
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ function [F,pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend)
+ % Needleman-Wunsch algorithm modified to handle affine gaps
+
+ % Set states
+ inAlign = 1;
+ inGapUp = 2;
+ inGapLeft = 3;
+ numStates = 3;
+
+ % Set up storage for dynamic programming matrix:
+ % for keeping the maximum scores for every state
+
+ F = zeros(n+1,m+1,numStates);
+ F(:,1,:) = -inf;
+ F(1,:,:) = -inf;
+ F(1,1,inAlign) = 0;
+
+ F(2:end,1,inGapUp) = gapopen + gapextend * (0:n-1)';
+ F(1,2:end,inGapLeft) = gapopen + gapextend * (0:m-1);
+
+ % and for the back tracing pointers
+ pointer(n+1,m+1,numStates) = uint8(0);
+ pointer(2:end,1,inGapUp) = 2; % up
+ pointer(1,2:end,inGapLeft) = 4; % left
+
+ % initialize buffers to the first column
+ ptrA = pointer(:,1,inAlign);
+ ptrU = pointer(:,1,inGapLeft);
+ ptrL = pointer(:,1,inGapUp);
+
+ currentFColumnA = F(:,1,inAlign);
+ currentFColumnU = F(:,1,inGapUp);
+ currentFColumnL = F(:,1,inGapLeft);
+
+ % main loop runs through the matrix looking for maximal scores
+ for outer = 2:m+1
+ % score current column
+ scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
+ % grab the data from the matrices and initialize some values for the
+ % first row the most orderly possible
+ lastFColumnA = currentFColumnA;
+ currentFColumnA = F(:,outer,inAlign);
+ bestA = currentFColumnA(1);
+ currentinA = lastFColumnA(1);
+
+ lastFColumnU = currentFColumnU;
+ currentFColumnU = F(:,outer,inGapUp);
+ bestU = currentFColumnU(1);
+
+ lastFColumnL = currentFColumnL;
+ currentFColumnL = F(:,outer,inGapLeft);
+ currentinGL = lastFColumnL(1);
+
+ for inner = 2:n+1
+
+ % grab the data from the columns the most orderly possible
+ upOpen = bestA + gapopen;
+ inA = currentinA;
+ currentinA = lastFColumnA(inner);
+ leftOpen = currentinA + gapopen;
+
+ inGL = currentinGL;
+ currentinGL = lastFColumnL(inner);
+ leftExtend = currentinGL + gapextend;
+
+ upExtend = bestU + gapextend;
+ inGU = lastFColumnU(inner-1);
+
+ % operate state 'inGapUp'
+
+ if upOpen > upExtend
+ bestU = upOpen; ptr = 1; % diagonal
+ elseif upOpen < upExtend
+ bestU = upExtend; ptr = 2; % up
+ else % upOpen == upExtend
+ bestU = upOpen; ptr = 3; % diagonal and up
+ end
+ currentFColumnU(inner)=bestU;
+ ptrU(inner)=ptr;
+
+ % operate state 'inGapLeft'
+
+ if leftOpen > leftExtend
+ bestL = leftOpen; ptr = 1; % diagonal
+ elseif leftOpen < leftExtend
+ bestL = leftExtend; ptr = 4; % left
+ else % leftOpen == leftExtend
+ bestL = leftOpen; ptr = 5; % diagonal and left
+ end
+ currentFColumnL(inner) = bestL;
+ ptrL(inner) = ptr;
+
+ % operate state 'inAlign'
+
+ if inA > inGU
+ if inA > inGL
+ bestA = inA; ptr = 1; % diagonal
+ elseif inGL > inA
+ bestA = inGL; ptr = 4; % left
+ else
+ bestA = inA; ptr = 5; % diagonal and left
+ end
+ elseif inGU > inA
+ if inGU > inGL
+ bestA = inGU; ptr = 2; % up
+ elseif inGL > inGU
+ bestA = inGL; ptr = 4; % left
+ else
+ bestA = inGU; ptr = 6; % up & left
+ end
+ else
+ if inA > inGL
+ bestA = inA; ptr = 3; % diagonal & up
+ elseif inGL > inA
+ bestA = inGL; ptr = 4; % left
+ else
+ bestA = inA; ptr = 7; % all
+ end
+ end
+
+ bestA = bestA + scoredMatchColumn(inner-1);
+ currentFColumnA(inner) = bestA;
+ ptrA(inner) = ptr;
+
+ end %inner
+
+ % put back updated columns
+ F(:,outer,inGapLeft) = currentFColumnL;
+ F(:,outer,inGapUp) = currentFColumnU;
+ F(:,outer,inAlign) = currentFColumnA;
+ % save columns of pointers
+ pointer(:,outer,inAlign) = ptrA;
+ pointer(:,outer,inGapUp) = ptrU;
+ pointer(:,outer,inGapLeft)= ptrL;
+ end %outer
+ end
+ function pcmap = privateColorMap(selection)
+ %PRIVATECOLORMAP returns a custom color map
+ switch selection
+ case 1, pts = [0 0 .3 20;
+ 0 .1 .8 25;
+ 0 .9 .5 15;
+ .9 1 .9 8;
+ 1 1 0 26;
+ 1 0 0 26;
+ .4 0 0 0];
+ otherwise, pts = [0 0 0 128; 1 1 1 0];
+ end
+ xcl=1;
+ for i=1:size(pts,1)-1
+ xcl=[xcl,i+1/pts(i,4):1/pts(i,4):i+1];
+ end
+ pcmap = interp1(pts(:,1:3),xcl);
+ end
+end
+

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.