-rw-r--r-- | test/entropy/Align2Ref.m | 24 | ||||
-rw-r--r-- | test/entropy/CalculateEntropy.m | 15 | ||||
-rw-r--r-- | test/entropy/CalculateProteinEntropy.m | 25 | ||||
-rw-r--r-- | test/entropy/FastNWalign2.c | 94 | ||||
-rw-r--r-- | test/entropy/GenomeAlignments.m | 31 | ||||
-rw-r--r-- | test/entropy/GenomePairwiseDist.m | 98 | ||||
-rw-r--r-- | test/entropy/RefineAlignments.m | 276 | ||||
-rw-r--r-- | test/entropy/don_anal.m | 40 | ||||
-rw-r--r-- | test/entropy/nwalign_mod.m | 637 |
9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m new file mode 100644 index 0000000..544db0a --- a/dev/null +++ b/test/entropy/GenomePairwiseDist.m @@ -0,0 +1,98 @@ +function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
+% GenomePairwiseDist
+% This will align all of the protein segments in the genome structure
+% and return a cell-array. Each cell is the alignment of one
+% protein. If the protein is missing then the distance is NaN.
+%
+
+if nargin < 1
+ ALIGN_NAME = 'saved_alignments';
+end
+
+try
+ display('Loading Previous Alignments')
+ d = load(ALIGN_NAME);
+ saved_dists = d.DIST_CELL;
+ saved_headers = d.header_names;
+catch %#ok<CTCH>
+ display('No previous alignments found')
+ saved_headers = {};
+ saved_dists = zeros(numel(IN_PROTS));
+end
+
+NUM_LETTERS = 4000000;
+
+
+[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
+
+display('Starting Alignments')
+
+dist_mat = NaN(size(IN_PROTS,1));
+
+[tf loc]=ismember(header_names, saved_headers);
+
+dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
+[I J] = find(triu(isnan(dist_mat)));
+inds = sub2ind(size(dist_mat), I, J);
+
+sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
+
+%find all of the ones that are not worth aligning
+bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
+dist_mat(inds(bad_mask)) = Inf;
+I(bad_mask) = [];
+J(bad_mask) = [];
+
+p_sizes = sizes(I)+sizes(J);
+counter = 1;
+display('Aligning...')
+tic
+while counter < length(I)
+ cum_sizes = cumsum(p_sizes(counter+1:end));
+ top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
+
+ I_spots = I(counter:top);
+ J_spots = J(counter:top);
+ need_align = cell(length(I_spots),2);
+ [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
+ [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
+ dist = zeros(size(I_spots));
+
+ %parfor k = 1:size(need_align,1)
+ for k = 1:size(need_align,1)
+ %display([num2str(k) ' ' num2str(toc)])
+ try
+ dist(k)=nwalign_mod(need_align{k,:});
+ catch %#ok<CTCH>
+ dist(k) = Inf;
+ end
+ end
+ time = toc;
+ inds = sub2ind(size(dist_mat), I_spots,J_spots);
+ dist_mat(inds) = dist;
+ counter = top+1;
+ display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
+end
+
+DIST_CELL = dist_mat;
+save(ALIGN_NAME, 'DIST_CELL', 'header_names')
+DIST_OUT = dfun(DIST_CELL);
+
+
+
+
+
+function out_d = dfun(MAT)
+
+score12 = triu(MAT,1);
+self_d = diag(MAT);
+d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
+
+out_d = triu(d,1) + triu(d,1)';
+
+
+
+
+
+
+
|