9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m
new file mode 100644
index 0000000..544db0a
--- a/dev/null
+++ b/test/entropy/GenomePairwiseDist.m
@@ -0,0 +1,98 @@
+function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
+%   GenomePairwiseDist
+%       This will align all of the protein segments in the genome structure
+%       and return a cell-array.  Each cell is the alignment of one
+%       protein.  If the protein is missing then the distance is NaN.
+%
+
+if nargin < 1
+	ALIGN_NAME = 'saved_alignments';
+end
+
+try
+    display('Loading Previous Alignments')
+    d = load(ALIGN_NAME);
+    saved_dists = d.DIST_CELL;
+    saved_headers = d.header_names;
+catch %#ok<CTCH>
+    display('No previous alignments found')
+    saved_headers = {};
+    saved_dists = zeros(numel(IN_PROTS));
+end
+            
+NUM_LETTERS = 4000000;
+
+
+[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
+
+display('Starting Alignments')
+
+dist_mat = NaN(size(IN_PROTS,1));
+
+[tf loc]=ismember(header_names, saved_headers);
+
+dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
+[I J] = find(triu(isnan(dist_mat)));
+inds = sub2ind(size(dist_mat), I, J);
+
+sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
+
+%find all of the ones that are not worth aligning
+bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
+dist_mat(inds(bad_mask)) = Inf;
+I(bad_mask) = [];
+J(bad_mask) = [];
+
+p_sizes = sizes(I)+sizes(J);
+counter = 1;
+display('Aligning...')
+tic
+while counter < length(I)
+    cum_sizes = cumsum(p_sizes(counter+1:end));
+    top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
+    
+    I_spots = I(counter:top);
+    J_spots = J(counter:top);
+    need_align = cell(length(I_spots),2);
+    [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
+    [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
+    dist = zeros(size(I_spots));
+    
+	%parfor k = 1:size(need_align,1)
+    for k = 1:size(need_align,1)
+        %display([num2str(k) ' ' num2str(toc)])
+        try
+            dist(k)=nwalign_mod(need_align{k,:});
+        catch %#ok<CTCH>
+            dist(k) = Inf;
+        end
+    end
+    time = toc;
+    inds = sub2ind(size(dist_mat), I_spots,J_spots);
+    dist_mat(inds) = dist;
+    counter = top+1;
+    display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
+end
+
+DIST_CELL = dist_mat;
+save(ALIGN_NAME, 'DIST_CELL', 'header_names')
+DIST_OUT = dfun(DIST_CELL);
+
+
+
+
+
+function out_d = dfun(MAT)
+
+score12 = triu(MAT,1);
+self_d = diag(MAT);
+d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
+
+out_d = triu(d,1) + triu(d,1)';
+
+
+
+
+
+
+