9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m
new file mode 100644
index 0000000..544db0a
--- a/dev/null
+++ b/test/entropy/GenomePairwiseDist.m
@@ -0,0 +1,98 @@
+function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
+%   GenomePairwiseDist
+%       This will align all of the protein segments in the genome structure
+%       and return a cell-array.  Each cell is the alignment of one
+%       protein.  If the protein is missing then the distance is NaN.
+%
+if nargin < 1
+        ALIGN_NAME = 'saved_alignments';
+end
+try
+    display('Loading Previous Alignments')
+    d = load(ALIGN_NAME);
+    saved_dists = d.DIST_CELL;
+    saved_headers = d.header_names;
+catch %#ok<CTCH>
+    display('No previous alignments found')
+    saved_headers = {};
+    saved_dists = zeros(numel(IN_PROTS));
+end
+            
+NUM_LETTERS = 4000000;
+[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
+display('Starting Alignments')
+dist_mat = NaN(size(IN_PROTS,1));
+[tf loc]=ismember(header_names, saved_headers);
+dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
+[I J] = find(triu(isnan(dist_mat)));
+inds = sub2ind(size(dist_mat), I, J);
+sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
+%find all of the ones that are not worth aligning
+bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
+dist_mat(inds(bad_mask)) = Inf;
+I(bad_mask) = [];
+J(bad_mask) = [];
+p_sizes = sizes(I)+sizes(J);
+counter = 1;
+display('Aligning...')
+tic
+while counter < length(I)
+    cum_sizes = cumsum(p_sizes(counter+1:end));
+    top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
+    
+    I_spots = I(counter:top);
+    J_spots = J(counter:top);
+    need_align = cell(length(I_spots),2);
+    [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
+    [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
+    dist = zeros(size(I_spots));
+    
+        %parfor k = 1:size(need_align,1)
+    for k = 1:size(need_align,1)
+        %display([num2str(k) ' ' num2str(toc)])
+        try
+            dist(k)=nwalign_mod(need_align{k,:});
+        catch %#ok<CTCH>
+            dist(k) = Inf;
+        end
+    end
+    time = toc;
+    inds = sub2ind(size(dist_mat), I_spots,J_spots);
+    dist_mat(inds) = dist;
+    counter = top+1;
+    display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
+end
+DIST_CELL = dist_mat;
+save(ALIGN_NAME, 'DIST_CELL', 'header_names')
+DIST_OUT = dfun(DIST_CELL);
+function out_d = dfun(MAT)
+score12 = triu(MAT,1);
+self_d = diag(MAT);
+d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
+out_d = triu(d,1) + triu(d,1)';

diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m new file mode 100644 index 0000000..544db0a --- a/dev/null +++ b/test/entropy/GenomePairwiseDist.m
@@ -0,0 +1,98 @@
	1	function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
	2	% GenomePairwiseDist
	3	% This will align all of the protein segments in the genome structure
	4	% and return a cell-array. Each cell is the alignment of one
	5	% protein. If the protein is missing then the distance is NaN.
	6	%
	7
	8	if nargin < 1
	9	ALIGN_NAME = 'saved_alignments';
	10	end
	11
	12	try
	13	display('Loading Previous Alignments')
	14	d = load(ALIGN_NAME);
	15	saved_dists = d.DIST_CELL;
	16	saved_headers = d.header_names;
	17	catch %#ok<CTCH>
	18	display('No previous alignments found')
	19	saved_headers = {};
	20	saved_dists = zeros(numel(IN_PROTS));
	21	end
	22
	23	NUM_LETTERS = 4000000;
	24
	25
	26	[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
	27
	28	display('Starting Alignments')
	29
	30	dist_mat = NaN(size(IN_PROTS,1));
	31
	32	[tf loc]=ismember(header_names, saved_headers);
	33
	34	dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
	35	[I J] = find(triu(isnan(dist_mat)));
	36	inds = sub2ind(size(dist_mat), I, J);
	37
	38	sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
	39
	40	%find all of the ones that are not worth aligning
	41	bad_mask = sizes(I) <= 1 \| sizes(J) <= 1;
	42	dist_mat(inds(bad_mask)) = Inf;
	43	I(bad_mask) = [];
	44	J(bad_mask) = [];
	45
	46	p_sizes = sizes(I)+sizes(J);
	47	counter = 1;
	48	display('Aligning...')
	49	tic
	50	while counter < length(I)
	51	cum_sizes = cumsum(p_sizes(counter+1:end));
	52	top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
	53
	54	I_spots = I(counter:top);
	55	J_spots = J(counter:top);
	56	need_align = cell(length(I_spots),2);
	57	[need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
	58	[need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
	59	dist = zeros(size(I_spots));
	60
	61	%parfor k = 1:size(need_align,1)
	62	for k = 1:size(need_align,1)
	63	%display([num2str(k) ' ' num2str(toc)])
	64	try
	65	dist(k)=nwalign_mod(need_align{k,:});
	66	catch %#ok<CTCH>
	67	dist(k) = Inf;
	68	end
	69	end
	70	time = toc;
	71	inds = sub2ind(size(dist_mat), I_spots,J_spots);
	72	dist_mat(inds) = dist;
	73	counter = top+1;
	74	display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
	75	end
	76
	77	DIST_CELL = dist_mat;
	78	save(ALIGN_NAME, 'DIST_CELL', 'header_names')
	79	DIST_OUT = dfun(DIST_CELL);
	80
	81
	82
	83
	84
	85	function out_d = dfun(MAT)
	86
	87	score12 = triu(MAT,1);
	88	self_d = diag(MAT);
	89	d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
	90
	91	out_d = triu(d,1) + triu(d,1)';
	92
	93
	94
	95
	96
	97
	98