-rw-r--r-- | test/entropy/Align2Ref.m | 24 | ||||
-rw-r--r-- | test/entropy/CalculateEntropy.m | 15 | ||||
-rw-r--r-- | test/entropy/CalculateProteinEntropy.m | 25 | ||||
-rw-r--r-- | test/entropy/FastNWalign2.c | 94 | ||||
-rw-r--r-- | test/entropy/GenomeAlignments.m | 31 | ||||
-rw-r--r-- | test/entropy/GenomePairwiseDist.m | 98 | ||||
-rw-r--r-- | test/entropy/RefineAlignments.m | 276 | ||||
-rw-r--r-- | test/entropy/don_anal.m | 40 | ||||
-rw-r--r-- | test/entropy/nwalign_mod.m | 637 |
9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m new file mode 100644 index 0000000..544db0a --- a/dev/null +++ b/test/entropy/GenomePairwiseDist.m | |||
@@ -0,0 +1,98 @@ | |||
1 | function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME) | ||
2 | % GenomePairwiseDist | ||
3 | % This will align all of the protein segments in the genome structure | ||
4 | % and return a cell-array. Each cell is the alignment of one | ||
5 | % protein. If the protein is missing then the distance is NaN. | ||
6 | % | ||
7 | |||
8 | if nargin < 1 | ||
9 | ALIGN_NAME = 'saved_alignments'; | ||
10 | end | ||
11 | |||
12 | try | ||
13 | display('Loading Previous Alignments') | ||
14 | d = load(ALIGN_NAME); | ||
15 | saved_dists = d.DIST_CELL; | ||
16 | saved_headers = d.header_names; | ||
17 | catch %#ok<CTCH> | ||
18 | display('No previous alignments found') | ||
19 | saved_headers = {}; | ||
20 | saved_dists = zeros(numel(IN_PROTS)); | ||
21 | end | ||
22 | |||
23 | NUM_LETTERS = 4000000; | ||
24 | |||
25 | |||
26 | [header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header); | ||
27 | |||
28 | display('Starting Alignments') | ||
29 | |||
30 | dist_mat = NaN(size(IN_PROTS,1)); | ||
31 | |||
32 | [tf loc]=ismember(header_names, saved_headers); | ||
33 | |||
34 | dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf)); | ||
35 | [I J] = find(triu(isnan(dist_mat))); | ||
36 | inds = sub2ind(size(dist_mat), I, J); | ||
37 | |||
38 | sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS); | ||
39 | |||
40 | %find all of the ones that are not worth aligning | ||
41 | bad_mask = sizes(I) <= 1 | sizes(J) <= 1; | ||
42 | dist_mat(inds(bad_mask)) = Inf; | ||
43 | I(bad_mask) = []; | ||
44 | J(bad_mask) = []; | ||
45 | |||
46 | p_sizes = sizes(I)+sizes(J); | ||
47 | counter = 1; | ||
48 | display('Aligning...') | ||
49 | tic | ||
50 | while counter < length(I) | ||
51 | cum_sizes = cumsum(p_sizes(counter+1:end)); | ||
52 | top = counter+find(cum_sizes<NUM_LETTERS,1, 'last'); | ||
53 | |||
54 | I_spots = I(counter:top); | ||
55 | J_spots = J(counter:top); | ||
56 | need_align = cell(length(I_spots),2); | ||
57 | [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence); | ||
58 | [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence); | ||
59 | dist = zeros(size(I_spots)); | ||
60 | |||
61 | %parfor k = 1:size(need_align,1) | ||
62 | for k = 1:size(need_align,1) | ||
63 | %display([num2str(k) ' ' num2str(toc)]) | ||
64 | try | ||
65 | dist(k)=nwalign_mod(need_align{k,:}); | ||
66 | catch %#ok<CTCH> | ||
67 | dist(k) = Inf; | ||
68 | end | ||
69 | end | ||
70 | time = toc; | ||
71 | inds = sub2ind(size(dist_mat), I_spots,J_spots); | ||
72 | dist_mat(inds) = dist; | ||
73 | counter = top+1; | ||
74 | display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds']) | ||
75 | end | ||
76 | |||
77 | DIST_CELL = dist_mat; | ||
78 | save(ALIGN_NAME, 'DIST_CELL', 'header_names') | ||
79 | DIST_OUT = dfun(DIST_CELL); | ||
80 | |||
81 | |||
82 | |||
83 | |||
84 | |||
85 | function out_d = dfun(MAT) | ||
86 | |||
87 | score12 = triu(MAT,1); | ||
88 | self_d = diag(MAT); | ||
89 | d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)')); | ||
90 | |||
91 | out_d = triu(d,1) + triu(d,1)'; | ||
92 | |||
93 | |||
94 | |||
95 | |||
96 | |||
97 | |||
98 | |||