1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
% GenomePairwiseDist
% This will align all of the protein segments in the genome structure
% and return a cell-array. Each cell is the alignment of one
% protein. If the protein is missing then the distance is NaN.
%
if nargin < 1
ALIGN_NAME = 'saved_alignments';
end
try
display('Loading Previous Alignments')
d = load(ALIGN_NAME);
saved_dists = d.DIST_CELL;
saved_headers = d.header_names;
catch %#ok<CTCH>
display('No previous alignments found')
saved_headers = {};
saved_dists = zeros(numel(IN_PROTS));
end
NUM_LETTERS = 4000000;
[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
display('Starting Alignments')
dist_mat = NaN(size(IN_PROTS,1));
[tf loc]=ismember(header_names, saved_headers);
dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
[I J] = find(triu(isnan(dist_mat)));
inds = sub2ind(size(dist_mat), I, J);
sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
%find all of the ones that are not worth aligning
bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
dist_mat(inds(bad_mask)) = Inf;
I(bad_mask) = [];
J(bad_mask) = [];
p_sizes = sizes(I)+sizes(J);
counter = 1;
display('Aligning...')
tic
while counter < length(I)
cum_sizes = cumsum(p_sizes(counter+1:end));
top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
I_spots = I(counter:top);
J_spots = J(counter:top);
need_align = cell(length(I_spots),2);
[need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
[need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
dist = zeros(size(I_spots));
%parfor k = 1:size(need_align,1)
for k = 1:size(need_align,1)
%display([num2str(k) ' ' num2str(toc)])
try
dist(k)=nwalign_mod(need_align{k,:});
catch %#ok<CTCH>
dist(k) = Inf;
end
end
time = toc;
inds = sub2ind(size(dist_mat), I_spots,J_spots);
dist_mat(inds) = dist;
counter = top+1;
display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
end
DIST_CELL = dist_mat;
save(ALIGN_NAME, 'DIST_CELL', 'header_names')
DIST_OUT = dfun(DIST_CELL);
function out_d = dfun(MAT)
score12 = triu(MAT,1);
self_d = diag(MAT);
d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
out_d = triu(d,1) + triu(d,1)';
|