-rw-r--r-- | src/Makefile.am | 2 | ||||
-rw-r--r-- | src/controller/callbacks/reshape.c | 2 | ||||
-rw-r--r-- | src/db/dbconnect.sqc | 12 | ||||
-rw-r--r-- | src/model/data/base.sqc | 2 | ||||
-rw-r--r-- | src/model/state/state.h | 3 | ||||
-rw-r--r-- | src/util/check_error.c | 6 | ||||
-rw-r--r-- | src/util/check_error_db.c | 13 | ||||
-rw-r--r-- | src/util/check_error_db.h | 9 | ||||
-rw-r--r-- | test/distance_sanity_check/gi_227977170_78032581.png | bin | 0 -> 51161 bytes | |||
-rw-r--r-- | test/entropy/Align2Ref.m | 24 | ||||
-rw-r--r-- | test/entropy/CalculateEntropy.m | 15 | ||||
-rw-r--r-- | test/entropy/CalculateProteinEntropy.m | 25 | ||||
-rw-r--r-- | test/entropy/FastNWalign2.c | 94 | ||||
-rw-r--r-- | test/entropy/GenomeAlignments.m | 31 | ||||
-rw-r--r-- | test/entropy/GenomePairwiseDist.m | 98 | ||||
-rw-r--r-- | test/entropy/RefineAlignments.m | 276 | ||||
-rw-r--r-- | test/entropy/don_anal.m | 40 | ||||
-rw-r--r-- | test/entropy/nwalign_mod.m | 637 |
18 files changed, 1280 insertions, 9 deletions
diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m new file mode 100644 index 0000000..0e69275 --- a/dev/null +++ b/test/entropy/RefineAlignments.m | |||
@@ -0,0 +1,276 @@ | |||
1 | function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin) | ||
2 | % RefineAlignments | ||
3 | % Uses an interative algorithm to improve the multiple alignments of | ||
4 | % an input sequence. It will iteratively remove a fraction of the | ||
5 | % sequences, compress any gaps, and then re-align. This will be done | ||
6 | % for NUM_REPS. | ||
7 | % | ||
8 | % Written by Will Dampier. Contact at wnd22@drexel.edu | ||
9 | % Version 1.0 on 10/23/09 | ||
10 | % | ||
11 | % [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN) | ||
12 | % | ||
13 | % IN_ALIGN A char-array representing a multiple alignment to be | ||
14 | % refined. | ||
15 | % | ||
16 | % OUT_ALIGN The refined alignment. | ||
17 | % SCORE The new alignment score. | ||
18 | % | ||
19 | % | ||
20 | % Optional Arguements: | ||
21 | % | ||
22 | % 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a | ||
23 | % nucleotide or amino acid alignment. | ||
24 | % | ||
25 | % 'GapPenalty' The penalty for gaps. This is set at -8 by | ||
26 | % default. | ||
27 | % | ||
28 | % 'DistMat' The distance matrix to use for the alignment | ||
29 | % scoring. Default is BLOSUM50 for 'AA' and NUC44 | ||
30 | % for 'NT' | ||
31 | % | ||
32 | % 'NumReps' The number of refining iterations to do. | ||
33 | % | ||
34 | % 'NumTry' The number of tries to perform at each iteration. | ||
35 | % | ||
36 | % 'RefineType' Which type of refinement to perform. This can | ||
37 | % be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'. | ||
38 | % | ||
39 | % 'Display' A boolean indicating whether to display the output | ||
40 | % at each iteration. | ||
41 | % | ||
42 | % | ||
43 | |||
44 | |||
45 | |||
46 | |||
47 | RM_FRAC = 0.1; | ||
48 | NUM_TRIES = 10; | ||
49 | NUM_REPS = 100; | ||
50 | alpha = 'aa'; | ||
51 | dist_mat = []; | ||
52 | gap_penalty = -8; | ||
53 | TYPE = 'mixed'; | ||
54 | disp_flag = true; | ||
55 | |||
56 | for i = 1:2:length(varargin) | ||
57 | switch lower(varargin{i}) | ||
58 | |||
59 | case 'alphabet' | ||
60 | if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1}) | ||
61 | alpha = varargin{i+1}; | ||
62 | else | ||
63 | error('RefineAlignments:BADALPHA', ... | ||
64 | 'Arguement to "alphabet" must be "NT" or "AA"') | ||
65 | end | ||
66 | case 'gappenalty' | ||
67 | if isnumeric(varargin{i+1}) && varargin{i+1} < 0 | ||
68 | gap_penalty = varargin{i+1}; | ||
69 | else | ||
70 | error('RefineAlignments:BADGAP', ... | ||
71 | 'Arguement to "GapPenalty" must be negative numeric') | ||
72 | end | ||
73 | case 'distmat' | ||
74 | dist_mat = varargin{i+1}; | ||
75 | |||
76 | case 'numreps' | ||
77 | if isnumeric(varargin{i+1}) && varargin{i+1} > 1 | ||
78 | NUM_REPS = varargin{i+1}; | ||
79 | else | ||
80 | error('RefineAlignments:BADREPS', ... | ||
81 | 'Arguement to NumReps must be a positive numeric') | ||
82 | end | ||
83 | |||
84 | case 'refinetype' | ||
85 | TYPE = varargin{i+1}; | ||
86 | |||
87 | case 'display' | ||
88 | if islogical(varargin{i+1}) | ||
89 | disp_flag = varargin{i+1}; | ||
90 | else | ||
91 | error('RefineAlignments:DISPLAY', ... | ||
92 | 'Arguement to Display must be a boolean.') | ||
93 | end | ||
94 | |||
95 | otherwise | ||
96 | error('RefineAlignments:BADARG', 'Unknown arguement: %s', ... | ||
97 | lower(varargin{i})) | ||
98 | |||
99 | end | ||
100 | end | ||
101 | |||
102 | if isempty(dist_mat) | ||
103 | if strcmpi(alpha, 'nt') | ||
104 | dist_mat = nuc44; | ||
105 | toint = @nt2int; | ||
106 | else | ||
107 | dist_mat = blosum50; | ||
108 | toint = @aa2int; | ||
109 | end | ||
110 | end | ||
111 | |||
112 | dist_mat(end+1,:) = gap_penalty; | ||
113 | dist_mat(:,end+1) = gap_penalty; | ||
114 | dist_mat(end,end) = 0; | ||
115 | |||
116 | switch upper(TYPE) | ||
117 | case 'MIXED' | ||
118 | is_stoch = true; | ||
119 | is_mix = true; | ||
120 | case 'STOCASTIC' | ||
121 | is_stoch = true; | ||
122 | is_mix = false; | ||
123 | case 'DETERMINISTIC' | ||
124 | is_stoch = false; | ||
125 | is_mix = false; | ||
126 | end | ||
127 | |||
128 | |||
129 | OUT_ALIGN = IN_ALIGN; | ||
130 | cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint); | ||
131 | |||
132 | for i = 1:NUM_REPS | ||
133 | |||
134 | %determine whether to switch based on the "mix" factor | ||
135 | if is_mix | ||
136 | is_stoch = ~is_stoch; | ||
137 | end | ||
138 | |||
139 | |||
140 | rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES); | ||
141 | counter = 1; | ||
142 | while counter < NUM_TRIES | ||
143 | new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha); | ||
144 | |||
145 | new_score = CalculateScore(new_mat, dist_mat, toint); | ||
146 | if new_score > cur_score | ||
147 | if disp_flag | ||
148 | disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ... | ||
149 | num2str(abs(new_score-cur_score)), ' and by ', ... | ||
150 | num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ... | ||
151 | ' columns', ' Using stoch:', num2str(is_stoch)}; | ||
152 | display([disp_cell{:}]) | ||
153 | end | ||
154 | cur_score = new_score; | ||
155 | OUT_ALIGN = new_mat; | ||
156 | break | ||
157 | end | ||
158 | counter = counter + 1; | ||
159 | end | ||
160 | end | ||
161 | |||
162 | function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES) | ||
163 | % GetRows | ||
164 | % A helper function which determines the rows to split the alignment | ||
165 | % with. | ||
166 | % | ||
167 | % MAT A char-array of the multiple alignment. | ||
168 | % STOCH A boolean indicating whether to use a stochastic approach. | ||
169 | % RM_FRAC The fraction of rows to remove. | ||
170 | % NUM_TRIES The number of tries to return. | ||
171 | % | ||
172 | % rows_mat A matrix indicating which rows to remove. | ||
173 | % | ||
174 | |||
175 | if STOCH | ||
176 | rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC; | ||
177 | if any(all(rows_mat,1)) || any(all(~rows_mat,1)) | ||
178 | cols = any(all(rows_mat,1)) || any(all(~rows_mat,1)); | ||
179 | rows_mat(1,cols) = ~rows_mat(1,cols); | ||
180 | end | ||
181 | |||
182 | else | ||
183 | gap_mask = MAT =='-'; | ||
184 | runs = FindRuns(gap_mask); | ||
185 | score = mode(runs.*(runs~=0))-min(runs); | ||
186 | [~, order] = sort(score, 'descend'); | ||
187 | rm_rows = order(1:NUM_TRIES); | ||
188 | [~, norder] = sort(runs(:,rm_rows),'descend'); | ||
189 | rows_mat = false(size(MAT,1), NUM_TRIES); | ||
190 | num_remove = ceil(RM_FRAC*size(MAT,1)); | ||
191 | for i = 1:NUM_TRIES | ||
192 | rows_mat(norder(1:num_remove,i),i) = true; | ||
193 | end | ||
194 | end | ||
195 | |||
196 | function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha) | ||
197 | % SplitAndReAlign | ||
198 | % This will split the alignment matix based on the provided rows and | ||
199 | % then create two profiles and re-align them. | ||
200 | % | ||
201 | % MAT A char-array of the alignment. | ||
202 | % ROWS A boolean-array describing the switch between rows | ||
203 | % alpha Either 'NT' or 'AA' to indicate which sequence profile to | ||
204 | % create. | ||
205 | % | ||
206 | % NEW_MAT The ReAlinged alignment | ||
207 | % | ||
208 | |||
209 | mat1 = MAT(ROWS, :); | ||
210 | mat2 = MAT(~ROWS, :); | ||
211 | |||
212 | mat1(:,all(mat1 == '-',1)) = []; | ||
213 | mat2(:,all(mat2 == '-',1)) = []; | ||
214 | |||
215 | pmat1 = seqprofile(mat1, 'alphabet', alpha); | ||
216 | pmat2 = seqprofile(mat2, 'alphabet', alpha); | ||
217 | try | ||
218 | [~, ind1, ind2] = profalign(pmat1, pmat2); | ||
219 | catch | ||
220 | ind1 = 1:size(pmat1,2); | ||
221 | ind2 = 1:size(pmat2,2); | ||
222 | end | ||
223 | |||
224 | NEW_MAT = char(0); | ||
225 | NEW_MAT(ROWS, ind1) = mat1; | ||
226 | NEW_MAT(~ROWS, ind2) = mat2; | ||
227 | NEW_MAT(~isletter(NEW_MAT)) = '-'; | ||
228 | NEW_MAT(all(NEW_MAT == '-',1)) = []; | ||
229 | |||
230 | |||
231 | |||
232 | |||
233 | function s = CalculateScore(MAT, DIST, FUN) | ||
234 | % CalculateScore | ||
235 | % A function which calculates the score of the alignment. This score | ||
236 | % is based on the agreement between the consensus and each individual | ||
237 | % alignment. | ||
238 | % | ||
239 | % MAT A char-array of a multiple alignment. | ||
240 | % DIST A distance matrix which indicates the penalty for | ||
241 | % mismatches. | ||
242 | % FUN A function which converts letter to numbers such at nt2int | ||
243 | % or aa2int. | ||
244 | % | ||
245 | % s The score for this alingment. | ||
246 | % | ||
247 | % | ||
248 | |||
249 | cons = seqconsensus(MAT); | ||
250 | num_cons = FUN(cons); | ||
251 | num_align = FUN(MAT); | ||
252 | ncons = repmat(num_cons(:), [size(num_align,1), 1]); | ||
253 | nalign = num_align(:); | ||
254 | lookupinds = sub2ind(size(DIST), ncons, nalign); | ||
255 | s = sum(DIST(lookupinds)); | ||
256 | |||
257 | function reverse_looking = FindRuns(input) | ||
258 | % FindRuns | ||
259 | % Finds consecutive runs of 1's along the rows of a boolean array. | ||
260 | % | ||
261 | % INPUT = [1 1 1 0 0 0 1 0 1 0 1 1; | ||
262 | % 0 1 1 1 0 1 1 0 0 1 1 1]; | ||
263 | % RL = [1 2 3 0 0 0 1 0 1 0 1 2; | ||
264 | % [0 3 2 1 0 1 2 0 0 1 2 3]; | ||
265 | % | ||
266 | |||
267 | [m,n] = size(input); | ||
268 | reverse_looking = [zeros(1,m);input.']; | ||
269 | reverse_looking = reverse_looking(:); | ||
270 | p = find(~reverse_looking); | ||
271 | reverse_looking(p) = [0;1-diff(p)]; | ||
272 | reverse_looking = reshape(cumsum(reverse_looking),[],m).'; | ||
273 | reverse_looking(:,1) = []; | ||
274 | |||
275 | |||
276 | |||