author | Don Pellegrino <don@drexel.edu> | 2010-03-30 03:53:18 (GMT) |
---|---|---|
committer | Don Pellegrino <don@drexel.edu> | 2010-03-30 03:53:18 (GMT) |
commit | 17fd5ba7712a14a0a99ed74556568c0f6ca7eaae (patch) (unidiff) | |
tree | a48a86b2e00a49452a4250c097ccc275b296f2d8 | |
parent | 19c12c898675844c8e8a99bc580be01a993472ad (diff) | |
parent | 552d5386b948b58dbbc838684e67bd89fcc281a8 (diff) | |
download | exp005-17fd5ba7712a14a0a99ed74556568c0f6ca7eaae.zip exp005-17fd5ba7712a14a0a99ed74556568c0f6ca7eaae.tar.gz exp005-17fd5ba7712a14a0a99ed74556568c0f6ca7eaae.tar.bz2 |
Merge branch 'master' of cluster.cis.drexel.edu:/home/chensgroup/exp-repo.git/exp005
-rw-r--r-- | src/Makefile.am | 2 | ||||
-rw-r--r-- | src/controller/callbacks/reshape.c | 2 | ||||
-rw-r--r-- | src/db/dbconnect.sqc | 12 | ||||
-rw-r--r-- | src/model/data/base.sqc | 2 | ||||
-rw-r--r-- | src/model/state/state.h | 3 | ||||
-rw-r--r-- | src/util/check_error.c | 6 | ||||
-rw-r--r-- | src/util/check_error_db.c | 13 | ||||
-rw-r--r-- | src/util/check_error_db.h | 9 | ||||
-rw-r--r-- | test/distance_sanity_check/gi_227977170_78032581.png | bin | 0 -> 51161 bytes | |||
-rw-r--r-- | test/entropy/Align2Ref.m | 24 | ||||
-rw-r--r-- | test/entropy/CalculateEntropy.m | 15 | ||||
-rw-r--r-- | test/entropy/CalculateProteinEntropy.m | 25 | ||||
-rw-r--r-- | test/entropy/FastNWalign2.c | 94 | ||||
-rw-r--r-- | test/entropy/GenomeAlignments.m | 31 | ||||
-rw-r--r-- | test/entropy/GenomePairwiseDist.m | 98 | ||||
-rw-r--r-- | test/entropy/RefineAlignments.m | 276 | ||||
-rw-r--r-- | test/entropy/don_anal.m | 40 | ||||
-rw-r--r-- | test/entropy/nwalign_mod.m | 637 |
18 files changed, 1280 insertions, 9 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index cf16798..de6226d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am | |||
@@ -39,6 +39,7 @@ libflumap_la_SOURCES = \ | |||
39 | model/state/zoom_info_init.c \ | 39 | model/state/zoom_info_init.c \ |
40 | util/ati_meminfo.c \ | 40 | util/ati_meminfo.c \ |
41 | util/check_error.c \ | 41 | util/check_error.c \ |
42 | util/check_error_db.c \ | ||
42 | util/pick_convert.c \ | 43 | util/pick_convert.c \ |
43 | util/sqlinfoprint.c \ | 44 | util/sqlinfoprint.c \ |
44 | view/geometry.c \ | 45 | view/geometry.c \ |
@@ -78,6 +79,7 @@ noinst_HEADERS = \ | |||
78 | plugin/plugin.h \ | 79 | plugin/plugin.h \ |
79 | util/ati_meminfo.h \ | 80 | util/ati_meminfo.h \ |
80 | util/check_error.h \ | 81 | util/check_error.h \ |
82 | util/check_error_db.h \ | ||
81 | util/pick_convert.h \ | 83 | util/pick_convert.h \ |
82 | util/sqlinfoprint.h \ | 84 | util/sqlinfoprint.h \ |
83 | view/geometry.h \ | 85 | view/geometry.h \ |
diff --git a/src/controller/callbacks/reshape.c b/src/controller/callbacks/reshape.c index 99557b7..dedce5a 100644 --- a/src/controller/callbacks/reshape.c +++ b/src/controller/callbacks/reshape.c | |||
@@ -81,7 +81,7 @@ reshape (int w, int h) | |||
81 | glViewport (0, 0, (GLsizei) w, (GLsizei) h); | 81 | glViewport (0, 0, (GLsizei) w, (GLsizei) h); |
82 | 82 | ||
83 | density_legend_geometry (); | 83 | density_legend_geometry (); |
84 | datarose_geometry (); | 84 | // datarose_geometry (); |
85 | 85 | ||
86 | return; | 86 | return; |
87 | } | 87 | } |
diff --git a/src/db/dbconnect.sqc b/src/db/dbconnect.sqc index 1e41a12..aab7bcb 100644 --- a/src/db/dbconnect.sqc +++ b/src/db/dbconnect.sqc | |||
@@ -1,5 +1,5 @@ | |||
1 | #include "dbconnect.h" | 1 | #include "dbconnect.h" |
2 | #include "../util/check_error.h" | 2 | #include "../util/check_error_db.h" |
3 | 3 | ||
4 | EXEC SQL INCLUDE sqlca; | 4 | EXEC SQL INCLUDE sqlca; |
5 | 5 | ||
@@ -7,5 +7,13 @@ void | |||
7 | dbconnect (void) | 7 | dbconnect (void) |
8 | { | 8 | { |
9 | EXEC SQL CONNECT TO exp004; | 9 | EXEC SQL CONNECT TO exp004; |
10 | check_error (__FILE__, __LINE__); | 10 | |
11 | /* | ||
12 | * Only check for a database error rather than using check_error to | ||
13 | * test for OpenGL and database errors. OpenGL may not yet be | ||
14 | * prepared and checking it prematurely could cause odd behavior. | ||
15 | */ | ||
16 | check_error_db (__FILE__, __LINE__); | ||
17 | |||
18 | return; | ||
11 | } | 19 | } |
diff --git a/src/model/data/base.sqc b/src/model/data/base.sqc index 41a651c..4627bea 100644 --- a/src/model/data/base.sqc +++ b/src/model/data/base.sqc | |||
@@ -169,7 +169,7 @@ base (void) | |||
169 | v++; | 169 | v++; |
170 | } | 170 | } |
171 | 171 | ||
172 | glGenBuffers (2, S.buffers); | 172 | glGenBuffers (NUM_BUFFERS, S.buffers); |
173 | 173 | ||
174 | glBindBuffer (GL_ARRAY_BUFFER, S.buffers[BASE_VERTICES]); | 174 | glBindBuffer (GL_ARRAY_BUFFER, S.buffers[BASE_VERTICES]); |
175 | glVertexPointer (2, GL_FLOAT, 0, 0); | 175 | glVertexPointer (2, GL_FLOAT, 0, 0); |
diff --git a/src/model/state/state.h b/src/model/state/state.h index 6e4f62e..f432017 100644 --- a/src/model/state/state.h +++ b/src/model/state/state.h | |||
@@ -8,6 +8,7 @@ | |||
8 | /* | 8 | /* |
9 | * Buffer object identifiers. | 9 | * Buffer object identifiers. |
10 | */ | 10 | */ |
11 | #define NUM_BUFFERS 2 | ||
11 | #define BASE_VERTICES 0 | 12 | #define BASE_VERTICES 0 |
12 | #define BASE_COLORS 1 | 13 | #define BASE_COLORS 1 |
13 | 14 | ||
@@ -80,7 +81,7 @@ typedef struct | |||
80 | /* | 81 | /* |
81 | * Buffer objects. | 82 | * Buffer objects. |
82 | */ | 83 | */ |
83 | unsigned int buffers[1]; | 84 | unsigned int buffers[NUM_BUFFERS]; |
84 | 85 | ||
85 | /* | 86 | /* |
86 | * GI Identifiers indexed by row. Storage is [rows][20]. | 87 | * GI Identifiers indexed by row. Storage is [rows][20]. |
diff --git a/src/util/check_error.c b/src/util/check_error.c index c47ca43..7d116e9 100644 --- a/src/util/check_error.c +++ b/src/util/check_error.c | |||
@@ -2,8 +2,7 @@ | |||
2 | #include <error.h> | 2 | #include <error.h> |
3 | #include <errno.h> | 3 | #include <errno.h> |
4 | #include <stdlib.h> | 4 | #include <stdlib.h> |
5 | #include "sqlinfoprint.h" | 5 | #include "check_error_db.h" |
6 | extern struct sqlca sqlca; | ||
7 | 6 | ||
8 | void | 7 | void |
9 | check_error (const char *filename, const unsigned int linenum) | 8 | check_error (const char *filename, const unsigned int linenum) |
@@ -23,8 +22,7 @@ check_error (const char *filename, const unsigned int linenum) | |||
23 | /* | 22 | /* |
24 | * Check for an error from the Database API. | 23 | * Check for an error from the Database API. |
25 | */ | 24 | */ |
26 | if (sqlinfoprint ("DB Error", &sqlca, filename, linenum) == 1) | 25 | check_error_db (filename, linenum); |
27 | exit (EXIT_FAILURE); | ||
28 | 26 | ||
29 | return; | 27 | return; |
30 | } | 28 | } |
diff --git a/src/util/check_error_db.c b/src/util/check_error_db.c new file mode 100644 index 0000000..a6b7f51 --- a/dev/null +++ b/src/util/check_error_db.c | |||
@@ -0,0 +1,13 @@ | |||
1 | #include "check_error_db.h" | ||
2 | #include "sqlinfoprint.h" | ||
3 | #include <stdlib.h> | ||
4 | extern struct sqlca sqlca; | ||
5 | |||
6 | void | ||
7 | check_error_db (const char *filename, const unsigned int linenum) | ||
8 | { | ||
9 | if (sqlinfoprint ("DB Error", &sqlca, filename, linenum) == 1) | ||
10 | exit (EXIT_FAILURE); | ||
11 | |||
12 | return; | ||
13 | } | ||
diff --git a/src/util/check_error_db.h b/src/util/check_error_db.h new file mode 100644 index 0000000..cc8467b --- a/dev/null +++ b/src/util/check_error_db.h | |||
@@ -0,0 +1,9 @@ | |||
1 | #ifndef CHECK_ERROR_H | ||
2 | #define CHECK_ERROR_H | ||
3 | |||
4 | /* | ||
5 | * Check the database library to see if an error has occurred. | ||
6 | */ | ||
7 | void check_error_db (const char* filename, unsigned int linenum); | ||
8 | |||
9 | #endif // CHECK_ERROR_H | ||
diff --git a/test/distance_sanity_check/gi_227977170_78032581.png b/test/distance_sanity_check/gi_227977170_78032581.png new file mode 100644 index 0000000..69c4c6e --- a/dev/null +++ b/test/distance_sanity_check/gi_227977170_78032581.png | |||
Binary files differ | |||
diff --git a/test/entropy/Align2Ref.m b/test/entropy/Align2Ref.m new file mode 100644 index 0000000..3021d4a --- a/dev/null +++ b/test/entropy/Align2Ref.m | |||
@@ -0,0 +1,24 @@ | |||
1 | function NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS, ALPHA) | ||
2 | % Align2Ref | ||
3 | % Uses a profile alignment to change a set of indicices in the | ||
4 | % "alignment space" to the "reference space". | ||
5 | % | ||
6 | % | ||
7 | % NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS) | ||
8 | % | ||
9 | |||
10 | if nargin == 3 | ||
11 | ALPHA = 'aa'; | ||
12 | end | ||
13 | |||
14 | ref_prof = seqprofile(REF_SEQ, 'alphabet', ALPHA); | ||
15 | align_prof = seqprofile(ALIGN, 'alphabet', ALPHA); | ||
16 | |||
17 | [~, ind1, ind2] = profalign(ref_prof, align_prof); | ||
18 | |||
19 | ninds = ind2(ALIGN_INDS); | ||
20 | |||
21 | mask = sum(bsxfun(@le, ind1(:)', ninds(:)),2); | ||
22 | mask(mask == 0) = 1; | ||
23 | |||
24 | NEW_INDS = reshape(ind1(mask), size(ALIGN_INDS)); \ No newline at end of file | ||
diff --git a/test/entropy/CalculateEntropy.m b/test/entropy/CalculateEntropy.m new file mode 100644 index 0000000..18bfda5 --- a/dev/null +++ b/test/entropy/CalculateEntropy.m | |||
@@ -0,0 +1,15 @@ | |||
1 | function OUT_VEC = CalculateEntropy(ARRAY) | ||
2 | % CalculateEntropy | ||
3 | % Calculate the Informationational Entropy of each position in a | ||
4 | % multialignment. In this function I remove all gap characters from | ||
5 | % each column of the alignment before doing the calculation. | ||
6 | % | ||
7 | % | ||
8 | % | ||
9 | |||
10 | num_array = aa2int(ARRAY); | ||
11 | bins = histc(num_array, 0:21, 1); | ||
12 | |||
13 | probs=bsxfun(@rdivide, bins(2:end-1,:),sum(bins(2:end-1,:))); | ||
14 | |||
15 | OUT_VEC = -nansum(log(probs).*double(probs~=0).*probs); \ No newline at end of file | ||
diff --git a/test/entropy/CalculateProteinEntropy.m b/test/entropy/CalculateProteinEntropy.m new file mode 100644 index 0000000..41f06f5 --- a/dev/null +++ b/test/entropy/CalculateProteinEntropy.m | |||
@@ -0,0 +1,25 @@ | |||
1 | function OUT_DATA=CalculateProteinEntropy(Ref, ALIGN_CELL) | ||
2 | |||
3 | OUT_DATA = cell(size(ALIGN_CELL)); | ||
4 | for i = 1:size(ALIGN_CELL,1) | ||
5 | |||
6 | aligned_seqs = ALIGN_CELL{i}; | ||
7 | |||
8 | ref_seq = Ref.Sequence; | ||
9 | |||
10 | ref_prof = seqprofile(ref_seq, 'alphabet', 'aa'); | ||
11 | seqs_prof = seqprofile(aligned_seqs, 'alphabet', 'aa'); | ||
12 | |||
13 | [mp, H1, H2] = profalign(ref_prof, seqs_prof); | ||
14 | |||
15 | merged_seqs = char(zeros(size(aligned_seqs,1)+1, size(mp,2))); | ||
16 | merged_seqs(2:end, H2) = aligned_seqs; | ||
17 | merged_seqs(1, H1) = ref_seq; | ||
18 | |||
19 | entropy_vals = CalculateEntropy(merged_seqs); | ||
20 | |||
21 | %only pull out the values that are in the reference positions | ||
22 | ref_entropy = entropy_vals(H1); | ||
23 | |||
24 | OUT_DATA{i} = ref_entropy; | ||
25 | end | ||
diff --git a/test/entropy/FastNWalign2.c b/test/entropy/FastNWalign2.c new file mode 100644 index 0000000..33b286d --- a/dev/null +++ b/test/entropy/FastNWalign2.c | |||
@@ -0,0 +1,94 @@ | |||
1 | #include "mex.h" | ||
2 | #include "matrix.h" | ||
3 | |||
4 | void simplegap(double scoredMatchMat[], | ||
5 | const double gap, int n, int m, double output_F[], double output_P[]) | ||
6 | { | ||
7 | // Standard Needleman-Wunsch algorithm | ||
8 | |||
9 | double up, left, diagonal, best, pos; | ||
10 | |||
11 | |||
12 | int i,j; | ||
13 | |||
14 | |||
15 | for(i=0;i<m;i++) | ||
16 | { | ||
17 | output_F[i]=i*gap; | ||
18 | output_P[i]=2; | ||
19 | } | ||
20 | |||
21 | |||
22 | for(j=0; j<n; j++) //put initial values in | ||
23 | { | ||
24 | output_F[j*m]=j*gap; | ||
25 | output_P[j*m]=4; | ||
26 | } | ||
27 | output_P[0]=1; | ||
28 | |||
29 | |||
30 | |||
31 | for(j=1;j<n;j++) //cycle through columns | ||
32 | { | ||
33 | best=output_F[(j)*m]; | ||
34 | |||
35 | for(i=1; i<m; i++) //cycle through the rows | ||
36 | { | ||
37 | up=best+gap; | ||
38 | left=output_F[i+(j-1)*m]+gap; | ||
39 | diagonal=output_F[i-1+(j-1)*m]+scoredMatchMat[i-1+(j-1)*m]; | ||
40 | |||
41 | if (up>left) | ||
42 | { | ||
43 | best=up; | ||
44 | pos=2; | ||
45 | } | ||
46 | else | ||
47 | { | ||
48 | best=left; | ||
49 | pos=4; | ||
50 | } | ||
51 | |||
52 | if (diagonal>=best) | ||
53 | { | ||
54 | best=diagonal; | ||
55 | output_P[i+j*m]=1; | ||
56 | } | ||
57 | else | ||
58 | { | ||
59 | output_P[i+j*m]=pos; | ||
60 | } | ||
61 | output_F[i+j*m]=best; | ||
62 | } | ||
63 | } | ||
64 | } | ||
65 | |||
66 | void mexFunction(int nlhs, mxArray *plhs[], | ||
67 | int nrhs, const mxArray *prhs[]) | ||
68 | { | ||
69 | |||
70 | double *gap, *output_F, *output_P; | ||
71 | double *scoredMatchMat; | ||
72 | int n, m; | ||
73 | mwSize i; | ||
74 | |||
75 | //double *F_col, *ptr_col; | ||
76 | m=mxGetM(prhs[0]); | ||
77 | n=mxGetN(prhs[0]); | ||
78 | gap=mxGetPr(prhs[1]); | ||
79 | |||
80 | plhs[0]=mxCreateDoubleMatrix(m,n,mxREAL); | ||
81 | plhs[1]=mxCreateDoubleMatrix(m,n,mxREAL); | ||
82 | |||
83 | output_F=mxGetPr(plhs[0]); | ||
84 | output_P=mxGetPr(plhs[1]); | ||
85 | |||
86 | |||
87 | scoredMatchMat=mxGetPr(prhs[0]); | ||
88 | // | ||
89 | |||
90 | |||
91 | simplegap(scoredMatchMat,*gap,n,m,output_F,output_P); | ||
92 | |||
93 | } | ||
94 | |||
diff --git a/test/entropy/GenomeAlignments.m b/test/entropy/GenomeAlignments.m new file mode 100644 index 0000000..be56570 --- a/dev/null +++ b/test/entropy/GenomeAlignments.m | |||
@@ -0,0 +1,31 @@ | |||
1 | function [ALIGN inds] = GenomeAlignments(IN_GENOMES, DIST_MAT) | ||
2 | % GenomeAlignments | ||
3 | % This will align all of the protein segments in the genome structure | ||
4 | % and return a cell-array. Each cell is the alignment of one | ||
5 | % protein. If the protein is missing then the distance is NaN. | ||
6 | % | ||
7 | |||
8 | [tree inds] = MakeTree(DIST_MAT); | ||
9 | ALIGN = malign(tree, inds); | ||
10 | |||
11 | function align=malign(tree, inds) | ||
12 | seqs = arrayfun(@(x)(x.Sequence), IN_GENOMES(inds), 'uniformoutput', false); | ||
13 | align = multialign(seqs, tree); | ||
14 | end | ||
15 | |||
16 | |||
17 | |||
18 | function [tree_obj inds]=MakeTree(dist_mat) | ||
19 | % MakeTree | ||
20 | % Takes a distance matrix and returns a tree object and the indicies | ||
21 | % to the rows that are in the tree. | ||
22 | % | ||
23 | |||
24 | inds = find(~all(isnan(dist_mat)|isinf(dist_mat)|dist_mat==0)); | ||
25 | dvec = squareform(dist_mat(inds, inds)); | ||
26 | dvec(dvec <= 0) = rand(nnz(dvec <= 0),1)*min(dvec(dvec>0)); | ||
27 | tree_obj = seqlinkage(dvec); | ||
28 | |||
29 | end | ||
30 | |||
31 | end \ No newline at end of file | ||
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m new file mode 100644 index 0000000..544db0a --- a/dev/null +++ b/test/entropy/GenomePairwiseDist.m | |||
@@ -0,0 +1,98 @@ | |||
1 | function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME) | ||
2 | % GenomePairwiseDist | ||
3 | % This will align all of the protein segments in the genome structure | ||
4 | % and return a cell-array. Each cell is the alignment of one | ||
5 | % protein. If the protein is missing then the distance is NaN. | ||
6 | % | ||
7 | |||
8 | if nargin < 1 | ||
9 | ALIGN_NAME = 'saved_alignments'; | ||
10 | end | ||
11 | |||
12 | try | ||
13 | display('Loading Previous Alignments') | ||
14 | d = load(ALIGN_NAME); | ||
15 | saved_dists = d.DIST_CELL; | ||
16 | saved_headers = d.header_names; | ||
17 | catch %#ok<CTCH> | ||
18 | display('No previous alignments found') | ||
19 | saved_headers = {}; | ||
20 | saved_dists = zeros(numel(IN_PROTS)); | ||
21 | end | ||
22 | |||
23 | NUM_LETTERS = 4000000; | ||
24 | |||
25 | |||
26 | [header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header); | ||
27 | |||
28 | display('Starting Alignments') | ||
29 | |||
30 | dist_mat = NaN(size(IN_PROTS,1)); | ||
31 | |||
32 | [tf loc]=ismember(header_names, saved_headers); | ||
33 | |||
34 | dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf)); | ||
35 | [I J] = find(triu(isnan(dist_mat))); | ||
36 | inds = sub2ind(size(dist_mat), I, J); | ||
37 | |||
38 | sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS); | ||
39 | |||
40 | %find all of the ones that are not worth aligning | ||
41 | bad_mask = sizes(I) <= 1 | sizes(J) <= 1; | ||
42 | dist_mat(inds(bad_mask)) = Inf; | ||
43 | I(bad_mask) = []; | ||
44 | J(bad_mask) = []; | ||
45 | |||
46 | p_sizes = sizes(I)+sizes(J); | ||
47 | counter = 1; | ||
48 | display('Aligning...') | ||
49 | tic | ||
50 | while counter < length(I) | ||
51 | cum_sizes = cumsum(p_sizes(counter+1:end)); | ||
52 | top = counter+find(cum_sizes<NUM_LETTERS,1, 'last'); | ||
53 | |||
54 | I_spots = I(counter:top); | ||
55 | J_spots = J(counter:top); | ||
56 | need_align = cell(length(I_spots),2); | ||
57 | [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence); | ||
58 | [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence); | ||
59 | dist = zeros(size(I_spots)); | ||
60 | |||
61 | %parfor k = 1:size(need_align,1) | ||
62 | for k = 1:size(need_align,1) | ||
63 | %display([num2str(k) ' ' num2str(toc)]) | ||
64 | try | ||
65 | dist(k)=nwalign_mod(need_align{k,:}); | ||
66 | catch %#ok<CTCH> | ||
67 | dist(k) = Inf; | ||
68 | end | ||
69 | end | ||
70 | time = toc; | ||
71 | inds = sub2ind(size(dist_mat), I_spots,J_spots); | ||
72 | dist_mat(inds) = dist; | ||
73 | counter = top+1; | ||
74 | display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds']) | ||
75 | end | ||
76 | |||
77 | DIST_CELL = dist_mat; | ||
78 | save(ALIGN_NAME, 'DIST_CELL', 'header_names') | ||
79 | DIST_OUT = dfun(DIST_CELL); | ||
80 | |||
81 | |||
82 | |||
83 | |||
84 | |||
85 | function out_d = dfun(MAT) | ||
86 | |||
87 | score12 = triu(MAT,1); | ||
88 | self_d = diag(MAT); | ||
89 | d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)')); | ||
90 | |||
91 | out_d = triu(d,1) + triu(d,1)'; | ||
92 | |||
93 | |||
94 | |||
95 | |||
96 | |||
97 | |||
98 | |||
diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m new file mode 100644 index 0000000..0e69275 --- a/dev/null +++ b/test/entropy/RefineAlignments.m | |||
@@ -0,0 +1,276 @@ | |||
1 | function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin) | ||
2 | % RefineAlignments | ||
3 | % Uses an interative algorithm to improve the multiple alignments of | ||
4 | % an input sequence. It will iteratively remove a fraction of the | ||
5 | % sequences, compress any gaps, and then re-align. This will be done | ||
6 | % for NUM_REPS. | ||
7 | % | ||
8 | % Written by Will Dampier. Contact at wnd22@drexel.edu | ||
9 | % Version 1.0 on 10/23/09 | ||
10 | % | ||
11 | % [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN) | ||
12 | % | ||
13 | % IN_ALIGN A char-array representing a multiple alignment to be | ||
14 | % refined. | ||
15 | % | ||
16 | % OUT_ALIGN The refined alignment. | ||
17 | % SCORE The new alignment score. | ||
18 | % | ||
19 | % | ||
20 | % Optional Arguements: | ||
21 | % | ||
22 | % 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a | ||
23 | % nucleotide or amino acid alignment. | ||
24 | % | ||
25 | % 'GapPenalty' The penalty for gaps. This is set at -8 by | ||
26 | % default. | ||
27 | % | ||
28 | % 'DistMat' The distance matrix to use for the alignment | ||
29 | % scoring. Default is BLOSUM50 for 'AA' and NUC44 | ||
30 | % for 'NT' | ||
31 | % | ||
32 | % 'NumReps' The number of refining iterations to do. | ||
33 | % | ||
34 | % 'NumTry' The number of tries to perform at each iteration. | ||
35 | % | ||
36 | % 'RefineType' Which type of refinement to perform. This can | ||
37 | % be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'. | ||
38 | % | ||
39 | % 'Display' A boolean indicating whether to display the output | ||
40 | % at each iteration. | ||
41 | % | ||
42 | % | ||
43 | |||
44 | |||
45 | |||
46 | |||
47 | RM_FRAC = 0.1; | ||
48 | NUM_TRIES = 10; | ||
49 | NUM_REPS = 100; | ||
50 | alpha = 'aa'; | ||
51 | dist_mat = []; | ||
52 | gap_penalty = -8; | ||
53 | TYPE = 'mixed'; | ||
54 | disp_flag = true; | ||
55 | |||
56 | for i = 1:2:length(varargin) | ||
57 | switch lower(varargin{i}) | ||
58 | |||
59 | case 'alphabet' | ||
60 | if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1}) | ||
61 | alpha = varargin{i+1}; | ||
62 | else | ||
63 | error('RefineAlignments:BADALPHA', ... | ||
64 | 'Arguement to "alphabet" must be "NT" or "AA"') | ||
65 | end | ||
66 | case 'gappenalty' | ||
67 | if isnumeric(varargin{i+1}) && varargin{i+1} < 0 | ||
68 | gap_penalty = varargin{i+1}; | ||
69 | else | ||
70 | error('RefineAlignments:BADGAP', ... | ||
71 | 'Arguement to "GapPenalty" must be negative numeric') | ||
72 | end | ||
73 | case 'distmat' | ||
74 | dist_mat = varargin{i+1}; | ||
75 | |||
76 | case 'numreps' | ||
77 | if isnumeric(varargin{i+1}) && varargin{i+1} > 1 | ||
78 | NUM_REPS = varargin{i+1}; | ||
79 | else | ||
80 | error('RefineAlignments:BADREPS', ... | ||
81 | 'Arguement to NumReps must be a positive numeric') | ||
82 | end | ||
83 | |||
84 | case 'refinetype' | ||
85 | TYPE = varargin{i+1}; | ||
86 | |||
87 | case 'display' | ||
88 | if islogical(varargin{i+1}) | ||
89 | disp_flag = varargin{i+1}; | ||
90 | else | ||
91 | error('RefineAlignments:DISPLAY', ... | ||
92 | 'Arguement to Display must be a boolean.') | ||
93 | end | ||
94 | |||
95 | otherwise | ||
96 | error('RefineAlignments:BADARG', 'Unknown arguement: %s', ... | ||
97 | lower(varargin{i})) | ||
98 | |||
99 | end | ||
100 | end | ||
101 | |||
102 | if isempty(dist_mat) | ||
103 | if strcmpi(alpha, 'nt') | ||
104 | dist_mat = nuc44; | ||
105 | toint = @nt2int; | ||
106 | else | ||
107 | dist_mat = blosum50; | ||
108 | toint = @aa2int; | ||
109 | end | ||
110 | end | ||
111 | |||
112 | dist_mat(end+1,:) = gap_penalty; | ||
113 | dist_mat(:,end+1) = gap_penalty; | ||
114 | dist_mat(end,end) = 0; | ||
115 | |||
116 | switch upper(TYPE) | ||
117 | case 'MIXED' | ||
118 | is_stoch = true; | ||
119 | is_mix = true; | ||
120 | case 'STOCASTIC' | ||
121 | is_stoch = true; | ||
122 | is_mix = false; | ||
123 | case 'DETERMINISTIC' | ||
124 | is_stoch = false; | ||
125 | is_mix = false; | ||
126 | end | ||
127 | |||
128 | |||
129 | OUT_ALIGN = IN_ALIGN; | ||
130 | cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint); | ||
131 | |||
132 | for i = 1:NUM_REPS | ||
133 | |||
134 | %determine whether to switch based on the "mix" factor | ||
135 | if is_mix | ||
136 | is_stoch = ~is_stoch; | ||
137 | end | ||
138 | |||
139 | |||
140 | rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES); | ||
141 | counter = 1; | ||
142 | while counter < NUM_TRIES | ||
143 | new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha); | ||
144 | |||
145 | new_score = CalculateScore(new_mat, dist_mat, toint); | ||
146 | if new_score > cur_score | ||
147 | if disp_flag | ||
148 | disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ... | ||
149 | num2str(abs(new_score-cur_score)), ' and by ', ... | ||
150 | num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ... | ||
151 | ' columns', ' Using stoch:', num2str(is_stoch)}; | ||
152 | display([disp_cell{:}]) | ||
153 | end | ||
154 | cur_score = new_score; | ||
155 | OUT_ALIGN = new_mat; | ||
156 | break | ||
157 | end | ||
158 | counter = counter + 1; | ||
159 | end | ||
160 | end | ||
161 | |||
162 | function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES) | ||
163 | % GetRows | ||
164 | % A helper function which determines the rows to split the alignment | ||
165 | % with. | ||
166 | % | ||
167 | % MAT A char-array of the multiple alignment. | ||
168 | % STOCH A boolean indicating whether to use a stochastic approach. | ||
169 | % RM_FRAC The fraction of rows to remove. | ||
170 | % NUM_TRIES The number of tries to return. | ||
171 | % | ||
172 | % rows_mat A matrix indicating which rows to remove. | ||
173 | % | ||
174 | |||
175 | if STOCH | ||
176 | rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC; | ||
177 | if any(all(rows_mat,1)) || any(all(~rows_mat,1)) | ||
178 | cols = any(all(rows_mat,1)) || any(all(~rows_mat,1)); | ||
179 | rows_mat(1,cols) = ~rows_mat(1,cols); | ||
180 | end | ||
181 | |||
182 | else | ||
183 | gap_mask = MAT =='-'; | ||
184 | runs = FindRuns(gap_mask); | ||
185 | score = mode(runs.*(runs~=0))-min(runs); | ||
186 | [~, order] = sort(score, 'descend'); | ||
187 | rm_rows = order(1:NUM_TRIES); | ||
188 | [~, norder] = sort(runs(:,rm_rows),'descend'); | ||
189 | rows_mat = false(size(MAT,1), NUM_TRIES); | ||
190 | num_remove = ceil(RM_FRAC*size(MAT,1)); | ||
191 | for i = 1:NUM_TRIES | ||
192 | rows_mat(norder(1:num_remove,i),i) = true; | ||
193 | end | ||
194 | end | ||
195 | |||
196 | function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha) | ||
197 | % SplitAndReAlign | ||
198 | % This will split the alignment matix based on the provided rows and | ||
199 | % then create two profiles and re-align them. | ||
200 | % | ||
201 | % MAT A char-array of the alignment. | ||
202 | % ROWS A boolean-array describing the switch between rows | ||
203 | % alpha Either 'NT' or 'AA' to indicate which sequence profile to | ||
204 | % create. | ||
205 | % | ||
206 | % NEW_MAT The ReAlinged alignment | ||
207 | % | ||
208 | |||
209 | mat1 = MAT(ROWS, :); | ||
210 | mat2 = MAT(~ROWS, :); | ||
211 | |||
212 | mat1(:,all(mat1 == '-',1)) = []; | ||
213 | mat2(:,all(mat2 == '-',1)) = []; | ||
214 | |||
215 | pmat1 = seqprofile(mat1, 'alphabet', alpha); | ||
216 | pmat2 = seqprofile(mat2, 'alphabet', alpha); | ||
217 | try | ||
218 | [~, ind1, ind2] = profalign(pmat1, pmat2); | ||
219 | catch | ||
220 | ind1 = 1:size(pmat1,2); | ||
221 | ind2 = 1:size(pmat2,2); | ||
222 | end | ||
223 | |||
224 | NEW_MAT = char(0); | ||
225 | NEW_MAT(ROWS, ind1) = mat1; | ||
226 | NEW_MAT(~ROWS, ind2) = mat2; | ||
227 | NEW_MAT(~isletter(NEW_MAT)) = '-'; | ||
228 | NEW_MAT(all(NEW_MAT == '-',1)) = []; | ||
229 | |||
230 | |||
231 | |||
232 | |||
233 | function s = CalculateScore(MAT, DIST, FUN) | ||
234 | % CalculateScore | ||
235 | % A function which calculates the score of the alignment. This score | ||
236 | % is based on the agreement between the consensus and each individual | ||
237 | % alignment. | ||
238 | % | ||
239 | % MAT A char-array of a multiple alignment. | ||
240 | % DIST A distance matrix which indicates the penalty for | ||
241 | % mismatches. | ||
242 | % FUN A function which converts letter to numbers such at nt2int | ||
243 | % or aa2int. | ||
244 | % | ||
245 | % s The score for this alingment. | ||
246 | % | ||
247 | % | ||
248 | |||
249 | cons = seqconsensus(MAT); | ||
250 | num_cons = FUN(cons); | ||
251 | num_align = FUN(MAT); | ||
252 | ncons = repmat(num_cons(:), [size(num_align,1), 1]); | ||
253 | nalign = num_align(:); | ||
254 | lookupinds = sub2ind(size(DIST), ncons, nalign); | ||
255 | s = sum(DIST(lookupinds)); | ||
256 | |||
257 | function reverse_looking = FindRuns(input) | ||
258 | % FindRuns | ||
259 | % Finds consecutive runs of 1's along the rows of a boolean array. | ||
260 | % | ||
261 | % INPUT = [1 1 1 0 0 0 1 0 1 0 1 1; | ||
262 | % 0 1 1 1 0 1 1 0 0 1 1 1]; | ||
263 | % RL = [1 2 3 0 0 0 1 0 1 0 1 2; | ||
264 | % [0 3 2 1 0 1 2 0 0 1 2 3]; | ||
265 | % | ||
266 | |||
267 | [m,n] = size(input); | ||
268 | reverse_looking = [zeros(1,m);input.']; | ||
269 | reverse_looking = reverse_looking(:); | ||
270 | p = find(~reverse_looking); | ||
271 | reverse_looking(p) = [0;1-diff(p)]; | ||
272 | reverse_looking = reshape(cumsum(reverse_looking),[],m).'; | ||
273 | reverse_looking(:,1) = []; | ||
274 | |||
275 | |||
276 | |||
diff --git a/test/entropy/don_anal.m b/test/entropy/don_anal.m new file mode 100644 index 0000000..7d0a648 --- a/dev/null +++ b/test/entropy/don_anal.m | |||
@@ -0,0 +1,40 @@ | |||
1 | d = fastaread('C:\dondata\FASTA2.fa'); | ||
2 | lens = arrayfun(@(x)(numel(x.Sequence)), d); | ||
3 | d(lens < 300) = []; | ||
4 | rexp = '/Human/(\w*)/([\d\w]*)/(.*?)/(\d{4})/'; | ||
5 | headers = cell(length(d), 1); | ||
6 | [headers{:}] = deal(d(:).Header); | ||
7 | groups = regexp(headers, rexp, 'tokens', 'once'); | ||
8 | cgroups = cat(1,groups{:}); | ||
9 | dates = unique(cgroups(:,end)); | ||
10 | |||
11 | seqs = cell(length(d), 1); | ||
12 | [seqs{:}] = deal(d(:).Sequence); | ||
13 | |||
14 | ALIGN_CELL = cell(length(dates), 1); | ||
15 | INDS_CELL = cell(length(dates), 1); | ||
16 | PAIRWISE_CELL = cell(length(dates),1); | ||
17 | for i = 1:length(dates) | ||
18 | display(dates{i}) | ||
19 | tf = strcmp(cgroups(:,end), dates{i}); | ||
20 | if isempty(PAIRWISE_CELL{i}) | ||
21 | pairwise_dists = GenomePairwiseDist(d(tf), dates{i}); | ||
22 | PAIRWISE_CELL{i} = pairwise_dists; | ||
23 | else | ||
24 | pairwise_dists = PAIRWISE_CELL{i}; | ||
25 | end | ||
26 | if isempty(ALIGN_CELL{i}) || isempty(INDS_CELL{i}) | ||
27 | [ALIGN_CELL{i}, INDS_CELL{i}] = GenomeAlignments(d(tf), ... | ||
28 | pairwise_dists); | ||
29 | save restore_data PAIRWISE_CELL ALIGN_CELL INDS_CELL | ||
30 | end | ||
31 | end | ||
32 | |||
33 | for i = 1:length(ALIGN_CELL) | ||
34 | [ALIGN_CELL{i}, ~]= RefineAlignments(ALIGN_CELL{i}); | ||
35 | end | ||
36 | |||
37 | |||
38 | ents = CalculateProteinEntropy(d(1), ALIGN_CELL); | ||
39 | |||
40 | dents = cat(1, ents{:}); | ||
diff --git a/test/entropy/nwalign_mod.m b/test/entropy/nwalign_mod.m new file mode 100644 index 0000000..76aade6 --- a/dev/null +++ b/test/entropy/nwalign_mod.m | |||
@@ -0,0 +1,637 @@ | |||
1 | function [score, alignment, startat, matrices] = nwalign(seq1,seq2,varargin) | ||
2 | %NWALIGN performs Needleman-Wunsch global alignment of two sequences. | ||
3 | % | ||
4 | % NWALIGN(SEQ1, SEQ2) returns the score (in bits) for the optimal | ||
5 | % alignment. Note: The scale factor used to calculate the score is | ||
6 | % provided by the scoring matrix info (see below). If this is not | ||
7 | % defined, then NWALIGN returns the raw score. | ||
8 | % | ||
9 | % [SCORE, ALIGNMENT] = NWALIGN(SEQ1, SEQ2) returns a string showing an | ||
10 | % optimal global alignment of amino acid (or nucleotide) sequences SEQ1 | ||
11 | % and SEQ2. | ||
12 | % | ||
13 | % [SCORE, ALIGNMENT, STARTAT] = NWALIGN(SEQ1, SEQ2) returns a 2x1 vector | ||
14 | % with the starting point indices indicating the starting point of the | ||
15 | % alignment in the two sequences. Note: this output is for consistency | ||
16 | % with SWALIGN and will always be [1;1] because this is a global | ||
17 | % alignment. | ||
18 | % | ||
19 | % NWALIGN(..., 'ALPHABET', A) specifies whether the sequences are | ||
20 | % amino acids ('AA') or nucleotides ('NT'). The default is AA. | ||
21 | % | ||
22 | % NWALIGN(..., 'SCORINGMATRIX', matrix) defines the scoring matrix to be | ||
23 | % used for the alignment. The default is BLOSUM50 for AA or NUC44 for NT. | ||
24 | % | ||
25 | % NWALIGN(..., 'SCALE' ,scale) indicates the scale factor of the scoring | ||
26 | % matrix to return the score using arbitrary units. If the scoring matrix | ||
27 | % Info also provides a scale factor, then both are used. | ||
28 | % | ||
29 | % NWALIGN(..., 'GAPOPEN', penalty) defines the penalty for opening a gap | ||
30 | % in the alignment. The default gap open penalty is 8. | ||
31 | % | ||
32 | % NWALIGN(..., 'EXTENDGAP', penalty) defines the penalty for extending a | ||
33 | % gap in the alignment. If EXTENDGAP is not specified, then extensions to | ||
34 | % gaps are scored with the same value as GAPOPEN. | ||
35 | % | ||
36 | % NWALIGN(..., 'SHOWSCORE', true) displays the scoring space and the | ||
37 | % winning path. | ||
38 | % | ||
39 | % | ||
40 | % Examples: | ||
41 | % | ||
42 | % % Return the score in bits and the global alignment using the | ||
43 | % % default scoring matrix (BLOSUM50). | ||
44 | % [score, align] = nwalign('VSPAGMASGYD', 'IPGKASYD') | ||
45 | % | ||
46 | % % Use user-specified scoring matrix and "gap open" penalty. | ||
47 | % [score, align] = nwalign('IGRHRYHIGG', 'SRYIGRG',... | ||
48 | % 'scoringmatrix', @pam250, 'gapopen',5) | ||
49 | % | ||
50 | % % Return the score in nat units (nats). | ||
51 | % [score, align] = nwalign('HEAGAWGHEE', 'PAWHEAE', 'scale', log(2)) | ||
52 | % | ||
53 | % % Display the scoring space and the winning path. | ||
54 | % nwalign('VSPAGMASGYD', 'IPGKASYD', 'showscore', true) | ||
55 | % | ||
56 | % See also BLOSUM, MULTIALIGN, NT2AA, PAM, PROFALIGN, SEQDOTPLOT, | ||
57 | % SHOWALIGNMENT, SWALIGN. | ||
58 | |||
59 | % References: | ||
60 | % R. Durbin, S. Eddy, A. Krogh, and G. Mitchison. Biological Sequence | ||
61 | % Analysis. Cambridge UP, 1998. | ||
62 | % Needleman, S. B., Wunsch, C. D., J. Mol. Biol. (1970) 48:443-453 | ||
63 | |||
64 | % Copyright 2002-2006 The MathWorks, Inc. | ||
65 | % $Revision: 1.22.6.10 $ $Date: 2006/05/17 20:48:28 $ | ||
66 | |||
67 | gapopen = -8; | ||
68 | gapextend = -8; | ||
69 | setGapExtend = false; | ||
70 | showscore=false; | ||
71 | isAminoAcid = true; | ||
72 | scale=1; | ||
73 | |||
74 | % If the input is a structure then extract the Sequence data. | ||
75 | if isstruct(seq1) | ||
76 | try | ||
77 | seq1 = seqfromstruct(seq1); | ||
78 | catch | ||
79 | rethrow(lasterror); | ||
80 | end | ||
81 | end | ||
82 | if isstruct(seq2) | ||
83 | try | ||
84 | seq2 = seqfromstruct(seq2); | ||
85 | catch | ||
86 | rethrow(lasterror); | ||
87 | end | ||
88 | end | ||
89 | if nargin > 2 | ||
90 | if rem(nargin,2) == 1 | ||
91 | error('Bioinfo:IncorrectNumberOfArguments',... | ||
92 | 'Incorrect number of arguments to %s.',mfilename); | ||
93 | end | ||
94 | okargs = {'scoringmatrix','gapopen','extendgap','alphabet','scale','showscore'}; | ||
95 | for j=1:2:nargin-2 | ||
96 | pname = varargin{j}; | ||
97 | pval = varargin{j+1}; | ||
98 | k = strmatch(lower(pname), okargs); %#ok | ||
99 | if isempty(k) | ||
100 | error('Bioinfo:UnknownParameterName',... | ||
101 | 'Unknown parameter name: %s.',pname); | ||
102 | elseif length(k)>1 | ||
103 | error('Bioinfo:AmbiguousParameterName',... | ||
104 | 'Ambiguous parameter name: %s.',pname); | ||
105 | else | ||
106 | switch(k) | ||
107 | case 1 % scoring matrix | ||
108 | if isnumeric(pval) | ||
109 | ScoringMatrix = pval; | ||
110 | else | ||
111 | if ischar(pval) | ||
112 | pval = lower(pval); | ||
113 | end | ||
114 | try | ||
115 | [ScoringMatrix,ScoringMatrixInfo] = feval(pval); | ||
116 | catch | ||
117 | error('Bioinfo:InvalidScoringMatrix','Invalid scoring matrix.'); | ||
118 | end | ||
119 | end | ||
120 | case 2 %gap open penalty | ||
121 | gapopen = -pval; | ||
122 | case 3 %gap extend penalty | ||
123 | gapextend = -pval; | ||
124 | setGapExtend = true; | ||
125 | case 4 %if sequence is nucleotide | ||
126 | if strcmpi(pval,'nt') | ||
127 | isAminoAcid = false; | ||
128 | end | ||
129 | case 5 % scale | ||
130 | scale=pval; | ||
131 | case 6 % showscore | ||
132 | showscore = pval == true; | ||
133 | end | ||
134 | end | ||
135 | end | ||
136 | end | ||
137 | |||
138 | % setting the default scoring matrix | ||
139 | if ~exist('ScoringMatrix','var') | ||
140 | if isAminoAcid | ||
141 | [ScoringMatrix,ScoringMatrixInfo] = blosum50; | ||
142 | else | ||
143 | [ScoringMatrix,ScoringMatrixInfo] = nuc44; | ||
144 | end | ||
145 | end | ||
146 | |||
147 | |||
148 | % getting the scale from ScoringMatrixInfo, if it exists | ||
149 | if exist('ScoringMatrixInfo','var') && isfield(ScoringMatrixInfo,'Scale') | ||
150 | scale=scale*ScoringMatrixInfo.Scale; | ||
151 | end | ||
152 | |||
153 | % handle properly "?" characters typically found in pdb files | ||
154 | if isAminoAcid | ||
155 | if ischar(seq1) | ||
156 | seq1 = strrep(seq1,'?','X'); | ||
157 | else | ||
158 | seq1(seq1 == 26) = 23; | ||
159 | end | ||
160 | if ischar(seq2) | ||
161 | seq2 = strrep(seq2,'?','X'); | ||
162 | else | ||
163 | seq2(seq2 == 26) = 23; | ||
164 | end | ||
165 | end | ||
166 | |||
167 | % check input sequences | ||
168 | % if isAminoAcid && ~(isaa(seq1) && isaa(seq2)) | ||
169 | % error('Bioinfo:InvalidAminoAcidSequences',... | ||
170 | % 'Both sequences must be amino acids, use ALPHABET = ''NT'' for aligning nucleotides.'); | ||
171 | % elseif ~isAminoAcid && ~(isnt(seq1) && isnt(seq2)) | ||
172 | % error('Bioinfo:InvalidNucleotideSequences',... | ||
173 | % 'When ALPHABET = ''NT'', both sequences must be nucleotides.'); | ||
174 | % end | ||
175 | |||
176 | % use numerical arrays for easy indexing | ||
177 | if ischar(seq1) | ||
178 | seq1=upper(seq1); %the output alignment will be all uppercase | ||
179 | if isAminoAcid | ||
180 | intseq1 = aa2int(seq1); | ||
181 | else | ||
182 | intseq1 = nt2int(seq1); | ||
183 | end | ||
184 | else | ||
185 | intseq1=seq1; | ||
186 | if isAminoAcid | ||
187 | seq1 = int2aa(intseq1); | ||
188 | else | ||
189 | seq1 = int2nt(intseq1); | ||
190 | end | ||
191 | end | ||
192 | if ischar(seq2) | ||
193 | seq2=upper(seq2); %the output alignment will be all uppercase | ||
194 | if isAminoAcid | ||
195 | intseq2 = aa2int(seq2); | ||
196 | else | ||
197 | intseq2 = nt2int(seq2); | ||
198 | end | ||
199 | else | ||
200 | intseq2=seq2; | ||
201 | if isAminoAcid | ||
202 | seq2 = int2aa(intseq2); | ||
203 | else | ||
204 | seq2 = int2nt(intseq2); | ||
205 | end | ||
206 | end | ||
207 | |||
208 | |||
209 | m = length(seq1); | ||
210 | n = length(seq2); | ||
211 | if ~n||~m | ||
212 | error('Bioinfo:InvalidLengthSequences','Length of input sequences must be greater than 0'); | ||
213 | end | ||
214 | |||
215 | % If unknown, ambiguous or gaps appear in the sequence, we need to make | ||
216 | % sure that ScoringMatrix can handle them. | ||
217 | |||
218 | % possible values are | ||
219 | % B Z X * - ? | ||
220 | % 21 22 23 24 25 26 | ||
221 | |||
222 | scoringMatrixSize = size(ScoringMatrix,1); | ||
223 | |||
224 | highestVal = max([intseq1, intseq2]); | ||
225 | if highestVal > scoringMatrixSize | ||
226 | % if the matrix contains the 'Any' we map to that | ||
227 | if isAminoAcid | ||
228 | anyVal = aa2int('X'); | ||
229 | else | ||
230 | anyVal = nt2int('N'); | ||
231 | end | ||
232 | if scoringMatrixSize >= anyVal | ||
233 | intseq1(intseq1>scoringMatrixSize) = anyVal; | ||
234 | intseq2(intseq2>scoringMatrixSize) = anyVal; | ||
235 | else | ||
236 | error('Bioinfo:InvalidSymbolsInInputSequeces',... | ||
237 | 'Sequences contain symbols that cannot be handled by the given scoring matrix.'); | ||
238 | end | ||
239 | end | ||
240 | |||
241 | if setGapExtend % call more complicated algorithm if we have | ||
242 | [F, pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend); | ||
243 | else | ||
244 | % [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen); | ||
245 | scoredMatchMat=[ScoringMatrix(intseq2,intseq1(:)) zeros(n,1); zeros(1,m+1)]; | ||
246 | [F pointer]=FastNWalign2(scoredMatchMat,gapopen); | ||
247 | end | ||
248 | |||
249 | % trace back through the pointer matrix | ||
250 | halfn=ceil((n+1)/2); | ||
251 | halfm=ceil((m+1)/2); | ||
252 | |||
253 | i = n+1; j = m+1; | ||
254 | path = zeros(n+m,2); | ||
255 | step = 1; | ||
256 | |||
257 | score = max(F(n+1,m+1,:)); | ||
258 | |||
259 | if setGapExtend | ||
260 | |||
261 | if F(n+1,m+1,3)==score % favor with left-gap | ||
262 | laststate=3; | ||
263 | elseif F(n+1,m+1,2)==score % then with up-gap | ||
264 | laststate=2; | ||
265 | else % at last with match | ||
266 | laststate=1; | ||
267 | end | ||
268 | |||
269 | while (i>halfn && j>halfm) % in the rigth half favor gaps when several | ||
270 | % paths lead to the highest score | ||
271 | state=laststate; | ||
272 | if bitget(pointer(i,j,state),3) | ||
273 | laststate=3; | ||
274 | elseif bitget(pointer(i,j,state),2) | ||
275 | laststate=2; | ||
276 | else | ||
277 | laststate=1; | ||
278 | end | ||
279 | |||
280 | switch state | ||
281 | case 1 % is diagonal | ||
282 | j = j - 1; | ||
283 | i = i - 1; | ||
284 | path(step,:) = [j,i]; | ||
285 | case 2 % is up | ||
286 | i = i - 1; | ||
287 | path(step,2) = i; | ||
288 | case 3 % is left | ||
289 | j = j - 1; | ||
290 | path(step,1) = j; | ||
291 | end | ||
292 | step = step +1; | ||
293 | end | ||
294 | |||
295 | while (i>1 || j>1) % in the rigth half favor matchs when several | ||
296 | % paths lead to the highest score | ||
297 | state=laststate; | ||
298 | if bitget(pointer(i,j,state),1) | ||
299 | laststate=1; | ||
300 | elseif bitget(pointer(i,j,state),2) | ||
301 | laststate=2; | ||
302 | else | ||
303 | laststate=3; | ||
304 | end | ||
305 | |||
306 | switch state | ||
307 | case 1 % is diagonal | ||
308 | j = j - 1; | ||
309 | i = i - 1; | ||
310 | path(step,:) = [j,i]; | ||
311 | case 2 % is up | ||
312 | i = i - 1; | ||
313 | path(step,2) = i; | ||
314 | case 3 % is left | ||
315 | j = j - 1; | ||
316 | path(step,1) = j; | ||
317 | end | ||
318 | step = step +1; | ||
319 | end | ||
320 | |||
321 | else % ~setGapExtend | ||
322 | |||
323 | while (i > 1 || j > 1) | ||
324 | |||
325 | switch pointer(i,j) | ||
326 | case 1 % diagonal only | ||
327 | j = j - 1; | ||
328 | i = i - 1; | ||
329 | path(step,:) = [j,i]; | ||
330 | case 2 % up only | ||
331 | i = i - 1; | ||
332 | path(step,2) = i; | ||
333 | case 4 % left only | ||
334 | j = j - 1; | ||
335 | path(step,1) = j; | ||
336 | case 6 % up or left --> up (favors gaps in seq2) | ||
337 | j = j - 1; | ||
338 | path(step,1) = j; | ||
339 | otherwise %3 diagonal or up --> diagonal (favors no gaps) | ||
340 | %4 diagonal or left --> diagonal (favors no gaps) | ||
341 | %7 diagonal or left or up --> diagonal (favors no gaps) | ||
342 | j = j - 1; | ||
343 | i = i - 1; | ||
344 | path(step,:) = [j,i]; | ||
345 | end | ||
346 | step = step +1; | ||
347 | end | ||
348 | |||
349 | end % if setGapExtend | ||
350 | |||
351 | % re-scaling the output score | ||
352 | score = scale * score; | ||
353 | |||
354 | if nargout<=1 && ~showscore | ||
355 | return | ||
356 | end | ||
357 | |||
358 | path(step:end,:) = []; % step-1 is the length of the alignment | ||
359 | path = flipud(path); | ||
360 | |||
361 | % setting the size of the alignment | ||
362 | alignment = repmat(('- -')',1,step-1); | ||
363 | |||
364 | % adding sequence to alignment | ||
365 | alignment(1,path(:,1)>0) = seq1; | ||
366 | alignment(3,path(:,2)>0) = seq2; | ||
367 | |||
368 | % find locations where there are no gaps | ||
369 | h=find(all(path>0,2)); | ||
370 | if isAminoAcid | ||
371 | noGaps1=aa2int(alignment(1,h)); | ||
372 | noGaps2=aa2int(alignment(3,h)); | ||
373 | else | ||
374 | noGaps1=nt2int(alignment(1,h)); | ||
375 | noGaps2=nt2int(alignment(3,h)); | ||
376 | end | ||
377 | |||
378 | % erasing symbols that can not be scored | ||
379 | htodel=max([noGaps1;noGaps2])>scoringMatrixSize; | ||
380 | h(htodel)=[]; | ||
381 | noGaps1(htodel)=[]; | ||
382 | noGaps2(htodel)=[]; | ||
383 | % score pairs with no gap | ||
384 | value = ScoringMatrix(sub2ind(size(ScoringMatrix),double(noGaps1),double(noGaps2))); | ||
385 | |||
386 | % insert symbols of the match string into the alignment | ||
387 | alignment(2,h(value>=0)) = ':'; | ||
388 | alignment(2,h(noGaps1==noGaps2)) = '|'; | ||
389 | |||
390 | startat = [1;1]; | ||
391 | |||
392 | % undocumented fourth output -- score and pointer matrices | ||
393 | if nargout > 3 | ||
394 | matrices.Scores = F; | ||
395 | matrices.Pointers = pointer; | ||
396 | end | ||
397 | |||
398 | if showscore | ||
399 | figure | ||
400 | F=scale.*max(F(2:end,2:end,:),[],3); | ||
401 | clim=max(max(max(abs(F(~isinf(F))))),eps); | ||
402 | imagesc(F,[-clim clim]); | ||
403 | colormap(privateColorMap(1)); | ||
404 | set(colorbar,'YLim',[min([F(:);-eps]) max([F(:);eps])]) | ||
405 | title('Score for best path') | ||
406 | xlabel('Sequence 1') | ||
407 | ylabel('Sequence 2') | ||
408 | hold on | ||
409 | plot(path(all(path>0,2),1),path(all(path>0,2),2),'k.') | ||
410 | end | ||
411 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
412 | % function [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gap) | ||
413 | % % Standard Needleman-Wunsch algorithm | ||
414 | % | ||
415 | % %LOOP IMPLEMENTED IN FastNWalign2.c | ||
416 | % | ||
417 | % % set up storage for dynamic programming matrix | ||
418 | % F = zeros(n+1,m+1); | ||
419 | % F(2:end,1) = gap * (1:n)'; | ||
420 | % F(1,2:end) = gap * (1:m); | ||
421 | % | ||
422 | % | ||
423 | % %and for the back tracing matrix | ||
424 | % pointer= repmat(uint8(4),n+1,m+1); | ||
425 | % pointer(:,1) = 2; % up | ||
426 | % pointer(1,1) = 1; | ||
427 | % | ||
428 | % | ||
429 | % %initialize buffers to the first column | ||
430 | % ptr = pointer(:,2); % ptr(1) is always 4 | ||
431 | % currentFColumn = F(:,1); | ||
432 | % | ||
433 | % | ||
434 | % %main loop runs through the matrix looking for maximal scores | ||
435 | % | ||
436 | % for outer = 2:m+1 | ||
437 | % | ||
438 | % %score current column | ||
439 | % scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1)); | ||
440 | % %grab the data from the matrices and initialize some values | ||
441 | % lastFColumn = F(:,outer-1); | ||
442 | % currentFColumn = F(:,outer); | ||
443 | % best = currentFColumn(1); | ||
444 | % | ||
445 | % %[F(:,outer) pointer(:,outer)]=FastNWalign(lastFColumn,currentFColumn,scoredMatchColumn,gap); | ||
446 | % | ||
447 | % for inner = 2:n+1 | ||
448 | % % score the three options | ||
449 | % up = best + gap; | ||
450 | % left = lastFColumn(inner) + gap; | ||
451 | % diagonal = lastFColumn(inner-1) + scoredMatchColumn(inner-1); | ||
452 | % | ||
453 | % % max could be used here but it is quicker to use if statements | ||
454 | % if up > left | ||
455 | % best = up; | ||
456 | % pos = 2; | ||
457 | % else | ||
458 | % best = left; | ||
459 | % pos = 4; | ||
460 | % end | ||
461 | % | ||
462 | % if diagonal >= best | ||
463 | % best = diagonal; | ||
464 | % ptr(inner) = 1; | ||
465 | % else | ||
466 | % ptr(inner) = pos; | ||
467 | % end | ||
468 | % currentFColumn(inner) = best; | ||
469 | % | ||
470 | % end % inner | ||
471 | % %put back updated columns | ||
472 | % | ||
473 | % % save columns of pointers | ||
474 | % F(:,outer) = currentFColumn; | ||
475 | % % save columns of pointers | ||
476 | % pointer(:,outer) = ptr; | ||
477 | % | ||
478 | % end % outer | ||
479 | % | ||
480 | % end | ||
481 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
482 | function [F,pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend) | ||
483 | % Needleman-Wunsch algorithm modified to handle affine gaps | ||
484 | |||
485 | % Set states | ||
486 | inAlign = 1; | ||
487 | inGapUp = 2; | ||
488 | inGapLeft = 3; | ||
489 | numStates = 3; | ||
490 | |||
491 | % Set up storage for dynamic programming matrix: | ||
492 | % for keeping the maximum scores for every state | ||
493 | |||
494 | F = zeros(n+1,m+1,numStates); | ||
495 | F(:,1,:) = -inf; | ||
496 | F(1,:,:) = -inf; | ||
497 | F(1,1,inAlign) = 0; | ||
498 | |||
499 | F(2:end,1,inGapUp) = gapopen + gapextend * (0:n-1)'; | ||
500 | F(1,2:end,inGapLeft) = gapopen + gapextend * (0:m-1); | ||
501 | |||
502 | % and for the back tracing pointers | ||
503 | pointer(n+1,m+1,numStates) = uint8(0); | ||
504 | pointer(2:end,1,inGapUp) = 2; % up | ||
505 | pointer(1,2:end,inGapLeft) = 4; % left | ||
506 | |||
507 | % initialize buffers to the first column | ||
508 | ptrA = pointer(:,1,inAlign); | ||
509 | ptrU = pointer(:,1,inGapLeft); | ||
510 | ptrL = pointer(:,1,inGapUp); | ||
511 | |||
512 | currentFColumnA = F(:,1,inAlign); | ||
513 | currentFColumnU = F(:,1,inGapUp); | ||
514 | currentFColumnL = F(:,1,inGapLeft); | ||
515 | |||
516 | % main loop runs through the matrix looking for maximal scores | ||
517 | for outer = 2:m+1 | ||
518 | % score current column | ||
519 | scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1)); | ||
520 | % grab the data from the matrices and initialize some values for the | ||
521 | % first row the most orderly possible | ||
522 | lastFColumnA = currentFColumnA; | ||
523 | currentFColumnA = F(:,outer,inAlign); | ||
524 | bestA = currentFColumnA(1); | ||
525 | currentinA = lastFColumnA(1); | ||
526 | |||
527 | lastFColumnU = currentFColumnU; | ||
528 | currentFColumnU = F(:,outer,inGapUp); | ||
529 | bestU = currentFColumnU(1); | ||
530 | |||
531 | lastFColumnL = currentFColumnL; | ||
532 | currentFColumnL = F(:,outer,inGapLeft); | ||
533 | currentinGL = lastFColumnL(1); | ||
534 | |||
535 | for inner = 2:n+1 | ||
536 | |||
537 | % grab the data from the columns the most orderly possible | ||
538 | upOpen = bestA + gapopen; | ||
539 | inA = currentinA; | ||
540 | currentinA = lastFColumnA(inner); | ||
541 | leftOpen = currentinA + gapopen; | ||
542 | |||
543 | inGL = currentinGL; | ||
544 | currentinGL = lastFColumnL(inner); | ||
545 | leftExtend = currentinGL + gapextend; | ||
546 | |||
547 | upExtend = bestU + gapextend; | ||
548 | inGU = lastFColumnU(inner-1); | ||
549 | |||
550 | % operate state 'inGapUp' | ||
551 | |||
552 | if upOpen > upExtend | ||
553 | bestU = upOpen; ptr = 1; % diagonal | ||
554 | elseif upOpen < upExtend | ||
555 | bestU = upExtend; ptr = 2; % up | ||
556 | else % upOpen == upExtend | ||
557 | bestU = upOpen; ptr = 3; % diagonal and up | ||
558 | end | ||
559 | currentFColumnU(inner)=bestU; | ||
560 | ptrU(inner)=ptr; | ||
561 | |||
562 | % operate state 'inGapLeft' | ||
563 | |||
564 | if leftOpen > leftExtend | ||
565 | bestL = leftOpen; ptr = 1; % diagonal | ||
566 | elseif leftOpen < leftExtend | ||
567 | bestL = leftExtend; ptr = 4; % left | ||
568 | else % leftOpen == leftExtend | ||
569 | bestL = leftOpen; ptr = 5; % diagonal and left | ||
570 | end | ||
571 | currentFColumnL(inner) = bestL; | ||
572 | ptrL(inner) = ptr; | ||
573 | |||
574 | % operate state 'inAlign' | ||
575 | |||
576 | if inA > inGU | ||
577 | if inA > inGL | ||
578 | bestA = inA; ptr = 1; % diagonal | ||
579 | elseif inGL > inA | ||
580 | bestA = inGL; ptr = 4; % left | ||
581 | else | ||
582 | bestA = inA; ptr = 5; % diagonal and left | ||
583 | end | ||
584 | elseif inGU > inA | ||
585 | if inGU > inGL | ||
586 | bestA = inGU; ptr = 2; % up | ||
587 | elseif inGL > inGU | ||
588 | bestA = inGL; ptr = 4; % left | ||
589 | else | ||
590 | bestA = inGU; ptr = 6; % up & left | ||
591 | end | ||
592 | else | ||
593 | if inA > inGL | ||
594 | bestA = inA; ptr = 3; % diagonal & up | ||
595 | elseif inGL > inA | ||
596 | bestA = inGL; ptr = 4; % left | ||
597 | else | ||
598 | bestA = inA; ptr = 7; % all | ||
599 | end | ||
600 | end | ||
601 | |||
602 | bestA = bestA + scoredMatchColumn(inner-1); | ||
603 | currentFColumnA(inner) = bestA; | ||
604 | ptrA(inner) = ptr; | ||
605 | |||
606 | end %inner | ||
607 | |||
608 | % put back updated columns | ||
609 | F(:,outer,inGapLeft) = currentFColumnL; | ||
610 | F(:,outer,inGapUp) = currentFColumnU; | ||
611 | F(:,outer,inAlign) = currentFColumnA; | ||
612 | % save columns of pointers | ||
613 | pointer(:,outer,inAlign) = ptrA; | ||
614 | pointer(:,outer,inGapUp) = ptrU; | ||
615 | pointer(:,outer,inGapLeft)= ptrL; | ||
616 | end %outer | ||
617 | end | ||
618 | function pcmap = privateColorMap(selection) | ||
619 | %PRIVATECOLORMAP returns a custom color map | ||
620 | switch selection | ||
621 | case 1, pts = [0 0 .3 20; | ||
622 | 0 .1 .8 25; | ||
623 | 0 .9 .5 15; | ||
624 | .9 1 .9 8; | ||
625 | 1 1 0 26; | ||
626 | 1 0 0 26; | ||
627 | .4 0 0 0]; | ||
628 | otherwise, pts = [0 0 0 128; 1 1 1 0]; | ||
629 | end | ||
630 | xcl=1; | ||
631 | for i=1:size(pts,1)-1 | ||
632 | xcl=[xcl,i+1/pts(i,4):1/pts(i,4):i+1]; | ||
633 | end | ||
634 | pcmap = interp1(pts(:,1:3),xcl); | ||
635 | end | ||
636 | end | ||
637 | |||