-rw-r--r-- | analysis/year.R | 20 | ||||
-rw-r--r-- | src/Makefile.am | 39 | ||||
-rw-r--r-- | src/aggregator.c | 6 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c) | 87 | ||||
-rw-r--r-- | src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h) | 0 | ||||
-rw-r--r-- | src/error/check_error.c (renamed from src/check_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_error.h (renamed from src/check_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.c (renamed from src/check_h5_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.h (renamed from src/check_h5_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c) | 4 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c) | 10 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h) | 0 | ||||
-rw-r--r-- | src/model/gi_type_data.h | 21 | ||||
-rw-r--r-- | src/model/gi_type_data_init.c | 36 | ||||
-rw-r--r-- | src/model/gi_type_data_init.h | 14 | ||||
-rw-r--r-- | src/model/sequence_data.h (renamed from src/sequence_data.h) | 5 | ||||
-rw-r--r-- | src/model/sequence_data_init.c (renamed from src/sequence_data_init.c) | 6 | ||||
-rw-r--r-- | src/model/sequence_data_init.h (renamed from src/sequence_data_init.h) | 0 | ||||
-rw-r--r-- | src/updator.c | 4 |
22 files changed, 181 insertions, 71 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c new file mode 100644 index 0000000..0a11c23 --- a/dev/null +++ b/src/assign/assign_protein_type.c | |||
@@ -0,0 +1,253 @@ | |||
1 | #include "assign_protein_type.h" | ||
2 | #include "error/check_ncbi_error.h" | ||
3 | #include "error/check_h5_error.h" | ||
4 | #include "model/gi_type_data.h" | ||
5 | #include "model/gi_type_data_init.h" | ||
6 | #include "model/sequence_data.h" | ||
7 | #include "model/sequence_data_init.h" | ||
8 | #include <ncbi.h> | ||
9 | #include <readdb.h> | ||
10 | #include <blast.h> | ||
11 | #include <salpacc.h> | ||
12 | #include <stdbool.h> | ||
13 | #include <hdf5_hl.h> | ||
14 | #include <error.h> | ||
15 | |||
16 | /* | ||
17 | * todo: Write results to an independent table by GI so that they can | ||
18 | * be reused when the input tables influenza.faa and influenza_aa.dat | ||
19 | * are replaced with new data. Since GI values are unique and | ||
20 | * persistent the assignment of a protein type should stay valid | ||
21 | * unless the list of reference sequences is updated. | ||
22 | * | ||
23 | * todo: Save the species type as a new field { A, B, C }. | ||
24 | */ | ||
25 | |||
26 | /* | ||
27 | * BLAST database containing all of the influenza protein sequences. | ||
28 | */ | ||
29 | #define SEQDB "/home/don/exp004/influenzadb/influenzadb" | ||
30 | |||
31 | /* | ||
32 | * BLAST reference database of prototypical protein types. | ||
33 | */ | ||
34 | #define REFDB "/home/don/exp004/influenzadb/proteinnames" | ||
35 | |||
36 | #define BUFFER_LEN 50 | ||
37 | |||
38 | void | ||
39 | assign_protein_type (hid_t file_id) | ||
40 | { | ||
41 | /* | ||
42 | * Iterate through the records for which no protein type has been | ||
43 | * assigned. Create a BioSeq Pointer to the data and then use this | ||
44 | * as input to the BLAST run against the reference database of | ||
45 | * prototypical sequence records. | ||
46 | */ | ||
47 | |||
48 | /* | ||
49 | * Make BLAST messages reportable. | ||
50 | */ | ||
51 | ErrSetMessageLevel (SEV_WARNING); | ||
52 | |||
53 | /* | ||
54 | * Open the BLAST sequence database. | ||
55 | */ | ||
56 | ReadDBFILEPtr seqdb = readdb_new (SEQDB, true); | ||
57 | |||
58 | /* | ||
59 | * Get default BLAST options. | ||
60 | */ | ||
61 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); | ||
62 | |||
63 | /* | ||
64 | * Set to single hit mode. | ||
65 | */ | ||
66 | options->window_size = 0; | ||
67 | options->multiple_hits_only = false; | ||
68 | |||
69 | ValNodePtr error_returns = NULL; | ||
70 | |||
71 | /* | ||
72 | * Read the data from HDF5 influenza.faa. | ||
73 | */ | ||
74 | hsize_t faa_nfields; | ||
75 | hsize_t faa_nrecords; | ||
76 | herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields, | ||
77 | &faa_nrecords); | ||
78 | if (status < 0) | ||
79 | check_h5_error (status, __FILE__, __LINE__); | ||
80 | |||
81 | sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords); | ||
82 | |||
83 | size_t faa_size; | ||
84 | size_t faa_offset[SEQUENCE_DATA_FIELD_NUM]; | ||
85 | size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM]; | ||
86 | hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM]; | ||
87 | sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type); | ||
88 | |||
89 | status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset, | ||
90 | faa_sizes, faa_buf); | ||
91 | if (status < 0) | ||
92 | check_h5_error (status, __FILE__, __LINE__); | ||
93 | |||
94 | /* | ||
95 | * Read the data from HDF5 gi_type_data. | ||
96 | */ | ||
97 | hsize_t gi_nfields; | ||
98 | hsize_t gi_nrecords; | ||
99 | status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, | ||
100 | &gi_nrecords); | ||
101 | if (status < 0) | ||
102 | check_h5_error (status, __FILE__, __LINE__); | ||
103 | |||
104 | gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); | ||
105 | |||
106 | size_t gi_size; | ||
107 | size_t gi_offset[GI_TYPE_DATA_FIELD_NUM]; | ||
108 | size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM]; | ||
109 | hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; | ||
110 | gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); | ||
111 | |||
112 | status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, | ||
113 | gi_sizes, gi_buf); | ||
114 | if (status < 0) | ||
115 | check_h5_error (status, __FILE__, __LINE__); | ||
116 | |||
117 | /* | ||
118 | * Assign protein types to records for which the field is empty. | ||
119 | */ | ||
120 | printf ("Records to process: %i\n", (int)faa_nrecords); | ||
121 | bool updates_pending = false; | ||
122 | for (int i = 0; i < faa_nrecords; i++) | ||
123 | { | ||
124 | |||
125 | if (dst_buf[i].protein_type[0] == '\0') { | ||
126 | /* | ||
127 | * Read the sequence from the database by GI. | ||
128 | */ | ||
129 | Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); | ||
130 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | ||
131 | if (bsp == NULL) | ||
132 | { | ||
133 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, | ||
134 | "Unable to find BLAST record for gi|%i. Ensure the BLAST " | ||
135 | "database is up-to-date with the HDF5 record set. See the " | ||
136 | "BLAST formatdb.log file for details.\n", | ||
137 | dst_buf[i].gi); | ||
138 | } | ||
139 | |||
140 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
141 | "blastp", | ||
142 | REFDB, | ||
143 | options, | ||
144 | NULL, | ||
145 | &error_returns, | ||
146 | NULL); | ||
147 | |||
148 | /* | ||
149 | * BLAST reported an error. Write it out and continue processing. | ||
150 | */ | ||
151 | if (error_returns != NULL) | ||
152 | { | ||
153 | printf ("Warning: An error has been reported by the NCBI Toolkit API " | ||
154 | "for sequence gi|%i: %s", | ||
155 | dst_buf[i].gi, BlastErrorToString (error_returns)); | ||
156 | |||
157 | } | ||
158 | |||
159 | /* | ||
160 | * A hit was found. Record the first hit as the protein type. | ||
161 | * Skip the first 6 characters and eat the "lcl|x_". | ||
162 | */ | ||
163 | else if (seqalign != NULL) | ||
164 | { | ||
165 | Char target_id_buf[BUFFER_LEN + 1]; | ||
166 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
167 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
168 | |||
169 | // Species Type | ||
170 | dst_buf[i].type[0] = &target_id_buf[4]; | ||
171 | dst_buf[i].type[1] = '\0'; | ||
172 | |||
173 | // Protein Type | ||
174 | strncpy (dst_buf[i].protein, &target_id_buf[6], | ||
175 | sizeof (dst_buf[i].protein)); | ||
176 | |||
177 | updates_pending = true; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * BLAST did not find any hits. | ||
182 | */ | ||
183 | else | ||
184 | { | ||
185 | printf ("Warning: Unable to identify protein type for sequence gi|%i\n", | ||
186 | dst_buf[i].gi); | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * Clean up memory for the next ieration. | ||
191 | */ | ||
192 | seqalign = SeqAlignSetFree (seqalign); | ||
193 | bsp = BioseqFree (bsp); | ||
194 | |||
195 | } | ||
196 | |||
197 | /* | ||
198 | * Write the data out to the file. | ||
199 | */ | ||
200 | if ( (i % 1000 == 0) && (i > 0) && updates_pending) | ||
201 | { | ||
202 | status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, | ||
203 | dst_size, dst_offset, dst_sizes, | ||
204 | &dst_buf[i-1000]); | ||
205 | if (status < 0) | ||
206 | check_h5_error (status, __FILE__, __LINE__); | ||
207 | |||
208 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | ||
209 | if (status < 0) | ||
210 | check_h5_error (status, __FILE__, __LINE__); | ||
211 | |||
212 | updates_pending = false; | ||
213 | |||
214 | printf ("Processed %i of %i records.\n", i, (int)nrecords); | ||
215 | } | ||
216 | |||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Write out records from the last bin if it was less than 1000 | ||
221 | * records in size. | ||
222 | */ | ||
223 | if (updates_pending) | ||
224 | { | ||
225 | if ((int)nrecords < 1000) | ||
226 | { | ||
227 | status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, | ||
228 | dst_size, dst_offset, dst_sizes, | ||
229 | dst_buf); | ||
230 | } | ||
231 | else | ||
232 | { | ||
233 | status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, | ||
234 | dst_size, dst_offset, dst_sizes, | ||
235 | &dst_buf[nrecords-1000]); | ||
236 | } | ||
237 | if (status < 0) | ||
238 | check_h5_error (status, __FILE__, __LINE__); | ||
239 | |||
240 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | ||
241 | if (status < 0) | ||
242 | check_h5_error (status, __FILE__, __LINE__); | ||
243 | |||
244 | updates_pending = false; | ||
245 | } | ||
246 | |||
247 | free (faa_buf); | ||
248 | free (gi_buf); | ||
249 | |||
250 | options = BLASTOptionDelete (options); | ||
251 | |||
252 | return; | ||
253 | } | ||