-rw-r--r-- | src/aggregator.c | 3 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c | 110 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c | 13 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h | 2 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c | 30 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h | 2 | ||||
-rw-r--r-- | src/model/gi_type_data_init.c | 4 | ||||
-rw-r--r-- | src/model/gi_type_data_init.h | 4 | ||||
-rw-r--r-- | src/model/sequence_data_init.c | 18 | ||||
-rw-r--r-- | src/model/sequence_data_init.h | 6 |
10 files changed, 99 insertions, 93 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c index 9a0717b..1df4c8d 100644 --- a/src/assign/assign_protein_type.c +++ b/src/assign/assign_protein_type.c | |||
@@ -70,7 +70,7 @@ assign_protein_type (hid_t file_id) | |||
70 | if (status < 0) | 70 | if (status < 0) |
71 | check_h5_error (status, __FILE__, __LINE__); | 71 | check_h5_error (status, __FILE__, __LINE__); |
72 | 72 | ||
73 | sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords); | 73 | sequence_data *faa_buf = malloc (sizeof (sequence_data) * faa_nrecords); |
74 | 74 | ||
75 | size_t faa_size; | 75 | size_t faa_size; |
76 | size_t faa_offset[SEQUENCE_DATA_FIELD_NUM]; | 76 | size_t faa_offset[SEQUENCE_DATA_FIELD_NUM]; |
@@ -86,7 +86,7 @@ assign_protein_type (hid_t file_id) | |||
86 | /* | 86 | /* |
87 | * Allocate memory for the new table. | 87 | * Allocate memory for the new table. |
88 | */ | 88 | */ |
89 | gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords); | 89 | gi_type_data *new_buf = malloc (sizeof (gi_type_data) * faa_nrecords); |
90 | if (new_buf == NULL) | 90 | if (new_buf == NULL) |
91 | check_error (__FILE__, __LINE__); | 91 | check_error (__FILE__, __LINE__); |
92 | 92 | ||
@@ -101,7 +101,7 @@ assign_protein_type (hid_t file_id) | |||
101 | hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; | 101 | hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; |
102 | gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); | 102 | gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); |
103 | 103 | ||
104 | gi_type_data* old_buf = NULL; | 104 | gi_type_data *old_buf = NULL; |
105 | 105 | ||
106 | /* | 106 | /* |
107 | * If the table is already present read the values into memory and | 107 | * If the table is already present read the values into memory and |
@@ -117,10 +117,11 @@ assign_protein_type (hid_t file_id) | |||
117 | if (status < 0) | 117 | if (status < 0) |
118 | check_h5_error (status, __FILE__, __LINE__); | 118 | check_h5_error (status, __FILE__, __LINE__); |
119 | 119 | ||
120 | printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords); | 120 | printf (" Using gi_type_data cache of %i records.\n", |
121 | 121 | (int) gi_nrecords); | |
122 | old_buf = malloc (sizeof(gi_type_data) * gi_nrecords); | 122 | |
123 | 123 | old_buf = malloc (sizeof (gi_type_data) * gi_nrecords); | |
124 | |||
124 | status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, | 125 | status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, |
125 | gi_sizes, old_buf); | 126 | gi_sizes, old_buf); |
126 | if (status < 0) | 127 | if (status < 0) |
@@ -129,18 +130,18 @@ assign_protein_type (hid_t file_id) | |||
129 | status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords); | 130 | status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords); |
130 | if (status < 0) | 131 | if (status < 0) |
131 | check_h5_error (status, __FILE__, __LINE__); | 132 | check_h5_error (status, __FILE__, __LINE__); |
132 | 133 | ||
133 | } | 134 | } |
134 | 135 | ||
135 | /* | 136 | /* |
136 | * If the table is not already present create it. | 137 | * If the table is not already present create it. |
137 | */ | 138 | */ |
138 | else | 139 | else |
139 | { | 140 | { |
140 | 141 | ||
141 | printf ("Creating gi_type_data.\n"); | 142 | printf ("Creating gi_type_data.\n"); |
142 | 143 | ||
143 | const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = | 144 | const char *gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = |
144 | GI_TYPE_DATA_FIELD_NAMES; | 145 | GI_TYPE_DATA_FIELD_NAMES; |
145 | 146 | ||
146 | hsize_t chunk_size = 10; | 147 | hsize_t chunk_size = 10; |
@@ -152,8 +153,7 @@ assign_protein_type (hid_t file_id) | |||
152 | GI_TYPE_DATA_FIELD_NUM, 0, | 153 | GI_TYPE_DATA_FIELD_NUM, 0, |
153 | gi_size, gi_type_data_field_names, | 154 | gi_size, gi_type_data_field_names, |
154 | gi_offset, gi_field_type, | 155 | gi_offset, gi_field_type, |
155 | chunk_size, fill_data, compress, | 156 | chunk_size, fill_data, compress, NULL); |
156 | NULL); | ||
157 | if (status < 0) | 157 | if (status < 0) |
158 | check_h5_error (status, __FILE__, __LINE__); | 158 | check_h5_error (status, __FILE__, __LINE__); |
159 | 159 | ||
@@ -169,7 +169,7 @@ assign_protein_type (hid_t file_id) | |||
169 | "Allocation of cache failed."); | 169 | "Allocation of cache failed."); |
170 | ENTRY e, *ep; | 170 | ENTRY e, *ep; |
171 | 171 | ||
172 | for (int i = 0; i < (int)gi_nrecords; i++) | 172 | for (int i = 0; i < (int) gi_nrecords; i++) |
173 | { | 173 | { |
174 | char gi_chr[25]; | 174 | char gi_chr[25]; |
175 | snprintf (gi_chr, 25, "%i", old_buf[i].gi); | 175 | snprintf (gi_chr, 25, "%i", old_buf[i].gi); |
@@ -183,14 +183,14 @@ assign_protein_type (hid_t file_id) | |||
183 | /* | 183 | /* |
184 | * Assign protein types to records for which the field is empty. | 184 | * Assign protein types to records for which the field is empty. |
185 | */ | 185 | */ |
186 | printf ("Records to process: %i\n", (int)faa_nrecords); | 186 | printf ("Records to process: %i\n", (int) faa_nrecords); |
187 | int written = 0; | 187 | int written = 0; |
188 | for (int i = 0; i < (int)faa_nrecords; i++) | 188 | for (int i = 0; i < (int) faa_nrecords; i++) |
189 | { | 189 | { |
190 | new_buf[i].gi = faa_buf[i].gi; | 190 | new_buf[i].gi = faa_buf[i].gi; |
191 | strncpy (new_buf[i].type, "", sizeof (new_buf[i].type)); | 191 | strncpy (new_buf[i].type, "", sizeof (new_buf[i].type)); |
192 | strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein)); | 192 | strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein)); |
193 | 193 | ||
194 | char gi_chr[25]; | 194 | char gi_chr[25]; |
195 | snprintf (gi_chr, 25, "%i", faa_buf[i].gi); | 195 | snprintf (gi_chr, 25, "%i", faa_buf[i].gi); |
196 | e.key = gi_chr; | 196 | e.key = gi_chr; |
@@ -199,24 +199,23 @@ assign_protein_type (hid_t file_id) | |||
199 | /* | 199 | /* |
200 | * A record was not found in the cache for this gi. | 200 | * A record was not found in the cache for this gi. |
201 | */ | 201 | */ |
202 | if (hsearch_r (e, FIND, &ep, &htab) == 0) | 202 | if (hsearch_r (e, FIND, &ep, &htab) == 0) |
203 | { | 203 | { |
204 | 204 | ||
205 | /* | 205 | /* |
206 | * Read the sequence from the database by GI. | 206 | * Read the sequence from the database by GI. |
207 | */ | 207 | */ |
208 | Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL); | 208 | Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL); |
209 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | 209 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); |
210 | if (bsp == NULL) | 210 | if (bsp == NULL) |
211 | { | 211 | { |
212 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, | 212 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, |
213 | "Unable to find BLAST record for gi|%i. Ensure " | 213 | "Unable to find BLAST record for gi|%i. Ensure " |
214 | "the BLAST database is up-to-date with the HDF5 " | 214 | "the BLAST database is up-to-date with the HDF5 " |
215 | "record set. See the BLAST formatdb.log file " | 215 | "record set. See the BLAST formatdb.log file " |
216 | "for details.\n", | 216 | "for details.\n", faa_buf[i].gi); |
217 | faa_buf[i].gi); | ||
218 | } | 217 | } |
219 | 218 | ||
220 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | 219 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, |
221 | "blastp", | 220 | "blastp", |
222 | REFDB, | 221 | REFDB, |
@@ -224,19 +223,19 @@ assign_protein_type (hid_t file_id) | |||
224 | NULL, | 223 | NULL, |
225 | &error_returns, | 224 | &error_returns, |
226 | NULL); | 225 | NULL); |
227 | 226 | ||
228 | /* | 227 | /* |
229 | * BLAST reported an error. Write it out and continue processing. | 228 | * BLAST reported an error. Write it out and continue processing. |
230 | */ | 229 | */ |
231 | if (error_returns != NULL) | 230 | if (error_returns != NULL) |
232 | { | 231 | { |
233 | CharPtr msg = BlastErrorToString (error_returns); | 232 | CharPtr msg = BlastErrorToString (error_returns); |
234 | printf ("Warning: An error has been reported by the NCBI Toolkit " | 233 | printf |
235 | "API for sequence gi|%i: %s", | 234 | ("Warning: An error has been reported by the NCBI Toolkit " |
236 | faa_buf[i].gi, msg); | 235 | "API for sequence gi|%i: %s", faa_buf[i].gi, msg); |
237 | free (msg); | 236 | free (msg); |
238 | } | 237 | } |
239 | 238 | ||
240 | /* | 239 | /* |
241 | * A hit was found. Record the first hit as the protein type. | 240 | * A hit was found. Record the first hit as the protein type. |
242 | * Skip the first 4 characters and eat the "lcl|". | 241 | * Skip the first 4 characters and eat the "lcl|". |
@@ -245,18 +244,18 @@ assign_protein_type (hid_t file_id) | |||
245 | { | 244 | { |
246 | Char target_id_buf[BUFFER_LEN + 1]; | 245 | Char target_id_buf[BUFFER_LEN + 1]; |
247 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | 246 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); |
248 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, | 247 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, |
249 | BUFFER_LEN); | 248 | BUFFER_LEN); |
250 | 249 | ||
251 | // Species Type | 250 | // Species Type |
252 | new_buf[i].type[0] = target_id_buf[4]; | 251 | new_buf[i].type[0] = target_id_buf[4]; |
253 | new_buf[i].type[1] = '\0'; | 252 | new_buf[i].type[1] = '\0'; |
254 | 253 | ||
255 | // Protein Type (Skip the underscore in the string). | 254 | // Protein Type (Skip the underscore in the string). |
256 | strncpy (new_buf[i].protein, &target_id_buf[6], | 255 | strncpy (new_buf[i].protein, &target_id_buf[6], |
257 | sizeof (new_buf[i].protein)); | 256 | sizeof (new_buf[i].protein)); |
258 | } | 257 | } |
259 | 258 | ||
260 | /* | 259 | /* |
261 | * BLAST did not find any hits. | 260 | * BLAST did not find any hits. |
262 | */ | 261 | */ |
@@ -265,73 +264,74 @@ assign_protein_type (hid_t file_id) | |||
265 | printf ("Warning: Unable to identify protein type for sequence " | 264 | printf ("Warning: Unable to identify protein type for sequence " |
266 | "gi|%i\n", faa_buf[i].gi); | 265 | "gi|%i\n", faa_buf[i].gi); |
267 | } | 266 | } |
268 | 267 | ||
269 | /* | 268 | /* |
270 | * Clean up memory for the next ieration. | 269 | * Clean up memory for the next ieration. |
271 | */ | 270 | */ |
272 | seqalign = SeqAlignSetFree (seqalign); | 271 | seqalign = SeqAlignSetFree (seqalign); |
273 | bsp = BioseqFree (bsp); | 272 | bsp = BioseqFree (bsp); |
274 | 273 | ||
275 | } // End existing entry not found. | 274 | } // End existing entry not found. |
276 | 275 | ||
277 | /* | 276 | /* |
278 | * Hash table entry found. Keep the old value. | 277 | * Hash table entry found. Keep the old value. |
279 | */ | 278 | */ |
280 | else | 279 | else |
281 | { | 280 | { |
282 | gi_type_data* old_value = (gi_type_data*)ep->data; | 281 | gi_type_data *old_value = (gi_type_data *) ep->data; |
283 | new_buf[i].gi = old_value->gi; | 282 | new_buf[i].gi = old_value->gi; |
284 | strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type)); | 283 | strncpy (new_buf[i].type, old_value->type, |
285 | strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein)); | 284 | sizeof (new_buf[i].type)); |
285 | strncpy (new_buf[i].protein, old_value->protein, | ||
286 | sizeof (new_buf[i].protein)); | ||
286 | } | 287 | } |
287 | 288 | ||
288 | /* | 289 | /* |
289 | * Write the data out to the file. | 290 | * Write the data out to the file. |
290 | */ | 291 | */ |
291 | if ( (i % 1000 == 0) && (i > 0) ) | 292 | if ((i % 1000 == 0) && (i > 0)) |
292 | { | 293 | { |
293 | status = H5TBappend_records (file_id, "gi_type_data", 1000, | 294 | status = H5TBappend_records (file_id, "gi_type_data", 1000, |
294 | gi_size, gi_offset, gi_sizes, | 295 | gi_size, gi_offset, gi_sizes, |
295 | &new_buf[i-1000]); | 296 | &new_buf[i - 1000]); |
296 | if (status < 0) | 297 | if (status < 0) |
297 | check_h5_error (status, __FILE__, __LINE__); | 298 | check_h5_error (status, __FILE__, __LINE__); |
298 | 299 | ||
299 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | 300 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); |
300 | if (status < 0) | 301 | if (status < 0) |
301 | check_h5_error (status, __FILE__, __LINE__); | 302 | check_h5_error (status, __FILE__, __LINE__); |
302 | 303 | ||
303 | written = i; | 304 | written = i; |
304 | 305 | ||
305 | printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); | 306 | printf ("Processed %i of %i records.\n", i, (int) faa_nrecords); |
306 | } | 307 | } |
307 | 308 | ||
308 | } | 309 | } |
309 | 310 | ||
310 | /* | 311 | /* |
311 | * Write out records from the last bin if it was less than 1000 | 312 | * Write out records from the last bin if it was less than 1000 |
312 | * records in size. | 313 | * records in size. |
313 | */ | 314 | */ |
314 | if ((int)faa_nrecords < 1000) | 315 | if ((int) faa_nrecords < 1000) |
315 | { | 316 | { |
316 | status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords, | 317 | status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords, |
317 | gi_size, gi_offset, gi_sizes, | 318 | gi_size, gi_offset, gi_sizes, new_buf); |
318 | new_buf); | ||
319 | } | 319 | } |
320 | 320 | ||
321 | else | 321 | else |
322 | { | 322 | { |
323 | status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written, | 323 | status = |
324 | gi_size, gi_offset, gi_sizes, | 324 | H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written, |
325 | &new_buf[written]); | 325 | gi_size, gi_offset, gi_sizes, &new_buf[written]); |
326 | } | 326 | } |
327 | 327 | ||
328 | if (status < 0) | 328 | if (status < 0) |
329 | check_h5_error (status, __FILE__, __LINE__); | 329 | check_h5_error (status, __FILE__, __LINE__); |
330 | 330 | ||
331 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | 331 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); |
332 | if (status < 0) | 332 | if (status < 0) |
333 | check_h5_error (status, __FILE__, __LINE__); | 333 | check_h5_error (status, __FILE__, __LINE__); |
334 | 334 | ||
335 | free (faa_buf); | 335 | free (faa_buf); |
336 | free (old_buf); | 336 | free (old_buf); |
337 | free (new_buf); | 337 | free (new_buf); |
@@ -339,6 +339,6 @@ assign_protein_type (hid_t file_id) | |||
339 | 339 | ||
340 | options = BLASTOptionDelete (options); | 340 | options = BLASTOptionDelete (options); |
341 | readdb_destruct (seqdb); | 341 | readdb_destruct (seqdb); |
342 | 342 | ||
343 | return; | 343 | return; |
344 | } | 344 | } |