-rw-r--r-- | src/aggregator.c | 34 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c | 141 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c | 53 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h | 2 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c | 53 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h | 2 | ||||
-rw-r--r-- | src/updator.c | 2 |
7 files changed, 201 insertions, 86 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c index 73685bb..3947800 100644 --- a/src/assign/assign_protein_type.c +++ b/src/assign/assign_protein_type.c | |||
@@ -1,5 +1,6 @@ | |||
1 | #define _GNU_SOURCE | 1 | #define _GNU_SOURCE |
2 | #include "assign_protein_type.h" | 2 | #include "assign_protein_type.h" |
3 | #include "error/check_error.h" | ||
3 | #include "error/check_h5_error.h" | 4 | #include "error/check_h5_error.h" |
4 | #include "error/check_ncbi_error.h" | 5 | #include "error/check_ncbi_error.h" |
5 | #include "model/gi_type_data.h" | 6 | #include "model/gi_type_data.h" |
@@ -84,6 +85,13 @@ assign_protein_type (hid_t file_id) | |||
84 | check_h5_error (status, __FILE__, __LINE__); | 85 | check_h5_error (status, __FILE__, __LINE__); |
85 | 86 | ||
86 | /* | 87 | /* |
88 | * Allocate memory for the new table. | ||
89 | */ | ||
90 | gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords); | ||
91 | if (new_buf == NULL) | ||
92 | check_error (__FILE__, __LINE__); | ||
93 | |||
94 | /* | ||
87 | * Read the data from HDF5 gi_type_data. | 95 | * Read the data from HDF5 gi_type_data. |
88 | */ | 96 | */ |
89 | hsize_t gi_nfields = 0; | 97 | hsize_t gi_nfields = 0; |
@@ -94,8 +102,12 @@ assign_protein_type (hid_t file_id) | |||
94 | hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; | 102 | hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; |
95 | gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); | 103 | gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); |
96 | 104 | ||
97 | gi_type_data* gi_buf = NULL; | 105 | gi_type_data* old_buf = NULL; |
98 | 106 | ||
107 | /* | ||
108 | * If the table is already present read the values into memory and | ||
109 | * then clear the table. | ||
110 | */ | ||
99 | if (H5LTfind_dataset (file_id, "gi_type_data") == 1) | 111 | if (H5LTfind_dataset (file_id, "gi_type_data") == 1) |
100 | { | 112 | { |
101 | 113 | ||
@@ -105,22 +117,30 @@ assign_protein_type (hid_t file_id) | |||
105 | &gi_nrecords); | 117 | &gi_nrecords); |
106 | if (status < 0) | 118 | if (status < 0) |
107 | check_h5_error (status, __FILE__, __LINE__); | 119 | check_h5_error (status, __FILE__, __LINE__); |
120 | |||
121 | printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords); | ||
108 | 122 | ||
109 | gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); | 123 | old_buf = malloc (sizeof(gi_type_data) * gi_nrecords); |
110 | 124 | ||
111 | status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, | 125 | status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, |
112 | gi_sizes, gi_buf); | 126 | gi_sizes, old_buf); |
127 | if (status < 0) | ||
128 | check_h5_error (status, __FILE__, __LINE__); | ||
129 | |||
130 | status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords); | ||
113 | if (status < 0) | 131 | if (status < 0) |
114 | check_h5_error (status, __FILE__, __LINE__); | 132 | check_h5_error (status, __FILE__, __LINE__); |
115 | 133 | ||
116 | } | 134 | } |
135 | |||
136 | /* | ||
137 | * If the table is not already present create it. | ||
138 | */ | ||
117 | else | 139 | else |
118 | { | 140 | { |
119 | 141 | ||
120 | printf ("Creating gi_type_data.\n"); | 142 | printf ("Creating gi_type_data.\n"); |
121 | 143 | ||
122 | gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords); | ||
123 | |||
124 | const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = | 144 | const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = |
125 | GI_TYPE_DATA_FIELD_NAMES; | 145 | GI_TYPE_DATA_FIELD_NAMES; |
126 | 146 | ||
@@ -130,7 +150,7 @@ assign_protein_type (hid_t file_id) | |||
130 | 150 | ||
131 | status = H5TBmake_table ("gi_type_data", file_id, | 151 | status = H5TBmake_table ("gi_type_data", file_id, |
132 | "gi_type_data", | 152 | "gi_type_data", |
133 | GI_TYPE_DATA_FIELD_NUM, faa_nrecords, | 153 | GI_TYPE_DATA_FIELD_NUM, 0, |
134 | gi_size, gi_type_data_field_names, | 154 | gi_size, gi_type_data_field_names, |
135 | gi_offset, gi_field_type, | 155 | gi_offset, gi_field_type, |
136 | chunk_size, fill_data, compress, | 156 | chunk_size, fill_data, compress, |
@@ -140,17 +160,22 @@ assign_protein_type (hid_t file_id) | |||
140 | 160 | ||
141 | } | 161 | } |
142 | 162 | ||
163 | /* | ||
164 | * Copy the contents of the old table into a hash. | ||
165 | */ | ||
143 | struct hsearch_data htab; | 166 | struct hsearch_data htab; |
144 | bzero (&htab, sizeof (htab)); | 167 | bzero (&htab, sizeof (htab)); |
145 | hcreate_r (gi_nrecords * 2, &htab); | 168 | if (hcreate_r (gi_nrecords * 2, &htab) == 0) |
169 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, | ||
170 | "Allocation of cache failed."); | ||
146 | ENTRY e, *ep; | 171 | ENTRY e, *ep; |
147 | 172 | ||
148 | for (int i = 0; i < gi_nrecords; i++) | 173 | for (int i = 0; i < (int)gi_nrecords; i++) |
149 | { | 174 | { |
150 | char gi_chr[25]; | 175 | char gi_chr[25]; |
151 | snprintf (gi_chr, 25, "%i", gi_buf[i].gi); | 176 | snprintf (gi_chr, 25, "%i", old_buf[i].gi); |
152 | e.key = gi_chr; | 177 | e.key = strdup (gi_chr); |
153 | e.data = &gi_buf[i]; | 178 | e.data = &old_buf[i]; |
154 | if (hsearch_r (e, ENTER, &ep, &htab) == 0) | 179 | if (hsearch_r (e, ENTER, &ep, &htab) == 0) |
155 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, | 180 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, |
156 | "Allocation failed."); | 181 | "Allocation failed."); |
@@ -160,19 +185,23 @@ assign_protein_type (hid_t file_id) | |||
160 | * Assign protein types to records for which the field is empty. | 185 | * Assign protein types to records for which the field is empty. |
161 | */ | 186 | */ |
162 | printf ("Records to process: %i\n", (int)faa_nrecords); | 187 | printf ("Records to process: %i\n", (int)faa_nrecords); |
163 | bool updates_pending = false; | 188 | int written = 0; |
164 | for (int i = 0; i < faa_nrecords; i++) | 189 | for (int i = 0; i < (int)faa_nrecords; i++) |
165 | { | 190 | { |
191 | new_buf[i].gi = faa_buf[i].gi; | ||
192 | strncpy (new_buf[i].type, "", sizeof (new_buf[i].type)); | ||
193 | strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein)); | ||
166 | 194 | ||
167 | char gi_chr[25]; | 195 | char gi_chr[25]; |
168 | snprintf (gi_chr, 25, "%i", faa_buf[i].gi); | 196 | snprintf (gi_chr, 25, "%i", faa_buf[i].gi); |
169 | e.key = gi_chr; | 197 | e.key = gi_chr; |
198 | e.data = NULL; | ||
199 | |||
200 | /* | ||
201 | * A record was not found in the cache for this gi. | ||
202 | */ | ||
170 | if (hsearch_r (e, FIND, &ep, &htab) == 0) | 203 | if (hsearch_r (e, FIND, &ep, &htab) == 0) |
171 | { | 204 | { |
172 | |||
173 | gi_buf[i].gi = faa_buf[i].gi; | ||
174 | gi_buf[i].type[0] = '\0'; | ||
175 | gi_buf[i].protein[0] = '\0'; | ||
176 | 205 | ||
177 | /* | 206 | /* |
178 | * Read the sequence from the database by GI. | 207 | * Read the sequence from the database by GI. |
@@ -202,7 +231,7 @@ assign_protein_type (hid_t file_id) | |||
202 | */ | 231 | */ |
203 | if (error_returns != NULL) | 232 | if (error_returns != NULL) |
204 | { | 233 | { |
205 | char *msg = BlastErrorToString (error_returns); | 234 | CharPtr msg = BlastErrorToString (error_returns); |
206 | printf ("Warning: An error has been reported by the NCBI Toolkit " | 235 | printf ("Warning: An error has been reported by the NCBI Toolkit " |
207 | "API for sequence gi|%i: %s", | 236 | "API for sequence gi|%i: %s", |
208 | faa_buf[i].gi, msg); | 237 | faa_buf[i].gi, msg); |
@@ -221,14 +250,12 @@ assign_protein_type (hid_t file_id) | |||
221 | BUFFER_LEN); | 250 | BUFFER_LEN); |
222 | 251 | ||
223 | // Species Type | 252 | // Species Type |
224 | gi_buf[i].type[0] = target_id_buf[4]; | 253 | new_buf[i].type[0] = target_id_buf[4]; |
225 | gi_buf[i].type[1] = '\0'; | 254 | new_buf[i].type[1] = '\0'; |
226 | 255 | ||
227 | // Protein Type | 256 | // Protein Type |
228 | strncpy (gi_buf[i].protein, &target_id_buf[6], | 257 | strncpy (new_buf[i].protein, &target_id_buf[6], |
229 | sizeof (gi_buf[i].protein)); | 258 | sizeof (new_buf[i].protein)); |
230 | |||
231 | updates_pending = true; | ||
232 | } | 259 | } |
233 | 260 | ||
234 | /* | 261 | /* |
@@ -246,16 +273,27 @@ assign_protein_type (hid_t file_id) | |||
246 | seqalign = SeqAlignSetFree (seqalign); | 273 | seqalign = SeqAlignSetFree (seqalign); |
247 | bsp = BioseqFree (bsp); | 274 | bsp = BioseqFree (bsp); |
248 | 275 | ||
276 | } // End existing entry not found. | ||
277 | |||
278 | /* | ||
279 | * Hash table entry found. Keep the old value. | ||
280 | */ | ||
281 | else | ||
282 | { | ||
283 | gi_type_data* old_value = (gi_type_data*)ep->data; | ||
284 | new_buf[i].gi = old_value->gi; | ||
285 | strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type)); | ||
286 | strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein)); | ||
249 | } | 287 | } |
250 | 288 | ||
251 | /* | 289 | /* |
252 | * Write the data out to the file. | 290 | * Write the data out to the file. |
253 | */ | 291 | */ |
254 | if ( (i % 1000 == 0) && (i > 0) && updates_pending) | 292 | if ( (i % 1000 == 0) && (i > 0) ) |
255 | { | 293 | { |
256 | status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000, | 294 | status = H5TBappend_records (file_id, "gi_type_data", 1000, |
257 | gi_size, gi_offset, gi_sizes, | 295 | gi_size, gi_offset, gi_sizes, |
258 | &gi_buf[i-1000]); | 296 | &new_buf[i-1000]); |
259 | if (status < 0) | 297 | if (status < 0) |
260 | check_h5_error (status, __FILE__, __LINE__); | 298 | check_h5_error (status, __FILE__, __LINE__); |
261 | 299 | ||
@@ -263,7 +301,7 @@ assign_protein_type (hid_t file_id) | |||
263 | if (status < 0) | 301 | if (status < 0) |
264 | check_h5_error (status, __FILE__, __LINE__); | 302 | check_h5_error (status, __FILE__, __LINE__); |
265 | 303 | ||
266 | updates_pending = false; | 304 | written = i; |
267 | 305 | ||
268 | printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); | 306 | printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); |
269 | } | 307 | } |
@@ -274,37 +312,34 @@ assign_protein_type (hid_t file_id) | |||
274 | * Write out records from the last bin if it was less than 1000 | 312 | * Write out records from the last bin if it was less than 1000 |
275 | * records in size. | 313 | * records in size. |
276 | */ | 314 | */ |
277 | if (updates_pending) | 315 | if ((int)faa_nrecords < 1000) |
278 | { | 316 | { |
279 | /* | 317 | status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords, |
280 | if ((int)faa_nrecords < 1000) | 318 | gi_size, gi_offset, gi_sizes, |
281 | { | 319 | new_buf); |
282 | status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, | 320 | } |
283 | dst_size, dst_offset, dst_sizes, | 321 | |
284 | dst_buf); | 322 | else |
285 | } | 323 | { |
286 | else | 324 | status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written, |
287 | { | 325 | gi_size, gi_offset, gi_sizes, |
288 | status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, | 326 | &new_buf[written]); |
289 | dst_size, dst_offset, dst_sizes, | ||
290 | &dst_buf[nrecords-1000]); | ||
291 | } | ||
292 | if (status < 0) | ||
293 | check_h5_error (status, __FILE__, __LINE__); | ||
294 | |||
295 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | ||
296 | if (status < 0) | ||
297 | check_h5_error (status, __FILE__, __LINE__); | ||
298 | |||
299 | updates_pending = false; | ||
300 | */ | ||
301 | } | 327 | } |
328 | |||
329 | if (status < 0) | ||
330 | check_h5_error (status, __FILE__, __LINE__); | ||
331 | |||
332 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | ||
333 | if (status < 0) | ||
334 | check_h5_error (status, __FILE__, __LINE__); | ||
302 | 335 | ||
303 | free (faa_buf); | 336 | free (faa_buf); |
304 | free (gi_buf); | 337 | free (old_buf); |
338 | free (new_buf); | ||
305 | hdestroy_r (&htab); | 339 | hdestroy_r (&htab); |
306 | 340 | ||
307 | options = BLASTOptionDelete (options); | 341 | options = BLASTOptionDelete (options); |
342 | readdb_destruct (seqdb); | ||
308 | 343 | ||
309 | return; | 344 | return; |
310 | } | 345 | } |