-rw-r--r-- | src/aggregator.c | 34 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c | 141 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c | 53 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h | 2 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c | 53 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h | 2 | ||||
-rw-r--r-- | src/updator.c | 2 |
7 files changed, 201 insertions, 86 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c index 73685bb..3947800 100644 --- a/src/assign/assign_protein_type.c +++ b/src/assign/assign_protein_type.c @@ -1,5 +1,6 @@ #define _GNU_SOURCE #include "assign_protein_type.h" +#include "error/check_error.h" #include "error/check_h5_error.h" #include "error/check_ncbi_error.h" #include "model/gi_type_data.h" @@ -84,6 +85,13 @@ assign_protein_type (hid_t file_id) check_h5_error (status, __FILE__, __LINE__); /* + * Allocate memory for the new table. + */ + gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords); + if (new_buf == NULL) + check_error (__FILE__, __LINE__); + + /* * Read the data from HDF5 gi_type_data. */ hsize_t gi_nfields = 0; @@ -94,8 +102,12 @@ assign_protein_type (hid_t file_id) hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); - gi_type_data* gi_buf = NULL; + gi_type_data* old_buf = NULL; + /* + * If the table is already present read the values into memory and + * then clear the table. + */ if (H5LTfind_dataset (file_id, "gi_type_data") == 1) { @@ -105,22 +117,30 @@ assign_protein_type (hid_t file_id) &gi_nrecords); if (status < 0) check_h5_error (status, __FILE__, __LINE__); + + printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords); - gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); + old_buf = malloc (sizeof(gi_type_data) * gi_nrecords); status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, - gi_sizes, gi_buf); + gi_sizes, old_buf); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } + + /* + * If the table is not already present create it. + */ else { printf ("Creating gi_type_data.\n"); - gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords); - const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = GI_TYPE_DATA_FIELD_NAMES; @@ -130,7 +150,7 @@ assign_protein_type (hid_t file_id) status = H5TBmake_table ("gi_type_data", file_id, "gi_type_data", - GI_TYPE_DATA_FIELD_NUM, faa_nrecords, + GI_TYPE_DATA_FIELD_NUM, 0, gi_size, gi_type_data_field_names, gi_offset, gi_field_type, chunk_size, fill_data, compress, @@ -140,17 +160,22 @@ assign_protein_type (hid_t file_id) } + /* + * Copy the contents of the old table into a hash. + */ struct hsearch_data htab; bzero (&htab, sizeof (htab)); - hcreate_r (gi_nrecords * 2, &htab); + if (hcreate_r (gi_nrecords * 2, &htab) == 0) + error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, + "Allocation of cache failed."); ENTRY e, *ep; - - for (int i = 0; i < gi_nrecords; i++) + + for (int i = 0; i < (int)gi_nrecords; i++) { char gi_chr[25]; - snprintf (gi_chr, 25, "%i", gi_buf[i].gi); - e.key = gi_chr; - e.data = &gi_buf[i]; + snprintf (gi_chr, 25, "%i", old_buf[i].gi); + e.key = strdup (gi_chr); + e.data = &old_buf[i]; if (hsearch_r (e, ENTER, &ep, &htab) == 0) error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, "Allocation failed."); @@ -160,19 +185,23 @@ assign_protein_type (hid_t file_id) * Assign protein types to records for which the field is empty. */ printf ("Records to process: %i\n", (int)faa_nrecords); - bool updates_pending = false; - for (int i = 0; i < faa_nrecords; i++) + int written = 0; + for (int i = 0; i < (int)faa_nrecords; i++) { + new_buf[i].gi = faa_buf[i].gi; + strncpy (new_buf[i].type, "", sizeof (new_buf[i].type)); + strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein)); char gi_chr[25]; snprintf (gi_chr, 25, "%i", faa_buf[i].gi); e.key = gi_chr; + e.data = NULL; + + /* + * A record was not found in the cache for this gi. + */ if (hsearch_r (e, FIND, &ep, &htab) == 0) { - - gi_buf[i].gi = faa_buf[i].gi; - gi_buf[i].type[0] = '\0'; - gi_buf[i].protein[0] = '\0'; /* * Read the sequence from the database by GI. @@ -202,7 +231,7 @@ assign_protein_type (hid_t file_id) */ if (error_returns != NULL) { - char *msg = BlastErrorToString (error_returns); + CharPtr msg = BlastErrorToString (error_returns); printf ("Warning: An error has been reported by the NCBI Toolkit " "API for sequence gi|%i: %s", faa_buf[i].gi, msg); @@ -221,14 +250,12 @@ assign_protein_type (hid_t file_id) BUFFER_LEN); // Species Type - gi_buf[i].type[0] = target_id_buf[4]; - gi_buf[i].type[1] = '\0'; + new_buf[i].type[0] = target_id_buf[4]; + new_buf[i].type[1] = '\0'; // Protein Type - strncpy (gi_buf[i].protein, &target_id_buf[6], - sizeof (gi_buf[i].protein)); - - updates_pending = true; + strncpy (new_buf[i].protein, &target_id_buf[6], + sizeof (new_buf[i].protein)); } /* @@ -246,16 +273,27 @@ assign_protein_type (hid_t file_id) seqalign = SeqAlignSetFree (seqalign); bsp = BioseqFree (bsp); + } // End existing entry not found. + + /* + * Hash table entry found. Keep the old value. + */ + else + { + gi_type_data* old_value = (gi_type_data*)ep->data; + new_buf[i].gi = old_value->gi; + strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type)); + strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein)); } /* * Write the data out to the file. */ - if ( (i % 1000 == 0) && (i > 0) && updates_pending) + if ( (i % 1000 == 0) && (i > 0) ) { - status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000, + status = H5TBappend_records (file_id, "gi_type_data", 1000, gi_size, gi_offset, gi_sizes, - &gi_buf[i-1000]); + &new_buf[i-1000]); if (status < 0) check_h5_error (status, __FILE__, __LINE__); @@ -263,7 +301,7 @@ assign_protein_type (hid_t file_id) if (status < 0) check_h5_error (status, __FILE__, __LINE__); - updates_pending = false; + written = i; printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); } @@ -274,37 +312,34 @@ assign_protein_type (hid_t file_id) * Write out records from the last bin if it was less than 1000 * records in size. */ - if (updates_pending) + if ((int)faa_nrecords < 1000) { - /* - if ((int)faa_nrecords < 1000) - { - status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, - dst_size, dst_offset, dst_sizes, - dst_buf); - } - else - { - status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, - dst_size, dst_offset, dst_sizes, - &dst_buf[nrecords-1000]); - } - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); - - status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); - - updates_pending = false; - */ + status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords, + gi_size, gi_offset, gi_sizes, + new_buf); + } + + else + { + status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written, + gi_size, gi_offset, gi_sizes, + &new_buf[written]); } + + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); free (faa_buf); - free (gi_buf); + free (old_buf); + free (new_buf); hdestroy_r (&htab); options = BLASTOptionDelete (options); + readdb_destruct (seqdb); return; } |