-rw-r--r-- | .gitignore | 5 | ||||
-rw-r--r-- | src/Makefile.am | 10 | ||||
-rw-r--r-- | src/assign_protein_type.c | 135 | ||||
-rw-r--r-- | src/load_influenza_faa.c | 67 | ||||
-rw-r--r-- | src/sequence_data.h | 16 | ||||
-rw-r--r-- | src/sequence_data_init.c | 37 | ||||
-rw-r--r-- | src/sequence_data_init.h | 14 |
7 files changed, 188 insertions, 96 deletions
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 1b58f54..166e787 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c @@ -1,12 +1,15 @@ #include "assign_protein_type.h" #include "check_ncbi_error.h" #include "check_h5_error.h" +#include "sequence_data.h" +#include "sequence_data_init.h" #include <ncbi.h> #include <readdb.h> #include <blast.h> #include <salpacc.h> #include <stdbool.h> #include <hdf5_hl.h> +#include <error.h> /* * BLAST database containing all of the influenza protein sequences. @@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id) * Get default BLAST options. */ BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); + + /* + * Set to single hit mode. + */ + options->window_size = 0; + options->multiple_hits_only = false; + ValNodePtr error_returns = NULL; /* @@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id) if (status < 0) check_h5_error (status, __FILE__, __LINE__); - /* - * todo: Allocate memory of nrecords for dst_buf. - * - * todo: Refactor code to share structres in read and write HDF5 - * calls. - */ + sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords); + + size_t dst_size; + size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; + size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; + hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; + sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, dst_sizes, dst_buf); if (status < 0) check_h5_error (status, __FILE__, __LINE__); - for (int i = 0; i < nrecords; i++) - { - - } - /* - * Read the sequence from the database by GI. + * Assign protein types to records for which the field is empty. */ - Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); - BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); - - SeqAlignPtr seqalign = BioseqBlastEngine (bsp, - "blastp", - REFDB, - options, - NULL, - &error_returns, - NULL); - - if (error_returns != NULL) - check_ncbi_error (error_returns, __FILE__, __LINE__); - - if (seqalign != NULL) + printf ("Records to process: %i\n", (int)nrecords); + for (int i = 0; i < nrecords; i++) { - Char target_id_buf[BUFFER_LEN + 1]; - SeqIdPtr target_id = SeqAlignId (seqalign, 1); - SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); - printf ("%s\n", target_id_buf); - } - - // Clean up memory for the next ieration. - seqalign = SeqAlignSetFree (seqalign); - bsp = BioseqFree (bsp); + if (dst_buf[i].protein_type[0] == '\0') { + /* + * Read the sequence from the database by GI. + */ + Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); + BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); + + SeqAlignPtr seqalign = BioseqBlastEngine (bsp, + "blastp", + REFDB, + options, + NULL, + &error_returns, + NULL); + + /* + * BLAST reported an error. Write it out and continue processing. + */ + if (error_returns != NULL) + { + printf ("Warning: An error has been reported by the NCBI Toolkit API " + "for sequence gi|%i: %s", + dst_buf[i].gi, BlastErrorToString (error_returns)); + + } + + /* + * A hit was found. Record the first hit as the protein type. + * Skip the first 6 characters and eat the "lcl|x_". + */ + else if (seqalign != NULL) + { + Char target_id_buf[BUFFER_LEN + 1]; + SeqIdPtr target_id = SeqAlignId (seqalign, 1); + SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); + strncpy (dst_buf[i].protein_type, &target_id_buf[6], + sizeof (dst_buf[i].protein_type)); + } + + /* + * BLAST did not find any hits. + */ + else + { + printf ("Warning: Unable to identify protein type for sequence gi|%i\n", + dst_buf[i].gi); + } + + /* + * Clean up memory for the next ieration. + */ + seqalign = SeqAlignSetFree (seqalign); + bsp = BioseqFree (bsp); + + /* + * Write the data out to the file. + */ + if ( (i % 1000 == 0) && (i > 0) ) + { + status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, + dst_size, dst_offset, dst_sizes, + &dst_buf[i-1000]); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + printf ("Processed %i of %i records.\n", i, (int)nrecords); + } + } + + } + + free (dst_buf); + options = BLASTOptionDelete (options); - + return; } |