-rw-r--r-- | analysis/year.R | 20 | ||||
-rw-r--r-- | src/Makefile.am | 39 | ||||
-rw-r--r-- | src/aggregator.c | 6 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c) | 87 | ||||
-rw-r--r-- | src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h) | 0 | ||||
-rw-r--r-- | src/error/check_error.c (renamed from src/check_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_error.h (renamed from src/check_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.c (renamed from src/check_h5_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.h (renamed from src/check_h5_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c) | 4 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c) | 10 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h) | 0 | ||||
-rw-r--r-- | src/model/gi_type_data.h | 21 | ||||
-rw-r--r-- | src/model/gi_type_data_init.c | 36 | ||||
-rw-r--r-- | src/model/gi_type_data_init.h | 14 | ||||
-rw-r--r-- | src/model/sequence_data.h (renamed from src/sequence_data.h) | 5 | ||||
-rw-r--r-- | src/model/sequence_data_init.c (renamed from src/sequence_data_init.c) | 6 | ||||
-rw-r--r-- | src/model/sequence_data_init.h (renamed from src/sequence_data_init.h) | 0 | ||||
-rw-r--r-- | src/updator.c | 4 |
22 files changed, 181 insertions, 71 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c new file mode 100644 index 0000000..0a11c23 --- a/dev/null +++ b/src/assign/assign_protein_type.c @@ -0,0 +1,253 @@ +#include "assign_protein_type.h" +#include "error/check_ncbi_error.h" +#include "error/check_h5_error.h" +#include "model/gi_type_data.h" +#include "model/gi_type_data_init.h" +#include "model/sequence_data.h" +#include "model/sequence_data_init.h" +#include <ncbi.h> +#include <readdb.h> +#include <blast.h> +#include <salpacc.h> +#include <stdbool.h> +#include <hdf5_hl.h> +#include <error.h> + +/* + * todo: Write results to an independent table by GI so that they can + * be reused when the input tables influenza.faa and influenza_aa.dat + * are replaced with new data. Since GI values are unique and + * persistent the assignment of a protein type should stay valid + * unless the list of reference sequences is updated. + * + * todo: Save the species type as a new field { A, B, C }. + */ + +/* + * BLAST database containing all of the influenza protein sequences. + */ +#define SEQDB "/home/don/exp004/influenzadb/influenzadb" + +/* + * BLAST reference database of prototypical protein types. + */ +#define REFDB "/home/don/exp004/influenzadb/proteinnames" + +#define BUFFER_LEN 50 + +void +assign_protein_type (hid_t file_id) +{ + /* + * Iterate through the records for which no protein type has been + * assigned. Create a BioSeq Pointer to the data and then use this + * as input to the BLAST run against the reference database of + * prototypical sequence records. + */ + + /* + * Make BLAST messages reportable. + */ + ErrSetMessageLevel (SEV_WARNING); + + /* + * Open the BLAST sequence database. + */ + ReadDBFILEPtr seqdb = readdb_new (SEQDB, true); + + /* + * Get default BLAST options. + */ + BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); + + /* + * Set to single hit mode. + */ + options->window_size = 0; + options->multiple_hits_only = false; + + ValNodePtr error_returns = NULL; + + /* + * Read the data from HDF5 influenza.faa. + */ + hsize_t faa_nfields; + hsize_t faa_nrecords; + herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields, + &faa_nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords); + + size_t faa_size; + size_t faa_offset[SEQUENCE_DATA_FIELD_NUM]; + size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM]; + hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM]; + sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type); + + status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset, + faa_sizes, faa_buf); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + /* + * Read the data from HDF5 gi_type_data. + */ + hsize_t gi_nfields; + hsize_t gi_nrecords; + status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, + &gi_nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); + + size_t gi_size; + size_t gi_offset[GI_TYPE_DATA_FIELD_NUM]; + size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM]; + hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; + gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); + + status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, + gi_sizes, gi_buf); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + /* + * Assign protein types to records for which the field is empty. + */ + printf ("Records to process: %i\n", (int)faa_nrecords); + bool updates_pending = false; + for (int i = 0; i < faa_nrecords; i++) + { + + if (dst_buf[i].protein_type[0] == '\0') { + /* + * Read the sequence from the database by GI. + */ + Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); + BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); + if (bsp == NULL) + { + error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, + "Unable to find BLAST record for gi|%i. Ensure the BLAST " + "database is up-to-date with the HDF5 record set. See the " + "BLAST formatdb.log file for details.\n", + dst_buf[i].gi); + } + + SeqAlignPtr seqalign = BioseqBlastEngine (bsp, + "blastp", + REFDB, + options, + NULL, + &error_returns, + NULL); + + /* + * BLAST reported an error. Write it out and continue processing. + */ + if (error_returns != NULL) + { + printf ("Warning: An error has been reported by the NCBI Toolkit API " + "for sequence gi|%i: %s", + dst_buf[i].gi, BlastErrorToString (error_returns)); + + } + + /* + * A hit was found. Record the first hit as the protein type. + * Skip the first 6 characters and eat the "lcl|x_". + */ + else if (seqalign != NULL) + { + Char target_id_buf[BUFFER_LEN + 1]; + SeqIdPtr target_id = SeqAlignId (seqalign, 1); + SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); + + // Species Type + dst_buf[i].type[0] = &target_id_buf[4]; + dst_buf[i].type[1] = '\0'; + + // Protein Type + strncpy (dst_buf[i].protein, &target_id_buf[6], + sizeof (dst_buf[i].protein)); + + updates_pending = true; + } + + /* + * BLAST did not find any hits. + */ + else + { + printf ("Warning: Unable to identify protein type for sequence gi|%i\n", + dst_buf[i].gi); + } + + /* + * Clean up memory for the next ieration. + */ + seqalign = SeqAlignSetFree (seqalign); + bsp = BioseqFree (bsp); + + } + + /* + * Write the data out to the file. + */ + if ( (i % 1000 == 0) && (i > 0) && updates_pending) + { + status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, + dst_size, dst_offset, dst_sizes, + &dst_buf[i-1000]); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + updates_pending = false; + + printf ("Processed %i of %i records.\n", i, (int)nrecords); + } + + } + + /* + * Write out records from the last bin if it was less than 1000 + * records in size. + */ + if (updates_pending) + { + if ((int)nrecords < 1000) + { + status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, + dst_size, dst_offset, dst_sizes, + dst_buf); + } + else + { + status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, + dst_size, dst_offset, dst_sizes, + &dst_buf[nrecords-1000]); + } + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + updates_pending = false; + } + + free (faa_buf); + free (gi_buf); + + options = BLASTOptionDelete (options); + + return; +} |