-rw-r--r-- | analysis/year.R | 20 | ||||
-rw-r--r-- | src/Makefile.am | 39 | ||||
-rw-r--r-- | src/aggregator.c | 6 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c) | 87 | ||||
-rw-r--r-- | src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h) | 0 | ||||
-rw-r--r-- | src/error/check_error.c (renamed from src/check_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_error.h (renamed from src/check_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.c (renamed from src/check_h5_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.h (renamed from src/check_h5_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c) | 4 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c) | 10 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h) | 0 | ||||
-rw-r--r-- | src/model/gi_type_data.h | 21 | ||||
-rw-r--r-- | src/model/gi_type_data_init.c | 36 | ||||
-rw-r--r-- | src/model/gi_type_data_init.h | 14 | ||||
-rw-r--r-- | src/model/sequence_data.h (renamed from src/sequence_data.h) | 5 | ||||
-rw-r--r-- | src/model/sequence_data_init.c (renamed from src/sequence_data_init.c) | 6 | ||||
-rw-r--r-- | src/model/sequence_data_init.h (renamed from src/sequence_data_init.h) | 0 | ||||
-rw-r--r-- | src/updator.c | 4 |
22 files changed, 181 insertions, 71 deletions
diff --git a/src/load/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c new file mode 100644 index 0000000..8bf47aa --- a/dev/null +++ b/src/load/load_influenza_aa_dat.c @@ -0,0 +1,254 @@ +/* + * Load the influnza_aa.dat tab delimited text file into an HDF5 + * binary table. + * + * todo: Handle NULL values occuring in numeric fields. + */ + +#include "load_influenza_aa_dat.h" +#include "error/check_error.h" +#include "error/check_h5_error.h" +#include <hdf5_hl.h> +#include <string.h> +#include <stdlib.h> + +#define NFIELDS (hsize_t) 11 +#define TABLE_NAME "influenza_aa.dat" + +void +load_influenza_aa_dat (hid_t file_id) +{ + /* + * Model the data using native types. + */ + typedef struct + { + char genbank_accession_number[9]; + char host[15]; + int genome_segment_number; + char subtype[7]; + char country[25]; + int year; + int sequence_length; + char virus_name[196]; + char age[17]; + char gender[6]; + char full_length_indicator[4]; + } supplementary_data; + + /* + * Use an HDF5 Table for storage. + * http://www.hdfgroup.org/HDF5/Tutor/h5table.html + */ + + /* + * "Calculate the size and the offsets of our struct members in + * memory." + */ + size_t dst_size = sizeof (supplementary_data); + size_t dst_offset[NFIELDS] = + { HOFFSET (supplementary_data, genbank_accession_number), + HOFFSET (supplementary_data, host), + HOFFSET (supplementary_data, genome_segment_number), + HOFFSET (supplementary_data, subtype), + HOFFSET (supplementary_data, country), + HOFFSET (supplementary_data, year), + HOFFSET (supplementary_data, sequence_length), + HOFFSET (supplementary_data, virus_name), + HOFFSET (supplementary_data, age), + HOFFSET (supplementary_data, gender), + HOFFSET (supplementary_data, full_length_indicator) + }; + + supplementary_data dst_buf[1]; + + size_t dst_sizes[NFIELDS] = { sizeof (dst_buf[0].genbank_accession_number), + sizeof (dst_buf[0].host), + sizeof (dst_buf[0].genome_segment_number), + sizeof (dst_buf[0].subtype), + sizeof (dst_buf[0].country), + sizeof (dst_buf[0].year), + sizeof (dst_buf[0].sequence_length), + sizeof (dst_buf[0].virus_name), + sizeof (dst_buf[0].age), + sizeof (dst_buf[0].gender), + sizeof (dst_buf[0].full_length_indicator) + }; + + /* + * Map the native types to HDF5 types for each field. + */ + hid_t field_type[NFIELDS]; + + hid_t genbank_accession_number_type = H5Tcopy (H5T_C_S1); + H5Tset_size (genbank_accession_number_type, 9); + field_type[0] = genbank_accession_number_type; + + hid_t host_type = H5Tcopy (H5T_C_S1); + H5Tset_size (host_type, 15); + field_type[1] = host_type; + + field_type[2] = H5T_NATIVE_INT; + + hid_t subtype_type = H5Tcopy (H5T_C_S1); + H5Tset_size (subtype_type, 7); + field_type[3] = subtype_type; + + hid_t country_type = H5Tcopy (H5T_C_S1); + H5Tset_size (country_type, 25); + field_type[4] = country_type; + + field_type[5] = H5T_NATIVE_INT; + + field_type[6] = H5T_NATIVE_INT; + + hid_t virus_name_type = H5Tcopy (H5T_C_S1); + H5Tset_size (virus_name_type, 196); + field_type[7] = virus_name_type; + + hid_t age_type = H5Tcopy (H5T_C_S1); + H5Tset_size (age_type, 17); + field_type[8] = age_type; + + hid_t gender_type = H5Tcopy (H5T_C_S1); + H5Tset_size (gender_type, 6); + field_type[9] = gender_type; + + hid_t full_length_indicator_type = H5Tcopy (H5T_C_S1); + H5Tset_size (full_length_indicator_type, 4); + field_type[10] = full_length_indicator_type; + + /* + * Labels used for the fields in the table. + */ + const char *field_names[NFIELDS] = { "GenBank accession number", + "Host", + "Genome segment number", + "Subtype", + "Country", + "Year", + "Sequence length", + "Virus name", + "Age", + "Gender", + "Full-length Indicator" + }; + + /* + * Table storage options. + */ + hsize_t chunk_size = 10; + int *fill_data = NULL; + int compress = 0; + + /* + * Insert the records. + */ + supplementary_data p_data; + FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", + "r"); + if (dat == NULL) + check_error (__FILE__, __LINE__); + char *line = NULL; + size_t len = 0; + int current_line = 0; + + while (getline (&line, &len, dat) != -1) + { + + current_line++; + char *running = strdup (line); + char *token = NULL; + + /* + * Parse the line, handling the case of empty fields represented + * by sequential delimiters. + */ + strncpy (p_data.genbank_accession_number, strsep (&running, "\t"), + sizeof (p_data.genbank_accession_number)); + + strncpy (p_data.host, strsep (&running, "\t"), sizeof (p_data.host)); + + token = strsep (&running, "\t"); + if (strcmp (token, "\0") == 0) + p_data.genome_segment_number = 0; + else + p_data.genome_segment_number = atoi (token); + + strncpy (p_data.subtype, strsep (&running, "\t"), + sizeof (p_data.subtype)); + + strncpy (p_data.country, strsep (&running, "\t"), + sizeof (p_data.country)); + + /* + * Convert the year field from text to numeric. Unknown and empty + * values are assigned a numeric value of zero. + */ + token = strsep (&running, "\t"); + if (strcmp (token, "\0") == 0) + p_data.year = 0; + else if (strcmp (token, "unknown") == 0) + p_data.year = 0; + else if (strcmp (token, "NON") == 0) + p_data.year = 0; + else + p_data.year = atoi (token); + + token = strsep (&running, "\t"); + if (strcmp (token, "\0") == 0) + p_data.sequence_length = 0; + else + p_data.sequence_length = atoi (token); + + strncpy (p_data.virus_name, strsep (&running, "\t"), + sizeof (p_data.virus_name)); + + strncpy (p_data.age, strsep (&running, "\t"), sizeof (p_data.age)); + + strncpy (p_data.gender, strsep (&running, "\t"), + sizeof (p_data.gender)); + + strncpy (p_data.full_length_indicator, strsep (&running, "\t"), + sizeof (p_data.full_length_indicator)); + + if (current_line == 1) + { + herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, + TABLE_NAME, NFIELDS, 1, dst_size, + field_names, dst_offset, field_type, + chunk_size, fill_data, compress, + &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } + else + { + herr_t status = + H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, + dst_offset, dst_sizes, &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } + + if (running) + free (running); + + } + + if (line) + free (line); + + fclose (dat); + + H5Tclose (genbank_accession_number_type); + H5Tclose (host_type); + H5Tclose (subtype_type); + H5Tclose (country_type); + H5Tclose (virus_name_type); + H5Tclose (age_type); + H5Tclose (gender_type); + H5Tclose (full_length_indicator_type); + + return; +} |