-rw-r--r-- | analysis/year.R | 20 | ||||
-rw-r--r-- | src/Makefile.am | 39 | ||||
-rw-r--r-- | src/aggregator.c | 6 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c) | 87 | ||||
-rw-r--r-- | src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h) | 0 | ||||
-rw-r--r-- | src/error/check_error.c (renamed from src/check_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_error.h (renamed from src/check_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.c (renamed from src/check_h5_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_h5_error.h (renamed from src/check_h5_error.h) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c) | 0 | ||||
-rw-r--r-- | src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c) | 4 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h) | 0 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c) | 10 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h) | 0 | ||||
-rw-r--r-- | src/model/gi_type_data.h | 21 | ||||
-rw-r--r-- | src/model/gi_type_data_init.c | 36 | ||||
-rw-r--r-- | src/model/gi_type_data_init.h | 14 | ||||
-rw-r--r-- | src/model/sequence_data.h (renamed from src/sequence_data.h) | 5 | ||||
-rw-r--r-- | src/model/sequence_data_init.c (renamed from src/sequence_data_init.c) | 6 | ||||
-rw-r--r-- | src/model/sequence_data_init.h (renamed from src/sequence_data_init.h) | 0 | ||||
-rw-r--r-- | src/updator.c | 4 |
22 files changed, 181 insertions, 71 deletions
diff --git a/src/load/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c new file mode 100644 index 0000000..8bf47aa --- a/dev/null +++ b/src/load/load_influenza_aa_dat.c | |||
@@ -0,0 +1,254 @@ | |||
1 | /* | ||
2 | * Load the influnza_aa.dat tab delimited text file into an HDF5 | ||
3 | * binary table. | ||
4 | * | ||
5 | * todo: Handle NULL values occuring in numeric fields. | ||
6 | */ | ||
7 | |||
8 | #include "load_influenza_aa_dat.h" | ||
9 | #include "error/check_error.h" | ||
10 | #include "error/check_h5_error.h" | ||
11 | #include <hdf5_hl.h> | ||
12 | #include <string.h> | ||
13 | #include <stdlib.h> | ||
14 | |||
15 | #define NFIELDS (hsize_t) 11 | ||
16 | #define TABLE_NAME "influenza_aa.dat" | ||
17 | |||
18 | void | ||
19 | load_influenza_aa_dat (hid_t file_id) | ||
20 | { | ||
21 | /* | ||
22 | * Model the data using native types. | ||
23 | */ | ||
24 | typedef struct | ||
25 | { | ||
26 | char genbank_accession_number[9]; | ||
27 | char host[15]; | ||
28 | int genome_segment_number; | ||
29 | char subtype[7]; | ||
30 | char country[25]; | ||
31 | int year; | ||
32 | int sequence_length; | ||
33 | char virus_name[196]; | ||
34 | char age[17]; | ||
35 | char gender[6]; | ||
36 | char full_length_indicator[4]; | ||
37 | } supplementary_data; | ||
38 | |||
39 | /* | ||
40 | * Use an HDF5 Table for storage. | ||
41 | * http://www.hdfgroup.org/HDF5/Tutor/h5table.html | ||
42 | */ | ||
43 | |||
44 | /* | ||
45 | * "Calculate the size and the offsets of our struct members in | ||
46 | * memory." | ||
47 | */ | ||
48 | size_t dst_size = sizeof (supplementary_data); | ||
49 | size_t dst_offset[NFIELDS] = | ||
50 | { HOFFSET (supplementary_data, genbank_accession_number), | ||
51 | HOFFSET (supplementary_data, host), | ||
52 | HOFFSET (supplementary_data, genome_segment_number), | ||
53 | HOFFSET (supplementary_data, subtype), | ||
54 | HOFFSET (supplementary_data, country), | ||
55 | HOFFSET (supplementary_data, year), | ||
56 | HOFFSET (supplementary_data, sequence_length), | ||
57 | HOFFSET (supplementary_data, virus_name), | ||
58 | HOFFSET (supplementary_data, age), | ||
59 | HOFFSET (supplementary_data, gender), | ||
60 | HOFFSET (supplementary_data, full_length_indicator) | ||
61 | }; | ||
62 | |||
63 | supplementary_data dst_buf[1]; | ||
64 | |||
65 | size_t dst_sizes[NFIELDS] = { sizeof (dst_buf[0].genbank_accession_number), | ||
66 | sizeof (dst_buf[0].host), | ||
67 | sizeof (dst_buf[0].genome_segment_number), | ||
68 | sizeof (dst_buf[0].subtype), | ||
69 | sizeof (dst_buf[0].country), | ||
70 | sizeof (dst_buf[0].year), | ||
71 | sizeof (dst_buf[0].sequence_length), | ||
72 | sizeof (dst_buf[0].virus_name), | ||
73 | sizeof (dst_buf[0].age), | ||
74 | sizeof (dst_buf[0].gender), | ||
75 | sizeof (dst_buf[0].full_length_indicator) | ||
76 | }; | ||
77 | |||
78 | /* | ||
79 | * Map the native types to HDF5 types for each field. | ||
80 | */ | ||
81 | hid_t field_type[NFIELDS]; | ||
82 | |||
83 | hid_t genbank_accession_number_type = H5Tcopy (H5T_C_S1); | ||
84 | H5Tset_size (genbank_accession_number_type, 9); | ||
85 | field_type[0] = genbank_accession_number_type; | ||
86 | |||
87 | hid_t host_type = H5Tcopy (H5T_C_S1); | ||
88 | H5Tset_size (host_type, 15); | ||
89 | field_type[1] = host_type; | ||
90 | |||
91 | field_type[2] = H5T_NATIVE_INT; | ||
92 | |||
93 | hid_t subtype_type = H5Tcopy (H5T_C_S1); | ||
94 | H5Tset_size (subtype_type, 7); | ||
95 | field_type[3] = subtype_type; | ||
96 | |||
97 | hid_t country_type = H5Tcopy (H5T_C_S1); | ||
98 | H5Tset_size (country_type, 25); | ||
99 | field_type[4] = country_type; | ||
100 | |||
101 | field_type[5] = H5T_NATIVE_INT; | ||
102 | |||
103 | field_type[6] = H5T_NATIVE_INT; | ||
104 | |||
105 | hid_t virus_name_type = H5Tcopy (H5T_C_S1); | ||
106 | H5Tset_size (virus_name_type, 196); | ||
107 | field_type[7] = virus_name_type; | ||
108 | |||
109 | hid_t age_type = H5Tcopy (H5T_C_S1); | ||
110 | H5Tset_size (age_type, 17); | ||
111 | field_type[8] = age_type; | ||
112 | |||
113 | hid_t gender_type = H5Tcopy (H5T_C_S1); | ||
114 | H5Tset_size (gender_type, 6); | ||
115 | field_type[9] = gender_type; | ||
116 | |||
117 | hid_t full_length_indicator_type = H5Tcopy (H5T_C_S1); | ||
118 | H5Tset_size (full_length_indicator_type, 4); | ||
119 | field_type[10] = full_length_indicator_type; | ||
120 | |||
121 | /* | ||
122 | * Labels used for the fields in the table. | ||
123 | */ | ||
124 | const char *field_names[NFIELDS] = { "GenBank accession number", | ||
125 | "Host", | ||
126 | "Genome segment number", | ||
127 | "Subtype", | ||
128 | "Country", | ||
129 | "Year", | ||
130 | "Sequence length", | ||
131 | "Virus name", | ||
132 | "Age", | ||
133 | "Gender", | ||
134 | "Full-length Indicator" | ||
135 | }; | ||
136 | |||
137 | /* | ||
138 | * Table storage options. | ||
139 | */ | ||
140 | hsize_t chunk_size = 10; | ||
141 | int *fill_data = NULL; | ||
142 | int compress = 0; | ||
143 | |||
144 | /* | ||
145 | * Insert the records. | ||
146 | */ | ||
147 | supplementary_data p_data; | ||
148 | FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", | ||
149 | "r"); | ||
150 | if (dat == NULL) | ||
151 | check_error (__FILE__, __LINE__); | ||
152 | char *line = NULL; | ||
153 | size_t len = 0; | ||
154 | int current_line = 0; | ||
155 | |||
156 | while (getline (&line, &len, dat) != -1) | ||
157 | { | ||
158 | |||
159 | current_line++; | ||
160 | char *running = strdup (line); | ||
161 | char *token = NULL; | ||
162 | |||
163 | /* | ||
164 | * Parse the line, handling the case of empty fields represented | ||
165 | * by sequential delimiters. | ||
166 | */ | ||
167 | strncpy (p_data.genbank_accession_number, strsep (&running, "\t"), | ||
168 | sizeof (p_data.genbank_accession_number)); | ||
169 | |||
170 | strncpy (p_data.host, strsep (&running, "\t"), sizeof (p_data.host)); | ||
171 | |||
172 | token = strsep (&running, "\t"); | ||
173 | if (strcmp (token, "\0") == 0) | ||
174 | p_data.genome_segment_number = 0; | ||
175 | else | ||
176 | p_data.genome_segment_number = atoi (token); | ||
177 | |||
178 | strncpy (p_data.subtype, strsep (&running, "\t"), | ||
179 | sizeof (p_data.subtype)); | ||
180 | |||
181 | strncpy (p_data.country, strsep (&running, "\t"), | ||
182 | sizeof (p_data.country)); | ||
183 | |||
184 | /* | ||
185 | * Convert the year field from text to numeric. Unknown and empty | ||
186 | * values are assigned a numeric value of zero. | ||
187 | */ | ||
188 | token = strsep (&running, "\t"); | ||
189 | if (strcmp (token, "\0") == 0) | ||
190 | p_data.year = 0; | ||
191 | else if (strcmp (token, "unknown") == 0) | ||
192 | p_data.year = 0; | ||
193 | else if (strcmp (token, "NON") == 0) | ||
194 | p_data.year = 0; | ||
195 | else | ||
196 | p_data.year = atoi (token); | ||
197 | |||
198 | token = strsep (&running, "\t"); | ||
199 | if (strcmp (token, "\0") == 0) | ||
200 | p_data.sequence_length = 0; | ||
201 | else | ||
202 | p_data.sequence_length = atoi (token); | ||
203 | |||
204 | strncpy (p_data.virus_name, strsep (&running, "\t"), | ||
205 | sizeof (p_data.virus_name)); | ||
206 | |||
207 | strncpy (p_data.age, strsep (&running, "\t"), sizeof (p_data.age)); | ||
208 | |||
209 | strncpy (p_data.gender, strsep (&running, "\t"), | ||
210 | sizeof (p_data.gender)); | ||
211 | |||
212 | strncpy (p_data.full_length_indicator, strsep (&running, "\t"), | ||
213 | sizeof (p_data.full_length_indicator)); | ||
214 | |||
215 | if (current_line == 1) | ||
216 | { | ||
217 | herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, | ||
218 | TABLE_NAME, NFIELDS, 1, dst_size, | ||
219 | field_names, dst_offset, field_type, | ||
220 | chunk_size, fill_data, compress, | ||
221 | &p_data); | ||
222 | if (status < 0) | ||
223 | check_h5_error (status, __FILE__, __LINE__); | ||
224 | } | ||
225 | else | ||
226 | { | ||
227 | herr_t status = | ||
228 | H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, | ||
229 | dst_offset, dst_sizes, &p_data); | ||
230 | if (status < 0) | ||
231 | check_h5_error (status, __FILE__, __LINE__); | ||
232 | } | ||
233 | |||
234 | if (running) | ||
235 | free (running); | ||
236 | |||
237 | } | ||
238 | |||
239 | if (line) | ||
240 | free (line); | ||
241 | |||
242 | fclose (dat); | ||
243 | |||
244 | H5Tclose (genbank_accession_number_type); | ||
245 | H5Tclose (host_type); | ||
246 | H5Tclose (subtype_type); | ||
247 | H5Tclose (country_type); | ||
248 | H5Tclose (virus_name_type); | ||
249 | H5Tclose (age_type); | ||
250 | H5Tclose (gender_type); | ||
251 | H5Tclose (full_length_indicator_type); | ||
252 | |||
253 | return; | ||
254 | } | ||