-rw-r--r-- | .gitignore | 5 | ||||
-rw-r--r-- | src/Makefile.am | 10 | ||||
-rw-r--r-- | src/assign_protein_type.c | 135 | ||||
-rw-r--r-- | src/load_influenza_faa.c | 67 | ||||
-rw-r--r-- | src/sequence_data.h | 16 | ||||
-rw-r--r-- | src/sequence_data_init.c | 37 | ||||
-rw-r--r-- | src/sequence_data_init.h | 14 |
7 files changed, 188 insertions, 96 deletions
@@ -14,7 +14,8 @@ configure | |||
14 | depcomp | 14 | depcomp |
15 | install-sh | 15 | install-sh |
16 | missing | 16 | missing |
17 | stamp-h1 | 17 | src/.deps |
18 | src/aggregator | 18 | src/aggregator |
19 | src/influenza.h5 | 19 | src/influenza.h5 |
20 | src/.deps | 20 | src/updator |
21 | stamp-h1 | ||
diff --git a/src/Makefile.am b/src/Makefile.am index a7e6852..6dd8e16 100644 --- a/src/Makefile.am +++ b/src/Makefile.am | |||
@@ -5,14 +5,16 @@ aggregator_SOURCES = \ | |||
5 | check_error.c \ | 5 | check_error.c \ |
6 | check_h5_error.c \ | 6 | check_h5_error.c \ |
7 | load_influenza_aa_dat.c \ | 7 | load_influenza_aa_dat.c \ |
8 | load_influenza_faa.c | 8 | load_influenza_faa.c \ |
9 | sequence_data_init.c | ||
9 | 10 | ||
10 | updator_SOURCES = \ | 11 | updator_SOURCES = \ |
11 | updator.c \ | 12 | updator.c \ |
12 | check_error.c \ | 13 | check_error.c \ |
13 | check_h5_error.c \ | 14 | check_h5_error.c \ |
14 | check_ncbi_error.c \ | 15 | check_ncbi_error.c \ |
15 | assign_protein_type.c | 16 | assign_protein_type.c \ |
17 | sequence_data_init.c | ||
16 | 18 | ||
17 | noinst_HEADERS = \ | 19 | noinst_HEADERS = \ |
18 | assign_protein_type.h \ | 20 | assign_protein_type.h \ |
@@ -20,6 +22,8 @@ noinst_HEADERS = \ | |||
20 | check_h5_error.h \ | 22 | check_h5_error.h \ |
21 | check_ncbi_error.h \ | 23 | check_ncbi_error.h \ |
22 | load_influenza_aa_dat.h \ | 24 | load_influenza_aa_dat.h \ |
23 | load_influenza_faa.h | 25 | load_influenza_faa.h \ |
26 | sequence_data.h \ | ||
27 | sequence_data_init.h | ||
24 | 28 | ||
25 | AM_CFLAGS = -Wall -std=gnu99 -ggdb | 29 | AM_CFLAGS = -Wall -std=gnu99 -ggdb |
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 1b58f54..166e787 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c | |||
@@ -1,12 +1,15 @@ | |||
1 | #include "assign_protein_type.h" | 1 | #include "assign_protein_type.h" |
2 | #include "check_ncbi_error.h" | 2 | #include "check_ncbi_error.h" |
3 | #include "check_h5_error.h" | 3 | #include "check_h5_error.h" |
4 | #include "sequence_data.h" | ||
5 | #include "sequence_data_init.h" | ||
4 | #include <ncbi.h> | 6 | #include <ncbi.h> |
5 | #include <readdb.h> | 7 | #include <readdb.h> |
6 | #include <blast.h> | 8 | #include <blast.h> |
7 | #include <salpacc.h> | 9 | #include <salpacc.h> |
8 | #include <stdbool.h> | 10 | #include <stdbool.h> |
9 | #include <hdf5_hl.h> | 11 | #include <hdf5_hl.h> |
12 | #include <error.h> | ||
10 | 13 | ||
11 | /* | 14 | /* |
12 | * BLAST database containing all of the influenza protein sequences. | 15 | * BLAST database containing all of the influenza protein sequences. |
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id) | |||
44 | * Get default BLAST options. | 47 | * Get default BLAST options. |
45 | */ | 48 | */ |
46 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); | 49 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); |
50 | |||
51 | /* | ||
52 | * Set to single hit mode. | ||
53 | */ | ||
54 | options->window_size = 0; | ||
55 | options->multiple_hits_only = false; | ||
56 | |||
47 | ValNodePtr error_returns = NULL; | 57 | ValNodePtr error_returns = NULL; |
48 | 58 | ||
49 | /* | 59 | /* |
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id) | |||
56 | if (status < 0) | 66 | if (status < 0) |
57 | check_h5_error (status, __FILE__, __LINE__); | 67 | check_h5_error (status, __FILE__, __LINE__); |
58 | 68 | ||
59 | /* | 69 | sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords); |
60 | * todo: Allocate memory of nrecords for dst_buf. | 70 | |
61 | * | 71 | size_t dst_size; |
62 | * todo: Refactor code to share structres in read and write HDF5 | 72 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; |
63 | * calls. | 73 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; |
64 | */ | 74 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; |
75 | sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); | ||
65 | 76 | ||
66 | status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, | 77 | status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, |
67 | dst_sizes, dst_buf); | 78 | dst_sizes, dst_buf); |
68 | if (status < 0) | 79 | if (status < 0) |
69 | check_h5_error (status, __FILE__, __LINE__); | 80 | check_h5_error (status, __FILE__, __LINE__); |
70 | 81 | ||
71 | for (int i = 0; i < nrecords; i++) | ||
72 | { | ||
73 | |||
74 | } | ||
75 | |||
76 | /* | 82 | /* |
77 | * Read the sequence from the database by GI. | 83 | * Assign protein types to records for which the field is empty. |
78 | */ | 84 | */ |
79 | Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); | 85 | printf ("Records to process: %i\n", (int)nrecords); |
80 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | 86 | for (int i = 0; i < nrecords; i++) |
81 | |||
82 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
83 | "blastp", | ||
84 | REFDB, | ||
85 | options, | ||
86 | NULL, | ||
87 | &error_returns, | ||
88 | NULL); | ||
89 | |||
90 | if (error_returns != NULL) | ||
91 | check_ncbi_error (error_returns, __FILE__, __LINE__); | ||
92 | |||
93 | if (seqalign != NULL) | ||
94 | { | 87 | { |
95 | Char target_id_buf[BUFFER_LEN + 1]; | ||
96 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
97 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
98 | printf ("%s\n", target_id_buf); | ||
99 | } | ||
100 | |||
101 | // Clean up memory for the next ieration. | ||
102 | seqalign = SeqAlignSetFree (seqalign); | ||
103 | bsp = BioseqFree (bsp); | ||
104 | 88 | ||
89 | if (dst_buf[i].protein_type[0] == '\0') { | ||
90 | /* | ||
91 | * Read the sequence from the database by GI. | ||
92 | */ | ||
93 | Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); | ||
94 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | ||
95 | |||
96 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
97 | "blastp", | ||
98 | REFDB, | ||
99 | options, | ||
100 | NULL, | ||
101 | &error_returns, | ||
102 | NULL); | ||
103 | |||
104 | /* | ||
105 | * BLAST reported an error. Write it out and continue processing. | ||
106 | */ | ||
107 | if (error_returns != NULL) | ||
108 | { | ||
109 | printf ("Warning: An error has been reported by the NCBI Toolkit API " | ||
110 | "for sequence gi|%i: %s", | ||
111 | dst_buf[i].gi, BlastErrorToString (error_returns)); | ||
112 | |||
113 | } | ||
114 | |||
115 | /* | ||
116 | * A hit was found. Record the first hit as the protein type. | ||
117 | * Skip the first 6 characters and eat the "lcl|x_". | ||
118 | */ | ||
119 | else if (seqalign != NULL) | ||
120 | { | ||
121 | Char target_id_buf[BUFFER_LEN + 1]; | ||
122 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
123 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
124 | strncpy (dst_buf[i].protein_type, &target_id_buf[6], | ||
125 | sizeof (dst_buf[i].protein_type)); | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * BLAST did not find any hits. | ||
130 | */ | ||
131 | else | ||
132 | { | ||
133 | printf ("Warning: Unable to identify protein type for sequence gi|%i\n", | ||
134 | dst_buf[i].gi); | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * Clean up memory for the next ieration. | ||
139 | */ | ||
140 | seqalign = SeqAlignSetFree (seqalign); | ||
141 | bsp = BioseqFree (bsp); | ||
142 | |||
143 | /* | ||
144 | * Write the data out to the file. | ||
145 | */ | ||
146 | if ( (i % 1000 == 0) && (i > 0) ) | ||
147 | { | ||
148 | status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, | ||
149 | dst_size, dst_offset, dst_sizes, | ||
150 | &dst_buf[i-1000]); | ||
151 | if (status < 0) | ||
152 | check_h5_error (status, __FILE__, __LINE__); | ||
153 | |||
154 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | ||
155 | if (status < 0) | ||
156 | check_h5_error (status, __FILE__, __LINE__); | ||
157 | |||
158 | printf ("Processed %i of %i records.\n", i, (int)nrecords); | ||
159 | } | ||
160 | } | ||
161 | |||
162 | } | ||
163 | |||
164 | free (dst_buf); | ||
165 | |||
105 | options = BLASTOptionDelete (options); | 166 | options = BLASTOptionDelete (options); |
106 | 167 | ||
107 | return; | 168 | return; |
108 | } | 169 | } |
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c index 749b7ad..fd35254 100644 --- a/src/load_influenza_faa.c +++ b/src/load_influenza_faa.c | |||
@@ -1,62 +1,22 @@ | |||
1 | #include "load_influenza_faa.h" | ||
2 | #include "check_error.h" | 1 | #include "check_error.h" |
3 | #include "check_h5_error.h" | 2 | #include "check_h5_error.h" |
4 | #include "hdf5_hl.h" | 3 | #include "load_influenza_faa.h" |
4 | #include "sequence_data.h" | ||
5 | #include "sequence_data_init.h" | ||
6 | #include <hdf5_hl.h> | ||
5 | #include <string.h> | 7 | #include <string.h> |
6 | #include <stdlib.h> | 8 | #include <stdlib.h> |
7 | 9 | ||
8 | #define SEQUENCE_DATA_FIELD_NUM 4 | ||
9 | |||
10 | void | 10 | void |
11 | load_influenza_faa (hid_t file_id) | 11 | load_influenza_faa (hid_t file_id) |
12 | { | 12 | { |
13 | typedef struct | 13 | size_t dst_size; |
14 | { | 14 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; |
15 | int gi; | 15 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; |
16 | char gb[9]; | ||
17 | char description[196]; | ||
18 | char protein_type[7]; | ||
19 | } sequence_data; | ||
20 | |||
21 | size_t dst_size = sizeof (sequence_data); | ||
22 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] = | ||
23 | { HOFFSET (sequence_data, gi), | ||
24 | HOFFSET (sequence_data, gb), | ||
25 | HOFFSET (sequence_data, description), | ||
26 | HOFFSET (sequence_data, protein_type) | ||
27 | }; | ||
28 | |||
29 | sequence_data dst_buf[1]; | ||
30 | |||
31 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = { | ||
32 | sizeof (dst_buf[0].gi), | ||
33 | sizeof (dst_buf[0].gb), | ||
34 | sizeof (dst_buf[0].description), | ||
35 | sizeof (dst_buf[0].protein_type) | ||
36 | }; | ||
37 | |||
38 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; | 16 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; |
39 | 17 | ||
40 | field_type[0] = H5T_NATIVE_INT; | 18 | sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); |
41 | 19 | ||
42 | hid_t gb_type = H5Tcopy (H5T_C_S1); | ||
43 | H5Tset_size (gb_type, 9); | ||
44 | field_type[1] = gb_type; | ||
45 | |||
46 | hid_t description_type = H5Tcopy (H5T_C_S1); | ||
47 | H5Tset_size (description_type, 196); | ||
48 | field_type[2] = description_type; | ||
49 | |||
50 | hid_t protein_type_type = H5Tcopy (H5T_C_S1); | ||
51 | H5Tset_size (protein_type_type, 7); | ||
52 | field_type[3] = protein_type_type; | ||
53 | |||
54 | const char *field_names[SEQUENCE_DATA_FIELD_NUM] = | ||
55 | { "GI", | ||
56 | "GB", | ||
57 | "Description", | ||
58 | "Protein Type" }; | ||
59 | |||
60 | hsize_t chunk_size = 10; | 20 | hsize_t chunk_size = 10; |
61 | int *fill_data = NULL; | 21 | int *fill_data = NULL; |
62 | int compress = 0; | 22 | int compress = 0; |
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id) | |||
99 | 59 | ||
100 | strncpy (p_data.protein_type, "", sizeof (p_data.protein_type)); | 60 | strncpy (p_data.protein_type, "", sizeof (p_data.protein_type)); |
101 | 61 | ||
62 | const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] = | ||
63 | SEQUENCE_DATA_FIELD_NAMES; | ||
64 | |||
102 | if (current_line == 1) | 65 | if (current_line == 1) |
103 | { | 66 | { |
104 | herr_t status = H5TBmake_table ("influenza.faa", file_id, | 67 | herr_t status = H5TBmake_table ("influenza.faa", file_id, |
105 | "influenza.faa", | 68 | "influenza.faa", |
106 | SEQUENCE_DATA_FIELD_NUM, 1, | 69 | SEQUENCE_DATA_FIELD_NUM, 1, |
107 | dst_size, field_names, | 70 | dst_size, sequence_data_field_names, |
108 | dst_offset, field_type, | 71 | dst_offset, field_type, |
109 | chunk_size, fill_data, compress, | 72 | chunk_size, fill_data, compress, |
110 | &p_data); | 73 | &p_data); |
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id) | |||
132 | 95 | ||
133 | fclose (dat); | 96 | fclose (dat); |
134 | 97 | ||
135 | H5Tclose (gb_type); | ||
136 | H5Tclose (description_type); | ||
137 | H5Tclose (protein_type_type); | ||
138 | |||
139 | return; | 98 | return; |
140 | } | 99 | } |
diff --git a/src/sequence_data.h b/src/sequence_data.h new file mode 100644 index 0000000..fb4ff78 --- a/dev/null +++ b/src/sequence_data.h | |||
@@ -0,0 +1,16 @@ | |||
1 | #ifndef SEQUENCE_DATA_H | ||
2 | #define SEQUENCE_DATA_H | ||
3 | |||
4 | #define SEQUENCE_DATA_FIELD_NUM 4 | ||
5 | |||
6 | #define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" } | ||
7 | |||
8 | typedef struct | ||
9 | { | ||
10 | int gi; | ||
11 | char gb[9]; | ||
12 | char description[196]; | ||
13 | char protein_type[7]; | ||
14 | } sequence_data; | ||
15 | |||
16 | #endif // SEQUENCE_DATA_H | ||
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c new file mode 100644 index 0000000..09ba189 --- a/dev/null +++ b/src/sequence_data_init.c | |||
@@ -0,0 +1,37 @@ | |||
1 | #include "sequence_data_init.h" | ||
2 | #include "sequence_data.h" | ||
3 | |||
4 | void | ||
5 | sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, | ||
6 | hid_t *field_type) | ||
7 | { | ||
8 | *dst_size = sizeof (sequence_data); | ||
9 | |||
10 | dst_offset[0] = HOFFSET (sequence_data, gi); | ||
11 | dst_offset[1] = HOFFSET (sequence_data, gb); | ||
12 | dst_offset[2] = HOFFSET (sequence_data, description); | ||
13 | dst_offset[3] = HOFFSET (sequence_data, protein_type); | ||
14 | |||
15 | sequence_data dst_buf[1]; | ||
16 | |||
17 | dst_sizes[0] = sizeof (dst_buf[0].gi); | ||
18 | dst_sizes[1] = sizeof (dst_buf[0].gb); | ||
19 | dst_sizes[2] = sizeof (dst_buf[0].description); | ||
20 | dst_sizes[3] = sizeof (dst_buf[0].protein_type); | ||
21 | |||
22 | field_type[0] = H5T_NATIVE_INT; | ||
23 | |||
24 | hid_t gb_type = H5Tcopy (H5T_C_S1); | ||
25 | H5Tset_size (gb_type, 9); | ||
26 | field_type[1] = gb_type; | ||
27 | |||
28 | hid_t description_type = H5Tcopy (H5T_C_S1); | ||
29 | H5Tset_size (description_type, 196); | ||
30 | field_type[2] = description_type; | ||
31 | |||
32 | hid_t protein_type_type = H5Tcopy (H5T_C_S1); | ||
33 | H5Tset_size (protein_type_type, 7); | ||
34 | field_type[3] = protein_type_type; | ||
35 | |||
36 | return; | ||
37 | } | ||
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h new file mode 100644 index 0000000..c87e7e6 --- a/dev/null +++ b/src/sequence_data_init.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef SEQUENCE_DATA_INIT_H | ||
2 | #define SEQUENCE_DATA_INIT_H | ||
3 | |||
4 | #include <hdf5.h> | ||
5 | |||
6 | /* | ||
7 | * Initialize the structures describing sequence_data. These | ||
8 | * descriptive structures are used by the HDF5 API. | ||
9 | */ | ||
10 | void | ||
11 | sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, | ||
12 | hid_t *field_type); | ||
13 | |||
14 | #endif // SEQUENCE_DATA_INIT_H | ||