author | Don Pellegrino <don@drexel.edu> | 2010-01-17 23:44:57 (GMT) |
---|---|---|
committer | Don Pellegrino <don@drexel.edu> | 2010-01-17 23:44:57 (GMT) |
commit | b2e4ace6b0265f4575733bfaa38519167074c477 (patch) (unidiff) | |
tree | 5d7dc32c19e6a33ac45dc1fc9b702919a4c7721b | |
parent | 22bc29abeb6c0c3354bbd857cac53af14153bb2f (diff) | |
download | exp007-b2e4ace6b0265f4575733bfaa38519167074c477.zip exp007-b2e4ace6b0265f4575733bfaa38519167074c477.tar.gz exp007-b2e4ace6b0265f4575733bfaa38519167074c477.tar.bz2 |
Added error checking routines. Added implementation of
assign_protein_type tested against a single hard-coded sequence input.
The next step is to iterate over all records in the HDF5 collection
that don't have a type assigned and to assign a type value.
-rw-r--r-- | configure.ac | 20 | ||||
-rw-r--r-- | src/Makefile.am | 6 | ||||
-rw-r--r-- | src/aggregator.c | 14 | ||||
-rw-r--r-- | src/assign_protein_type.c | 72 | ||||
-rw-r--r-- | src/assign_protein_type.h | 6 | ||||
-rw-r--r-- | src/check_error.c | 14 | ||||
-rw-r--r-- | src/check_error.h | 11 | ||||
-rw-r--r-- | src/check_h5_error.c | 12 | ||||
-rw-r--r-- | src/check_h5_error.h | 12 | ||||
-rw-r--r-- | src/check_ncbi_error.c | 13 | ||||
-rw-r--r-- | src/check_ncbi_error.h | 13 | ||||
-rw-r--r-- | src/load_influenza_aa_dat.c | 31 |
12 files changed, 211 insertions, 13 deletions
diff --git a/configure.ac b/configure.ac index 3104df8..f45958c 100644 --- a/configure.ac +++ b/configure.ac | |||
@@ -24,4 +24,24 @@ AC_CHECK_HEADERS([mpi.h],[], | |||
24 | AC_CHECK_HEADERS([hdf5.h],[], | 24 | AC_CHECK_HEADERS([hdf5.h],[], |
25 | [AC_MSG_ERROR(The HDF5 headers are needed to build the system.)]) | 25 | [AC_MSG_ERROR(The HDF5 headers are needed to build the system.)]) |
26 | 26 | ||
27 | ######################## | ||
28 | # MODULE: NCBI Toolkit # | ||
29 | ######################## | ||
30 | |||
31 | # Check for the NCBI ToolBox libraries. | ||
32 | AC_SEARCH_LIBS([log10],[m],[], | ||
33 | [AC_MSG_ERROR(The C Math Library is needed to build the system.)]) | ||
34 | |||
35 | AC_SEARCH_LIBS([NlmThreadsAvailable],[ncbi],[], | ||
36 | [AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) | ||
37 | |||
38 | AC_SEARCH_LIBS([SeqAlignNew],[ncbiobj],[], | ||
39 | [AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) | ||
40 | |||
41 | AC_SEARCH_LIBS([Blast_RedoOneMatch],[blastcompadj],[], | ||
42 | [AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) | ||
43 | |||
44 | AC_SEARCH_LIBS([BioseqBlastEngine],[ncbitool],[], | ||
45 | [AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) | ||
46 | |||
27 | AC_OUTPUT | 47 | AC_OUTPUT |
diff --git a/src/Makefile.am b/src/Makefile.am index 8098f33..fcbcdd5 100644 --- a/src/Makefile.am +++ b/src/Makefile.am | |||
@@ -3,6 +3,9 @@ bin_PROGRAMS = aggregator | |||
3 | aggregator_SOURCES = \ | 3 | aggregator_SOURCES = \ |
4 | aggregator.c \ | 4 | aggregator.c \ |
5 | assign_protein_type.c \ | 5 | assign_protein_type.c \ |
6 | check_error.c \ | ||
7 | check_h5_error.c \ | ||
8 | check_ncbi_error.c \ | ||
6 | load_influenza_aa_dat.c \ | 9 | load_influenza_aa_dat.c \ |
7 | load_influenza_faa.c | 10 | load_influenza_faa.c |
8 | 11 | ||
@@ -10,6 +13,9 @@ aggregator_LDADD = -lhdf5 | |||
10 | 13 | ||
11 | noinst_HEADERS = \ | 14 | noinst_HEADERS = \ |
12 | assign_protein_type.h \ | 15 | assign_protein_type.h \ |
16 | check_error.h \ | ||
17 | check_h5_error.h \ | ||
18 | check_ncbi_error.h \ | ||
13 | load_influenza_aa_dat.h \ | 19 | load_influenza_aa_dat.h \ |
14 | load_influenza_faa.h | 20 | load_influenza_faa.h |
15 | 21 | ||
diff --git a/src/aggregator.c b/src/aggregator.c index 228e5c6..20da6df 100644 --- a/src/aggregator.c +++ b/src/aggregator.c | |||
@@ -3,6 +3,8 @@ | |||
3 | * container. | 3 | * container. |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include "assign_protein_type.h" | ||
7 | #include "check_h5_error.h" | ||
6 | #include "load_influenza_aa_dat.h" | 8 | #include "load_influenza_aa_dat.h" |
7 | #include "load_influenza_faa.h" | 9 | #include "load_influenza_faa.h" |
8 | 10 | ||
@@ -14,22 +16,26 @@ main() | |||
14 | /* | 16 | /* |
15 | * Create the HDF5 file. | 17 | * Create the HDF5 file. |
16 | */ | 18 | */ |
17 | hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); | 19 | // hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); |
18 | 20 | ||
19 | /* | 21 | /* |
20 | * Load the supplementary protein data file. | 22 | * Load the supplementary protein data file. |
21 | */ | 23 | */ |
22 | load_influenza_aa_dat (file_id); | 24 | // load_influenza_aa_dat (file_id); |
23 | 25 | ||
24 | /* | 26 | /* |
25 | * Load the FASTA protein sequence data file. | 27 | * Load the FASTA protein sequence data file. |
26 | */ | 28 | */ |
27 | load_influenza_faa (file_id); | 29 | // load_influenza_faa (file_id); |
28 | 30 | ||
29 | /* | 31 | /* |
30 | * Close the HD5 file. | 32 | * Close the HD5 file. |
31 | */ | 33 | */ |
32 | H5Fclose (file_id); | 34 | // herr_t status = H5Fclose (file_id); |
35 | // if (status < 0) | ||
36 | // check_h5_error (status, __FILE__, __LINE__); | ||
37 | |||
38 | assign_protein_type (0); | ||
33 | 39 | ||
34 | return 0; | 40 | return 0; |
35 | } | 41 | } |
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 2ba59a7..643ea3f 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c | |||
@@ -1,8 +1,80 @@ | |||
1 | #include "assign_protein_type.h" | 1 | #include "assign_protein_type.h" |
2 | #include "check_ncbi_error.h" | ||
3 | #include <ncbi.h> | ||
4 | #include <readdb.h> | ||
5 | #include <blast.h> | ||
6 | #include <salpacc.h> | ||
7 | #include <stdbool.h> | ||
8 | |||
9 | /* | ||
10 | * BLAST database containing all of the influenza protein sequences. | ||
11 | */ | ||
12 | #define SEQDB "/home/don/exp004/influenzadb/influenzadb" | ||
13 | |||
14 | /* | ||
15 | * BLAST reference database of prototypical protein types. | ||
16 | */ | ||
17 | #define REFDB "/home/don/exp004/influenzadb/proteinnames" | ||
18 | |||
19 | #define BUFFER_LEN 50 | ||
2 | 20 | ||
3 | void | 21 | void |
4 | assign_protein_type (hid_t file_id) | 22 | assign_protein_type (hid_t file_id) |
5 | { | 23 | { |
24 | /* | ||
25 | * Iterate through the records for which no protein type has been | ||
26 | * assigned. Create a BioSeq Pointer to the data and then use this | ||
27 | * as input to the BLAST run against the reference database of | ||
28 | * prototypical sequence records. | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * Make BLAST messages reportable. | ||
33 | */ | ||
34 | ErrSetMessageLevel (SEV_WARNING); | ||
35 | |||
36 | /* | ||
37 | * Open the BLAST sequence database. | ||
38 | */ | ||
39 | ReadDBFILEPtr seqdb = readdb_new (SEQDB, true); | ||
40 | |||
41 | /* | ||
42 | * Get default BLAST options. | ||
43 | */ | ||
44 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); | ||
45 | ValNodePtr error_returns = NULL; | ||
46 | |||
47 | /* | ||
48 | * Read the sequence from the database by GI. | ||
49 | */ | ||
50 | Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); | ||
51 | BioseqPtr bsp = readdb_get_bioseq(seqdb, sequence_number); | ||
52 | |||
53 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
54 | "blastp", | ||
55 | REFDB, | ||
56 | options, | ||
57 | NULL, | ||
58 | &error_returns, | ||
59 | NULL); | ||
60 | |||
61 | if (error_returns != NULL) | ||
62 | check_ncbi_error (error_returns, __FILE__, __LINE__); | ||
63 | |||
64 | if (seqalign != NULL) | ||
65 | { | ||
66 | Char target_id_buf[BUFFER_LEN + 1]; | ||
67 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
68 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
69 | printf ("%s\n", | ||
70 | target_id_buf); | ||
71 | } | ||
72 | |||
73 | // Clean up memory for the next ieration. | ||
74 | seqalign = SeqAlignSetFree (seqalign); | ||
75 | bsp = BioseqFree (bsp); | ||
76 | |||
77 | options = BLASTOptionDelete (options); | ||
6 | 78 | ||
7 | return; | 79 | return; |
8 | } | 80 | } |
diff --git a/src/assign_protein_type.h b/src/assign_protein_type.h index 312b774..1dfb8e6 100644 --- a/src/assign_protein_type.h +++ b/src/assign_protein_type.h | |||
@@ -4,7 +4,11 @@ | |||
4 | #include <hdf5.h> | 4 | #include <hdf5.h> |
5 | 5 | ||
6 | /* | 6 | /* |
7 | * Determine the protein type for each protein sequence record. | 7 | * Determine the protein type for each protein sequence record. The |
8 | * technique used by NCBI is used here. A BLAST database of | ||
9 | * prototypical protein sequences serves as the reference. Each input | ||
10 | * sequence is BLASTed against this database. The first hit is used | ||
11 | * to assign a protein type to sequence. | ||
8 | */ | 12 | */ |
9 | void | 13 | void |
10 | assign_protein_type (hid_t file_id); | 14 | assign_protein_type (hid_t file_id); |
diff --git a/src/check_error.c b/src/check_error.c new file mode 100644 index 0000000..70c62c4 --- a/dev/null +++ b/src/check_error.c | |||
@@ -0,0 +1,14 @@ | |||
1 | #include "check_error.h" | ||
2 | #include <error.h> | ||
3 | #include <errno.h> | ||
4 | #include <stdlib.h> | ||
5 | |||
6 | void | ||
7 | check_error (const char *filename, const unsigned int linenum) | ||
8 | { | ||
9 | if (errno) | ||
10 | error_at_line (EXIT_FAILURE, errno, filename, linenum, | ||
11 | "An error has been detected within the application."); | ||
12 | |||
13 | return; | ||
14 | } | ||
diff --git a/src/check_error.h b/src/check_error.h new file mode 100644 index 0000000..33acc63 --- a/dev/null +++ b/src/check_error.h | |||
@@ -0,0 +1,11 @@ | |||
1 | #ifndef CHECK_ERROR_H | ||
2 | #define CHECK_ERROR_H | ||
3 | |||
4 | /* | ||
5 | * Check the error state. Reports and error message and exits if an | ||
6 | * error has occured. | ||
7 | */ | ||
8 | void | ||
9 | check_error (const char *filename, unsigned int linenum); | ||
10 | |||
11 | #endif // CHECK_ERROR_H | ||
diff --git a/src/check_h5_error.c b/src/check_h5_error.c new file mode 100644 index 0000000..30fc87c --- a/dev/null +++ b/src/check_h5_error.c | |||
@@ -0,0 +1,12 @@ | |||
1 | #include "check_h5_error.h" | ||
2 | #include <error.h> | ||
3 | #include <stdlib.h> | ||
4 | |||
5 | void | ||
6 | check_h5_error (herr_t status, const char* filename, unsigned int linenum) | ||
7 | { | ||
8 | error_at_line (EXIT_FAILURE, 0, filename, linenum, | ||
9 | "An error has been reported by the HDF5 API."); | ||
10 | |||
11 | return; | ||
12 | } | ||
diff --git a/src/check_h5_error.h b/src/check_h5_error.h new file mode 100644 index 0000000..74730cd --- a/dev/null +++ b/src/check_h5_error.h | |||
@@ -0,0 +1,12 @@ | |||
1 | #ifndef CHECK_H5_ERROR_H | ||
2 | #define CHECK_H5_ERROR_H | ||
3 | |||
4 | #include <hdf5.h> | ||
5 | |||
6 | /* | ||
7 | * Handle errors from the HDF5 API. | ||
8 | */ | ||
9 | void | ||
10 | check_h5_error (herr_t status, const char* filename, unsigned int linenum); | ||
11 | |||
12 | #endif // CHECK_H5_ERROR_H | ||
diff --git a/src/check_ncbi_error.c b/src/check_ncbi_error.c new file mode 100644 index 0000000..3caa7a9 --- a/dev/null +++ b/src/check_ncbi_error.c | |||
@@ -0,0 +1,13 @@ | |||
1 | #include "check_ncbi_error.h" | ||
2 | |||
3 | void | ||
4 | check_ncbi_error (ValNodePtr error_returns, | ||
5 | const char* filename, | ||
6 | unsigned int linenum) | ||
7 | { | ||
8 | error_at_line (EXIT_FAILURE, 0, filename, linenum, | ||
9 | "An error has been reported by the NCBI Toolkit API: %s", | ||
10 | BlastErrorToString (error_returns)); | ||
11 | |||
12 | return; | ||
13 | } | ||
diff --git a/src/check_ncbi_error.h b/src/check_ncbi_error.h new file mode 100644 index 0000000..c27c56d --- a/dev/null +++ b/src/check_ncbi_error.h | |||
@@ -0,0 +1,13 @@ | |||
1 | #ifndef CHECK_NCBI_ERROR_H | ||
2 | #define CHECK_NCBI_ERROR_H | ||
3 | |||
4 | #include <ncbi.h> | ||
5 | |||
6 | /* | ||
7 | * Handle errors from the NCBI Toolkit API. | ||
8 | */ | ||
9 | void | ||
10 | check_ncbi_error (ValNodePtr error_returns, | ||
11 | const char* filename, unsigned int linenum); | ||
12 | |||
13 | #endif // CHECK_NCBI_ERROR_H | ||
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c index 493c7db..91ef415 100644 --- a/src/load_influenza_aa_dat.c +++ b/src/load_influenza_aa_dat.c | |||
@@ -6,6 +6,8 @@ | |||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "load_influenza_aa_dat.h" | 8 | #include "load_influenza_aa_dat.h" |
9 | #include "check_error.h" | ||
10 | #include "check_h5_error.h" | ||
9 | #include "hdf5_hl.h" | 11 | #include "hdf5_hl.h" |
10 | #include <string.h> | 12 | #include <string.h> |
11 | #include <stdlib.h> | 13 | #include <stdlib.h> |
@@ -140,7 +142,10 @@ load_influenza_aa_dat (hid_t file_id) | |||
140 | * Insert the records. | 142 | * Insert the records. |
141 | */ | 143 | */ |
142 | supplementary_data p_data; | 144 | supplementary_data p_data; |
143 | FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r"); | 145 | FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", |
146 | "r"); | ||
147 | if (dat == NULL) | ||
148 | check_error (__FILE__, __LINE__); | ||
144 | char *line = NULL; | 149 | char *line = NULL; |
145 | size_t len = 0; | 150 | size_t len = 0; |
146 | int current_line = 0; | 151 | int current_line = 0; |
@@ -205,13 +210,23 @@ load_influenza_aa_dat (hid_t file_id) | |||
205 | strncpy(p_data.full_length_indicator, strsep (&running, "\t"), | 210 | strncpy(p_data.full_length_indicator, strsep (&running, "\t"), |
206 | sizeof(p_data.full_length_indicator)); | 211 | sizeof(p_data.full_length_indicator)); |
207 | 212 | ||
208 | if (current_line == 1) | 213 | if (current_line == 1) |
209 | H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,1, | 214 | { |
210 | dst_size,field_names, dst_offset, field_type, | 215 | herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, |
211 | chunk_size, fill_data, compress, &p_data); | 216 | TABLE_NAME, NFIELDS, 1,dst_size, |
212 | else | 217 | field_names, dst_offset, field_type, |
213 | H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset, | 218 | chunk_size, fill_data, compress, |
214 | dst_sizes, &p_data); | 219 | &p_data); |
220 | if (status < 0) | ||
221 | check_h5_error (status, __FILE__, __LINE__); | ||
222 | } | ||
223 | else | ||
224 | { | ||
225 | herr_t status = H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, | ||
226 | dst_offset, dst_sizes, &p_data); | ||
227 | if (status < 0) | ||
228 | check_h5_error (status, __FILE__, __LINE__); | ||
229 | } | ||
215 | 230 | ||
216 | if (running) | 231 | if (running) |
217 | free (running); | 232 | free (running); |