-rw-r--r-- | .gitignore | 5 | ||||
-rw-r--r-- | src/Makefile.am | 10 | ||||
-rw-r--r-- | src/assign_protein_type.c | 135 | ||||
-rw-r--r-- | src/load_influenza_faa.c | 67 | ||||
-rw-r--r-- | src/sequence_data.h | 16 | ||||
-rw-r--r-- | src/sequence_data_init.c | 37 | ||||
-rw-r--r-- | src/sequence_data_init.h | 14 |
7 files changed, 188 insertions, 96 deletions
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 1b58f54..166e787 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c | |||
@@ -1,12 +1,15 @@ | |||
1 | #include "assign_protein_type.h" | 1 | #include "assign_protein_type.h" |
2 | #include "check_ncbi_error.h" | 2 | #include "check_ncbi_error.h" |
3 | #include "check_h5_error.h" | 3 | #include "check_h5_error.h" |
4 | #include "sequence_data.h" | ||
5 | #include "sequence_data_init.h" | ||
4 | #include <ncbi.h> | 6 | #include <ncbi.h> |
5 | #include <readdb.h> | 7 | #include <readdb.h> |
6 | #include <blast.h> | 8 | #include <blast.h> |
7 | #include <salpacc.h> | 9 | #include <salpacc.h> |
8 | #include <stdbool.h> | 10 | #include <stdbool.h> |
9 | #include <hdf5_hl.h> | 11 | #include <hdf5_hl.h> |
12 | #include <error.h> | ||
10 | 13 | ||
11 | /* | 14 | /* |
12 | * BLAST database containing all of the influenza protein sequences. | 15 | * BLAST database containing all of the influenza protein sequences. |
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id) | |||
44 | * Get default BLAST options. | 47 | * Get default BLAST options. |
45 | */ | 48 | */ |
46 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); | 49 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); |
50 | |||
51 | /* | ||
52 | * Set to single hit mode. | ||
53 | */ | ||
54 | options->window_size = 0; | ||
55 | options->multiple_hits_only = false; | ||
56 | |||
47 | ValNodePtr error_returns = NULL; | 57 | ValNodePtr error_returns = NULL; |
48 | 58 | ||
49 | /* | 59 | /* |
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id) | |||
56 | if (status < 0) | 66 | if (status < 0) |
57 | check_h5_error (status, __FILE__, __LINE__); | 67 | check_h5_error (status, __FILE__, __LINE__); |
58 | 68 | ||
59 | /* | 69 | sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords); |
60 | * todo: Allocate memory of nrecords for dst_buf. | 70 | |
61 | * | 71 | size_t dst_size; |
62 | * todo: Refactor code to share structres in read and write HDF5 | 72 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; |
63 | * calls. | 73 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; |
64 | */ | 74 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; |
75 | sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); | ||
65 | 76 | ||
66 | status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, | 77 | status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, |
67 | dst_sizes, dst_buf); | 78 | dst_sizes, dst_buf); |
68 | if (status < 0) | 79 | if (status < 0) |
69 | check_h5_error (status, __FILE__, __LINE__); | 80 | check_h5_error (status, __FILE__, __LINE__); |
70 | 81 | ||
71 | for (int i = 0; i < nrecords; i++) | ||
72 | { | ||
73 | |||
74 | } | ||
75 | |||
76 | /* | 82 | /* |
77 | * Read the sequence from the database by GI. | 83 | * Assign protein types to records for which the field is empty. |
78 | */ | 84 | */ |
79 | Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); | 85 | printf ("Records to process: %i\n", (int)nrecords); |
80 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | 86 | for (int i = 0; i < nrecords; i++) |
81 | |||
82 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
83 | "blastp", | ||
84 | REFDB, | ||
85 | options, | ||
86 | NULL, | ||
87 | &error_returns, | ||
88 | NULL); | ||
89 | |||
90 | if (error_returns != NULL) | ||
91 | check_ncbi_error (error_returns, __FILE__, __LINE__); | ||
92 | |||
93 | if (seqalign != NULL) | ||
94 | { | 87 | { |
95 | Char target_id_buf[BUFFER_LEN + 1]; | ||
96 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
97 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
98 | printf ("%s\n", target_id_buf); | ||
99 | } | ||
100 | |||
101 | // Clean up memory for the next ieration. | ||
102 | seqalign = SeqAlignSetFree (seqalign); | ||
103 | bsp = BioseqFree (bsp); | ||
104 | 88 | ||
89 | if (dst_buf[i].protein_type[0] == '\0') { | ||
90 | /* | ||
91 | * Read the sequence from the database by GI. | ||
92 | */ | ||
93 | Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); | ||
94 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | ||
95 | |||
96 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
97 | "blastp", | ||
98 | REFDB, | ||
99 | options, | ||
100 | NULL, | ||
101 | &error_returns, | ||
102 | NULL); | ||
103 | |||
104 | /* | ||
105 | * BLAST reported an error. Write it out and continue processing. | ||
106 | */ | ||
107 | if (error_returns != NULL) | ||
108 | { | ||
109 | printf ("Warning: An error has been reported by the NCBI Toolkit API " | ||
110 | "for sequence gi|%i: %s", | ||
111 | dst_buf[i].gi, BlastErrorToString (error_returns)); | ||
112 | |||
113 | } | ||
114 | |||
115 | /* | ||
116 | * A hit was found. Record the first hit as the protein type. | ||
117 | * Skip the first 6 characters and eat the "lcl|x_". | ||
118 | */ | ||
119 | else if (seqalign != NULL) | ||
120 | { | ||
121 | Char target_id_buf[BUFFER_LEN + 1]; | ||
122 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
123 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
124 | strncpy (dst_buf[i].protein_type, &target_id_buf[6], | ||
125 | sizeof (dst_buf[i].protein_type)); | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * BLAST did not find any hits. | ||
130 | */ | ||
131 | else | ||
132 | { | ||
133 | printf ("Warning: Unable to identify protein type for sequence gi|%i\n", | ||
134 | dst_buf[i].gi); | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * Clean up memory for the next ieration. | ||
139 | */ | ||
140 | seqalign = SeqAlignSetFree (seqalign); | ||
141 | bsp = BioseqFree (bsp); | ||
142 | |||
143 | /* | ||
144 | * Write the data out to the file. | ||
145 | */ | ||
146 | if ( (i % 1000 == 0) && (i > 0) ) | ||
147 | { | ||
148 | status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, | ||
149 | dst_size, dst_offset, dst_sizes, | ||
150 | &dst_buf[i-1000]); | ||
151 | if (status < 0) | ||
152 | check_h5_error (status, __FILE__, __LINE__); | ||
153 | |||
154 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | ||
155 | if (status < 0) | ||
156 | check_h5_error (status, __FILE__, __LINE__); | ||
157 | |||
158 | printf ("Processed %i of %i records.\n", i, (int)nrecords); | ||
159 | } | ||
160 | } | ||
161 | |||
162 | } | ||
163 | |||
164 | free (dst_buf); | ||
165 | |||
105 | options = BLASTOptionDelete (options); | 166 | options = BLASTOptionDelete (options); |
106 | 167 | ||
107 | return; | 168 | return; |
108 | } | 169 | } |