1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
#include "assign_protein_type.h"
#include "check_ncbi_error.h"
#include "check_h5_error.h"
#include <ncbi.h>
#include <readdb.h>
#include <blast.h>
#include <salpacc.h>
#include <stdbool.h>
#include <hdf5_hl.h>
/*
* BLAST database containing all of the influenza protein sequences.
*/
#define SEQDB "/home/don/exp004/influenzadb/influenzadb"
/*
* BLAST reference database of prototypical protein types.
*/
#define REFDB "/home/don/exp004/influenzadb/proteinnames"
#define BUFFER_LEN 50
void
assign_protein_type (hid_t file_id)
{
/*
* Iterate through the records for which no protein type has been
* assigned. Create a BioSeq Pointer to the data and then use this
* as input to the BLAST run against the reference database of
* prototypical sequence records.
*/
/*
* Make BLAST messages reportable.
*/
ErrSetMessageLevel (SEV_WARNING);
/*
* Open the BLAST sequence database.
*/
ReadDBFILEPtr seqdb = readdb_new (SEQDB, true);
/*
* Get default BLAST options.
*/
BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
ValNodePtr error_returns = NULL;
/*
* Read the data from HDF5 file.
*/
hsize_t nfields;
hsize_t nrecords;
herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields,
&nrecords);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
/*
* todo: Allocate memory of nrecords for dst_buf.
*
* todo: Refactor code to share structres in read and write HDF5
* calls.
*/
status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
dst_sizes, dst_buf);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
for (int i = 0; i < nrecords; i++)
{
}
/*
* Read the sequence from the database by GI.
*/
Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);
BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
"blastp",
REFDB,
options,
NULL,
&error_returns,
NULL);
if (error_returns != NULL)
check_ncbi_error (error_returns, __FILE__, __LINE__);
if (seqalign != NULL)
{
Char target_id_buf[BUFFER_LEN + 1];
SeqIdPtr target_id = SeqAlignId (seqalign, 1);
SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
printf ("%s\n", target_id_buf);
}
// Clean up memory for the next ieration.
seqalign = SeqAlignSetFree (seqalign);
bsp = BioseqFree (bsp);
options = BLASTOptionDelete (options);
return;
}
|