author | Don Pellegrino <don@drexel.edu> | 2010-03-28 08:13:39 (GMT) |
---|---|---|
committer | Don Pellegrino <don@drexel.edu> | 2010-03-28 08:13:39 (GMT) |
commit | 75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e (patch) (side-by-side diff) | |
tree | 35c5bdd4f7c44946192059c5d3c78980ba9c4b07 | |
parent | 1d29fba5de1dd0731564829dbf5aec572d161bd5 (diff) | |
download | exp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.zip exp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.tar.gz exp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.tar.bz2 |
Testing options for loading meta-data/features/Entrez into HDF5.
-rw-r--r-- | src/load/load_asn.c | 173 | ||||
-rw-r--r-- | src/load/load_asn.h | 24 | ||||
-rw-r--r-- | src/load/load_features.c | 167 | ||||
-rw-r--r-- | src/load/load_features.h | 12 |
4 files changed, 376 insertions, 0 deletions
diff --git a/src/load/load_asn.c b/src/load/load_asn.c new file mode 100644 index 0000000..fc27d84 --- a/dev/null +++ b/src/load/load_asn.c @@ -0,0 +1,173 @@ +#define _GNU_SOURCE +#include "load_asn.h" +#include <string.h> +#include <asn.h> +#include <objgbseq.h> +#include <objsset.h> +#include <sqnutils.h> + +void +print_asn (ObjectIdPtr oid, SeqIdPtr id, ValNodePtr descr, SeqAnnotPtr annot) +{ + /* + * Print the record identifiers. + */ + printf (" IDENTIFIERS\n"); + printf (" -----------\n"); + while (oid != NULL) + { + printf("%i, %s\n", oid->id, oid->str); + } + while (id != NULL) + { + // printf ("ID: %i\n", id->choice); + + char idval[256]; + SeqIdPrint (id, idval, PRINTID_FASTA_SHORT); + printf (" %s\n", idval); + + // if (id->choice == SEQID_GI) + // printf ("GI: %i\n", id->data.intvalue); + + id = id->next; + } + + /* + * Print descriptions. + * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/BIOSEQ.HTML#_Seq-descr:_Describing_the] + */ + printf ("\n DESCRIPTIONS\n"); + printf (" ------------\n"); + while (descr != NULL) + { + switch (descr->choice) + { + case Seq_descr_title: + printf (" TITLE: %s\n", (char*)descr->data.ptrvalue); + break; + case Seq_descr_genbank: + printf (" GENBANK\n"); + break; + case Seq_descr_pub: + printf (" PUB\n"); + break; + case Seq_descr_create_date: + printf (" CREATE DATE\n"); + break; + case Seq_descr_update_date: + printf (" UPDATE DATE\n"); + break; + case Seq_descr_source: + printf (" BIOSOURCE\n"); + break; + case Seq_descr_molinfo: + printf (" MOLINFO\n"); + break; + default: + printf (" DESCRIPTION CHOICE=%i\n", descr->choice); + break; + } + + descr = descr->next; + } + + /* + * Print annotations. + */ + printf ("\n ANNOTATIONS\n"); + printf (" -----------\n"); + while (annot != NULL) + { + printf (" ANNOTATION: %s, ", annot->name); + if (annot->desc != NULL) { + switch (annot->desc->choice) + { + case Annot_descr_name: + printf (" NAME: %s\n", (char*)annot->desc->data.ptrvalue); + break; + default: + printf (" CHOICE=%i\n", annot->desc->choice); + break; + } + } + else + printf (" NONE\n"); + + annot = annot->next; + } + +} + +/* + * Based on example at + * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/SEQUTIL.HTML]. + */ +void +load_asn (hid_t file_id, const char* file_name) +{ + char* asn_file = strdup(file_name); + AsnIoPtr aip = AsnIoOpen (asn_file, "r"); + SeqEntryPtr sep = SeqEntryAsnRead (aip, NULL); + BioseqSetPtr bsetp = 0; + ValNodePtr descr = 0; + SeqAnnotPtr annot = 0; + SeqIdPtr id = 0; + ObjectIdPtr oid = 0; + + /* + * Data file statistics. + */ + printf ("NODES: %i\tBIOSEQS: %i\n", ValNodeLen (sep), BioseqCount (sep)); + printf ("\n"); + + /* + * This loop needs to be corrected to handle nesting of sets. + */ + + while (sep != NULL) + { + bsetp = (BioseqSetPtr) sep->data.ptrvalue; + if (bsetp != NULL) + { + oid = bsetp->id; + id = NULL; + descr = bsetp->descr; + annot = bsetp->annot; + } + + printf ("BIOSEQSET\n"); + printf ("\n"); + print_asn (oid, id, descr, annot); + printf ("\n"); + + /* + * Process Bioseqs in the set. + */ + SeqEntryPtr sep2 = bsetp->seq_set; + while (sep2 != NULL) + { + BioseqPtr bsp = sep2->data.ptrvalue; + if (bsp != NULL) + { + oid = NULL; + id = bsp->id; + descr = bsp->descr; + annot = bsp->annot; + + printf ("BIOSEQ\n"); + printf ("\n"); + print_asn (oid, id, descr, annot); + printf ("\n"); + } + + sep2 = sep2->next; + } + + sep = sep->next; + } + + AsnIoClose (aip); + free (asn_file); + + return; +} diff --git a/src/load/load_asn.h b/src/load/load_asn.h new file mode 100644 index 0000000..a7d54db --- a/dev/null +++ b/src/load/load_asn.h @@ -0,0 +1,24 @@ +#ifndef LOAD_ASN_H +#define LOAD_ASN_H + +#include <hdf5.h> + +/* + * Load the features and other meta-data pulled from Entrez via eFetch + * as ASN.1. + * + * Test: gi|453644 + * + * Retrieving the ASN.1 file via eFetch for gi|453644 worked smoothly + * however the hierarchy of the ASN.1 is difficult to align with other + * data by GI. This is due to the Bioseqset returned lacking + * identifiers and the gi|453644 appearing as a Bioseq member of the + * set. It is positioned on the same hierarchical level as gi|453643. + * The containing set includes the PUB records. Comparatively the XML + * files returned via the same process list the gi|453644 at the top + * of the hierarchy and above the PUB records. This output appears to + * be more consistent with the perspective requested in the input. + */ +void load_asn (hid_t file_id, const char* file_name); + +#endif // LOAD_ASN_H diff --git a/src/load/load_features.c b/src/load/load_features.c new file mode 100644 index 0000000..b18031a --- a/dev/null +++ b/src/load/load_features.c @@ -0,0 +1,167 @@ +#include "load_features.h" +#include <libxml/parser.h> +#include <stdbool.h> +#include <asn.h> +#include <objgbseq.h> + +/* + * An NCBI GBSeq structure to hold the data for the current record. + */ +GBSeqPtr g; + +bool in_element; + +static void +lf_startDocument (void *ctx ATTRIBUTE_UNUSED) +{ + printf ("SAX.startDocument()\n"); + + return; +} + +static void +lf_endDocument (void *ctx ATTRIBUTE_UNUSED) +{ + printf ("SAX.endDocument()\n"); + + return; +} + +static xmlEntityPtr +lf_getEntity (void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) +{ + printf("SAX.getEntity(%s)\n", name); + + return (NULL); +} + +static void +lf_startElement(void *ctx ATTRIBUTE_UNUSED, + const xmlChar *name, const xmlChar **atts) +{ + int i; + + fprintf(stdout, "SAX.startElement(%s", (char *) name); + if (atts != NULL) { + for (i = 0;(atts[i] != NULL);i++) { + fprintf(stdout, ", %s='", atts[i++]); + if (atts[i] != NULL) + fprintf(stdout, "%s'", atts[i]); + } + } + fprintf(stdout, ")\n"); + + in_element = true; + + return; +} + +static void +lf_endElement(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) +{ + printf("SAX.endElement(%s)\n", (char *) name); + in_element = false; + + return; +} + +static void +lf_characters(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len) +{ + if (!in_element) + return; + + char output[40]; + int i; + + for (i = 0;(i<len) && (i < 30);i++) + output[i] = ch[i]; + output[i] = 0; + + printf("SAX.characters(%s, %d)\n", output, len); +} + +/* + * 1. Parse and load the XML file into memory. + * 2. Insert the XML into HDF5. + * For an example of parsing XML with libxml2 and SAX see: + * [http://git.gnome.org/browse/libxml2/tree/testSAX.c]. + */ +void +load_features (hid_t file_id, const char* file_name) +{ + g = GBSeqNew (); + + LIBXML_TEST_VERSION; + + in_element = false; + + static xmlSAXHandler emptySAXHandlerStruct = { + NULL, /* internalSubset */ + NULL, /* isStandalone */ + NULL, /* hasInternalSubset */ + NULL, /* hasExternalSubset */ + NULL, /* resolveEntity */ + lf_getEntity, /* getEntity */ + NULL, /* entityDecl */ + NULL, /* notationDecl */ + NULL, /* attributeDecl */ + NULL, /* elementDecl */ + NULL, /* unparsedEntityDecl */ + NULL, /* setDocumentLocator */ + lf_startDocument, /* startDocument */ + lf_endDocument, /* endDocument */ + lf_startElement, /* startElement */ + lf_endElement, /* endElement */ + NULL, /* reference */ + lf_characters, /* characters */ + NULL, /* ignorableWhitespace */ + NULL, /* processingInstruction */ + NULL, /* comment */ + NULL, /* xmlParserWarning */ + NULL, /* xmlParserError */ + NULL, /* xmlParserError */ + NULL, /* getParameterEntity */ + NULL, /* cdataBlock; */ + NULL, /* externalSubset; */ + 1, + NULL, + NULL, /* startElementNs */ + NULL, /* endElementNs */ + NULL /* xmlStructuredErrorFunc */ + }; + + static xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct; + + FILE *f = fopen (file_name, "r"); + + if (f != NULL) + { + int ret; + char chars[10]; + xmlParserCtxtPtr ctxt; + + ret = fread (chars, 1, 4, f); + if (ret > 0) + { + ctxt = xmlCreatePushParserCtxt (emptySAXHandler, NULL, + chars, ret, file_name); + while ((ret = fread(chars, 1, 3, f)) > 0) + { + xmlParseChunk (ctxt, chars, ret, 0); + } + xmlParseChunk (ctxt, chars, 0, 1); + xmlFreeParserCtxt(ctxt); + } + fclose (f); + } + else + { + xmlGenericError (xmlGenericErrorContext, + "Cannot read file."); + } + + GBSeqFree (g); + + return; +} diff --git a/src/load/load_features.h b/src/load/load_features.h new file mode 100644 index 0000000..932883f --- a/dev/null +++ b/src/load/load_features.h @@ -0,0 +1,12 @@ +#ifndef LOAD_FEATURES_H +#define LOAD_FEATURES_H + +#include <hdf5.h> + +/* + * Load the features and other meta-data pulled from Entrez via eFetch + * as XML. + */ +void load_features (hid_t file_id, const char* file_name); + +#endif // LOAD_FEATURES_H |