From 4498865663dc42c25faf72e6ff72675538cdd697 Mon Sep 17 00:00:00 2001
From: Don Pellegrino <don@drexel.edu>
Date: Sat, 16 Jan 2010 01:46:51 +0000
Subject: Implemented the loading of the influenza_aa.dat file.  Tested by

comparing an export of the data from the HDF5 file and the original
file.
---
diff --git a/README b/README
index 9caedb8..197d289 100644
--- a/README
+++ b/README
@@ -32,4 +32,32 @@ The "doc/Data Deployments.dia" diagram shows the source systems that
 expose the various records as well as the transform routines that are
 used for aggregation of the data on the local system.
 
- LocalWords:  NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia
+BUILDING
+
+An autogen.sh script is provided to initialize the project directory
+with the necessary GNU Autotools configuration.
+
+When building on a Debian system the mpi.h file is in a subdirectory
+of /usr/include and therefore not found within the default include
+path.  To account for this run the following before running
+./configure.
+
+  $ export CPPFLAGS=-I/usr/include/mpi
+
+TEST CASES
+
+The "load_influenza_aa_dat" function loads a single tab delimited text
+file into a table structure in the HDF5 file.  The HDFView GUI can be
+used to open the loaded table and then export it back out as a text
+file.  The text file can then be compared with the original input to
+verify that the load was completed without error.
+
+  $ diff --report-identical-files \
+    /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat \
+    Protein\ Sequences.txt 
+
+  Files /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat and
+  Protein Sequences.txt are identical
+
+ LocalWords:  NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia mpi
+ LocalWords:  autogen Autotools CPPFLAGS aa dat HDFView GUI diff txt
diff --git a/doc/Data Deployments.dia b/doc/Data Deployments.dia
index b8ad4af..277d53a 100644
--- a/doc/Data Deployments.dia
+++ b/doc/Data Deployments.dia
Binary files differ
diff --git a/src/aggregator.c b/src/aggregator.c
index ae5aa60..da6db08 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -24,7 +24,7 @@ main()
   /*
    * Close the HD5 file.
    */
-  herr_t status = H5Fclose (file_id);
+  H5Fclose (file_id);
 
   return 0;
 }
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c
index 72aacb5..5af8a72 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load_influenza_aa_dat.c
@@ -1,10 +1,17 @@
+/*
+ * Load the influnza_aa.dat tab delimited text file into an HDF5
+ * binary table.
+ *
+ * todo: Handle NULL values occuring in numeric fields.
+ */
+
 #include "load_influenza_aa_dat.h"
 #include "hdf5_hl.h"
+#include <string.h>
+#include <stdlib.h>
 
 #define NFIELDS (hsize_t) 11
-//#define NRECORDS (hsize_t) 138052
-#define NRECORDS (hsize_t) 1
-#define TABLE_NAME "influenza_aa.dat"
+#define TABLE_NAME "Protein Sequences"
 
 void
 load_influenza_aa_dat (hid_t file_id)
@@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id)
   /*
    * Model the data using native types.
    */
-  typedef struct supplementary_data
+  typedef struct
   {
     char genbank_accession_number[9];
     char host[15];
     int genome_segment_number;
     char subtype[7];
     char country[25];
-    int year;
+    char year[8];
     int sequence_length;
     char virus_name[196];
     char age[17];
@@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id)
 				 HOFFSET ( supplementary_data, gender ),
 				 HOFFSET ( supplementary_data, full_length_indicator )};
 
-  /*
-
-    Only needed for reading?
-
-  supplementary_data dst_buf[NRECORDS];
+  supplementary_data dst_buf[1];
 
   size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),
 				sizeof ( dst_buf[0].host ),
@@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id)
 				sizeof ( dst_buf[0].age ),
 				sizeof ( dst_buf[0].gender ),
 				sizeof ( dst_buf[0].full_length_indicator)};
-  */
 
   /*
-   * "Define field information."
-   */
-  const char *field_names[NFIELDS] =
-    { "GenBank accession number",
-      "Host",
-      "Genome segment number",
-      "Subtype",
-      "Country",
-      "Year",
-      "Sequence length",
-      "Virus name",
-      "Age",
-      "Gender",
-      "Full-length Indicator" };
-  hsize_t chunk_size = 10;
-  int *fill_data = NULL;
-  int compress = 0;
-  
-  /*
-   * "Initialize field type."
+   * Map the native types to HDF5 types for each field.
    */
   hid_t field_type[NFIELDS];
 
@@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id)
   H5Tset_size (country_type, 25 );
   field_type[4] = country_type;
 
-  field_type[5] = H5T_NATIVE_INT; 
+  hid_t year_type = H5Tcopy ( H5T_C_S1 );
+  H5Tset_size (year_type, 8);
+  field_type[5] = year_type;
 
   field_type[6] = H5T_NATIVE_INT;
 
@@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id)
   H5Tset_size (full_length_indicator_type, 4);
   field_type[10] = full_length_indicator_type;
 
-  supplementary_data p_data[NRECORDS] = {
-    {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)",
-     "", "", "yes"}
-  };
+  /*
+   * Labels used for the fields in the table.
+   */
+  const char *field_names[NFIELDS] =
+    { "GenBank accession number",
+      "Host",
+      "Genome segment number",
+      "Subtype",
+      "Country",
+      "Year",
+      "Sequence length",
+      "Virus name",
+      "Age",
+      "Gender",
+      "Full-length Indicator" };
+
+  /*
+   * Table storage options.
+   */
+  hsize_t chunk_size = 10;
+  int *fill_data = NULL;
+  int compress = 0;
+
+  /*
+   * Insert the records.
+   */
+  supplementary_data p_data;
+  FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r");
+  char *line = NULL;
+  size_t len = 0;
+  int current_line = 0;
+
+  while (getline (&line, &len, dat) != -1) {
+
+    current_line++;
+    char *running = strdup (line);
+    char *token;
+    
+    /*
+     * Parse the line, handling the case of empty fields represented
+     * by sequential delimiters.
+     */
+    strncpy(p_data.genbank_accession_number, strsep (&running, "\t"),
+	    sizeof(p_data.genbank_accession_number));
+    
+    strncpy(p_data.host, strsep (&running, "\t"),
+	    sizeof(p_data.host));
+    
+    token = strsep (&running, "\t");
+    if (strcmp (token, "\0") == 0)
+      p_data.genome_segment_number = 0;
+    else
+      p_data.genome_segment_number = atoi(token);
+    
+    strncpy(p_data.subtype, strsep (&running, "\t"),
+	    sizeof(p_data.subtype));
+    
+    strncpy(p_data.country, strsep (&running, "\t"),
+	    sizeof(p_data.country));
+    
+    strncpy (p_data.year, strsep (&running, "\t"),
+	     sizeof(p_data.year));
+
+    token = strsep (&running, "\t");
+    if (strcmp (token, "\0") == 0)
+      p_data.sequence_length = 0;
+    else
+      p_data.sequence_length = atoi(token);
+    
+    strncpy(p_data.virus_name, strsep (&running, "\t"),
+	    sizeof(p_data.virus_name));
+    
+    strncpy(p_data.age, strsep (&running, "\t"),
+	    sizeof(p_data.age));
+    
+    strncpy(p_data.gender, strsep (&running, "\t"),
+	    sizeof(p_data.gender));
+    
+    strncpy(p_data.full_length_indicator, strsep (&running, "\t"),
+	    sizeof(p_data.full_length_indicator));
+
+    if (current_line == 1)     
+      H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1,
+		      dst_size,field_names, dst_offset, field_type,
+		      chunk_size, fill_data, compress, &p_data);
+    else     
+      H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset,
+			 dst_sizes, &p_data);
+
+    if (running)
+      free (running);
+   
+  }
+  
+  if (line)
+    free (line);
 
-  herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS,
-				  dst_size,field_names, dst_offset, field_type,
-				  chunk_size, fill_data, compress, p_data);
+  fclose (dat);
 
   H5Tclose (genbank_accession_number_type);
   H5Tclose (host_type);
   H5Tclose (subtype_type);
   H5Tclose (country_type);
+  H5Tclose (year_type);
   H5Tclose (virus_name_type);
   H5Tclose (age_type);
   H5Tclose (gender_type);
--
cgit v0.8.3.1-22-g547a