1 files changed, 20 insertions, 9 deletions
diff --git a/analysis/year.R b/analysis/year.R
index 6d68925..37310d5 100644
--- a/analysis/year.R
+++ b/analysis/year.R
@@ -4,22 +4,33 @@ require(hdf5);
 
 hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);
 
-A <- influenza.aa.dat;
-B <- influenza.faa;
-
-# Join the two tables by GB value.
-C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB");
+A <- merge (influenza.aa.dat, influenza.faa,
+            by.x = "GenBank accession number",
+            by.y = "GB");
+
+B <- merge (A, gi.type.data,
+            by.x = "GI",
+            by.y = "GI");
+
+# Compare the local copy with a query performed on the NCBI database.
+# A quick check of the number of records returned and the first and
+# last set of GB values in sorted order should not show any
+# inconsistencies.
+T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ];
+nrow (T);
+U <- T$"GenBank accession number";
+sort (U);
 
 # All records for 1918.  Based on code from
 # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
-D <- C[C$Year == 1918, ]
+C <- B[B$Year == 1918, ]
 
-summary (D);
+summary (C);
 
 # Countries represented in the 1918 dataset.
-D$Country;
+C$Country;
 
-D[D$"Protein Type" == "HA", ]
+C[C$Protein == "HA", ]
 
 # All record with a year value.
 E <- A[A$Year != 0, ];