-rw-r--r-- | analysis/year.R | 29 |
1 files changed, 20 insertions, 9 deletions
diff --git a/analysis/year.R b/analysis/year.R index 6d68925..37310d5 100644 --- a/analysis/year.R +++ b/analysis/year.R @@ -4,22 +4,33 @@ require(hdf5); hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE); -A <- influenza.aa.dat; -B <- influenza.faa; - -# Join the two tables by GB value. -C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB"); +A <- merge (influenza.aa.dat, influenza.faa, + by.x = "GenBank accession number", + by.y = "GB"); + +B <- merge (A, gi.type.data, + by.x = "GI", + by.y = "GI"); + +# Compare the local copy with a query performed on the NCBI database. +# A quick check of the number of records returned and the first and +# last set of GB values in sorted order should not show any +# inconsistencies. +T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ]; +nrow (T); +U <- T$"GenBank accession number"; +sort (U); # All records for 1918. Based on code from # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations -D <- C[C$Year == 1918, ] +C <- B[B$Year == 1918, ] -summary (D); +summary (C); # Countries represented in the 1918 dataset. -D$Country; +C$Country; -D[D$"Protein Type" == "HA", ] +C[C$Protein == "HA", ] # All record with a year value. E <- A[A$Year != 0, ]; |