6.1.1 Combining two files with identical row identifiers
# Load the R data file
load(file.path(DATADIR, "golub_cbind.RData"))
# Check the objects to make sure that they have the same number of elements/rows
# Remember, golub.names is a vector, the others are dataframes
length(golub.names)
## [1] 7129
## [1] 7129 55
## [1] 7129 21
# Then just join them together
golub.new.df <- cbind(golub.names, golub.all, golub.aml)
dim(golub.new.df)
## [1] 7129 77
6.1.2 Combining two files with rows in common
# Merge the two datasets by 'row.names'
# Set all = TRUE to include values present in only one or other set (or
# experiment with other options)
# Turn off sorting, although for expression data, that's probably unnecessary
golub.merge <- merge(all.subset, aml.subset, by = 'row.names',
all = TRUE, sort = FALSE)
# Reassign rownames
rownames(golub.merge) = golub.merge[,1]
# And finally, remove the 'Row.names' column
golub.merge <- golub.merge[,-1]
head(golub.merge[,1:5])
## X1 call X2 call.1 X3
## AFFX-BioB-M_st -41 A 19 A 19
## AFFX-BioB-3_st -831 A -743 A -1135
## AFFX-BioC-5_st -653 A -239 A -962
## AFFX-BioC-3_st -462 A -83 A -232
## AFFX-BioDn-5_st 75 A 182 A 208
## AFFX-BioDn-3_st 381 A 164 A 432
# And if we want to add back in gene names
golub.merge.named <- merge(golub.names, golub.merge,
by = 'row.names', all.y = TRUE)
rownames(golub.merge.named) = golub.merge.named[,1]
golub.merge.named <- golub.merge.named[,-1]
# Have a quick look at what we've ended up with
golub.merge.named[1:3,1:6]
## x X1 call X2 call.1
## A28102_at GB DEF = GABAa receptor alpha-3 subunit 151 A 263 P
## AB000114_at Osteomodulin 72 A 21 A
## AB000115_at mRNA 281 A 250 P
## X3
## A28102_at 88
## AB000114_at -27
## AB000115_at 358
6.1.3 Linking in reference data
# There are at least two ways of doing this exercise, but both need to
# read the file in first.
species.annot = read.csv(file.path(DATADIR, "species-status-codes.csv"))
# Method one - create look-up vectors and use cbind
# Create a vector of the nca.status descriptions, named by code
nca.status <- species.annot[(species.annot[,"Field"] == "NCA_status"), 3]
names(nca.status) <- species.annot[(species.annot[,"Field"] == "NCA_status"), 2]
# Create a vector of the NCA descriptions by looking up the description
# for the NCA_status column value. Note the 'as.character' to convert
# NCA_status from a factor integer to a text string
NCA.description <- nca.status[as.character(species[,'NCA_status'])]
# Then repeat the whole process for Endemicity
endemicity.status <- species.annot[(species.annot[,"Field"] == "Endemicity"), 3]
names(endemicity.status) <- species.annot[(species.annot[,"Field"] == "Endemicity"), 2]
Endemicity.description <- endemicity.status[as.character(species[,'Endemicity'])]
# Finally, cbind all the data together
species.annotated <- cbind(species, NCA.description, Endemicity.description)
head(species.annotated[,1:5])
## Taxon_Id Kingdom Class Family Scientific_name
## 1 706 animals amphibians Limnodynastidae Adelotus brevis
## 2 687 animals amphibians Limnodynastidae Kyarranus kundagungan
## 3 688 animals amphibians Limnodynastidae Kyarranus loveridgei
## 4 690 animals amphibians Limnodynastidae Lechriodus fletcheri
## 5 677 animals amphibians Limnodynastidae Limnodynastes convexiusculus
## 6 678 animals amphibians Limnodynastidae Limnodynastes dumerilii
# The second way is to use merge, and do some tidying up on the way
# First, pull out just the NCA_status entries from the annotation file
nca.status <- species.annot[species.annot[, "Field"] == "NCA_status",]
species.nca <- merge(species, nca.status,by.x = "NCA_status", by.y = "Code",
all.x = TRUE, sort = FALSE)
# Remove the unnecessary new 'Field' column
species.nca[, "Field"] <- NULL
# And the same for the endemicity
endemicity.status <- species.annot[species.annot[, "Field"] == "Endemicity",]
species.annotated <- merge(species.nca, endemicity.status,by.x = "Endemicity",
by.y = "Code", all.x = TRUE, sort = FALSE)
species.annotated[, "Field"] <- NULL
# Finally, rename the final two (description) columns to something useful
colnames(species.annotated)[12:13] <- c("NCA.description","Endemicity.description")