Data preparation, processing and interpretation with R

6 Integrating and summarising datasets

6.1.1 Combining two files with identical row identifiers

# Load the R data file
load(file.path(DATADIR, "golub_cbind.RData"))

# Check the objects to make sure that they have the same number of elements/rows
# Remember, golub.names is a vector, the others are dataframes
length(golub.names)

## [1] 7129

dim(golub.all)

## [1] 7129   55

dim(golub.aml)

## [1] 7129   21

# Then just join them together
golub.new.df <- cbind(golub.names, golub.all, golub.aml)
dim(golub.new.df)

## [1] 7129   77

6.1.2 Combining two files with rows in common

# Merge the two datasets by 'row.names'
# Set all = TRUE to include values present in only one or other set (or
# experiment with other options)
# Turn off sorting, although for expression data, that's probably unnecessary
golub.merge <- merge(all.subset, aml.subset, by = 'row.names',
all = TRUE, sort = FALSE)

# Reassign rownames
rownames(golub.merge) = golub.merge[,1]

# And finally, remove the 'Row.names' column
golub.merge <- golub.merge[,-1]
head(golub.merge[,1:5])

##                   X1 call   X2 call.1    X3
## AFFX-BioB-M_st   -41    A   19      A    19
## AFFX-BioB-3_st  -831    A -743      A -1135
## AFFX-BioC-5_st  -653    A -239      A  -962
## AFFX-BioC-3_st  -462    A  -83      A  -232
## AFFX-BioDn-5_st   75    A  182      A   208
## AFFX-BioDn-3_st  381    A  164      A   432

# And if we want to add back in gene names
golub.merge.named <- merge(golub.names, golub.merge,
by = 'row.names', all.y = TRUE)
rownames(golub.merge.named) = golub.merge.named[,1]
golub.merge.named <- golub.merge.named[,-1]

# Have a quick look at what we've ended up with
golub.merge.named[1:3,1:6]

##                                                   x  X1 call  X2 call.1
## A28102_at   GB DEF = GABAa receptor alpha-3 subunit 151    A 263      P
## AB000114_at                            Osteomodulin  72    A  21      A
## AB000115_at                                    mRNA 281    A 250      P
##              X3
## A28102_at    88
## AB000114_at -27
## AB000115_at 358

6.1.3 Linking in reference data

# There are at least two ways of doing this exercise, but both need to
# read the file in first.
species.annot = read.csv(file.path(DATADIR, "species-status-codes.csv"))

# Method one - create look-up vectors and use cbind
# Create a vector of the nca.status descriptions, named by code
nca.status <- species.annot[(species.annot[,"Field"] == "NCA_status"), 3]
names(nca.status) <- species.annot[(species.annot[,"Field"] == "NCA_status"), 2]

# Create a vector of the NCA descriptions by looking up the description
# for the NCA_status column value. Note the 'as.character' to convert
# NCA_status from a factor integer to a text string
NCA.description <- nca.status[as.character(species[,'NCA_status'])]

# Then repeat the whole process for Endemicity
endemicity.status <- species.annot[(species.annot[,"Field"] == "Endemicity"), 3]
names(endemicity.status) <- species.annot[(species.annot[,"Field"] == "Endemicity"), 2]
Endemicity.description <- endemicity.status[as.character(species[,'Endemicity'])]

# Finally, cbind all the data together
species.annotated <- cbind(species, NCA.description, Endemicity.description)
head(species.annotated[,1:5])

##   Taxon_Id Kingdom      Class          Family              Scientific_name
## 1      706 animals amphibians Limnodynastidae              Adelotus brevis
## 2      687 animals amphibians Limnodynastidae        Kyarranus kundagungan
## 3      688 animals amphibians Limnodynastidae         Kyarranus loveridgei
## 4      690 animals amphibians Limnodynastidae         Lechriodus fletcheri
## 5      677 animals amphibians Limnodynastidae Limnodynastes convexiusculus
## 6      678 animals amphibians Limnodynastidae      Limnodynastes dumerilii

# The second way is to use merge, and do some tidying up on the way
# First, pull out just the NCA_status entries from the annotation file
nca.status <- species.annot[species.annot[, "Field"] == "NCA_status",]
species.nca <- merge(species, nca.status,by.x = "NCA_status", by.y = "Code",
                     all.x = TRUE, sort = FALSE)

# Remove the unnecessary new 'Field' column
species.nca[, "Field"] <- NULL

# And the same for the endemicity
endemicity.status <- species.annot[species.annot[, "Field"] == "Endemicity",]
species.annotated <- merge(species.nca, endemicity.status,by.x = "Endemicity",
                           by.y = "Code", all.x = TRUE, sort = FALSE)
species.annotated[, "Field"] <- NULL

# Finally, rename the final two (description) columns to something useful
colnames(species.annotated)[12:13] <- c("NCA.description","Endemicity.description")