Data preparation, processing and interpretation with R

4 Importing and reformatting data files

4.1 Read in the datafile

## [1] "Gene Description\tGene Accession Number\t1\tcall\t2\tcall\t3\tcall\t4\tcall\t5\tcall\t6\tcall\t7\tcall\t8\tcall\t9\tcall\t10\tcall\t11\tcall\t12\tcall\t13\tcall\t14\tcall\t15\tcall\t16\tcall\t17\tcall\t18\tcall\t19\tcall\t20\tcall\t21\tcall\t22\tcall\t23\tcall\t24\tcall\t25\tcall\t26\tcall\t27\tcall\t34\tcall\t35\tcall\t36\tcall\t37\tcall\t38\tcall\t28\tcall\t29\tcall\t30\tcall\t31\tcall\t32\tcall\t33\tcall"
## [2] "AFFX-BioB-5_at (endogenous control)\tAFFX-BioB-5_at\t-214\tA\t-139\tA\t-76\tA\t-135\tA\t-106\tA\t-138\tA\t-72\tA\t-413\tA\t5\tA\t-88\tA\t-165\tA\t-67\tA\t-92\tA\t-113\tA\t-107\tA\t-117\tA\t-476\tA\t-81\tA\t-44\tA\t17\tA\t-144\tA\t-247\tA\t-74\tA\t-120\tA\t-81\tA\t-112\tA\t-273\tA\t-20\tA\t7\tA\t-213\tA\t-25\tA\t-72\tA\t-4\tA\t15\tA\t-318\tA\t-32\tA\t-124\tA\t-135\tA"                                          
## [3] "AFFX-BioB-M_at (endogenous control)\tAFFX-BioB-M_at\t-153\tA\t-73\tA\t-49\tA\t-114\tA\t-125\tA\t-85\tA\t-144\tA\t-260\tA\t-127\tA\t-105\tA\t-155\tA\t-93\tA\t-119\tA\t-147\tA\t-72\tA\t-219\tA\t-213\tA\t-150\tA\t-51\tA\t-229\tA\t-199\tA\t-90\tA\t-321\tA\t-263\tA\t-150\tA\t-233\tA\t-327\tA\t-207\tA\t-100\tA\t-252\tA\t-20\tA\t-139\tA\t-116\tA\t-114\tA\t-192\tA\t-49\tA\t-79\tA\t-186\tA"                           
## [4] "AFFX-BioB-3_at (endogenous control)\tAFFX-BioB-3_at\t-58\tA\t-1\tA\t-307\tA\t265\tA\t-76\tA\t215\tA\t238\tA\t7\tA\t106\tA\t42\tA\t-71\tA\t84\tA\t-31\tA\t-118\tA\t-126\tA\t-50\tA\t-18\tA\t-119\tA\t100\tA\t79\tA\t-157\tA\t-168\tA\t-11\tA\t-114\tA\t-85\tA\t-78\tA\t-76\tA\t-50\tA\t-57\tA\t136\tA\t124\tA\t-1\tA\t-125\tA\t2\tA\t-95\tA\t49\tA\t-37\tA\t-70\tA"                                                         
## [5] "AFFX-BioC-5_at (endogenous control)\tAFFX-BioC-5_at\t88\tA\t283\tA\t309\tA\t12\tA\t168\tA\t71\tA\t55\tA\t-2\tA\t268\tA\t219\tM\t82\tA\t25\tA\t173\tA\t243\tM\t149\tA\t257\tA\t301\tA\t78\tA\t207\tA\t218\tA\t132\tA\t-24\tA\t-36\tA\t255\tA\t316\tA\t54\tA\t81\tA\t101\tA\t132\tA\t318\tA\t325\tA\t392\tP\t241\tA\t193\tA\t312\tA\t230\tP\t330\tA\t337\tA"                                                                 
## [6] "AFFX-BioC-3_at (endogenous control)\tAFFX-BioC-3_at\t-295\tA\t-264\tA\t-376\tA\t-419\tA\t-230\tA\t-272\tA\t-399\tA\t-541\tA\t-210\tA\t-178\tA\t-163\tA\t-179\tA\t-233\tA\t-127\tA\t-205\tA\t-218\tA\t-403\tA\t-152\tA\t-146\tA\t-262\tA\t-151\tA\t-308\tA\t-317\tA\t-342\tA\t-418\tA\t-244\tA\t-439\tA\t-369\tA\t-377\tA\t-209\tA\t-396\tA\t-324\tA\t-191\tA\t-51\tA\t-139\tA\t-367\tA\t-188\tA\t-407\tA"

# Let's set up a few shortcuts first
DATADIR <- "~/data/Data_interpretation_with_R"
Golub.file <- file.path(DATADIR, "data_set_ALL_AML_train.tsv")

# Inspect the file content
head(readLines(Golub.file))

# Read the file into the golub.df dataframe
golub.df <- read.table(Golub.file,sep="\t", quote="", header=T,
                       row.names=2, comment.char="",
                       stringsAsFactors = FALSE)

4.2 Reformat the file

# Create a golub matrix containing just the intensity values
# 2*[1:38] returns even numbers from 2 to 2x38 (i.e. 76)
golub.matrix <- as.matrix(golub.df[,2*(1:38)])

# Name the columns with the sample status
# The first 27 are "ALL", the final 11 are "AML"
colnames(golub.matrix) <- c(rep("ALL",27),rep("AML",11))

# Then create a vector of gene names
golub.gnames <- golub.df$Gene.Description
# or golub.gnames <- golub.df[,1]
names(golub.gnames) <- rownames(golub.df)

4.3 Transform data

# Find the positions of values in golub below our floor or above our ceiling.
# Then replace the values in those positions by our floor or ceiling value.
golub.matrix[golub.matrix < 100] <- 100
golub.matrix[golub.matrix > 16000] <- 16000

# Log_10 transformation
golub.log <- log(golub.matrix,10)

4.4 Normalise data

# Check the existing mean and standard deviation
apply(golub.log, 2, mean)

##      ALL      ALL      ALL      ALL      ALL      ALL      ALL      ALL 
## 2.412221 2.393360 2.441669 2.391413 2.390291 2.341846 2.384387 2.399676 
##      ALL      ALL      ALL      ALL      ALL      ALL      ALL      ALL 
## 2.444495 2.338539 2.363399 2.274308 2.386571 2.388623 2.395650 2.380295 
##      ALL      ALL      ALL      ALL      ALL      ALL      ALL      ALL 
## 2.482811 2.324345 2.328069 2.499497 2.306725 2.313102 2.331595 2.409902 
##      ALL      ALL      ALL      AML      AML      AML      AML      AML 
## 2.343512 2.352611 2.360633 2.382545 2.336199 2.427405 2.401558 2.386542 
##      AML      AML      AML      AML      AML      AML 
## 2.406045 2.296222 2.400678 2.361889 2.418098 2.451635

apply(golub.log, 2, sd)

##       ALL       ALL       ALL       ALL       ALL       ALL       ALL 
## 0.5071723 0.5187630 0.5263826 0.4988002 0.5032199 0.4792356 0.4942427 
##       ALL       ALL       ALL       ALL       ALL       ALL       ALL 
## 0.5020369 0.5447880 0.4846996 0.4989016 0.4477448 0.5113511 0.5011462 
##       ALL       ALL       ALL       ALL       ALL       ALL       ALL 
## 0.5190887 0.4920936 0.5537143 0.4649320 0.4727689 0.5794321 0.4687063 
##       ALL       ALL       ALL       ALL       ALL       ALL       AML 
## 0.4640572 0.4649017 0.5043298 0.4843511 0.4877606 0.4773038 0.5001143 
##       AML       AML       AML       AML       AML       AML       AML 
## 0.4653518 0.5418554 0.5233421 0.5008276 0.5119954 0.4655509 0.5325124 
##       AML       AML       AML 
## 0.5000293 0.5186136 0.5380754

# Make average log value for each column equal to zero, then divide all
# values by the standard deviation
golub.norm <- scale(golub.log, center = TRUE, scale = TRUE)

# Now recheck means and standard deviations
apply(golub.norm, 2, mean)

##           ALL           ALL           ALL           ALL           ALL 
## -1.643583e-17 -5.344799e-17  2.832026e-16  9.722091e-17 -4.462088e-16 
##           ALL           ALL           ALL           ALL           ALL 
## -3.077379e-16 -1.595621e-16  3.048740e-16  3.319556e-16  2.381528e-16 
##           ALL           ALL           ALL           ALL           ALL 
## -3.858281e-16  5.794228e-17  3.542128e-16  1.424590e-16  1.303877e-16 
##           ALL           ALL           ALL           ALL           ALL 
## -1.626928e-16  3.069186e-16 -9.231404e-17  4.130346e-16 -3.053645e-16 
##           ALL           ALL           ALL           ALL           ALL 
##  2.734703e-16 -9.293533e-17  4.028515e-16  6.367562e-17 -4.141509e-16 
##           ALL           ALL           AML           AML           AML 
## -3.691710e-16 -2.962760e-16  6.240772e-17  3.223573e-16  2.943473e-16 
##           AML           AML           AML           AML           AML 
## -3.177875e-16 -2.160962e-16 -1.514798e-16  2.100244e-16  3.420917e-16 
##           AML           AML           AML 
##  2.160032e-17 -1.629890e-16 -7.954762e-17

apply(golub.norm, 2, sd)

## ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## ALL ALL ALL ALL ALL ALL ALL ALL ALL AML AML AML AML AML AML AML AML AML 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## AML AML 
##   1   1

4.5 Export data

# Export a csv version of the matrix
write.table(golub.norm, file="golub_normalised_data.csv",sep=",", quote=FALSE)
# OR
write.csv(golub.norm, file="golub_normalised_data.csv")
# And of the gene name conversion table
write.table(golub.gnames, file="golub_gene_names.csv", sep=",", quote=TRUE)

Save the knitR document by just clicking on the disk icon at the top of the knitR markdown document. This may be greyed out if you have just created HTML, since knitR saves the file automatically before running.
Save your workspace by going to the Session menu at the top of the RStudio window, and selecting “Save Workspace As”