This is a data analysis pipeline linked to a manuscript in review in Biology Open, entitled “Forcing contacts between mitochondria and the endoplasmic reticulum extends lifespan in a Drosophila model of Alzheimer’s disease”. This pipeline relates to data included in Figure 3.
The input files for this analysis pipeline were deposited in ArrayExpress:
Experiment ArrayExpress accession: E-MTAB-8468
Title: mRNA transcription profiling by array of adult flies expressing a artificial construct designed to bring mitochondria and the endoplasmic reticulum in close proximity
The other relevant files are in this GitHub repository, following this link
Load CEL files into R using the oligo package since the old affy package does not work with ST arrays Make sure you run the commands inside the CEL Files folder
#Load CEL files into R using the oligo package since the old affy package does not work with ST arrays
#Make sure you run the commands inside the CEL Files folder
library(oligo)
setwd("/Users/miguel/Dropbox/RStudio/Juan/Data/CELFiles")
list = list.files(full.names=TRUE)
data = oligo::read.celfiles(list)
## Reading in : ./Titan_0180_11614MM_A05.CEL
## Reading in : ./Titan_0180_11614MM_A09.CEL
## Reading in : ./Titan_0180_11614MM_B05.CEL
## Reading in : ./Titan_0180_11614MM_B09.CEL
## Reading in : ./Titan_0180_11614MM_C05.CEL
## Reading in : ./Titan_0180_11614MM_C09.CEL
## Reading in : ./Titan_0180_11614MM_D05.CEL
## Reading in : ./Titan_0180_11614MM_D09.CEL
## Reading in : ./Titan_0180_11614MM_E05.CEL
## Reading in : ./Titan_0180_11614MM_E07.CEL
## Reading in : ./Titan_0180_11614MM_F05.CEL
## Reading in : ./Titan_0180_11614MM_F07.CEL
## Reading in : ./Titan_0180_11614MM_G05.CEL
## Reading in : ./Titan_0180_11614MM_G07.CEL
## Reading in : ./Titan_0180_11614MM_H05.CEL
## Reading in : ./Titan_0180_11614MM_H07.CEL
ph = data@phenoData
feat = data@featureData
#Merge sample metadata from Edinburgh Genomics with Juan's sample information
#Load required libraries
library(readr)
library(tidyr)
library(dplyr)
setwd("/Users/miguel/Dropbox/RStudio/Juan")
metadata_juan <- read.csv("data/11614MM_SampleData_BioOpen.csv")
metadata_edinburghG <- read.csv("data/11614MM_HybSetup_BioOpen.csv")
metadata <- inner_join (metadata_juan,metadata_edinburghG)
ph@data[ ,1] = c("Control_3days_01","Linker_30days_01",
"Control_3days_02","Linker_30days_02",
"Control_3days_03","Linker_30days_03",
"Control_3days_04","Linker_30days_04",
"Linker_3days_01","Control_30days_01",
"Linker_3days_02","Control_30days_02",
"Linker_3days_03","Control_30days_03",
"Linker_3days_04","Control_30days_04")
Pset = fitProbeLevelModel(data)
Histograms of raw data
#Create histograms of microarray data, using ggplot
library(ggplot2)
pmexp = pm(data)
sampleNames = vector()
logs = vector()
for (i in 1:16)
{
sampleNames = c(sampleNames,rep(ph@data[i,1],dim(pmexp)[1]))
logs = c(logs,log2(pmexp[,i]))
}
logData = data.frame(logInt=logs,sampleName=sampleNames)
dataHist2 = ggplot(logData, aes(logInt, colour = sampleName))
dataHist2 + geom_density()
Boxplots of raw data
#Create a BoxPlot of the raw data
dataBox = ggplot(logData,aes(sampleName,logInt))
dataBox + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
Normalize the data using Robust Multi-array Average (RMA)
data.rma = rma(data)
## Background correcting
## Normalizing
## Calculating Expression
library(pd.drogene.1.1.st) # Load the platform design info for Affymetrix DroGene-1_1-st
data.rma_V02 = rma(data, target="core") #Summaries at the transcript level
## Background correcting
## Normalizing
## Calculating Expression
data.matrix = exprs(data.rma)
Annotate affymetrix probes using the affycoretools package
# Add annotation information using the affycoretools package
library(affycoretools)
library(pd.drogene.1.1.st) # Load the platform design info for Affymetrix DroGene-1_1-st
data.rma_V02 <- annotateEset(data.rma_V02, pd.drogene.1.1.st)
Quality control of the normalized data using box plots
#Create a BoxPlot of normalized intensities
sampleNames = vector()
normlogs = vector()
for (i in 1:16)
{
sampleNames = c(sampleNames,rep(ph@data[i,1],dim(data.matrix)[1]))
normlogs = c(normlogs,data.matrix[,i])
}
normData = data.frame(norm_logInt=normlogs,sampleName=sampleNames)
dataBox = ggplot(normData, aes(sampleName,norm_logInt))
dataBox + geom_boxplot() + ylim(2,16) + ggtitle("after normalization") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Summarize the variations in the data set using principal component analysis
#PCA Analysis
#Create a PCA plot using prcomp (default R)
#color=c('green','green','green','green','red','red','red','red','blue','blue','blue','blue','purple','purple','purple','purple','cyan','cyan','cyan','cyan','black','black','black','black')
#data.PC = prcomp(t(data.matrix),scale.=TRUE)
#plot(data.PC$x[1:24],col=color)
library("FactoMineR")
library("factoextra")
data_as_frame <- as.data.frame(t(data.matrix))
data_as_frame$sample <- c("Control_3days_01","Linker_30days_01",
"Control_3days_02","Linker_30days_02",
"Control_3days_03","Linker_30days_03",
"Control_3days_04","Linker_30days_04",
"Linker_3days_01","Control_30days_01",
"Linker_3days_02","Control_30days_02",
"Linker_3days_03","Control_30days_03",
"Linker_3days_04","Control_30days_04")
data_as_frame$sample_class <- c("Control_3days","Linker_30days",
"Control_3days","Linker_30days",
"Control_3days","Linker_30days",
"Control_3days","Linker_30days",
"Linker_3days","Control_30days",
"Linker_3days","Control_30days",
"Linker_3days","Control_30days",
"Linker_3days","Control_30days")
data_as_frame_ordered <- data_as_frame[, c(16324, 1:16323)]
data_sample_class <- data_as_frame_ordered[,1:16323]
PCA_mitoER <- PCA(data_sample_class[,-1], scale.unit = TRUE, ncp = 5, graph = FALSE)
fviz_pca_ind(PCA_mitoER, geom.ind = "point", col.ind = data_as_frame_ordered$sample_class,
addEllipses = TRUE,
legend.title = "Sample Class")
The plot above shows that with age, the variation in the data increases.
I therefore decided to focus on the arrays from young flies to assess the transcriptional changes in flies expressing the linker.
ph@data[ ,2] = c("Control_3days","Linker_30days",
"Control_3days","Linker_30days",
"Control_3days","Linker_30days",
"Control_3days","Linker_30days",
"Linker_3days","Control_30days",
"Linker_3days","Control_30days",
"Linker_3days","Control_30days",
"Linker_3days","Control_30days")
colnames(ph@data)[2]="source"
ph@data
## index source
## Titan_0180_11614MM_A05.CEL Control_3days_01 Control_3days
## Titan_0180_11614MM_A09.CEL Linker_30days_01 Linker_30days
## Titan_0180_11614MM_B05.CEL Control_3days_02 Control_3days
## Titan_0180_11614MM_B09.CEL Linker_30days_02 Linker_30days
## Titan_0180_11614MM_C05.CEL Control_3days_03 Control_3days
## Titan_0180_11614MM_C09.CEL Linker_30days_03 Linker_30days
## Titan_0180_11614MM_D05.CEL Control_3days_04 Control_3days
## Titan_0180_11614MM_D09.CEL Linker_30days_04 Linker_30days
## Titan_0180_11614MM_E05.CEL Linker_3days_01 Linker_3days
## Titan_0180_11614MM_E07.CEL Control_30days_01 Control_30days
## Titan_0180_11614MM_F05.CEL Linker_3days_02 Linker_3days
## Titan_0180_11614MM_F07.CEL Control_30days_02 Control_30days
## Titan_0180_11614MM_G05.CEL Linker_3days_03 Linker_3days
## Titan_0180_11614MM_G07.CEL Control_30days_03 Control_30days
## Titan_0180_11614MM_H05.CEL Linker_3days_04 Linker_3days
## Titan_0180_11614MM_H07.CEL Control_30days_04 Control_30days
groups = ph@data$source
f = factor(groups,levels=c("Control_3days","Control_30days","Linker_3days","Linker_30days"))
design = model.matrix(~ 0 + f)
colnames(design) = c("Control_3days","Control_30days","Linker_3days","Linker_30days")
design
## Control_3days Control_30days Linker_3days Linker_30days
## 1 1 0 0 0
## 2 0 0 0 1
## 3 1 0 0 0
## 4 0 0 0 1
## 5 1 0 0 0
## 6 0 0 0 1
## 7 1 0 0 0
## 8 0 0 0 1
## 9 0 0 1 0
## 10 0 1 0 0
## 11 0 0 1 0
## 12 0 1 0 0
## 13 0 0 1 0
## 14 0 1 0 0
## 15 0 0 1 0
## 16 0 1 0 0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$f
## [1] "contr.treatment"
library(limma)
data.fit = lmFit(data.rma_V02,design)
data.fit$coefficients[1:10,]
## Control_3days Control_30days Linker_3days Linker_30days
## 18130001 1.845124 2.482081 2.290334 2.207351
## 18130003 3.816978 3.923237 4.005682 4.435453
## 18130005 3.340945 3.642506 4.081426 3.841263
## 18130007 1.599987 2.457995 1.651805 2.421834
## 18130009 2.549814 3.733490 2.533831 3.385800
## 18130011 2.201017 2.477259 2.047083 2.184250
## 18130013 3.629739 3.888842 2.857710 3.464217
## 18130015 2.199753 3.176709 2.162428 2.612304
## 18130017 2.538511 3.115224 2.785309 3.099264
## 18130019 1.795543 1.640211 1.711993 1.791020
contrast.matrix = makeContrasts(Linker_3days-Control_3days,
Linker_30days-Control_30days,
levels=design)
data.fit.con = contrasts.fit(data.fit,contrast.matrix)
data.fit.eb = eBayes(data.fit.con)
contrasts <- as.data.frame(data.fit.eb)
DEresults = decideTests(data.fit.eb,method='global',adjust.method="BH",p.value=0.05,lfc=1)
DEresults <- as.data.frame(DEresults)
write.csv(DEresults, "DEresults_V02.csv")
DEresults_filtered = subset(DEresults,
DEresults[,1] !=0 | DEresults[,2] !=0)
library(dplyr)
contrasts_filtered <- contrasts %>%
filter(!is.na(genes.SYMBOL))
After filtering, there are 14981 mapped probe IDs. I will now merge the results in the “DE_results_filtered” with the “contrasts_filtered” data frame
DEresults_filtered <- cbind(genes.PROBEID = rownames(DEresults_filtered), DEresults_filtered)
contrasts_DEresults <- inner_join(contrasts_filtered, DEresults_filtered, by = "genes.PROBEID")
contrasts_DEresultsV02 <- contrasts_DEresults[,c(9,1:2,10)]
The coefficients in “contrasts_DEresultsV02” define the log fold changes for the comparisons Now we will create two separate files for significant changes (-2 > FC > 2 and FDR < 5%) for 3 days and 30 days
# A list of genes with altered expression at 3 days
DEresults_3d <- contrasts_DEresultsV02 %>%
select (genes.SYMBOL,coefficients.Control_3days...Linker_3days,genes.GENENAME) %>%
filter (coefficients.Control_3days...Linker_3days >= 1 | coefficients.Control_3days...Linker_3days <= -1)
The code above generates a list of 58 genes with altered expression in 3-day-old flies
# A list of genes with altered expression at 30 days
DEresults_30d <- contrasts_DEresultsV02 %>%
select (genes.SYMBOL,coefficients.Control_30days...Linker_30days,genes.GENENAME) %>%
filter (coefficients.Control_30days...Linker_30days >= 1 | coefficients.Control_30days...Linker_30days <= -1)
The code above generates a list of 49 genes with altered expression in 30-day-old flies
Output both lists above as csv files for downstream analysis using the GeneMANIA Cytoscape plugin
write.csv(DEresults_3d, "GeneMANIAInputData_3d.csv")
write.csv(DEresults_30d, "GeneMANIAInputData_30d.csv")
GeneMANIA identified an enrichment by co-expression, 9.60, described by Zhao and colleagues.
Load the enriched network as a new matrix
GeneMANIA_Output <- read.csv("GeneMANIA_3d_Zhao_Haddad_2010.csv")
# Change the column name for genes in the GeneMANIA dataset so that it matches the name in the DEresults_3d
colnames(GeneMANIA_Output)[colnames(GeneMANIA_Output)=="gene.name"] <- "genes.SYMBOL"
# Join the two tables by genes.SYMBOL
GeneMANIA <- inner_join(DEresults_3d,GeneMANIA_Output, by="genes.SYMBOL")
## Warning: Column `genes.SYMBOL` joining factors with different levels,
## coercing to character vector
This resulted in 28 genes belonging to the Zhao dataset (genes involved in tolerance to high oxygen levels)
write.csv(GeneMANIA, "Genemania_output.csv")
Above is the Cytoscape GUI displaying the hypoxia network (left) and the score for that network, surrounded by a red box on the right.
Display the 28 genes associated with tolerance to hyperoxia in flies:
GeneMANIA[,"genes.SYMBOL"]
## [1] "CG17322" "CG15263" "CG11034" "CG17107" "CG33679" "CG43402" "CG16713"
## [8] "CG10659" "Mtk" "GstE1" "Mal-A6" "Cht4" "CG12780" "Damm"
## [15] "CG15870" "CG3264" "CG34040" "PGRP-SD" "CG10516" "CG11458" "Drsl4"
## [22] "CG7298" "TotA" "Diedel" "Npc2e" "Fst" "CG9616" "CG43125"