===== DeSeq2 Cavanagh =====
==== Clinical parameters ====
##-------------------------##
## R libraries ##
##-------------------------##
##---- general libraries ----##
library("data.table")
library("ggplot2")
##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))
##-------------------------##
## Input Files ##
##-------------------------##
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##-------------------------##
## DESeq2 Clinical ##
##-------------------------##
##---- Runs DEseq2 for each of the clinical parameters ----##
sig_genes_by_clinical_parameter = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE)
# iterates through the clinical parameters and runs DESeq2:
for(clinical_parameter in colnames(patient_data))
{
if(clinical_parameter != "study_arm" & clinical_parameter != "group" & clinical_parameter != "centre" )
{
print(clinical_parameter)
clinical_parameter_data = patient_data[,clinical_parameter]
clinical_parameter_data = clinical_parameter_data[complete.cases(clinical_parameter_data)]
# tests for a binary parameter (e.g. gender)
if(length(summary(as.factor(clinical_parameter_data),maxsum=50000)) == 2)
{
min = min(as.numeric(clinical_parameter_data))
max = max(as.numeric(clinical_parameter_data))
group1 = row.names(subset(patient_data,patient_data[clinical_parameter] == min))
group2 = row.names(subset(patient_data,patient_data[clinical_parameter] == max))
}
# or must be a continuous paramter
else
{
q1 = summary(clinical_parameter_data)[2]
q3 = summary(clinical_parameter_data)[5]
group1 = row.names(subset(patient_data,patient_data[clinical_parameter] < q1))
group2 = row.names(subset(patient_data,patient_data[clinical_parameter] > q3))
}
# tests for a suitable number of samples
if(min(c(length(group1),length(group2))) >2)
{
# runs deseq2
condition = factor(c(rep("low", length(group1)), rep("high", length(group2))))
read_count_filtered = cbind(read_counts[,group1],read_counts[,group2])
countdata = as.matrix(read_count_filtered)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)
coldata = data.frame(row.names=colnames(countdata), condition)
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds = DESeq(dds, parallel=T)
res = results(dds, c("condition","high","low"), cooksCutoff=0.2)
res = res[order(res$padj), ]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file=paste("deseq2/unmodelled/DE_",clinical_parameter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
samples = data.frame(cbind(c(group1,group2),condition))
write.table(samples, file=paste("deseq2/unmodelled/SAMPLES_",clinical_parameter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.01)
sig_genes_for_clinical_parameter = data.frame(clinical_parameter,nrow(resdata_sig))
names(sig_genes_for_clinical_parameter)<-c("parameter","count")
sig_genes_by_clinical_parameter <- rbind(sig_genes_by_clinical_parameter, sig_genes_for_clinical_parameter)
}
}
}
write.table(sig_genes_by_clinical_parameter, file="deseq2/unmodelled/SIG_COUNTS.csv",row.names=FALSE, sep="\t", quote = FALSE)
==== HC vs MDD ====
##---- general libraries ----##
library("data.table")
library("ggplot2")
##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))
##-------------------------##
## Input Files ##
##-------------------------##
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##-------------------------##
## DESeq2 HC vs MDD ##
##-------------------------##
sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE)
# gets the confounding variables
MDD = factor(patient_data$MDD)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)
coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)
# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16+MDD)
dds = DESeq(dds, parallel=T)
# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)
# gets the results
res = results(dds, c("MDD","0","1"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)
##-------------------------##
## DESeq2 HC vs MDD Types ##
##-------------------------##
# gets the confounding variables
group = factor(patient_data$group)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
coldata = data.frame(row.names=colnames(read_counts),group,confound1,confound2,confound3,confound4)
# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)
# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+group)
dds = DESeq(dds, parallel=T)
# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD_subgroups.csv",row.names=FALSE, sep="\t", quote = FALSE)
# gets the results
res = results(dds, c("group","HC","MDD_untreated"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_untreated.csv",row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.01)
sig_genes_for_comparison = data.frame("HC vs MDD untreated" ,nrow(resdata_sig))
names(sig_genes_for_comparison)<-c("parameter","count")
sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison)
res = results(dds, c("group","HC","MDD_responder"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_responder.csv",row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.01)
sig_genes_for_comparison = data.frame("HC vs MDD responder",nrow(resdata_sig))
names(sig_genes_for_comparison)<-c("parameter","count")
sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison)
res = results(dds, c("group","HC","MDD_resistant"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_resistant.csv",row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.01)
sig_genes_for_comparison = data.frame("HC vs MDD resistant",nrow(resdata_sig))
names(sig_genes_for_comparison)<-c("parameter","count")
sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison)
==== HC vs MDD - limited model ====
##---- general libraries ----##
library("data.table")
library("ggplot2")
##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))
##-------------------------##
## Input Files ##
##-------------------------##
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##-------------------------##
## DESeq2 HC vs MDD ##
##-------------------------##
sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE)
# gets the confounding variables
MDD = factor(patient_data$MDD)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound3,confound4)
# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)
# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+MDD)
dds = DESeq(dds, parallel=T)
# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)
# gets the results
res = results(dds, c("MDD","0","1"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/limited_model/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)
##-------------------------##
## Input Files ##
##-------------------------##
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##-------------------------##
## DESeq2 HC vs MDD ##
##-------------------------##
sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE)
# gets the confounding variables
MDD = factor(patient_data$MDD)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound4 = factor(patient_data$centre)
coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound4)
# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)
# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound4+MDD)
dds = DESeq(dds, parallel=T)
# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model_no_BMI/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)
# gets the results
res = results(dds, c("MDD","0","1"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/limited_model_no_BMI/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)
==== HC vs MDD - non parameteric ====
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- loads cavanagh data ----##
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- No model ----##
results = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
# get groups
group1 = results[,row.names(subset(samples, MDD == 0))]
group2 = results[,row.names(subset(samples, MDD == 1))]
nrow(group1)
nrow(group2)
ncol(group1)
ncol(group2)
# sets up columns
results$log2fold = rep(1,nrow(expression_data))
results$p = rep(1,nrow(expression_data))
results$p.adj = rep(1,nrow(expression_data))
# does the stats
for (row in 1:nrow(results))
{
print(row)
gene_group1 = unlist(group1[row,])
gene_group2 = unlist(group2[row,])
results[row,"log2fold"] = log(mean(gene_group2),2) - log(mean(gene_group1),2)
results[row,"p"] = wilcox.test(gene_group1,gene_group2, paired=FALSE)$p.value
}
results[["p.adj"]] = p.adjust(results[["p"]], method = p.adjust.methods, n = length(results[["p"]]))
# outputs data
results = results[,c("log2fold","p","p.adj")]
write.table(results, file="cavanagh/mann_whitney/DE_HC_vs_MDD.csv",row.names=TRUE, sep="\t", quote = FALSE)
==== Random cases vs controls ====
##---- general libraries ----##
library("data.table")
library("ggplot2")
##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))
##-------------------------##
## Input Files ##
##-------------------------##
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- import data ----##
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##--------------------------------------------##
## 50 random deseq runs with 50 / 50 split ##
##--------------------------------------------##
read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))
for (counter in 1:50)
{
# set random seed
set.seed(counter)
# gets the random samples
samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),ncol(read_counts_trimmed)/2,replace=FALSE)])
samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
print(length(samples_random1))
print(length(samples_random2))
random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2))))
# gets the confounding variables
MDD = factor(patient_data$MDD)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)
# runs DESeq2
countdata = as.matrix(random_samples_counts)
coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16)
dds = DESeq(dds, parallel=T,betaPrior=TRUE)
res = results(dds, c("condition","high","low"))
# gets the deseq results
res = res[order(res$padj), ]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file=paste("deseq2/random/50_50/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.25)
write.table(resdata_sig, file=paste("deseq2/random/50_50/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_top100 = head(resdata, 100)
write.table(resdata_top100, file=paste("deseq2/random/50_50/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
# updates the p-value distribution results
p_value_dist[,counter] = head(sort(resdata$p),10000)
write.table(p_value_dist, file="deseq2/random/50_50/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
# prints the patient data
#coldata = data.frame(coldata)
#row.names(coldata) = names(random_samples_counts)
write.table(data.frame(coldata), file=paste("deseq2/random/50_50/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)
}
##---------------------------------------------##
## 50 random deseq runs with 44 / 187 split ##
##---------------------------------------------##
read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))
for (counter in 1:50)
{
# set random seed
set.seed(counter)
# gets the random samples
samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)])
samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
print(length(samples_random1))
print(length(samples_random2))
random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2))))
# gets the confounding variables
MDD = factor(patient_data$MDD)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)
# runs DESeq2
countdata = as.matrix(random_samples_counts)
coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16)
dds = DESeq(dds, parallel=T,betaPrior=TRUE)
res = results(dds, c("condition","high","low"))
# gets the deseq results
res = res[order(res$padj), ]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file=paste("deseq2/random/44_187/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.25)
write.table(resdata_sig, file=paste("deseq2/random/44_187/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_top100 = head(resdata, 100)
write.table(resdata_top100, file=paste("deseq2/random/44_187/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
# updates the p-value distribution results
p_value_dist[,counter] = head(sort(resdata$p),10000)
write.table(p_value_dist, file="deseq2/random/44_187/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
# prints the patient data
#coldata = data.frame(coldata)
#row.names(coldata) = names(random_samples_counts)
write.table(data.frame(coldata), file=paste("deseq2/random/44_187/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)
}
Merge the random genes
cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/50_50
cat DE_SIG_1.csv > DE_SIGS_COMBINED.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED.csv | cut -f1 | sort -u > temp.csv
mv temp.csv DE_SIGS_COMBINED.csv
done
cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/44_187
cat DE_SIG_1.csv > DE_SIGS_COMBINED.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED.csv | cut -f1 | sort -u > temp.csv
mv temp.csv DE_SIGS_COMBINED.csv
done
cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/50_50
cat DE_SIG_1.csv > DE_SIGS_COMBINED_counts.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED_counts.csv | cut -f1 > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
done
cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/44_187
cat DE_SIG_1.csv > DE_SIGS_COMBINED_counts.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED_counts.csv | cut -f1 > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
done
cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
==== Randomised Cases vs Controls - limited or no model ====
##---- general libraries ----##
library("data.table")
library("ggplot2")
##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))
##-------------------------##
## Input Files ##
##-------------------------##
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- import data ----##
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---------------------------------------------##
## 50 random deseq runs with 44 / 187 split ##
##---------------------------------------------##
read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))
for (counter in 1:50)
{
# set random seed
set.seed(counter)
# gets the random samples
samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)])
samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
print(length(samples_random1))
print(length(samples_random2))
random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2))))
# gets the confounding variables
MDD = factor(patient_data$MDD)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)
# runs DESeq2
countdata = as.matrix(random_samples_counts)
coldata = data.frame(row.names=colnames(countdata),condition)
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds = DESeq(dds, parallel=T,betaPrior=TRUE)
res = results(dds, c("condition","high","low"))
# gets the deseq results
res = res[order(res$padj), ]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file=paste("deseq2/random/44_187_no_model/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.25)
write.table(resdata_sig, file=paste("deseq2/random/44_187_no_model/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_top100 = head(resdata, 100)
write.table(resdata_top100, file=paste("deseq2/random/44_187_no_model/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
# updates the p-value distribution results
p_value_dist[,counter] = head(sort(resdata$p),10000)
write.table(p_value_dist, file="deseq2/random/44_187_no_model/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
# prints the patient data
#coldata = data.frame(coldata)
#row.names(coldata) = names(random_samples_counts)
write.table(data.frame(coldata), file=paste("deseq2/random/44_187_no_model/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)
}
##---------------------------------------------##
## 50 random deseq runs with 44 / 187 split ##
##---------------------------------------------##
read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))
for (counter in 1:50)
{
# set random seed
set.seed(counter)
# gets the random samples
samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)])
samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
print(length(samples_random1))
print(length(samples_random2))
random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2))))
# gets the confounding variables
MDD = factor(patient_data$MDD)
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)
# runs DESeq2
countdata = as.matrix(random_samples_counts)
coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4)
dds = DESeq(dds, parallel=T,betaPrior=TRUE)
res = results(dds, c("condition","high","low"))
# gets the deseq results
res = res[order(res$padj), ]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file=paste("deseq2/random/44_187_limited_model/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.25)
write.table(resdata_sig, file=paste("deseq2/random/44_187_limited_model/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
resdata_top100 = head(resdata, 100)
write.table(resdata_top100, file=paste("deseq2/random/44_187_limited_model/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
# updates the p-value distribution results
p_value_dist[,counter] = head(sort(resdata$p),10000)
write.table(p_value_dist, file="deseq2/random/44_187_limited_model/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
# prints the patient data
#coldata = data.frame(coldata)
#row.names(coldata) = names(random_samples_counts)
write.table(data.frame(coldata), file=paste("deseq2/random/44_187_limited_model/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)
}
==== Convert results to gene symbols ====
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
bmi = read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
converter = read.table(file="cavanagh/raw_data/ID_to_symbol.csv",header=TRUE, sep='\t', quote='',check.names = TRUE)
converted_BMI = merge(bmi, converter, by="ID")
converted_BMI_sig = subset(converted_BMI, p.adj < 0.01)
write.table(converted_BMI,"cavanagh/deseq2/unmodelled/symbols/DE_BMI_symbols.csv")
write.table(converted_BMI_sig,"cavanagh/deseq2/unmodelled/symbols/DE_BMI_sig_symbols.csv")
===== Limma Leday =====
library(Biobase)
library(GEOquery)
library(limma)
library(jetset)
library(sdef)
##----------------------##
## Download and Process ##
##----------------------##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/leday")
# download data
tmp <- getGEO("GSE98793", GSEMatrix=TRUE)
eset <- tmp[[1]]
# remove samples that failed Leday QC
sample_info = read.table(file="sample_IDs.csv",header=TRUE, sep='\t', quote='',check.names = TRUE)
valid_samples = subset(sample_info, keep == 0)$ID
eset = eset[,valid_samples]
# normalise
normalizeBetweenArrays(exprs(eset),method="quantile")
# remove unannotated probes
data <- featureData(eset)@data
data.f <- data[-which(data[["Gene Symbol"]] == ""), ]
nrow(data)
nrow(data.f)
eset.f <- eset[rownames(eset) %in% rownames(data.f), ]
#use the jetset to select only the probe set of highest jset score for each gene
jscores = jscores("hgu133plus2", rownames(eset.f))
unique_genes = data.frame(unique(jscores$symbol))
unique_genes$probe = NA
names(unique_genes) = c("symbols","probe")
for(i in 1:nrow(unique_genes))
{
gene_symbol = unique_genes[i,1]
duplicates = subset(jscores,jscores[["symbol"]] == gene_symbol)
unique_genes[i,2] = rownames(head(duplicates,1))[1]
}
nrow(unique_genes)
eset.fu <- eset.f[rownames(eset.f) %in% unique_genes$probe, ]
nrow(eset.fu)
# adds the covariates with user friendly names
pData(eset.fu)$group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
pData(eset.fu)$batch <- pData(eset.fu)$"batch"
pData(eset.fu)$gender <- pData(eset.fu)$"gender"
pData(eset.fu)$age <- pData(eset.fu)$"age"
pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"
##----------------------------------##
## Limma Uncorrected ##
##----------------------------------##
# Gender
mydesign2 <- model.matrix(~ gender, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), coef="genderM", adjust="BH")
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE_gender.csv",row.names=FALSE, sep="\t", quote = FALSE)
# Age
q1 = summary(as.numeric(pData(eset.fu)$age))[2]
q3 = summary(as.numeric(pData(eset.fu)$age))[5]
group1 = row.names(subset(pData(eset.fu),as.numeric(pData(eset.fu)$age) < q1))
group2 = row.names(subset(pData(eset.fu),as.numeric(pData(eset.fu)$age) > q3))
group1_2 = c(group1,group2)
eset.fu.age = eset.fu[,group1_2]
pData(eset.fu.age)$age_group <- factor(c(rep("low", length(group1)), rep("high", length(group2))))
mydesign2 <- model.matrix(~ age_group, data=pData(eset.fu.age))
fit <- lmFit(eset.fu.age, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu.age), adjust="BH")
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE_age.csv",row.names=FALSE, sep="\t", quote = FALSE)
# Anxiety
mydesign2 <- model.matrix(~ anxiety, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), adjust="BH")
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE_anxiety.csv",row.names=FALSE, sep="\t", quote = FALSE)
##--------------------------------##
## Limma Corrected ##
##--------------------------------##
#differential expression analysis performed via Limma
mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")
# save results
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE.csv",row.names=FALSE, sep="\t", quote = FALSE)
residuals <- data.frame(residuals(fit,eset.fu))
write.table(residuals, file="Residuals.csv",row.names=TRUE, sep="\t", quote = FALSE)
em <- exprs(eset.fu)
write.table(round(em,2), file="EM.csv",row.names=TRUE, sep="\t", quote = FALSE)
patient_data <- eset.fu@phenoData@data
write.table(patient_data, file="patient_data.csv",row.names=TRUE, sep="\t", quote = FALSE)
# batch corrected EM
em_corrected <- removeBatchEffect(em, batch=pData(eset.fu)$batch)
write.table(round(em_corrected,2), file="EM_batch_corrected.csv",row.names=TRUE, sep="\t", quote = FALSE)
# batch and covariate corrected EM
covariates <- data.frame(cbind(pData(eset.fu)$gender,pData(eset.fu)$age,pData(eset.fu)$anxiety))
covariates <- data.matrix(covariates)
em_corrected <- removeBatchEffect(em, batch=pData(eset.fu)$batch, covariates = covariates)
write.table(round(em_corrected,2), file="EM_covariate_corrected.csv",row.names=TRUE, sep="\t", quote = FALSE)
##----------------------##
## Randomised ##
##----------------------##
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))
for (counter in 1:50)
{
# set random seed
set.seed(counter)
print(counter)
# gets the random samples
group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
group_random = sample(group,length(group),replace=FALSE)
print(group_random)
#differential expression analysis performed via Limma
pData(eset.fu)$group <- factor(group_random)
pData(eset.fu)$batch <- pData(eset.fu)$"batch"
pData(eset.fu)$gender <- pData(eset.fu)$"gender"
pData(eset.fu)$age <- pData(eset.fu)$"age"
pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"
mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")
# save results
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file=paste("random/DE",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
res_sig = subset(res, P.Value < 0.001)
write.table(res_sig, file=paste("random/DE_sig_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
res_100 = head(res,100)
write.table(res_100, file=paste("random/DE_top100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
# updates the p-value distribution results
p_value_dist[,counter] = head(sort(res$P.Value),10000)
write.table(p_value_dist, file="random/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
}
##----------------------##
## Randomised Overlap ##
##----------------------##
for (counter in 1:50)
{
# set random seed
set.seed(counter)
print(counter)
## group 1
# gets the random samples
group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
group_random = sample(group,length(group),replace=FALSE)
print(group_random)
#differential expression analysis performed via Limma
pData(eset.fu)$group <- factor(group_random)
pData(eset.fu)$batch <- pData(eset.fu)$"batch"
pData(eset.fu)$gender <- pData(eset.fu)$"gender"
pData(eset.fu)$age <- pData(eset.fu)$"age"
pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"
mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")
# save results
res1 <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
## group 2
# set random seed
set.seed(counter*100)
print(counter)
# gets the random samples
group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
group_random = sample(group,length(group),replace=FALSE)
print(group_random)
#differential expression analysis performed via Limma
pData(eset.fu)$group <- factor(group_random)
pData(eset.fu)$batch <- pData(eset.fu)$"batch"
pData(eset.fu)$gender <- pData(eset.fu)$"gender"
pData(eset.fu)$age <- pData(eset.fu)$"age"
pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"
mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")
# save results
res2 <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
## overlap
# parse data
data_merged = merge(res1,res2, by="ID")
ids = data_merged$ID
data_trimmed = data_merged[,c("P.Value.x","P.Value.y")]
# get overlap
ratios = try(ratio(data_trimmed))
if(inherits(ratios, "try-error"))
{
next
}
print("ratios generated")
bays = try(baymod(ratios))
if(inherits(bays, "try-error"))
{
next
}
print("bays modelled")
sdef_features = try(data.frame(extractFeatures.R(ratios,bays,ids)$max))
if(inherits(sdef_features, "try-error"))
{
next
}
print("features extracted")
# write results
if (! is.null(sdef_features))
{
results = data_merged[data.frame(sdef_features)$Names,"Gene.Symbol.x"]
write.table(results, file=paste("random_sdef/DE_sig_",counter,".csv",sep=""),row.names=FALSE, col.names=FALSE, sep="\t", quote = FALSE)
}
}
Gets the counts for the random data
cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/leday/random
cat DE_sig_1.csv > DE_SIGS_COMBINED_counts.csv
for i in DE_sig_*
do
cat $i DE_SIGS_COMBINED_counts.csv | cut -f7 > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
done
cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
===== Cibersort =====
Get the expression data as gene symbols
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- loads data ----##
cavanagh = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- converts cavanagh ids to symbols ----##
cavanagh_conversion = read.table(file="cavanagh/raw_data/ID_to_symbol.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
cavanagh_converted = merge(cavanagh, cavanagh_conversion, by=0)
##---- saves the data ----##
write.table(cavanagh_converted, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_symbols.csv",row.names=FALSE, sep="\t", quote = FALSE)
We run Cibersort on the modelled HC vs MDD EM file, using the default LM22 background set, no quantile and 100 permutations: https://cibersort.stanford.edu/upload.php
/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_2019/cavanagh/cibersort/CIBERSORT.Output_Job6.csv
===== ComBat Correction =====
## Correct with ComBat
library(devtools)
library(Biobase)
library(sva)
# loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
EM <- read.table("deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE, row.names=1)
patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1)
# gets the confounding variables
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)
coldata = data.frame(row.names=colnames(EM),confound1,confound2,confound3,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
# gets batch
confound4 = factor(patient_data$centre)
# corrects
modcombat = model.matrix(~confound1+confound2+confound3+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16, data=coldata)
combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
# saves
resdata <- cbind(row.names(combat_edata),round(combat_edata,2))
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE)
===== Combat correction - limited model=====
## Correct with ComBat
library(devtools)
library(Biobase)
library(sva)
# loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
EM <- read.table("deseq2/limited_model/EM_HC_vs_MDD.csv", header=TRUE, row.names=1)
patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1)
# gets the confounding variables
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
coldata = data.frame(row.names=colnames(EM),confound1,confound2,confound3)
# gets batch
confound4 = factor(patient_data$centre)
# corrects
modcombat = model.matrix(~confound1+confound2+confound3, data=coldata)
combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
# saves
resdata <- cbind(row.names(combat_edata),round(combat_edata,2))
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE)
## Correct with ComBat
library(devtools)
library(Biobase)
library(sva)
# loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
EM <- read.table("deseq2/limited_model_no_BMI/EM_HC_vs_MDD.csv", header=TRUE, row.names=1)
patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1)
# gets the confounding variables
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
coldata = data.frame(row.names=colnames(EM),confound1,confound2)
# gets batch
confound4 = factor(patient_data$centre)
# corrects
modcombat = model.matrix(~confound1+confound2, data=coldata)
combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
# saves
resdata <- cbind(row.names(combat_edata),round(combat_edata,2))
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model_no_BMI/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE)
===== Percentile Correction =====
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
# method for correcting outliers
correct_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- qnt[1] - H
y[x > (qnt[2] + H)] <- qnt[2] + H
y
}
# pre Combat
expression = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
EM <- expression[0,]
for(row in 1:nrow(expression))
{
gene <- expression[(row),]
EM <- rbind(EM,correct_outliers(gene))
}
EM_IDs = EM
EM_IDs$IDs = row.names(EM)
EM_IDs = EM_IDs[,ncol(EM_IDs)]
EM = cbind(EM_IDs,EM)
colnames(EM)[1] = "IDs"
write.table(EM, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv",row.names=FALSE, sep="\t", quote = FALSE)
# post Combat
expression = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
EM <- expression[0,]
for(row in 1:nrow(expression))
{
gene <- expression[(row),]
EM <- rbind(EM,correct_outliers(gene))
}
EM_IDs = EM
EM_IDs$IDs = row.names(EM)
EM_IDs = EM_IDs[,ncol(EM_IDs)]
EM = cbind(EM_IDs,EM)
colnames(EM)[1] = "IDs"
write.table(EM, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv",row.names=FALSE, sep="\t", quote = FALSE)
===== Expected False Positives by FDR Threshold =====
==== Cavanagh ====
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- List of thresholds ----##
thresholds = c(0.25,0.1,0.05,0.01,0.001,0.0001,0.00001,0.000001)
##---- get the number of false positives ----##
false_positive_dist = data.frame(matrix(0, ncol = 3, nrow = length(thresholds)))
names(false_positive_dist) = c("threshold","count","max")
false_positive_dist$threshold = thresholds
for (index in 1:length(thresholds)) {
threshold = thresholds[index]
print(threshold)
values <- vector(mode="numeric", length=50)
max = 0
for (counter in 1:50)
{
DE_data = read.table(file=paste("cavanagh/deseq2/random/44_187/DE_",counter,".csv",sep=""), header=TRUE, sep='\t', quote='',check.names = TRUE)
values[counter] = nrow(subset(DE_data,p.adj < threshold))
if (nrow(subset(DE_data,p.adj < threshold)) > max)
{
max = nrow(subset(DE_data,p.adj < threshold))
}
}
false_positive_dist[index,2] = median(values)
false_positive_dist[index,3] = max
}
write.table(false_positive_dist, file="cavanagh/deseq2/random/44_187/false_positive_count.csv",row.names=FALSE, sep="\t", quote = FALSE)
==== Leday ====
##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- List of thresholds ----##
thresholds = c(0.25,0.1,0.05,0.01,0.001,0.0001,0.00001,0.000001)
##---- get the number of false positives ----##
false_positive_dist = data.frame(matrix(0, ncol = 3, nrow = length(thresholds)))
names(false_positive_dist) = c("threshold","count","max")
false_positive_dist$threshold = thresholds
for (index in 1:length(thresholds)) {
threshold = thresholds[index]
print(threshold)
values <- vector(mode="numeric", length=50)
max = 0
for (counter in 1:50)
{
DE_data = read.table(file=paste("leday/random/DE",counter,".csv",sep=""), header=TRUE, sep='\t', quote='',check.names = TRUE)
values[counter] = nrow(subset(DE_data,adj.P.Val < threshold))
if (nrow(subset(DE_data,adj.P.Val < threshold)) > max)
{
max = nrow(subset(DE_data,adj.P.Val < threshold))
}
}
false_positive_dist[index,2] = median(values)
false_positive_dist[index,3] = max
}
write.table(false_positive_dist, file="leday/random/false_positive_count.csv",row.names=FALSE, sep="\t", quote = FALSE)
===== Correlation Clusters =====
Pre-filters the data
##---------------------------------------------##
## ##
## Code to pre-filter the RNA-seq data ##
## ##
##---------------------------------------------##
#Clears the enrivoment
rm(list = ls())
#Load libraries
library(psych)
library(stats)
#loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
exp_values = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- Pre-filters the RNA-seq data ----##
# Remove genes
exp_values <- data.frame(subset(exp_values,apply(exp_values, 1, mean) >= 10))
# Identify genes with a high threshold
geneHighCoefOfVarFilter <- function(dataMatrix, coefOfVarThreshold) {
coefOfVars <- apply(dataMatrix, 1, function(x) { sd(x) / abs(mean(x)) })
fdata <- dataMatrix[coefOfVars < coefOfVarThreshold, ]
# return the filtered data
list(coefOfVars = coefOfVars, fdata=fdata)
}
# the smaller the threshold, the higher the experimental effect relative to the measurement precision
thr <- 0.15
# does the filtering
filtered <- geneHighCoefOfVarFilter(exp_values, thr)
filtered.combined <- na.omit(data.frame(filtered$fdata))
nrow(filtered.combined)
num.genes <- nrow(filtered.combined)
genes <- row.names(filtered.combined)
rnaSeq <- t(filtered.combined)
subjects <- row.names(rnaSeq)
my_subjs <- subjects
##---- Saves the data ----##
rnaSeq <- rnaSeq[my_subjs,]
num.genes <- ncol(rnaSeq)
save(num.genes, rnaSeq, my_subjs,file = "cavanagh/clusters/exp.filtered.corrected.Rdata")
Does the clustering
##---------------------------------------------##
## ##
## Code to do the clustering ##
## ##
##---------------------------------------------##
#loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/clusters/")
load("exp.filtered.corrected.Rdata")
#loads the libraries
library("dendextend")
library(WGCNA)
library(GSVA)
library(reshape2)
library(grid)
library(broom)
library(ggplot2)
library(ade4)
library(leaps)
library(dplyr)
library(flashClust)
# Prepares the data
pathnames <- list.files(pattern="[.]R$", path=paste(getwd(),"/funcs", sep =""), full.names=TRUE);
sapply(pathnames, FUN=source);
genes <- colnames(rnaSeq)
rnaseq_counts <- rnaSeq
expr <- as.matrix(t(rnaseq_counts))
##---- does the clustering ----##
corrThresh = 0.5
inputmat = rnaSeq
regs = genes
inputmat <- cor(inputmat)
adjacency <- inputmat > corrThresh
TOM <- TOMsimilarity(adjacency+0, TOMType = "unsigned");
dissTOM = 1-TOM
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 50;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, cutHeight = 0.95, distM = dissTOM,deepSplit = 2, pamRespectsDendro = FALSE,minClusterSize = minModuleSize);
mod_sizes <- table(dynamicMods)
length(mod_sizes)
mod_sizes
# gets all cluster members except first (-1) because first is "unassigned genes"
numMods <- sort(unique(names(dynamicMods)))[-1]
names(dynamicMods) <- colnames(adjacency)
modNumbers <- unique(dynamicMods)
names(dynamicMods) <- colnames(adjacency)
gene.mod.list <- lapply(modNumbers, function(x) {
cluster_genes <- dynamicMods[dynamicMods == x]
names(cluster_genes)
})
gene.mod.df <- melt(gene.mod.list)
rownames(gene.mod.df) <- gene.mod.df[,1]
# saves the results
write.table(gene.mod.df, file="meta_genes.csv",row.names=TRUE, sep="\t", quote = FALSE)
Stats and plots
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- general libraries ----##
library(ggplot2)
library(reshape)
library(amap)
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- number of clusters: ----##
number_of_clusters = 48
##---- plot metagene correlations: ----##
DGM_all = read.table(file="clusters/meta_genes.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
expression_data = read.table(file="deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
for(cluster_no in 1:number_of_clusters)
{
cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),])
expression_data_cluster_corr_melted = melt(cor(t(expression_data_cluster)))
expression_data_cluster_corr_melted = subset(expression_data_cluster_corr_melted,value < 1)
ggp <- ggplot(expression_data_cluster_corr_melted, aes(x=value)) + geom_density(alpha=0.25) + xlab("SCC") + ylab("density") + theme_SL2() + theme(legend.position="bottom") + xlim(c(-1,1)) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_corr_plots/",cluster_no,"_corrplot.svg",sep=""), height=300/90, width=300/90)
print(ggp)
dev.off()
}
##---- plot metagene HC vs MDD ----##
patient_data_cavanagh = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
# violin plots
legend_position = "none"
mdd_ttest_list = data.frame(matrix(, nrow=number_of_clusters, ncol=4))
mdd_spearman_list = data.frame(matrix(, nrow=number_of_clusters, ncol=2))
for(cluster_no in 1:number_of_clusters)
{
# gets the cluster expression data
cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),])
expression_data_cluster_scaled = data.frame(t(scale(t(expression_data_cluster))))
expression_data_cluster_scaled_colmeans = data.frame(colMeans(expression_data_cluster_scaled))
#stats for subtypes
expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$group
group1 = subset(expression_data_cluster_scaled_colmeans,Diag=="HC")$colMeans.expression_data_cluster_scaled.
group2 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_resistant")$colMeans.expression_data_cluster_scaled.
group3 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_responder")$colMeans.expression_data_cluster_scaled.
group4 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_untreated")$colMeans.expression_data_cluster_scaled.
mdd_ttest_list[cluster_no,2] = t.test(group1,group2)$p.value
mdd_ttest_list[cluster_no,3] = t.test(group1,group3)$p.value
mdd_ttest_list[cluster_no,4] = t.test(group1,group4)$p.value
#plots the subtypes
x_axis_label = ""
y_axis_label = "Cluster meta-gene\nexpression (Z-score)"
ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag, fill=Diag)) + geom_boxplot() + ylim(min(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25), max(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_boxplots_subtypes/",cluster_no,"_boxplot.svg",sep=""), height=160/90, width=400/90)
print(ggp)
dev.off()
#stats for disease state
expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$MDD
group1 = subset(expression_data_cluster_scaled_colmeans, Diag==0)$colMeans.expression_data_cluster_scaled.
group2 = subset(expression_data_cluster_scaled_colmeans,Diag==1)$colMeans.expression_data_cluster_scaled.
mdd_ttest_list[cluster_no,1] = t.test(group1,group2)$p.value
#plots for disease state
x_axis_label = ""
y_axis_label = "Cluster meta-gene\nexpression (Z-score)"
ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag)) + scale_x_discrete(breaks=c(0, 1),labels=c("HC","MDD"),limits=c(0, 1)) + geom_boxplot() + ylim(min(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25), max(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_boxplots_MDD/",cluster_no,"_boxplot.svg",sep=""), height=200/90, width=200/90)
print(ggp)
dev.off()
#stats for hamilton score
expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$hamilton_score
mdd_spearman_list[cluster_no,1]=cor.test(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled., expression_data_cluster_scaled_colmeans$Diag, method=c("spearman"))$estimate
mdd_spearman_list[cluster_no,2]=cor.test(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled., expression_data_cluster_scaled_colmeans$Diag, method=c("spearman"))$p.value
#plots the hamilton score
x_axis_label = "Hamilton score"
y_axis_label = "Cluster meta-gene\nexpression (Z-score)"
ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag, fill=Diag)) + geom_point() + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_vs_hamilton_corr_plots/",cluster_no,"_dotplot.svg",sep=""), height=160/90, width=150/90)
print(ggp)
dev.off()
}
# BH correction of stats
mdd_ttest_list$HC_vs_MDD_FDR = p.adjust(mdd_ttest_list$X1, method = "BH", n = number_of_clusters)
mdd_ttest_list$HC_vs_resistant_FDR = p.adjust(mdd_ttest_list$X2, method = "BH", n = number_of_clusters)
mdd_ttest_list$HC_vs_responder_FDR = p.adjust(mdd_ttest_list$X3, method = "BH", n = number_of_clusters)
mdd_ttest_list$HC_vs_untreated_FDR = p.adjust(mdd_ttest_list$X4, method = "BH", n = number_of_clusters)
names(mdd_ttest_list) = c("HC_vs_MDD_p","HC_vs_resistant_p","HC_vs_responder_p","HC_vs_untreated_p","HC_vs_MDD_pBH","HC_vs_resistant_pBH","HC_vs_responder_pBH","HC_vs_untreated_pBH")
write.table(mdd_ttest_list, file="clusters/cluster_violin_pvals.csv",row.names=TRUE, sep="\t", quote = FALSE)
mdd_spearman_list$HAM_vs_Exp_SCC_FDR = p.adjust(mdd_spearman_list$X2, method = "BH", n = number_of_clusters)
names(mdd_spearman_list) = c("SCC","p","pBH")
write.table(mdd_spearman_list, file="clusters/cluster_vs_ham_pvals.csv",row.names=TRUE, sep="\t", quote = FALSE)
##----- Clustering Method -----##
cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method)
y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method)
x.dd = as.dendrogram(x.cluster)
y.dd = as.dendrogram(y.cluster)
x.dd.reorder = reorder(x.dd,0,FUN=reorder_function)
y.dd.reorder = reorder(y.dd,0,FUN=reorder_function)
x.order = order.dendrogram(x.dd.reorder)
y.order = order.dendrogram(y.dd.reorder)
if(cluster_x == TRUE && cluster_y == TRUE)
{
mx_clustered = mx[y.order,x.order]
}
if(cluster_x == TRUE && cluster_y == FALSE)
{
mx_clustered = mx[,x.order]
}
if(cluster_x == FALSE && cluster_y == TRUE)
{
mx_clustered = mx[y.order,]
}
if(cluster_x == FALSE && cluster_y == FALSE)
{
mx_clustered = mx
}
return(mx_clustered)
}
##----- Significant Genes Heatmap Function -----##
make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
if(nrow(matrix_scaled) >= 2)
{
matrix_scaled_renamed = as.matrix(matrix_scaled)
matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered)
matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered))
matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered))
hm.palette <- colorRampPalette(colours)
ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank())
return(ggp)
}
else
{
return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this."))
}
}
##---- Heatmap settings ----##
plot_height = 750
plot_width = 500
colours = c("blue","yellow")
distance_method = "spearman"
clustering_method = "average"
reorder_function = "average"
cluster_x = TRUE
cluster_y = TRUE
##---- plot metagene Heatmaps ----##
for(cluster_no in 1:number_of_clusters)
{
cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),])
heatmap_matrix = data.frame(t(scale(t(expression_data_cluster))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)
ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png(paste("clusters/cluster_heatmaps/",cluster_no,"_heatmap.png",sep=""), height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
}
##---- plot correlation heatmap ----##
plot_height = 2500
plot_width = 2500
cluster_x = TRUE
cluster_y = TRUE
#select the first 25 genes from each cluster (to make clustering quicker)
cluster_genes_25 = list()
for(cluster_no in 1:number_of_clusters)
{
cluster_genes = subset(DGM_all,L1 == cluster_no)
cluster_genes_top = head(cluster_genes,round(nrow(cluster_genes)/4))
cluster_genes_25 = rbind(cluster_genes_25,cluster_genes_top)
}
nrow(cluster_genes_25)
# makes the matrix
expression_data_clusters = na.omit(expression_data[row.names(cluster_genes_25),])
cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_clusters_corr = cor(t(expression_data_clusters))
heatmap_matrix = data.frame(expression_data_clusters_corr)
nrow(heatmap_matrix)
ncol(heatmap_matrix)
# plots
ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("clusters/correlation_heatmap.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
===== Plots =====
==== Cibersort ====
##---- libraries----##
library(ggplot2)
library(reshape)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- loads data ----##
cavanagh = read.table(file="cavanagh/cibersort/CIBERSORT.Output_Job6_parsed.csv",row.names=1,header=TRUE, sep='\t', quote='',check.names = TRUE)
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(1,1,1,1), "cm")
)
}
##---- Cavanagh only plot ----##
names(cavanagh) = c("Monocytes","T cells CD4 memory resting","NK cells resting","T cells CD4 naive","T cells CD8","B cells naive","Macrophages M0","B cells memory","Neutrophils","T cells regulatory (Tregs)","Mast cells resting","Dendritic cells activated","T cells CD4 memory activated","Plasma cells","Macrophages M2","Eosinophils","Mast cells activated","Macrophages M1")
cavanagh_melted = melt(cavanagh)
ggp = ggplot(cavanagh_melted, aes(x=variable, y=value*100)) + geom_boxplot() + xlab("") + ylab("% of cell population") + theme_SL2() + theme(legend.position="right",axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
svg("plots/cibersort_cavanagh.svg", height=400/90, width=800/90)
print(ggp)
dev.off()
==== Signficant Confounders Bar Chart ====
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- number of sig genes by clinical parameter bar chart ----##
sig_genes_by_clinical_parameter = read.table(file="cavanagh/deseq2/unmodelled/SIG_COUNTS.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
sig_genes_by_clinical_parameter$IDs = row.names(sig_genes_by_clinical_parameter)
sig_genes_by_clinical_parameter$IDs = gsub('\\.', ' ', sig_genes_by_clinical_parameter$IDs)
sig_genes_by_clinical_parameter$IDs = gsub(' ', ' ', sig_genes_by_clinical_parameter$IDs)
sig_genes_by_clinical_parameter$IDs = factor(gsub('_', ' ', sig_genes_by_clinical_parameter$IDs))
row.names(sig_genes_by_clinical_parameter) = sig_genes_by_clinical_parameter$IDs
sig_genes_by_clinical_parameter = sig_genes_by_clinical_parameter[with(sig_genes_by_clinical_parameter, order(count)), ]
sig_genes_by_clinical_parameter_top = subset(sig_genes_by_clinical_parameter,count >= 5)
sig_genes_by_clinical_parameter_top$IDs = factor(sig_genes_by_clinical_parameter_top$IDs)
sig_genes_by_clinical_parameter_top$IDs <- factor(sig_genes_by_clinical_parameter_top$IDs, levels = sig_genes_by_clinical_parameter_top$IDs)
x_axis_label = "number of significant genes (adjusted p <0.01)"
y_axis_label = ""
bar_outline_size = 1
bar_transparency = 0.75
legend_position = "bottom"
data_label_size = 3
ggp <- ggplot(data=sig_genes_by_clinical_parameter_top , aes(x=IDs, y=count)) + geom_bar(colour="black",stat="identity", position = "dodge", alpha = bar_transparency) + coord_flip() + ylab(x_axis_label) + xlab(y_axis_label) + theme_SL2() + theme(axis.text.y = element_text(hjust=1),legend.position=legend_position, legend.spacing.x = unit(0.15, 'cm')) + scale_y_continuous(expand=c(0,0), limits=c(0,max(sig_genes_by_clinical_parameter_top$count)*1.25)) + geom_text(aes(label=count), position=position_dodge(width=0.9), size=data_label_size, show.legend = FALSE, hjust=-0.5)
svg("plots/sig_genes_barchart.svg", height=300/90, width=800/90, pointsize=5)
print(ggp)
dev.off()
==== Age, Gender, BMI heatmaps =====
##---- libraries----##
library(ggplot2)
library(amap)
library(reshape)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- loads data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(1,1,1,1), "cm")
)
}
##----- Clustering Method -----##
cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method)
y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method)
x.dd = as.dendrogram(x.cluster)
y.dd = as.dendrogram(y.cluster)
x.dd.reorder = reorder(x.dd,0,FUN=reorder_function)
y.dd.reorder = reorder(y.dd,0,FUN=reorder_function)
x.order = order.dendrogram(x.dd.reorder)
y.order = order.dendrogram(y.dd.reorder)
if(cluster_x == TRUE && cluster_y == TRUE)
{
mx_clustered = mx[y.order,x.order]
}
if(cluster_x == TRUE && cluster_y == FALSE)
{
mx_clustered = mx[,x.order]
}
if(cluster_x == FALSE && cluster_y == TRUE)
{
mx_clustered = mx[y.order,]
}
if(cluster_x == FALSE && cluster_y == FALSE)
{
mx_clustered = mx
}
return(mx_clustered)
}
##----- Significant Genes Heatmap Function -----##
make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
if(nrow(matrix_scaled) >= 2)
{
matrix_scaled_renamed = as.matrix(matrix_scaled)
matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered)
matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered))
matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered))
hm.palette <- colorRampPalette(colours)
ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank())
return(ggp)
}
else
{
return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this."))
}
}
##---- Heatmap settings ----##
plot_height = 750
plot_width = 500
colours = c("blue","yellow")
distance_method = "spearman"
clustering_method = "average"
reorder_function = "average"
cluster_x = FALSE
cluster_y = TRUE
##---- Gender Heatmap ----##
samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_gender.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)))
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_gender.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01))
heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)
ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/cavanagh_gender_heatmap.png", height=1750, width=1000, pointsize=5)
print(ggp)
dev.off()
##---- Age Heatmap ----##
samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)))
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01))
heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)
ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/cavanagh_age_heatmap.png", height=2000, width=1000, pointsize=5)
print(ggp)
dev.off()
##---- BMI Heatmap ----##
samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)))
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01))
heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)
ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/cavanagh_BMI_heatmap.png", height=2000, width=1000, pointsize=5)
print(ggp)
dev.off()
==== Age, Gender, BMI Boxplots ====
##---- libraries----##
library(ggplot2)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- loads cavanagh data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
age_samples = read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
BMI_samples = read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- gender plots ----##
# 2 is female
gene = "ENSG00000012817"
expression_data_gene = t(expression_data[gene,])
groups = samples$gender
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
sample_groups = c("Male", "Female")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=sample_groups,labels=sample_groups,limits=sample_groups) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_gender_up_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000147050"
expression_data_gene = t(expression_data[gene,])
groups = samples$gender
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
sample_groups = c("Male", "Female")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=sample_groups,labels=sample_groups,limits=sample_groups) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_gender_down_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()
##---- age plots ----##
# 2 is younger
gene = "ENSG00000169855"
groups = age_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Youngest", "Oldest")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_age_down_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000277586"
groups = age_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Youngest", "Oldest")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_age_up_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()
##---- BMI plots ----##
# 2 is lower
gene = "ENSG00000133742"
groups = BMI_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Lowest", "Highest")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(c(0,500))
svg("plots/cavanagh_BMI_up_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000161681"
groups = BMI_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Lowest", "Highest")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_BMI_down_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()
==== Cavanagh P-Value Waterfall Plots ====
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(1,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- Plot settings ----##
x_axis_label = "Top 250 genes ranked by p-value\n(1 = lowest p-value, 250 = highest p-value)"
y_axis_label = "-log10p value"
line_type = "solid"
line_size = 1.5
##---- Cavanagh ----##
random = read.table(file="cavanagh/deseq2/random/44_187/P_DISTRIBUTION.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
gender = read.table(file="cavanagh/deseq2/unmodelled/DE_gender.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
age = read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
bmi = read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_mdd = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_resistant = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_resistant.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_responder = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_responder.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_untreated = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_untreated.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
random_dist = head(sort(rowMeans(random)),1000)
gender_dist = head(sort(gender$p),1000)
age_dist = head(sort(age$p),1000)
bmi_dist = head(sort(bmi$p),1000)
hc_vs_mdd_dist = head(sort(hc_vs_mdd$p),1000)
hc_vs_resistant_dist = head(sort(hc_vs_resistant$p),1000)
hc_vs_responder_dist = head(sort(hc_vs_responder$p),1000)
hc_vs_untreated_dist = head(sort(hc_vs_untreated$p),1000)
rank = 1:1000
# all
p_dist_combined = data.frame(cbind(gender_dist,age_dist,bmi_dist,hc_vs_mdd_dist,hc_vs_resistant_dist,hc_vs_responder_dist,hc_vs_untreated_dist,random_dist,rank))
names(p_dist_combined) = c("male vs female", "youngest vs oldest", "lowest vs highest BMI", "HC vs MDD", "HC vs MDD resistant", "HC vs MDD responder", "HC vs MDD untreated","random", "rank")
p_dist_combined_melted = melt(p_dist_combined, id="rank")
ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000000000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="none")
svg("plots/cavanagh_p_distribution.svg", height=250/90, width=450/90, pointsize=5)
print(ggp)
dev.off()
ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000000000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="right")
svg("plots/cavanagh_p_distribution_key.svg", height=500/90, width=600/90)
print(ggp)
dev.off()
==== Cavanagh MDD Boxplots ====
##---- libraries----##
library(ggplot2)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(1,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- loads cavanagh data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- MDD plots ----##
# 0 is HC
gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_mdd_down1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000115290"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_mdd_down2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
##---- Resistant plots ----##
# 1 is HC
samples_hc_and_resistant = subset(samples,group=="HC" | group=="MDD_resistant")
expression_data_hc_and_resistant = expression_data[,row.names(samples_hc_and_resistant)]
gene = "ENSG00000162738"
expression_data_gene = t(expression_data_hc_and_resistant[gene,])
groups = samples_hc_and_resistant$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("HC", "RES")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_resistant_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000182272"
expression_data_gene = t(expression_data_hc_and_resistant[gene,])
groups = samples_hc_and_resistant$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("HC", "RES")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_resistant_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
##---- Responder plots ----##
# 1 is HC
samples_hc_and_responder = subset(samples,group=="HC" | group=="MDD_responder")
expression_data_hc_and_responder = expression_data[,row.names(samples_hc_and_responder)]
gene = "ENSG00000142686"
expression_data_gene = t(expression_data_hc_and_responder[gene,])
groups = samples_hc_and_responder$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "3"))
sample_groups = c("HC", "RSP")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "3"),labels=sample_groups,limits=c("1", "3")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_responder_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000128039"
expression_data_gene = t(expression_data_hc_and_responder[gene,])
groups = samples_hc_and_responder$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "3"))
sample_groups = c("HC", "RSP")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "3"),labels=sample_groups,limits=c("1", "3")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_responder_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
##---- Untreated plots ----##
# 1 is HC
samples_hc_and_untreated = subset(samples,group=="HC" | group=="MDD_untreated")
expression_data_hc_and_untreated = expression_data[,row.names(samples_hc_and_untreated)]
gene = "ENSG00000165633"
expression_data_gene = t(expression_data_hc_and_untreated[gene,])
groups = samples_hc_and_untreated$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "4"))
sample_groups = c("HC", "UNT")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "4"),labels=sample_groups,limits=c("1", "4")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_untreated_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000075151"
expression_data_gene = t(expression_data_hc_and_untreated[gene,])
groups = samples_hc_and_untreated$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "4"))
sample_groups = c("HC", "UNT")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "4"),labels=sample_groups,limits=c("1", "4")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_untreated_up2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
==== Cavanagh Random Boxplots ====
##---- libraries----##
library(ggplot2)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(1,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- Random 1 plots ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/deseq2/random/44_187/DE_PATIENTS_1.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
gene = "ENSG00000186973"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G1", "G2")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random1_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000182272"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G1", "G2")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random1_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
##---- Random 2 plots ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/deseq2/random/44_187/DE_PATIENTS_2.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
gene = "ENSG00000120658"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G3", "G4")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random2_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "ENSG00000115414"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G3", "G4")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random2_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
==== Cavanagh False Positive Counts Bar Charts ====
##---- libraries----##
library(ggplot2)
library(reshape)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- load data ----##
false_positives = read.table(file="cavanagh/deseq2/random/44_187/false_positive_count.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
##---- Make plots ----##
false_positives$threshold = factor(false_positives$threshold,levels=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"))
false_positives_melt = melt(false_positives,id.vars="threshold")
ggp = ggplot(data=false_positives,aes(x=threshold,y=count)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("median false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=count,label=round(count)),vjust=0,nudge_y=1)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,25))
svg("plots/cavanagh_false_positive_counts.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()
ggp = ggplot(data=false_positives,aes(x=threshold,y=max)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("max false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=max,label=round(max)),vjust=0,nudge_y=5)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,2500))
svg("plots/cavanagh_false_positive_max.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()
==== Cavanagh Random Enriched GO ====
##---- general libraries ----##
library(ggplot2)
library(reshape)
##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)
##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- input files ----##
ontologies = read.table(file="cavanagh/deseq2/random/44_187/david/DAVID_at_least_3_GO_KEGG_REACTOME_BIOCARTA.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = subset(ontologies,FDR < 5)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("Term","Name","Pop.Hits","Count","Fold.Enrichment","FDR")]
ontologies_sig_trimmmed$FDR = factor(round(ontologies_sig_trimmmed$FDR,5))
ontologies_sig_trimmmed$Fold.Enrichment = round(ontologies_sig_trimmmed$Fold.Enrichment,2)
names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR")
##---- plot table ----##
plot_height = 475/90
plot_width = 800/90
header_size = 1
text_size = 1
border_thickness = 2
table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/cavanagh_random_ontologies_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()
==== Leday MDD Boxplots ====
##---- libraries----##
library(ggplot2)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(1,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- loads leday data ----##
expression_data = read.table(file="leday/EM_covariate_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
expression_data = expression_data[,rownames(samples)]
##---- MDD plots ----##
# 2 is HC
gene = "212531_at" # LCN2
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "212768_s_at" # OLFM4
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "207269_at" # DEFA4
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_3_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "231688_at" # MMP8
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_4_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "206676_at" # CEACAM8
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_5_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "205557_at" # BPI
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_6_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "206851_at" # RNASE3
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_7_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
gene = "206697_s_at" # HP
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_8_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()
==== Leday MDD heatmap ====
##---- libraries----##
library(ggplot2)
library(amap)
library(reshape)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- loads data ----##
expression_data = read.table(file="leday/EM_covariate_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(1,1,1,1), "cm")
)
}
##----- Clustering Method -----##
cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method)
y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method)
x.dd = as.dendrogram(x.cluster)
y.dd = as.dendrogram(y.cluster)
x.dd.reorder = reorder(x.dd,0,FUN=reorder_function)
y.dd.reorder = reorder(y.dd,0,FUN=reorder_function)
x.order = order.dendrogram(x.dd.reorder)
y.order = order.dendrogram(y.dd.reorder)
if(cluster_x == TRUE && cluster_y == TRUE)
{
mx_clustered = mx[y.order,x.order]
}
if(cluster_x == TRUE && cluster_y == FALSE)
{
mx_clustered = mx[,x.order]
}
if(cluster_x == FALSE && cluster_y == TRUE)
{
mx_clustered = mx[y.order,]
}
if(cluster_x == FALSE && cluster_y == FALSE)
{
mx_clustered = mx
}
return(mx_clustered)
}
##----- Significant Genes Heatmap Function -----##
make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
if(nrow(matrix_scaled) >= 2)
{
matrix_scaled_renamed = as.matrix(matrix_scaled)
matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered)
matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered))
matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered))
hm.palette <- colorRampPalette(colours)
ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank())
return(ggp)
}
else
{
return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this."))
}
}
##---- Heatmap settings ----##
plot_height = 750
plot_width = 500
colours = c("blue","yellow")
distance_method = "spearman"
clustering_method = "average"
reorder_function = "average"
cluster_x = FALSE
cluster_y = TRUE
##---- Heatmap ----##
#get the samples in order of HC then MDD
sample_data = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
control = unlist(row.names(subset(sample_data,group == 0)))
mdd = unlist(row.names(subset(sample_data,group == 1)))
samples = c(control,mdd)
MDD165 = read.table(file="leday/MDD165.txt", header=FALSE, sep='\t', quote='',check.names = TRUE)
names(MDD165) = "Gene.Symbol"
id_converter = read.table(file="leday/DE.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
sig_genes = merge(MDD165,id_converter, by="Gene.Symbol")$Gene.Symbol
heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)
ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/leday_MDD165_heatmap.png", height=1750, width=1000, pointsize=5)
print(ggp)
dev.off()
==== Leday P Distribution Waterfall ====
##---- libraries ----##
library(ggplot2)
library(reshape)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(1,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- Plot settings ----##
x_axis_label = "Top 250 genes ranked by p-value\n(1 = lowest p-value, 250 = highest p-value)"
y_axis_label = "-log10p value"
line_type = "solid"
line_size = 1.5
##---- Leday ----##
random = read.table(file="leday/random/P_DISTRIBUTION.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_mdd = read.table(file="leday/DE.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
male_vs_female = read.table(file="leday/DE_gender.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
young_vs_old = read.table(file="leday/DE_age.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
anxiety_vs_control = read.table(file="leday/DE_anxiety.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
random_dist = head(sort(rowMeans(random)),1000)
hc_vs_mdd_dist = head(sort(hc_vs_mdd$P.Value),1000)
gender_dist = head(sort(male_vs_female$P.Value),1000)
age_dist = head(sort(young_vs_old$P.Value),1000)
anxiety_dist = head(sort(anxiety_vs_control$P.Value),1000)
rank = 1:1000
# all
p_dist_combined = data.frame(cbind(gender_dist,age_dist,anxiety_dist,hc_vs_mdd_dist,random_dist,rank))
names(p_dist_combined) = c("male vs female","youngest vs oldest","low vs high anxiety","HC vs MDD","random", "rank")
p_dist_combined_melted = melt(p_dist_combined, id="rank")
ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000001,10),group=variable,colour=variable)) + ylim(0,10) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="none")
svg("plots/leday_p_distribution.svg", height=250/90, width=450/90, pointsize=5)
print(ggp)
dev.off()
ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="right")
svg("plots/leday_p_distribution_key.svg", height=500/90, width=600/90)
print(ggp)
dev.off()
==== Leday False Positive Count ====
##---- libraries----##
library(ggplot2)
library(reshape)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- load data ----##
false_positives = read.table(file="leday/random/false_positive_count.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
##---- Make plots ----##
false_positives$threshold = factor(false_positives$threshold,levels=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"))
false_positives_melt = melt(false_positives,id.vars="threshold")
ggp = ggplot(data=false_positives,aes(x=threshold,y=count)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("mean false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=count,label=round(count)),vjust=0,nudge_y=1)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,200))
svg("plots/leday_false_positive_counts.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()
ggp = ggplot(data=false_positives,aes(x=threshold,y=max)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("max false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=max,label=round(max)),vjust=0,nudge_y=5)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,5000))
svg("plots/leday_false_positive_max.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()
==== Leday random GO terms table ====
##---- general libraries ----##
library(ggplot2)
library(reshape)
##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)
##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- input files ----##
ontologies = read.table(file="leday/random/david/David_any1_GO_Kegg_Reactome_Biocarta.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = subset(ontologies,FDR < 50)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("Rank","Term","Name","Pop.Hits","Count","Fold.Enrichment","FDR")]
ontologies_sig_trimmmed$FDR = factor(round(ontologies_sig_trimmmed$FDR,2))
ontologies_sig_trimmmed$Fold.Enrichment = round(ontologies_sig_trimmmed$Fold.Enrichment,2)
names(ontologies_sig_trimmmed) = c("Rank","ID","Term","Term Size","Overlap","Enrichment","FDR")
##---- plot table ----##
plot_height = 300/90
plot_width = 850/90
header_size = 1
text_size = 1
border_thickness = 2
table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/leday_random_ontologies_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()
==== Venn diagram stats ====
# BMI
phyper(14,203,18000-203,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(1,203,18000-203,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(1,203,18000-203,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi
# Age
phyper(37,1244,18000-1244,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(26,1244,18000-1244,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(2,1244,18000-1244,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi
# Gender
phyper(18,625,18000-625,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(3,625,18000-625,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(0,625,18000-625,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi
# Combined
phyper(49,1863,18000-1863,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(27,1863,18000-1863,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(2,1863,18000-1863,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi
# MDD
phyper(5,123,18000-129,165, lower.tail = FALSE, log.p = FALSE) # Leday vs Jansen
phyper(1,29,18000-29,165, lower.tail = FALSE, log.p = FALSE) # Leday vs Mostafavi
phyper(0,29,18000-29,129, lower.tail = FALSE, log.p = FALSE) # Jansen vs Mostafavi
# Random
phyper(3,154,18000-154,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(2,154,18000-154,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(1,154,18000-154,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi
==== BMI Webgestaldt table ====
##---- general libraries ----##
library(ggplot2)
library(reshape)
##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)
##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- input files ----##
ontologies = read.table(file="cross_dataset/web_gestaldt/bmi/enrichment_results_wg_result1565946369.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = head(ontologies,10)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("geneSet","description","size","overlap","enrichmentRatio","FDR")]
ontologies_sig_trimmmed$FDR = factor(signif(ontologies_sig_trimmmed$FDR*100,2))
ontologies_sig_trimmmed$enrichmentRatio = round(ontologies_sig_trimmmed$enrichmentRatio,2)
names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR")
##---- plot table ----##
plot_height = 300/90
plot_width = 800/90
header_size = 1
text_size = 1
border_thickness = 2
table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/cavanagh_BMI_webgestaldt_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()
==== Age Webgestaldt table ====
##---- general libraries ----##
library(ggplot2)
library(reshape)
##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)
##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- input files ----##
ontologies = read.table(file="cross_dataset/web_gestaldt/age/enrichment_results_wg_result1566543491.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = head(ontologies,10)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("geneSet","description","size","overlap","enrichmentRatio","FDR")]
ontologies_sig_trimmmed$FDR = factor(signif(ontologies_sig_trimmmed$FDR*100,2))
ontologies_sig_trimmmed$enrichmentRatio = round(ontologies_sig_trimmmed$enrichmentRatio,2)
names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR")
##---- plot table ----##
plot_height = 300/90
plot_width = 800/90
header_size = 1
text_size = 1
border_thickness = 2
table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/cavanagh_Age_webgestaldt_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()
==== Webgestaldt scatter plots ====
library(ggplot2)
##---- Load data ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cross_dataset/web_gestaldt")
age = read.table(file="age/enrichment_results_wg_result1566543491.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
bmi = read.table(file="bmi/enrichment_results_wg_result1565946369.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
age_bmi = read.table(file="age_plus_BMI/enrichment_results_wg_result1566544579.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
mdd165 = read.table(file="MDD165/enrichment_results_wg_result1565946153.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
Jaansen = read.table(file="Jaansen/enrichment_results_wg_result1565953466.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
Mostafavi = read.table(file="Mostafavi/enrichment_results_wg_result1565955664.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- BMI ----##
# BMI vs Leday
merged= merge(mdd165,bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_bmi_mdd165.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_bmi_mdd165_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# BMI vs Jaansen
merged= merge(Jaansen,bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_bmi_Jansen.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_bmi_Jansen_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# BMI vs Mostafavi
merged= merge(Mostafavi,bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_bmi_Mostafavi.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_bmi_Mostafavi_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
##---- Age + BMI ----##
# BMI vs Leday
merged= merge(mdd165,age_bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_bmi_mdd165.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_bmi_mdd165_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# BMI vs Jaansen
merged= merge(Jaansen,age_bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_bmi_Jansen.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_bmi_Jansen_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# BMI vs Mostafavi
merged= merge(Mostafavi,age_bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_bmi_Mostafavi.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_bmi_Mostafavi_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
##---- Age ----##
# BMI vs Leday
merged= merge(mdd165,age, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_mdd165.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_mdd165_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# BMI vs Jaansen
merged= merge(Jaansen,age, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_Jansen.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_Jansen_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# BMI vs Mostafavi
merged= merge(Mostafavi,age, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_Mostafavi.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_age_Mostafavi_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
##---- MDD ----##
# leday vs Jaansen
merged= merge(mdd165,Jaansen, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p Jansen") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_mdd165_Jansen.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p Jansen") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_mdd165_Jansen_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# leday vs Mostafavi
merged= merge(Mostafavi,mdd165, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + xlab("-log10p Mostafavi") + ylab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_mdd165_Mostafavi.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + xlab("-log10p Mostafavi") + ylab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_mdd165_Mostafavi_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
# Jaansen vs Mostafavi
merged= merge(Mostafavi,Jaansen, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + xlab("-log10p Mostafavi") + ylab("-log10p GO\nenrichment Jaansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_Jaansen_Mostafavi.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + xlab("-log10p Mostafavi") + ylab("-log10p GO\nenrichment Jaansen")# + xlim(c(0,20)) + ylim(c(0,20))
svg("WebGestaldt_Jaansen_Mostafavi_labelled.svg", height=350/90, width=350/90)
print(ggp)
dev.off()
cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman")
==== Cavanagh Immune Age Scatterplot ====
##---- libraries----##
library(ggplot2)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- load data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- get meta gene ----##
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01 & log2fold > 0.5))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_up = data.frame(colMeans(expression_data_sig_zscore))
names(meta_gene_up) = "up"
nrow(expression_data_sig_zscore)
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01 & log2fold < -1))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_down = data.frame(colMeans(expression_data_sig_zscore))
names(meta_gene_down) = "down"
nrow(expression_data_sig_zscore)
meta_gene = data.frame(rowMeans(cbind(-meta_gene_down,meta_gene_up)))
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- plots ----##
age = data.frame(samples$age)
mdd = data.frame(samples$MDD)
group = data.frame(samples$group)
hamilton = data.frame(samples$hamilton_score)
plot_data = cbind(meta_gene,age,mdd,group,hamilton)
names(plot_data) = c("biological_age","chronological_age","MDD","group","hamilton")
hm.palette = colorRampPalette(c("blue","red"))
ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/cavanagh_age_metrics_MDD_scatterplot.svg", height=400/90, width=550/90)
print(ggp)
dev.off()
ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/cavanagh_age_metrics_MDD_scatterplot_BIG.svg", height=1000/90, width=1000/90)
print(ggp)
dev.off()
cor(meta_gene,age, method = "spearman")
==== Leday Immune Age Scatterplot ====
##---- libraries----##
library(ggplot2)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- load data ----##
expression_data = read.table(file="leday/EM_batch_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- get meta gene ----##
sig_genes = row.names(subset(read.table(file="leday/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),P.Value<0.001 & logFC > 0))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_up = data.frame(colMeans(na.omit(expression_data_sig_zscore)))
names(meta_gene_up) = "up"
nrow(expression_data_sig_zscore)
sig_genes = row.names(subset(read.table(file="leday/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),P.Value<0.001 & logFC < -0))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_down = data.frame(colMeans(na.omit(expression_data_sig_zscore)))
names(meta_gene_down) = "down"
nrow(expression_data_sig_zscore)
meta_gene = data.frame(rowMeans(cbind(meta_gene_down,-meta_gene_up)))
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=12, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- plots ----##
age = data.frame(samples$age)
mdd = data.frame(samples$group)
group = data.frame(samples$batch)
plot_data = cbind(meta_gene,age,mdd,group)
names(plot_data) = c("biological_age","chronological_age","MDD","group")
hm.palette = colorRampPalette(c("blue","red"))
ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/leday_age_metrics_MDD_scatterplot.svg", height=400/90, width=550/90)
print(ggp)
dev.off()
ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/leday_age_metrics_MDD_scatterplot_BIG.svg", height=2000/90, width=2000/90)
print(ggp)
dev.off()
cor(meta_gene,age, method = "spearman")
temp = subset(mdd, samples.group == 0)
nrow(temp)
temp = subset(mdd, samples.group == 1)
nrow(temp)
===== Rebuttal Plots =====
==== HC vs MDD volcano plots ====
##---- general libraries ----##
library(ggplot2)
library(reshape)
##---- PDE sample and group names ----##
sample_groupings = factor(sample_groupings, levels = sample_groups)
##---- sets the working directory. If you move the SL2 folder please update this path ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 16, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 14, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 14, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 16, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 16, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=14, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(2.5,"line"),
plot.margin=unit(c(0.4,0.4,0.4,0.4), "cm")
)
}
##----- Volcano Plot Function -----##
make_volcano_plot <- function(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
{
ggp = ggplot(data=PDE_annotated, aes(x=log2fold, y=-log10(p), colour=sig, group=sig,fill=sig)) + geom_point(size=dot_size,alpha=dot_transparency) + scale_color_manual(values=c(non_significant_colour,significant_colour),breaks=c(FALSE,TRUE),labels=c(significant_name, non_significant_name)) + scale_fill_manual(values=c(non_significant_colour,significant_colour),breaks=c(FALSE,TRUE),labels=c(significant_name, non_significant_name)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position, legend.title = element_blank()) + geom_text(data=PDE_annotated_sig,aes(label=rownames(PDE_annotated_sig)),hjust=-0.1, vjust=-0.1, size=data_label_size, show.legend = FALSE) + xlim(c(-4,4))
return(ggp)
}
##----- Volcano Plot Variables -----##
plot_height = 500
plot_width = 600
non_significant_colour = "black"
significant_colour = "red"
dot_size = 1.5
dot_transparency = 1
significant_name = "non-significant"
non_significant_name = "significant"
x_axis_label = "log2 fold change"
y_axis_label = "-log10 p-value"
legend_position = "bottom"
##---- No model ----##
PDE_annotated = read.table(file="deseq2/unmodelled/DE_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)
data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_unmodelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_unmodelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
##---- No model ----##
PDE_annotated = read.table(file="deseq2/modelled/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)
data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_modelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_modelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
##---- limited model ----##
PDE_annotated = read.table(file="deseq2/limited_model/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)
data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_limited_model.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_limited_model.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
##---- limited model no BMI ----##
PDE_annotated = read.table(file="deseq2/limited_model_no_BMI/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)
data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_limited_model_no_BMI.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_limited_model_no_BMI.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
==== HC vs MDD expression boxplots ====
##---- libraries----##
library(ggplot2)
##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
##---- SL2 Theme ----##
theme_SL2 <- function () {
theme_bw() %+replace%
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank(),
legend.background = element_rect(fill="transparent", colour=NA),
legend.key = element_rect(fill="transparent", colour=NA),
title = element_text(size = 24, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.text.y = element_text(size = 22, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
axis.text.x = element_text(size = 22, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"),
axis.title.y = element_text(size = 24, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
axis.title.x = element_text(size = 24, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
legend.text=element_text(size=24, family="Arial", face="bold"),
legend.title=element_blank(),
legend.key.size=unit(1,"line"),
plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
)
}
##---- plot size ----##
plot_height=600
plot_width=400
##---- loads cavanagh data ----##
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
##---- No model ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
gene = "ENSG00000188000"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000188659"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000158406"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000229314"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000232810"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
##---- Limited model no BMI ----##
expression_data = read.table(file="limited_model_no_BMI/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
gene = "ENSG00000081041"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000158406"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000277632"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000229314"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
##---- Limited model ----##
expression_data = read.table(file="limited_model/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000158406"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000277632"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000229314"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000137267"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
##---- full model ----##
expression_data = read.table(file="modelled/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000115290"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000081041"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000196632"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()
gene = "ENSG00000278828"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")
ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()