===== DeSeq2 Cavanagh ===== ==== Clinical parameters ==== ##-------------------------## ## R libraries ## ##-------------------------## ##---- general libraries ----## library("data.table") library("ggplot2") ##---- DESeq2 libraries ----## library("DESeq2") library("BiocParallel") register(MulticoreParam(25)) ##-------------------------## ## Input Files ## ##-------------------------## ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- import data ----## patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##-------------------------## ## DESeq2 Clinical ## ##-------------------------## ##---- Runs DEseq2 for each of the clinical parameters ----## sig_genes_by_clinical_parameter = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) # iterates through the clinical parameters and runs DESeq2: for(clinical_parameter in colnames(patient_data)) { if(clinical_parameter != "study_arm" & clinical_parameter != "group" & clinical_parameter != "centre" ) { print(clinical_parameter) clinical_parameter_data = patient_data[,clinical_parameter] clinical_parameter_data = clinical_parameter_data[complete.cases(clinical_parameter_data)] # tests for a binary parameter (e.g. gender) if(length(summary(as.factor(clinical_parameter_data),maxsum=50000)) == 2) { min = min(as.numeric(clinical_parameter_data)) max = max(as.numeric(clinical_parameter_data)) group1 = row.names(subset(patient_data,patient_data[clinical_parameter] == min)) group2 = row.names(subset(patient_data,patient_data[clinical_parameter] == max)) } # or must be a continuous paramter else { q1 = summary(clinical_parameter_data)[2] q3 = summary(clinical_parameter_data)[5] group1 = row.names(subset(patient_data,patient_data[clinical_parameter] < q1)) group2 = row.names(subset(patient_data,patient_data[clinical_parameter] > q3)) } # tests for a suitable number of samples if(min(c(length(group1),length(group2))) >2) { # runs deseq2 condition = factor(c(rep("low", length(group1)), rep("high", length(group2)))) read_count_filtered = cbind(read_counts[,group1],read_counts[,group2]) countdata = as.matrix(read_count_filtered) countdata <- subset(countdata,apply(countdata, 1, mean) >= 1) coldata = data.frame(row.names=colnames(countdata), condition) dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) dds = DESeq(dds, parallel=T) res = results(dds, c("condition","high","low"), cooksCutoff=0.2) res = res[order(res$padj), ] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file=paste("deseq2/unmodelled/DE_",clinical_parameter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) samples = data.frame(cbind(c(group1,group2),condition)) write.table(samples, file=paste("deseq2/unmodelled/SAMPLES_",clinical_parameter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.01) sig_genes_for_clinical_parameter = data.frame(clinical_parameter,nrow(resdata_sig)) names(sig_genes_for_clinical_parameter)<-c("parameter","count") sig_genes_by_clinical_parameter <- rbind(sig_genes_by_clinical_parameter, sig_genes_for_clinical_parameter) } } } write.table(sig_genes_by_clinical_parameter, file="deseq2/unmodelled/SIG_COUNTS.csv",row.names=FALSE, sep="\t", quote = FALSE) ==== HC vs MDD ==== ##---- general libraries ----## library("data.table") library("ggplot2") ##---- DESeq2 libraries ----## library("DESeq2") library("BiocParallel") register(MulticoreParam(25)) ##-------------------------## ## Input Files ## ##-------------------------## ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- import data ----## patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##-------------------------## ## DESeq2 HC vs MDD ## ##-------------------------## sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) # gets the confounding variables MDD = factor(patient_data$MDD) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound4 = factor(patient_data$centre) confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}} confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median. for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}} confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault) confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}} confound9 = factor(patient_data$Venlafaxine) confound10 = factor(patient_data$live_with) confound11 = factor(patient_data$any.other.significant.negative.events) confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}} confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}} confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}} confound15 = factor(patient_data$Tobacco) confound16 = factor(patient_data$Magic_Mushroom) coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16) # filters the read counts countdata = as.matrix(read_counts) countdata <- subset(countdata,apply(countdata, 1, mean) >= 1) # runs deseq2 dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16+MDD) dds = DESeq(dds, parallel=T) # Get the EM resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2) resdata_IDs <-resdata resdata_IDs$IDs <- row.names(resdata) resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)] resdata <- cbind(resdata_IDs,resdata) colnames(resdata)[1] <- "ID" write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE) # gets the results res = results(dds, c("MDD","0","1"), cooksCutoff=0.2) summary(res) res = res[order(res$pvalue),] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE) ##-------------------------## ## DESeq2 HC vs MDD Types ## ##-------------------------## # gets the confounding variables group = factor(patient_data$group) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound4 = factor(patient_data$centre) coldata = data.frame(row.names=colnames(read_counts),group,confound1,confound2,confound3,confound4) # filters the read counts countdata = as.matrix(read_counts) countdata <- subset(countdata,apply(countdata, 1, mean) >= 1) # runs deseq2 dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+group) dds = DESeq(dds, parallel=T) # Get the EM resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2) resdata_IDs <-resdata resdata_IDs$IDs <- row.names(resdata) resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)] resdata <- cbind(resdata_IDs,resdata) colnames(resdata)[1] <- "ID" write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD_subgroups.csv",row.names=FALSE, sep="\t", quote = FALSE) # gets the results res = results(dds, c("group","HC","MDD_untreated"), cooksCutoff=0.2) summary(res) res = res[order(res$pvalue),] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_untreated.csv",row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.01) sig_genes_for_comparison = data.frame("HC vs MDD untreated" ,nrow(resdata_sig)) names(sig_genes_for_comparison)<-c("parameter","count") sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison) res = results(dds, c("group","HC","MDD_responder"), cooksCutoff=0.2) summary(res) res = res[order(res$pvalue),] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_responder.csv",row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.01) sig_genes_for_comparison = data.frame("HC vs MDD responder",nrow(resdata_sig)) names(sig_genes_for_comparison)<-c("parameter","count") sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison) res = results(dds, c("group","HC","MDD_resistant"), cooksCutoff=0.2) summary(res) res = res[order(res$pvalue),] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_resistant.csv",row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.01) sig_genes_for_comparison = data.frame("HC vs MDD resistant",nrow(resdata_sig)) names(sig_genes_for_comparison)<-c("parameter","count") sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison) ==== HC vs MDD - limited model ==== ##---- general libraries ----## library("data.table") library("ggplot2") ##---- DESeq2 libraries ----## library("DESeq2") library("BiocParallel") register(MulticoreParam(25)) ##-------------------------## ## Input Files ## ##-------------------------## ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- import data ----## patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##-------------------------## ## DESeq2 HC vs MDD ## ##-------------------------## sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) # gets the confounding variables MDD = factor(patient_data$MDD) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound4 = factor(patient_data$centre) coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound3,confound4) # filters the read counts countdata = as.matrix(read_counts) countdata <- subset(countdata,apply(countdata, 1, mean) >= 1) # runs deseq2 dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+MDD) dds = DESeq(dds, parallel=T) # Get the EM resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2) resdata_IDs <-resdata resdata_IDs$IDs <- row.names(resdata) resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)] resdata <- cbind(resdata_IDs,resdata) colnames(resdata)[1] <- "ID" write.table(resdata, file="deseq2/limited_model/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE) # gets the results res = results(dds, c("MDD","0","1"), cooksCutoff=0.2) summary(res) res = res[order(res$pvalue),] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file="deseq2/limited_model/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE) ##-------------------------## ## Input Files ## ##-------------------------## ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- import data ----## patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##-------------------------## ## DESeq2 HC vs MDD ## ##-------------------------## sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) # gets the confounding variables MDD = factor(patient_data$MDD) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound4 = factor(patient_data$centre) coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound4) # filters the read counts countdata = as.matrix(read_counts) countdata <- subset(countdata,apply(countdata, 1, mean) >= 1) # runs deseq2 dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound4+MDD) dds = DESeq(dds, parallel=T) # Get the EM resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2) resdata_IDs <-resdata resdata_IDs$IDs <- row.names(resdata) resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)] resdata <- cbind(resdata_IDs,resdata) colnames(resdata)[1] <- "ID" write.table(resdata, file="deseq2/limited_model_no_BMI/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE) # gets the results res = results(dds, c("MDD","0","1"), cooksCutoff=0.2) summary(res) res = res[order(res$pvalue),] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file="deseq2/limited_model_no_BMI/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE) ==== HC vs MDD - non parameteric ==== ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- loads cavanagh data ----## samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- No model ----## results = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) # get groups group1 = results[,row.names(subset(samples, MDD == 0))] group2 = results[,row.names(subset(samples, MDD == 1))] nrow(group1) nrow(group2) ncol(group1) ncol(group2) # sets up columns results$log2fold = rep(1,nrow(expression_data)) results$p = rep(1,nrow(expression_data)) results$p.adj = rep(1,nrow(expression_data)) # does the stats for (row in 1:nrow(results)) { print(row) gene_group1 = unlist(group1[row,]) gene_group2 = unlist(group2[row,]) results[row,"log2fold"] = log(mean(gene_group2),2) - log(mean(gene_group1),2) results[row,"p"] = wilcox.test(gene_group1,gene_group2, paired=FALSE)$p.value } results[["p.adj"]] = p.adjust(results[["p"]], method = p.adjust.methods, n = length(results[["p"]])) # outputs data results = results[,c("log2fold","p","p.adj")] write.table(results, file="cavanagh/mann_whitney/DE_HC_vs_MDD.csv",row.names=TRUE, sep="\t", quote = FALSE) ==== Random cases vs controls ==== ##---- general libraries ----## library("data.table") library("ggplot2") ##---- DESeq2 libraries ----## library("DESeq2") library("BiocParallel") register(MulticoreParam(25)) ##-------------------------## ## Input Files ## ##-------------------------## ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- import data ----## read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##--------------------------------------------## ## 50 random deseq runs with 50 / 50 split ## ##--------------------------------------------## read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1) p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000)) for (counter in 1:50) { # set random seed set.seed(counter) # gets the random samples samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),ncol(read_counts_trimmed)/2,replace=FALSE)]) samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)]) print(length(samples_random1)) print(length(samples_random2)) random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)] condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) # gets the confounding variables MDD = factor(patient_data$MDD) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound4 = factor(patient_data$centre) confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}} confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median. for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}} confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault) confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}} confound9 = factor(patient_data$Venlafaxine) confound10 = factor(patient_data$live_with) confound11 = factor(patient_data$any.other.significant.negative.events) confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}} confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}} confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}} confound15 = factor(patient_data$Tobacco) confound16 = factor(patient_data$Magic_Mushroom) # runs DESeq2 countdata = as.matrix(random_samples_counts) coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16) dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16) dds = DESeq(dds, parallel=T,betaPrior=TRUE) res = results(dds, c("condition","high","low")) # gets the deseq results res = res[order(res$padj), ] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file=paste("deseq2/random/50_50/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.25) write.table(resdata_sig, file=paste("deseq2/random/50_50/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_top100 = head(resdata, 100) write.table(resdata_top100, file=paste("deseq2/random/50_50/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) # updates the p-value distribution results p_value_dist[,counter] = head(sort(resdata$p),10000) write.table(p_value_dist, file="deseq2/random/50_50/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE) # prints the patient data #coldata = data.frame(coldata) #row.names(coldata) = names(random_samples_counts) write.table(data.frame(coldata), file=paste("deseq2/random/50_50/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE) } ##---------------------------------------------## ## 50 random deseq runs with 44 / 187 split ## ##---------------------------------------------## read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1) p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000)) for (counter in 1:50) { # set random seed set.seed(counter) # gets the random samples samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)]) samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)]) print(length(samples_random1)) print(length(samples_random2)) random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)] condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) # gets the confounding variables MDD = factor(patient_data$MDD) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound4 = factor(patient_data$centre) confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}} confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median. for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}} confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault) confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}} confound9 = factor(patient_data$Venlafaxine) confound10 = factor(patient_data$live_with) confound11 = factor(patient_data$any.other.significant.negative.events) confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}} confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}} confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}} confound15 = factor(patient_data$Tobacco) confound16 = factor(patient_data$Magic_Mushroom) # runs DESeq2 countdata = as.matrix(random_samples_counts) coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16) dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16) dds = DESeq(dds, parallel=T,betaPrior=TRUE) res = results(dds, c("condition","high","low")) # gets the deseq results res = res[order(res$padj), ] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file=paste("deseq2/random/44_187/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.25) write.table(resdata_sig, file=paste("deseq2/random/44_187/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_top100 = head(resdata, 100) write.table(resdata_top100, file=paste("deseq2/random/44_187/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) # updates the p-value distribution results p_value_dist[,counter] = head(sort(resdata$p),10000) write.table(p_value_dist, file="deseq2/random/44_187/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE) # prints the patient data #coldata = data.frame(coldata) #row.names(coldata) = names(random_samples_counts) write.table(data.frame(coldata), file=paste("deseq2/random/44_187/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE) } Merge the random genes cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/50_50 cat DE_SIG_1.csv > DE_SIGS_COMBINED.csv for i in DE_SIG_* do cat $i DE_SIGS_COMBINED.csv | cut -f1 | sort -u > temp.csv mv temp.csv DE_SIGS_COMBINED.csv done cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/44_187 cat DE_SIG_1.csv > DE_SIGS_COMBINED.csv for i in DE_SIG_* do cat $i DE_SIGS_COMBINED.csv | cut -f1 | sort -u > temp.csv mv temp.csv DE_SIGS_COMBINED.csv done cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/50_50 cat DE_SIG_1.csv > DE_SIGS_COMBINED_counts.csv for i in DE_SIG_* do cat $i DE_SIGS_COMBINED_counts.csv | cut -f1 > temp.csv mv temp.csv DE_SIGS_COMBINED_counts.csv done cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv mv temp.csv DE_SIGS_COMBINED_counts.csv cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/44_187 cat DE_SIG_1.csv > DE_SIGS_COMBINED_counts.csv for i in DE_SIG_* do cat $i DE_SIGS_COMBINED_counts.csv | cut -f1 > temp.csv mv temp.csv DE_SIGS_COMBINED_counts.csv done cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv mv temp.csv DE_SIGS_COMBINED_counts.csv ==== Randomised Cases vs Controls - limited or no model ==== ##---- general libraries ----## library("data.table") library("ggplot2") ##---- DESeq2 libraries ----## library("DESeq2") library("BiocParallel") register(MulticoreParam(25)) ##-------------------------## ## Input Files ## ##-------------------------## ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- import data ----## read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---------------------------------------------## ## 50 random deseq runs with 44 / 187 split ## ##---------------------------------------------## read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1) p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000)) for (counter in 1:50) { # set random seed set.seed(counter) # gets the random samples samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)]) samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)]) print(length(samples_random1)) print(length(samples_random2)) random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)] condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) # gets the confounding variables MDD = factor(patient_data$MDD) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound4 = factor(patient_data$centre) confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}} confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median. for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}} confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault) confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}} confound9 = factor(patient_data$Venlafaxine) confound10 = factor(patient_data$live_with) confound11 = factor(patient_data$any.other.significant.negative.events) confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}} confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}} confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}} confound15 = factor(patient_data$Tobacco) confound16 = factor(patient_data$Magic_Mushroom) # runs DESeq2 countdata = as.matrix(random_samples_counts) coldata = data.frame(row.names=colnames(countdata),condition) dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) dds = DESeq(dds, parallel=T,betaPrior=TRUE) res = results(dds, c("condition","high","low")) # gets the deseq results res = res[order(res$padj), ] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file=paste("deseq2/random/44_187_no_model/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.25) write.table(resdata_sig, file=paste("deseq2/random/44_187_no_model/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_top100 = head(resdata, 100) write.table(resdata_top100, file=paste("deseq2/random/44_187_no_model/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) # updates the p-value distribution results p_value_dist[,counter] = head(sort(resdata$p),10000) write.table(p_value_dist, file="deseq2/random/44_187_no_model/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE) # prints the patient data #coldata = data.frame(coldata) #row.names(coldata) = names(random_samples_counts) write.table(data.frame(coldata), file=paste("deseq2/random/44_187_no_model/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE) } ##---------------------------------------------## ## 50 random deseq runs with 44 / 187 split ## ##---------------------------------------------## read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1) p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000)) for (counter in 1:50) { # set random seed set.seed(counter) # gets the random samples samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)]) samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)]) print(length(samples_random1)) print(length(samples_random2)) random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)] condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) # gets the confounding variables MDD = factor(patient_data$MDD) confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound4 = factor(patient_data$centre) confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}} confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median. for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}} confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault) confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}} confound9 = factor(patient_data$Venlafaxine) confound10 = factor(patient_data$live_with) confound11 = factor(patient_data$any.other.significant.negative.events) confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}} confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}} confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}} confound15 = factor(patient_data$Tobacco) confound16 = factor(patient_data$Magic_Mushroom) # runs DESeq2 countdata = as.matrix(random_samples_counts) coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16) dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4) dds = DESeq(dds, parallel=T,betaPrior=TRUE) res = results(dds, c("condition","high","low")) # gets the deseq results res = res[order(res$padj), ] resdata = as.data.frame(res) resdata$ID = row.names(resdata) resdata = resdata[,c(7,2,5,6)] colnames(resdata) = c("ID","log2fold","p","p.adj") write.table(resdata, file=paste("deseq2/random/44_187_limited_model/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_sig = subset(resdata, p.adj < 0.25) write.table(resdata_sig, file=paste("deseq2/random/44_187_limited_model/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) resdata_top100 = head(resdata, 100) write.table(resdata_top100, file=paste("deseq2/random/44_187_limited_model/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) # updates the p-value distribution results p_value_dist[,counter] = head(sort(resdata$p),10000) write.table(p_value_dist, file="deseq2/random/44_187_limited_model/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE) # prints the patient data #coldata = data.frame(coldata) #row.names(coldata) = names(random_samples_counts) write.table(data.frame(coldata), file=paste("deseq2/random/44_187_limited_model/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE) } ==== Convert results to gene symbols ==== setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") bmi = read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) converter = read.table(file="cavanagh/raw_data/ID_to_symbol.csv",header=TRUE, sep='\t', quote='',check.names = TRUE) converted_BMI = merge(bmi, converter, by="ID") converted_BMI_sig = subset(converted_BMI, p.adj < 0.01) write.table(converted_BMI,"cavanagh/deseq2/unmodelled/symbols/DE_BMI_symbols.csv") write.table(converted_BMI_sig,"cavanagh/deseq2/unmodelled/symbols/DE_BMI_sig_symbols.csv") ===== Limma Leday ===== library(Biobase) library(GEOquery) library(limma) library(jetset) library(sdef) ##----------------------## ## Download and Process ## ##----------------------## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/leday") # download data tmp <- getGEO("GSE98793", GSEMatrix=TRUE) eset <- tmp[[1]] # remove samples that failed Leday QC sample_info = read.table(file="sample_IDs.csv",header=TRUE, sep='\t', quote='',check.names = TRUE) valid_samples = subset(sample_info, keep == 0)$ID eset = eset[,valid_samples] # normalise normalizeBetweenArrays(exprs(eset),method="quantile") # remove unannotated probes data <- featureData(eset)@data data.f <- data[-which(data[["Gene Symbol"]] == ""), ] nrow(data) nrow(data.f) eset.f <- eset[rownames(eset) %in% rownames(data.f), ] #use the jetset to select only the probe set of highest jset score for each gene jscores = jscores("hgu133plus2", rownames(eset.f)) unique_genes = data.frame(unique(jscores$symbol)) unique_genes$probe = NA names(unique_genes) = c("symbols","probe") for(i in 1:nrow(unique_genes)) { gene_symbol = unique_genes[i,1] duplicates = subset(jscores,jscores[["symbol"]] == gene_symbol) unique_genes[i,2] = rownames(head(duplicates,1))[1] } nrow(unique_genes) eset.fu <- eset.f[rownames(eset.f) %in% unique_genes$probe, ] nrow(eset.fu) # adds the covariates with user friendly names pData(eset.fu)$group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1) pData(eset.fu)$batch <- pData(eset.fu)$"batch" pData(eset.fu)$gender <- pData(eset.fu)$"gender" pData(eset.fu)$age <- pData(eset.fu)$"age" pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety" ##----------------------------------## ## Limma Uncorrected ## ##----------------------------------## # Gender mydesign2 <- model.matrix(~ gender, data=pData(eset.fu)) fit <- lmFit(eset.fu, design = mydesign2) fit <- eBayes(fit) res <- topTable(fit, number=nrow(eset.fu), coef="genderM", adjust="BH") res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title")) write.table(res, file="DE_gender.csv",row.names=FALSE, sep="\t", quote = FALSE) # Age q1 = summary(as.numeric(pData(eset.fu)$age))[2] q3 = summary(as.numeric(pData(eset.fu)$age))[5] group1 = row.names(subset(pData(eset.fu),as.numeric(pData(eset.fu)$age) < q1)) group2 = row.names(subset(pData(eset.fu),as.numeric(pData(eset.fu)$age) > q3)) group1_2 = c(group1,group2) eset.fu.age = eset.fu[,group1_2] pData(eset.fu.age)$age_group <- factor(c(rep("low", length(group1)), rep("high", length(group2)))) mydesign2 <- model.matrix(~ age_group, data=pData(eset.fu.age)) fit <- lmFit(eset.fu.age, design = mydesign2) fit <- eBayes(fit) res <- topTable(fit, number=nrow(eset.fu.age), adjust="BH") res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title")) write.table(res, file="DE_age.csv",row.names=FALSE, sep="\t", quote = FALSE) # Anxiety mydesign2 <- model.matrix(~ anxiety, data=pData(eset.fu)) fit <- lmFit(eset.fu, design = mydesign2) fit <- eBayes(fit) res <- topTable(fit, number=nrow(eset.fu), adjust="BH") res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title")) write.table(res, file="DE_anxiety.csv",row.names=FALSE, sep="\t", quote = FALSE) ##--------------------------------## ## Limma Corrected ## ##--------------------------------## #differential expression analysis performed via Limma mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu)) fit <- lmFit(eset.fu, design = mydesign2) fit <- eBayes(fit) res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH") # save results res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title")) write.table(res, file="DE.csv",row.names=FALSE, sep="\t", quote = FALSE) residuals <- data.frame(residuals(fit,eset.fu)) write.table(residuals, file="Residuals.csv",row.names=TRUE, sep="\t", quote = FALSE) em <- exprs(eset.fu) write.table(round(em,2), file="EM.csv",row.names=TRUE, sep="\t", quote = FALSE) patient_data <- eset.fu@phenoData@data write.table(patient_data, file="patient_data.csv",row.names=TRUE, sep="\t", quote = FALSE) # batch corrected EM em_corrected <- removeBatchEffect(em, batch=pData(eset.fu)$batch) write.table(round(em_corrected,2), file="EM_batch_corrected.csv",row.names=TRUE, sep="\t", quote = FALSE) # batch and covariate corrected EM covariates <- data.frame(cbind(pData(eset.fu)$gender,pData(eset.fu)$age,pData(eset.fu)$anxiety)) covariates <- data.matrix(covariates) em_corrected <- removeBatchEffect(em, batch=pData(eset.fu)$batch, covariates = covariates) write.table(round(em_corrected,2), file="EM_covariate_corrected.csv",row.names=TRUE, sep="\t", quote = FALSE) ##----------------------## ## Randomised ## ##----------------------## p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000)) for (counter in 1:50) { # set random seed set.seed(counter) print(counter) # gets the random samples group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1) group_random = sample(group,length(group),replace=FALSE) print(group_random) #differential expression analysis performed via Limma pData(eset.fu)$group <- factor(group_random) pData(eset.fu)$batch <- pData(eset.fu)$"batch" pData(eset.fu)$gender <- pData(eset.fu)$"gender" pData(eset.fu)$age <- pData(eset.fu)$"age" pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety" mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu)) fit <- lmFit(eset.fu, design = mydesign2) fit <- eBayes(fit) res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH") # save results res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title")) write.table(res, file=paste("random/DE",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) res_sig = subset(res, P.Value < 0.001) write.table(res_sig, file=paste("random/DE_sig_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) res_100 = head(res,100) write.table(res_100, file=paste("random/DE_top100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE) # updates the p-value distribution results p_value_dist[,counter] = head(sort(res$P.Value),10000) write.table(p_value_dist, file="random/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE) } ##----------------------## ## Randomised Overlap ## ##----------------------## for (counter in 1:50) { # set random seed set.seed(counter) print(counter) ## group 1 # gets the random samples group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1) group_random = sample(group,length(group),replace=FALSE) print(group_random) #differential expression analysis performed via Limma pData(eset.fu)$group <- factor(group_random) pData(eset.fu)$batch <- pData(eset.fu)$"batch" pData(eset.fu)$gender <- pData(eset.fu)$"gender" pData(eset.fu)$age <- pData(eset.fu)$"age" pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety" mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu)) fit <- lmFit(eset.fu, design = mydesign2) fit <- eBayes(fit) res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH") # save results res1 <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title")) ## group 2 # set random seed set.seed(counter*100) print(counter) # gets the random samples group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1) group_random = sample(group,length(group),replace=FALSE) print(group_random) #differential expression analysis performed via Limma pData(eset.fu)$group <- factor(group_random) pData(eset.fu)$batch <- pData(eset.fu)$"batch" pData(eset.fu)$gender <- pData(eset.fu)$"gender" pData(eset.fu)$age <- pData(eset.fu)$"age" pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety" mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu)) fit <- lmFit(eset.fu, design = mydesign2) fit <- eBayes(fit) res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH") # save results res2 <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title")) ## overlap # parse data data_merged = merge(res1,res2, by="ID") ids = data_merged$ID data_trimmed = data_merged[,c("P.Value.x","P.Value.y")] # get overlap ratios = try(ratio(data_trimmed)) if(inherits(ratios, "try-error")) { next } print("ratios generated") bays = try(baymod(ratios)) if(inherits(bays, "try-error")) { next } print("bays modelled") sdef_features = try(data.frame(extractFeatures.R(ratios,bays,ids)$max)) if(inherits(sdef_features, "try-error")) { next } print("features extracted") # write results if (! is.null(sdef_features)) { results = data_merged[data.frame(sdef_features)$Names,"Gene.Symbol.x"] write.table(results, file=paste("random_sdef/DE_sig_",counter,".csv",sep=""),row.names=FALSE, col.names=FALSE, sep="\t", quote = FALSE) } } Gets the counts for the random data cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/leday/random cat DE_sig_1.csv > DE_SIGS_COMBINED_counts.csv for i in DE_sig_* do cat $i DE_SIGS_COMBINED_counts.csv | cut -f7 > temp.csv mv temp.csv DE_SIGS_COMBINED_counts.csv done cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv mv temp.csv DE_SIGS_COMBINED_counts.csv ===== Cibersort ===== Get the expression data as gene symbols ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- loads data ----## cavanagh = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- converts cavanagh ids to symbols ----## cavanagh_conversion = read.table(file="cavanagh/raw_data/ID_to_symbol.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) cavanagh_converted = merge(cavanagh, cavanagh_conversion, by=0) ##---- saves the data ----## write.table(cavanagh_converted, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_symbols.csv",row.names=FALSE, sep="\t", quote = FALSE) We run Cibersort on the modelled HC vs MDD EM file, using the default LM22 background set, no quantile and 100 permutations: https://cibersort.stanford.edu/upload.php /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_2019/cavanagh/cibersort/CIBERSORT.Output_Job6.csv ===== ComBat Correction ===== ## Correct with ComBat library(devtools) library(Biobase) library(sva) # loads the data setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") EM <- read.table("deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE, row.names=1) patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1) # gets the confounding variables confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}} confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median. for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}} confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault) confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}} confound9 = factor(patient_data$Venlafaxine) confound10 = factor(patient_data$live_with) confound11 = factor(patient_data$any.other.significant.negative.events) confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}} confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}} confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median. for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}} confound15 = factor(patient_data$Tobacco) confound16 = factor(patient_data$Magic_Mushroom) coldata = data.frame(row.names=colnames(EM),confound1,confound2,confound3,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16) # gets batch confound4 = factor(patient_data$centre) # corrects modcombat = model.matrix(~confound1+confound2+confound3+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16, data=coldata) combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE) # saves resdata <- cbind(row.names(combat_edata),round(combat_edata,2)) colnames(resdata)[1] <- "ID" write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE) ===== Combat correction - limited model===== ## Correct with ComBat library(devtools) library(Biobase) library(sva) # loads the data setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") EM <- read.table("deseq2/limited_model/EM_HC_vs_MDD.csv", header=TRUE, row.names=1) patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1) # gets the confounding variables confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean. for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}} confound3 = factor(cut(confound3,4,labels=c("a","b","c","d"))) coldata = data.frame(row.names=colnames(EM),confound1,confound2,confound3) # gets batch confound4 = factor(patient_data$centre) # corrects modcombat = model.matrix(~confound1+confound2+confound3, data=coldata) combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE) # saves resdata <- cbind(row.names(combat_edata),round(combat_edata,2)) colnames(resdata)[1] <- "ID" write.table(resdata, file="deseq2/limited_model/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE) ## Correct with ComBat library(devtools) library(Biobase) library(sva) # loads the data setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") EM <- read.table("deseq2/limited_model_no_BMI/EM_HC_vs_MDD.csv", header=TRUE, row.names=1) patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1) # gets the confounding variables confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d"))) confound2 = factor(patient_data$gender) coldata = data.frame(row.names=colnames(EM),confound1,confound2) # gets batch confound4 = factor(patient_data$centre) # corrects modcombat = model.matrix(~confound1+confound2, data=coldata) combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE) # saves resdata <- cbind(row.names(combat_edata),round(combat_edata,2)) colnames(resdata)[1] <- "ID" write.table(resdata, file="deseq2/limited_model_no_BMI/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE) ===== Percentile Correction ===== setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") # method for correcting outliers correct_outliers <- function(x, na.rm = TRUE, ...) { qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...) H <- 1.5 * IQR(x, na.rm = na.rm) y <- x y[x < (qnt[1] - H)] <- qnt[1] - H y[x > (qnt[2] + H)] <- qnt[2] + H y } # pre Combat expression = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) EM <- expression[0,] for(row in 1:nrow(expression)) { gene <- expression[(row),] EM <- rbind(EM,correct_outliers(gene)) } EM_IDs = EM EM_IDs$IDs = row.names(EM) EM_IDs = EM_IDs[,ncol(EM_IDs)] EM = cbind(EM_IDs,EM) colnames(EM)[1] = "IDs" write.table(EM, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv",row.names=FALSE, sep="\t", quote = FALSE) # post Combat expression = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) EM <- expression[0,] for(row in 1:nrow(expression)) { gene <- expression[(row),] EM <- rbind(EM,correct_outliers(gene)) } EM_IDs = EM EM_IDs$IDs = row.names(EM) EM_IDs = EM_IDs[,ncol(EM_IDs)] EM = cbind(EM_IDs,EM) colnames(EM)[1] = "IDs" write.table(EM, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv",row.names=FALSE, sep="\t", quote = FALSE) ===== Expected False Positives by FDR Threshold ===== ==== Cavanagh ==== ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- List of thresholds ----## thresholds = c(0.25,0.1,0.05,0.01,0.001,0.0001,0.00001,0.000001) ##---- get the number of false positives ----## false_positive_dist = data.frame(matrix(0, ncol = 3, nrow = length(thresholds))) names(false_positive_dist) = c("threshold","count","max") false_positive_dist$threshold = thresholds for (index in 1:length(thresholds)) { threshold = thresholds[index] print(threshold) values <- vector(mode="numeric", length=50) max = 0 for (counter in 1:50) { DE_data = read.table(file=paste("cavanagh/deseq2/random/44_187/DE_",counter,".csv",sep=""), header=TRUE, sep='\t', quote='',check.names = TRUE) values[counter] = nrow(subset(DE_data,p.adj < threshold)) if (nrow(subset(DE_data,p.adj < threshold)) > max) { max = nrow(subset(DE_data,p.adj < threshold)) } } false_positive_dist[index,2] = median(values) false_positive_dist[index,3] = max } write.table(false_positive_dist, file="cavanagh/deseq2/random/44_187/false_positive_count.csv",row.names=FALSE, sep="\t", quote = FALSE) ==== Leday ==== ##---- working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- List of thresholds ----## thresholds = c(0.25,0.1,0.05,0.01,0.001,0.0001,0.00001,0.000001) ##---- get the number of false positives ----## false_positive_dist = data.frame(matrix(0, ncol = 3, nrow = length(thresholds))) names(false_positive_dist) = c("threshold","count","max") false_positive_dist$threshold = thresholds for (index in 1:length(thresholds)) { threshold = thresholds[index] print(threshold) values <- vector(mode="numeric", length=50) max = 0 for (counter in 1:50) { DE_data = read.table(file=paste("leday/random/DE",counter,".csv",sep=""), header=TRUE, sep='\t', quote='',check.names = TRUE) values[counter] = nrow(subset(DE_data,adj.P.Val < threshold)) if (nrow(subset(DE_data,adj.P.Val < threshold)) > max) { max = nrow(subset(DE_data,adj.P.Val < threshold)) } } false_positive_dist[index,2] = median(values) false_positive_dist[index,3] = max } write.table(false_positive_dist, file="leday/random/false_positive_count.csv",row.names=FALSE, sep="\t", quote = FALSE) ===== Correlation Clusters ===== Pre-filters the data ##---------------------------------------------## ## ## ## Code to pre-filter the RNA-seq data ## ## ## ##---------------------------------------------## #Clears the enrivoment rm(list = ls()) #Load libraries library(psych) library(stats) #loads the data setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") exp_values = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- Pre-filters the RNA-seq data ----## # Remove genes exp_values <- data.frame(subset(exp_values,apply(exp_values, 1, mean) >= 10)) # Identify genes with a high threshold geneHighCoefOfVarFilter <- function(dataMatrix, coefOfVarThreshold) { coefOfVars <- apply(dataMatrix, 1, function(x) { sd(x) / abs(mean(x)) }) fdata <- dataMatrix[coefOfVars < coefOfVarThreshold, ] # return the filtered data list(coefOfVars = coefOfVars, fdata=fdata) } # the smaller the threshold, the higher the experimental effect relative to the measurement precision thr <- 0.15 # does the filtering filtered <- geneHighCoefOfVarFilter(exp_values, thr) filtered.combined <- na.omit(data.frame(filtered$fdata)) nrow(filtered.combined) num.genes <- nrow(filtered.combined) genes <- row.names(filtered.combined) rnaSeq <- t(filtered.combined) subjects <- row.names(rnaSeq) my_subjs <- subjects ##---- Saves the data ----## rnaSeq <- rnaSeq[my_subjs,] num.genes <- ncol(rnaSeq) save(num.genes, rnaSeq, my_subjs,file = "cavanagh/clusters/exp.filtered.corrected.Rdata") Does the clustering ##---------------------------------------------## ## ## ## Code to do the clustering ## ## ## ##---------------------------------------------## #loads the data setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/clusters/") load("exp.filtered.corrected.Rdata") #loads the libraries library("dendextend") library(WGCNA) library(GSVA) library(reshape2) library(grid) library(broom) library(ggplot2) library(ade4) library(leaps) library(dplyr) library(flashClust) # Prepares the data pathnames <- list.files(pattern="[.]R$", path=paste(getwd(),"/funcs", sep =""), full.names=TRUE); sapply(pathnames, FUN=source); genes <- colnames(rnaSeq) rnaseq_counts <- rnaSeq expr <- as.matrix(t(rnaseq_counts)) ##---- does the clustering ----## corrThresh = 0.5 inputmat = rnaSeq regs = genes inputmat <- cor(inputmat) adjacency <- inputmat > corrThresh TOM <- TOMsimilarity(adjacency+0, TOMType = "unsigned"); dissTOM = 1-TOM # Call the hierarchical clustering function geneTree = hclust(as.dist(dissTOM), method = "average"); # Plot the resulting clustering tree (dendrogram) sizeGrWindow(12,9) plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity", labels = FALSE, hang = 0.04) # We like large modules, so we set the minimum module size relatively high: minModuleSize = 50; # Module identification using dynamic tree cut: dynamicMods = cutreeDynamic(dendro = geneTree, cutHeight = 0.95, distM = dissTOM,deepSplit = 2, pamRespectsDendro = FALSE,minClusterSize = minModuleSize); mod_sizes <- table(dynamicMods) length(mod_sizes) mod_sizes # gets all cluster members except first (-1) because first is "unassigned genes" numMods <- sort(unique(names(dynamicMods)))[-1] names(dynamicMods) <- colnames(adjacency) modNumbers <- unique(dynamicMods) names(dynamicMods) <- colnames(adjacency) gene.mod.list <- lapply(modNumbers, function(x) { cluster_genes <- dynamicMods[dynamicMods == x] names(cluster_genes) }) gene.mod.df <- melt(gene.mod.list) rownames(gene.mod.df) <- gene.mod.df[,1] # saves the results write.table(gene.mod.df, file="meta_genes.csv",row.names=TRUE, sep="\t", quote = FALSE) Stats and plots setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- general libraries ----## library(ggplot2) library(reshape) library(amap) ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- number of clusters: ----## number_of_clusters = 48 ##---- plot metagene correlations: ----## DGM_all = read.table(file="clusters/meta_genes.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) expression_data = read.table(file="deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) for(cluster_no in 1:number_of_clusters) { cluster_IDs = subset(DGM_all,L1 == cluster_no) expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),]) expression_data_cluster_corr_melted = melt(cor(t(expression_data_cluster))) expression_data_cluster_corr_melted = subset(expression_data_cluster_corr_melted,value < 1) ggp <- ggplot(expression_data_cluster_corr_melted, aes(x=value)) + geom_density(alpha=0.25) + xlab("SCC") + ylab("density") + theme_SL2() + theme(legend.position="bottom") + xlim(c(-1,1)) + ggtitle(paste("cluster",cluster_no)) svg(paste("clusters/cluster_corr_plots/",cluster_no,"_corrplot.svg",sep=""), height=300/90, width=300/90) print(ggp) dev.off() } ##---- plot metagene HC vs MDD ----## patient_data_cavanagh = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) # violin plots legend_position = "none" mdd_ttest_list = data.frame(matrix(, nrow=number_of_clusters, ncol=4)) mdd_spearman_list = data.frame(matrix(, nrow=number_of_clusters, ncol=2)) for(cluster_no in 1:number_of_clusters) { # gets the cluster expression data cluster_IDs = subset(DGM_all,L1 == cluster_no) expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),]) expression_data_cluster_scaled = data.frame(t(scale(t(expression_data_cluster)))) expression_data_cluster_scaled_colmeans = data.frame(colMeans(expression_data_cluster_scaled)) #stats for subtypes expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$group group1 = subset(expression_data_cluster_scaled_colmeans,Diag=="HC")$colMeans.expression_data_cluster_scaled. group2 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_resistant")$colMeans.expression_data_cluster_scaled. group3 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_responder")$colMeans.expression_data_cluster_scaled. group4 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_untreated")$colMeans.expression_data_cluster_scaled. mdd_ttest_list[cluster_no,2] = t.test(group1,group2)$p.value mdd_ttest_list[cluster_no,3] = t.test(group1,group3)$p.value mdd_ttest_list[cluster_no,4] = t.test(group1,group4)$p.value #plots the subtypes x_axis_label = "" y_axis_label = "Cluster meta-gene\nexpression (Z-score)" ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag, fill=Diag)) + geom_boxplot() + ylim(min(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25), max(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no)) svg(paste("clusters/cluster_boxplots_subtypes/",cluster_no,"_boxplot.svg",sep=""), height=160/90, width=400/90) print(ggp) dev.off() #stats for disease state expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$MDD group1 = subset(expression_data_cluster_scaled_colmeans, Diag==0)$colMeans.expression_data_cluster_scaled. group2 = subset(expression_data_cluster_scaled_colmeans,Diag==1)$colMeans.expression_data_cluster_scaled. mdd_ttest_list[cluster_no,1] = t.test(group1,group2)$p.value #plots for disease state x_axis_label = "" y_axis_label = "Cluster meta-gene\nexpression (Z-score)" ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag)) + scale_x_discrete(breaks=c(0, 1),labels=c("HC","MDD"),limits=c(0, 1)) + geom_boxplot() + ylim(min(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25), max(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no)) svg(paste("clusters/cluster_boxplots_MDD/",cluster_no,"_boxplot.svg",sep=""), height=200/90, width=200/90) print(ggp) dev.off() #stats for hamilton score expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$hamilton_score mdd_spearman_list[cluster_no,1]=cor.test(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled., expression_data_cluster_scaled_colmeans$Diag, method=c("spearman"))$estimate mdd_spearman_list[cluster_no,2]=cor.test(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled., expression_data_cluster_scaled_colmeans$Diag, method=c("spearman"))$p.value #plots the hamilton score x_axis_label = "Hamilton score" y_axis_label = "Cluster meta-gene\nexpression (Z-score)" ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag, fill=Diag)) + geom_point() + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no)) svg(paste("clusters/cluster_vs_hamilton_corr_plots/",cluster_no,"_dotplot.svg",sep=""), height=160/90, width=150/90) print(ggp) dev.off() } # BH correction of stats mdd_ttest_list$HC_vs_MDD_FDR = p.adjust(mdd_ttest_list$X1, method = "BH", n = number_of_clusters) mdd_ttest_list$HC_vs_resistant_FDR = p.adjust(mdd_ttest_list$X2, method = "BH", n = number_of_clusters) mdd_ttest_list$HC_vs_responder_FDR = p.adjust(mdd_ttest_list$X3, method = "BH", n = number_of_clusters) mdd_ttest_list$HC_vs_untreated_FDR = p.adjust(mdd_ttest_list$X4, method = "BH", n = number_of_clusters) names(mdd_ttest_list) = c("HC_vs_MDD_p","HC_vs_resistant_p","HC_vs_responder_p","HC_vs_untreated_p","HC_vs_MDD_pBH","HC_vs_resistant_pBH","HC_vs_responder_pBH","HC_vs_untreated_pBH") write.table(mdd_ttest_list, file="clusters/cluster_violin_pvals.csv",row.names=TRUE, sep="\t", quote = FALSE) mdd_spearman_list$HAM_vs_Exp_SCC_FDR = p.adjust(mdd_spearman_list$X2, method = "BH", n = number_of_clusters) names(mdd_spearman_list) = c("SCC","p","pBH") write.table(mdd_spearman_list, file="clusters/cluster_vs_ham_pvals.csv",row.names=TRUE, sep="\t", quote = FALSE) ##----- Clustering Method -----## cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) { x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method) y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method) x.dd = as.dendrogram(x.cluster) y.dd = as.dendrogram(y.cluster) x.dd.reorder = reorder(x.dd,0,FUN=reorder_function) y.dd.reorder = reorder(y.dd,0,FUN=reorder_function) x.order = order.dendrogram(x.dd.reorder) y.order = order.dendrogram(y.dd.reorder) if(cluster_x == TRUE && cluster_y == TRUE) { mx_clustered = mx[y.order,x.order] } if(cluster_x == TRUE && cluster_y == FALSE) { mx_clustered = mx[,x.order] } if(cluster_x == FALSE && cluster_y == TRUE) { mx_clustered = mx[y.order,] } if(cluster_x == FALSE && cluster_y == FALSE) { mx_clustered = mx } return(mx_clustered) } ##----- Significant Genes Heatmap Function -----## make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) { if(nrow(matrix_scaled) >= 2) { matrix_scaled_renamed = as.matrix(matrix_scaled) matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered) matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered)) matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered)) hm.palette <- colorRampPalette(colours) ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank()) return(ggp) } else { return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this.")) } } ##---- Heatmap settings ----## plot_height = 750 plot_width = 500 colours = c("blue","yellow") distance_method = "spearman" clustering_method = "average" reorder_function = "average" cluster_x = TRUE cluster_y = TRUE ##---- plot metagene Heatmaps ----## for(cluster_no in 1:number_of_clusters) { cluster_IDs = subset(DGM_all,L1 == cluster_no) expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),]) heatmap_matrix = data.frame(t(scale(t(expression_data_cluster)))) nrow(heatmap_matrix) ncol(heatmap_matrix) ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) png(paste("clusters/cluster_heatmaps/",cluster_no,"_heatmap.png",sep=""), height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() } ##---- plot correlation heatmap ----## plot_height = 2500 plot_width = 2500 cluster_x = TRUE cluster_y = TRUE #select the first 25 genes from each cluster (to make clustering quicker) cluster_genes_25 = list() for(cluster_no in 1:number_of_clusters) { cluster_genes = subset(DGM_all,L1 == cluster_no) cluster_genes_top = head(cluster_genes,round(nrow(cluster_genes)/4)) cluster_genes_25 = rbind(cluster_genes_25,cluster_genes_top) } nrow(cluster_genes_25) # makes the matrix expression_data_clusters = na.omit(expression_data[row.names(cluster_genes_25),]) cluster_IDs = subset(DGM_all,L1 == cluster_no) expression_data_clusters_corr = cor(t(expression_data_clusters)) heatmap_matrix = data.frame(expression_data_clusters_corr) nrow(heatmap_matrix) ncol(heatmap_matrix) # plots ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) png("clusters/correlation_heatmap.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() ===== Plots ===== ==== Cibersort ==== ##---- libraries----## library(ggplot2) library(reshape) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- loads data ----## cavanagh = read.table(file="cavanagh/cibersort/CIBERSORT.Output_Job6_parsed.csv",row.names=1,header=TRUE, sep='\t', quote='',check.names = TRUE) ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(1,1,1,1), "cm") ) } ##---- Cavanagh only plot ----## names(cavanagh) = c("Monocytes","T cells CD4 memory resting","NK cells resting","T cells CD4 naive","T cells CD8","B cells naive","Macrophages M0","B cells memory","Neutrophils","T cells regulatory (Tregs)","Mast cells resting","Dendritic cells activated","T cells CD4 memory activated","Plasma cells","Macrophages M2","Eosinophils","Mast cells activated","Macrophages M1") cavanagh_melted = melt(cavanagh) ggp = ggplot(cavanagh_melted, aes(x=variable, y=value*100)) + geom_boxplot() + xlab("") + ylab("% of cell population") + theme_SL2() + theme(legend.position="right",axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) svg("plots/cibersort_cavanagh.svg", height=400/90, width=800/90) print(ggp) dev.off() ==== Signficant Confounders Bar Chart ==== ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- number of sig genes by clinical parameter bar chart ----## sig_genes_by_clinical_parameter = read.table(file="cavanagh/deseq2/unmodelled/SIG_COUNTS.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) sig_genes_by_clinical_parameter$IDs = row.names(sig_genes_by_clinical_parameter) sig_genes_by_clinical_parameter$IDs = gsub('\\.', ' ', sig_genes_by_clinical_parameter$IDs) sig_genes_by_clinical_parameter$IDs = gsub(' ', ' ', sig_genes_by_clinical_parameter$IDs) sig_genes_by_clinical_parameter$IDs = factor(gsub('_', ' ', sig_genes_by_clinical_parameter$IDs)) row.names(sig_genes_by_clinical_parameter) = sig_genes_by_clinical_parameter$IDs sig_genes_by_clinical_parameter = sig_genes_by_clinical_parameter[with(sig_genes_by_clinical_parameter, order(count)), ] sig_genes_by_clinical_parameter_top = subset(sig_genes_by_clinical_parameter,count >= 5) sig_genes_by_clinical_parameter_top$IDs = factor(sig_genes_by_clinical_parameter_top$IDs) sig_genes_by_clinical_parameter_top$IDs <- factor(sig_genes_by_clinical_parameter_top$IDs, levels = sig_genes_by_clinical_parameter_top$IDs) x_axis_label = "number of significant genes (adjusted p <0.01)" y_axis_label = "" bar_outline_size = 1 bar_transparency = 0.75 legend_position = "bottom" data_label_size = 3 ggp <- ggplot(data=sig_genes_by_clinical_parameter_top , aes(x=IDs, y=count)) + geom_bar(colour="black",stat="identity", position = "dodge", alpha = bar_transparency) + coord_flip() + ylab(x_axis_label) + xlab(y_axis_label) + theme_SL2() + theme(axis.text.y = element_text(hjust=1),legend.position=legend_position, legend.spacing.x = unit(0.15, 'cm')) + scale_y_continuous(expand=c(0,0), limits=c(0,max(sig_genes_by_clinical_parameter_top$count)*1.25)) + geom_text(aes(label=count), position=position_dodge(width=0.9), size=data_label_size, show.legend = FALSE, hjust=-0.5) svg("plots/sig_genes_barchart.svg", height=300/90, width=800/90, pointsize=5) print(ggp) dev.off() ==== Age, Gender, BMI heatmaps ===== ##---- libraries----## library(ggplot2) library(amap) library(reshape) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- loads data ----## expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(1,1,1,1), "cm") ) } ##----- Clustering Method -----## cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) { x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method) y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method) x.dd = as.dendrogram(x.cluster) y.dd = as.dendrogram(y.cluster) x.dd.reorder = reorder(x.dd,0,FUN=reorder_function) y.dd.reorder = reorder(y.dd,0,FUN=reorder_function) x.order = order.dendrogram(x.dd.reorder) y.order = order.dendrogram(y.dd.reorder) if(cluster_x == TRUE && cluster_y == TRUE) { mx_clustered = mx[y.order,x.order] } if(cluster_x == TRUE && cluster_y == FALSE) { mx_clustered = mx[,x.order] } if(cluster_x == FALSE && cluster_y == TRUE) { mx_clustered = mx[y.order,] } if(cluster_x == FALSE && cluster_y == FALSE) { mx_clustered = mx } return(mx_clustered) } ##----- Significant Genes Heatmap Function -----## make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) { if(nrow(matrix_scaled) >= 2) { matrix_scaled_renamed = as.matrix(matrix_scaled) matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered) matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered)) matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered)) hm.palette <- colorRampPalette(colours) ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank()) return(ggp) } else { return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this.")) } } ##---- Heatmap settings ----## plot_height = 750 plot_width = 500 colours = c("blue","yellow") distance_method = "spearman" clustering_method = "average" reorder_function = "average" cluster_x = FALSE cluster_y = TRUE ##---- Gender Heatmap ----## samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_gender.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE))) sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_gender.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01)) heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples])))) nrow(heatmap_matrix) ncol(heatmap_matrix) ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) png("plots/cavanagh_gender_heatmap.png", height=1750, width=1000, pointsize=5) print(ggp) dev.off() ##---- Age Heatmap ----## samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE))) sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01)) heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples])))) nrow(heatmap_matrix) ncol(heatmap_matrix) ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) png("plots/cavanagh_age_heatmap.png", height=2000, width=1000, pointsize=5) print(ggp) dev.off() ##---- BMI Heatmap ----## samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE))) sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01)) heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples])))) nrow(heatmap_matrix) ncol(heatmap_matrix) ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) png("plots/cavanagh_BMI_heatmap.png", height=2000, width=1000, pointsize=5) print(ggp) dev.off() ==== Age, Gender, BMI Boxplots ==== ##---- libraries----## library(ggplot2) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- loads cavanagh data ----## expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) age_samples = read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) BMI_samples = read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- gender plots ----## # 2 is female gene = "ENSG00000012817" expression_data_gene = t(expression_data[gene,]) groups = samples$gender plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") sample_groups = c("Male", "Female") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=sample_groups,labels=sample_groups,limits=sample_groups) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_gender_up_boxplot.svg", height=200/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000147050" expression_data_gene = t(expression_data[gene,]) groups = samples$gender plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") sample_groups = c("Male", "Female") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=sample_groups,labels=sample_groups,limits=sample_groups) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_gender_down_boxplot.svg", height=200/90, width=150/90) print(ggp) dev.off() ##---- age plots ----## # 2 is younger gene = "ENSG00000169855" groups = age_samples expression_data_gene = t(expression_data[gene,row.names(groups)]) plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("Youngest", "Oldest") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_age_down_boxplot.svg", height=200/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000277586" groups = age_samples expression_data_gene = t(expression_data[gene,row.names(groups)]) plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("Youngest", "Oldest") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_age_up_boxplot.svg", height=200/90, width=150/90) print(ggp) dev.off() ##---- BMI plots ----## # 2 is lower gene = "ENSG00000133742" groups = BMI_samples expression_data_gene = t(expression_data[gene,row.names(groups)]) plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("Lowest", "Highest") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(c(0,500)) svg("plots/cavanagh_BMI_up_boxplot.svg", height=200/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000161681" groups = BMI_samples expression_data_gene = t(expression_data[gene,row.names(groups)]) plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("Lowest", "Highest") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_BMI_down_boxplot.svg", height=200/90, width=150/90) print(ggp) dev.off() ==== Cavanagh P-Value Waterfall Plots ==== ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(1,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- Plot settings ----## x_axis_label = "Top 250 genes ranked by p-value\n(1 = lowest p-value, 250 = highest p-value)" y_axis_label = "-log10p value" line_type = "solid" line_size = 1.5 ##---- Cavanagh ----## random = read.table(file="cavanagh/deseq2/random/44_187/P_DISTRIBUTION.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) gender = read.table(file="cavanagh/deseq2/unmodelled/DE_gender.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) age = read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) bmi = read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) hc_vs_mdd = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) hc_vs_resistant = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_resistant.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) hc_vs_responder = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_responder.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) hc_vs_untreated = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_untreated.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) random_dist = head(sort(rowMeans(random)),1000) gender_dist = head(sort(gender$p),1000) age_dist = head(sort(age$p),1000) bmi_dist = head(sort(bmi$p),1000) hc_vs_mdd_dist = head(sort(hc_vs_mdd$p),1000) hc_vs_resistant_dist = head(sort(hc_vs_resistant$p),1000) hc_vs_responder_dist = head(sort(hc_vs_responder$p),1000) hc_vs_untreated_dist = head(sort(hc_vs_untreated$p),1000) rank = 1:1000 # all p_dist_combined = data.frame(cbind(gender_dist,age_dist,bmi_dist,hc_vs_mdd_dist,hc_vs_resistant_dist,hc_vs_responder_dist,hc_vs_untreated_dist,random_dist,rank)) names(p_dist_combined) = c("male vs female", "youngest vs oldest", "lowest vs highest BMI", "HC vs MDD", "HC vs MDD resistant", "HC vs MDD responder", "HC vs MDD untreated","random", "rank") p_dist_combined_melted = melt(p_dist_combined, id="rank") ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000000000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="none") svg("plots/cavanagh_p_distribution.svg", height=250/90, width=450/90, pointsize=5) print(ggp) dev.off() ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000000000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="right") svg("plots/cavanagh_p_distribution_key.svg", height=500/90, width=600/90) print(ggp) dev.off() ==== Cavanagh MDD Boxplots ==== ##---- libraries----## library(ggplot2) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(1,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- loads cavanagh data ----## expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- MDD plots ----## # 0 is HC gene = "ENSG00000277075" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_mdd_down1_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000115290" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_mdd_down2_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() ##---- Resistant plots ----## # 1 is HC samples_hc_and_resistant = subset(samples,group=="HC" | group=="MDD_resistant") expression_data_hc_and_resistant = expression_data[,row.names(samples_hc_and_resistant)] gene = "ENSG00000162738" expression_data_gene = t(expression_data_hc_and_resistant[gene,]) groups = samples_hc_and_resistant$group plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "2")) sample_groups = c("HC", "RES") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_resistant_1_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000182272" expression_data_gene = t(expression_data_hc_and_resistant[gene,]) groups = samples_hc_and_resistant$group plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "2")) sample_groups = c("HC", "RES") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_resistant_2_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() ##---- Responder plots ----## # 1 is HC samples_hc_and_responder = subset(samples,group=="HC" | group=="MDD_responder") expression_data_hc_and_responder = expression_data[,row.names(samples_hc_and_responder)] gene = "ENSG00000142686" expression_data_gene = t(expression_data_hc_and_responder[gene,]) groups = samples_hc_and_responder$group plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "3")) sample_groups = c("HC", "RSP") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "3"),labels=sample_groups,limits=c("1", "3")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_responder_1_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000128039" expression_data_gene = t(expression_data_hc_and_responder[gene,]) groups = samples_hc_and_responder$group plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "3")) sample_groups = c("HC", "RSP") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "3"),labels=sample_groups,limits=c("1", "3")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_responder_2_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() ##---- Untreated plots ----## # 1 is HC samples_hc_and_untreated = subset(samples,group=="HC" | group=="MDD_untreated") expression_data_hc_and_untreated = expression_data[,row.names(samples_hc_and_untreated)] gene = "ENSG00000165633" expression_data_gene = t(expression_data_hc_and_untreated[gene,]) groups = samples_hc_and_untreated$group plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "4")) sample_groups = c("HC", "UNT") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "4"),labels=sample_groups,limits=c("1", "4")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_untreated_1_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000075151" expression_data_gene = t(expression_data_hc_and_untreated[gene,]) groups = samples_hc_and_untreated$group plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "4")) sample_groups = c("HC", "UNT") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "4"),labels=sample_groups,limits=c("1", "4")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_untreated_up2_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() ==== Cavanagh Random Boxplots ==== ##---- libraries----## library(ggplot2) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(1,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- Random 1 plots ----## expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) samples = read.table(file="cavanagh/deseq2/random/44_187/DE_PATIENTS_1.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) gene = "ENSG00000186973" expression_data_gene = t(expression_data[gene,]) groups = samples$condition plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "2")) sample_groups = c("G1", "G2") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_random1_1_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000182272" expression_data_gene = t(expression_data[gene,]) groups = samples$condition plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "2")) sample_groups = c("G1", "G2") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_random1_2_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() ##---- Random 2 plots ----## expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) samples = read.table(file="cavanagh/deseq2/random/44_187/DE_PATIENTS_2.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) gene = "ENSG00000120658" expression_data_gene = t(expression_data[gene,]) groups = samples$condition plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "2")) sample_groups = c("G3", "G4") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_random2_1_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "ENSG00000115414" expression_data_gene = t(expression_data[gene,]) groups = samples$condition plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("1", "2")) sample_groups = c("G3", "G4") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/cavanagh_random2_2_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() ==== Cavanagh False Positive Counts Bar Charts ==== ##---- libraries----## library(ggplot2) library(reshape) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- load data ----## false_positives = read.table(file="cavanagh/deseq2/random/44_187/false_positive_count.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) ##---- Make plots ----## false_positives$threshold = factor(false_positives$threshold,levels=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) false_positives_melt = melt(false_positives,id.vars="threshold") ggp = ggplot(data=false_positives,aes(x=threshold,y=count)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("median false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=count,label=round(count)),vjust=0,nudge_y=1)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,25)) svg("plots/cavanagh_false_positive_counts.svg", height=250/90, width=350/90, pointsize=5) print(ggp) dev.off() ggp = ggplot(data=false_positives,aes(x=threshold,y=max)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("max false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=max,label=round(max)),vjust=0,nudge_y=5)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,2500)) svg("plots/cavanagh_false_positive_max.svg", height=250/90, width=350/90, pointsize=5) print(ggp) dev.off() ==== Cavanagh Random Enriched GO ==== ##---- general libraries ----## library(ggplot2) library(reshape) ##---- table libraries ----## library(grid) library(gridExtra) library(gtable) ##---- sets the working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- input files ----## ontologies = read.table(file="cavanagh/deseq2/random/44_187/david/DAVID_at_least_3_GO_KEGG_REACTOME_BIOCARTA.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) ontologies_sig = subset(ontologies,FDR < 5) ontologies_sig$Rank = row.names(ontologies_sig) ontologies_sig_trimmmed = ontologies_sig[,c("Term","Name","Pop.Hits","Count","Fold.Enrichment","FDR")] ontologies_sig_trimmmed$FDR = factor(round(ontologies_sig_trimmmed$FDR,5)) ontologies_sig_trimmmed$Fold.Enrichment = round(ontologies_sig_trimmmed$Fold.Enrichment,2) names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR") ##---- plot table ----## plot_height = 475/90 plot_width = 800/90 header_size = 1 text_size = 1 border_thickness = 2 table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size))) gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1) svg("plots/cavanagh_random_ontologies_table.svg", height=plot_height, width=plot_width) grid.arrange(gt) dev.off() ==== Leday MDD Boxplots ==== ##---- libraries----## library(ggplot2) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(1,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- loads leday data ----## expression_data = read.table(file="leday/EM_covariate_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) samples = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) expression_data = expression_data[,rownames(samples)] ##---- MDD plots ----## # 2 is HC gene = "212531_at" # LCN2 expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_1_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "212768_s_at" # OLFM4 expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_2_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "207269_at" # DEFA4 expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_3_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "231688_at" # MMP8 expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_4_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "206676_at" # CEACAM8 expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_5_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "205557_at" # BPI expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_6_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "206851_at" # RNASE3 expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_7_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() gene = "206697_s_at" # HP expression_data_gene = t(expression_data[gene,]) groups = samples$source_name_ch1 plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("2", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) svg("plots/leday_mdd_8_boxplot.svg", height=150/90, width=150/90) print(ggp) dev.off() ==== Leday MDD heatmap ==== ##---- libraries----## library(ggplot2) library(amap) library(reshape) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- loads data ----## expression_data = read.table(file="leday/EM_covariate_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(1,1,1,1), "cm") ) } ##----- Clustering Method -----## cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) { x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method) y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method) x.dd = as.dendrogram(x.cluster) y.dd = as.dendrogram(y.cluster) x.dd.reorder = reorder(x.dd,0,FUN=reorder_function) y.dd.reorder = reorder(y.dd,0,FUN=reorder_function) x.order = order.dendrogram(x.dd.reorder) y.order = order.dendrogram(y.dd.reorder) if(cluster_x == TRUE && cluster_y == TRUE) { mx_clustered = mx[y.order,x.order] } if(cluster_x == TRUE && cluster_y == FALSE) { mx_clustered = mx[,x.order] } if(cluster_x == FALSE && cluster_y == TRUE) { mx_clustered = mx[y.order,] } if(cluster_x == FALSE && cluster_y == FALSE) { mx_clustered = mx } return(mx_clustered) } ##----- Significant Genes Heatmap Function -----## make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) { if(nrow(matrix_scaled) >= 2) { matrix_scaled_renamed = as.matrix(matrix_scaled) matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered) matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered)) matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered)) hm.palette <- colorRampPalette(colours) ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank()) return(ggp) } else { return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this.")) } } ##---- Heatmap settings ----## plot_height = 750 plot_width = 500 colours = c("blue","yellow") distance_method = "spearman" clustering_method = "average" reorder_function = "average" cluster_x = FALSE cluster_y = TRUE ##---- Heatmap ----## #get the samples in order of HC then MDD sample_data = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) control = unlist(row.names(subset(sample_data,group == 0))) mdd = unlist(row.names(subset(sample_data,group == 1))) samples = c(control,mdd) MDD165 = read.table(file="leday/MDD165.txt", header=FALSE, sep='\t', quote='',check.names = TRUE) names(MDD165) = "Gene.Symbol" id_converter = read.table(file="leday/DE.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) sig_genes = merge(MDD165,id_converter, by="Gene.Symbol")$Gene.Symbol heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples])))) nrow(heatmap_matrix) ncol(heatmap_matrix) ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function) png("plots/leday_MDD165_heatmap.png", height=1750, width=1000, pointsize=5) print(ggp) dev.off() ==== Leday P Distribution Waterfall ==== ##---- libraries ----## library(ggplot2) library(reshape) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(1,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- Plot settings ----## x_axis_label = "Top 250 genes ranked by p-value\n(1 = lowest p-value, 250 = highest p-value)" y_axis_label = "-log10p value" line_type = "solid" line_size = 1.5 ##---- Leday ----## random = read.table(file="leday/random/P_DISTRIBUTION.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) hc_vs_mdd = read.table(file="leday/DE.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) male_vs_female = read.table(file="leday/DE_gender.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) young_vs_old = read.table(file="leday/DE_age.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) anxiety_vs_control = read.table(file="leday/DE_anxiety.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) random_dist = head(sort(rowMeans(random)),1000) hc_vs_mdd_dist = head(sort(hc_vs_mdd$P.Value),1000) gender_dist = head(sort(male_vs_female$P.Value),1000) age_dist = head(sort(young_vs_old$P.Value),1000) anxiety_dist = head(sort(anxiety_vs_control$P.Value),1000) rank = 1:1000 # all p_dist_combined = data.frame(cbind(gender_dist,age_dist,anxiety_dist,hc_vs_mdd_dist,random_dist,rank)) names(p_dist_combined) = c("male vs female","youngest vs oldest","low vs high anxiety","HC vs MDD","random", "rank") p_dist_combined_melted = melt(p_dist_combined, id="rank") ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000001,10),group=variable,colour=variable)) + ylim(0,10) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="none") svg("plots/leday_p_distribution.svg", height=250/90, width=450/90, pointsize=5) print(ggp) dev.off() ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="right") svg("plots/leday_p_distribution_key.svg", height=500/90, width=600/90) print(ggp) dev.off() ==== Leday False Positive Count ==== ##---- libraries----## library(ggplot2) library(reshape) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- load data ----## false_positives = read.table(file="leday/random/false_positive_count.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) ##---- Make plots ----## false_positives$threshold = factor(false_positives$threshold,levels=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) false_positives_melt = melt(false_positives,id.vars="threshold") ggp = ggplot(data=false_positives,aes(x=threshold,y=count)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("mean false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=count,label=round(count)),vjust=0,nudge_y=1)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,200)) svg("plots/leday_false_positive_counts.svg", height=250/90, width=350/90, pointsize=5) print(ggp) dev.off() ggp = ggplot(data=false_positives,aes(x=threshold,y=max)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("max false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=max,label=round(max)),vjust=0,nudge_y=5)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,5000)) svg("plots/leday_false_positive_max.svg", height=250/90, width=350/90, pointsize=5) print(ggp) dev.off() ==== Leday random GO terms table ==== ##---- general libraries ----## library(ggplot2) library(reshape) ##---- table libraries ----## library(grid) library(gridExtra) library(gtable) ##---- sets the working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- input files ----## ontologies = read.table(file="leday/random/david/David_any1_GO_Kegg_Reactome_Biocarta.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) ontologies_sig = subset(ontologies,FDR < 50) ontologies_sig$Rank = row.names(ontologies_sig) ontologies_sig_trimmmed = ontologies_sig[,c("Rank","Term","Name","Pop.Hits","Count","Fold.Enrichment","FDR")] ontologies_sig_trimmmed$FDR = factor(round(ontologies_sig_trimmmed$FDR,2)) ontologies_sig_trimmmed$Fold.Enrichment = round(ontologies_sig_trimmmed$Fold.Enrichment,2) names(ontologies_sig_trimmmed) = c("Rank","ID","Term","Term Size","Overlap","Enrichment","FDR") ##---- plot table ----## plot_height = 300/90 plot_width = 850/90 header_size = 1 text_size = 1 border_thickness = 2 table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size))) gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1) svg("plots/leday_random_ontologies_table.svg", height=plot_height, width=plot_width) grid.arrange(gt) dev.off() ==== Venn diagram stats ==== # BMI phyper(14,203,18000-203,165, lower.tail = FALSE, log.p = FALSE) # Leday phyper(1,203,18000-203,129, lower.tail = FALSE, log.p = FALSE) # Jansen phyper(1,203,18000-203,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi # Age phyper(37,1244,18000-1244,165, lower.tail = FALSE, log.p = FALSE) # Leday phyper(26,1244,18000-1244,129, lower.tail = FALSE, log.p = FALSE) # Jansen phyper(2,1244,18000-1244,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi # Gender phyper(18,625,18000-625,165, lower.tail = FALSE, log.p = FALSE) # Leday phyper(3,625,18000-625,129, lower.tail = FALSE, log.p = FALSE) # Jansen phyper(0,625,18000-625,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi # Combined phyper(49,1863,18000-1863,165, lower.tail = FALSE, log.p = FALSE) # Leday phyper(27,1863,18000-1863,129, lower.tail = FALSE, log.p = FALSE) # Jansen phyper(2,1863,18000-1863,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi # MDD phyper(5,123,18000-129,165, lower.tail = FALSE, log.p = FALSE) # Leday vs Jansen phyper(1,29,18000-29,165, lower.tail = FALSE, log.p = FALSE) # Leday vs Mostafavi phyper(0,29,18000-29,129, lower.tail = FALSE, log.p = FALSE) # Jansen vs Mostafavi # Random phyper(3,154,18000-154,165, lower.tail = FALSE, log.p = FALSE) # Leday phyper(2,154,18000-154,129, lower.tail = FALSE, log.p = FALSE) # Jansen phyper(1,154,18000-154,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi ==== BMI Webgestaldt table ==== ##---- general libraries ----## library(ggplot2) library(reshape) ##---- table libraries ----## library(grid) library(gridExtra) library(gtable) ##---- sets the working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- input files ----## ontologies = read.table(file="cross_dataset/web_gestaldt/bmi/enrichment_results_wg_result1565946369.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) ontologies_sig = head(ontologies,10) ontologies_sig$Rank = row.names(ontologies_sig) ontologies_sig_trimmmed = ontologies_sig[,c("geneSet","description","size","overlap","enrichmentRatio","FDR")] ontologies_sig_trimmmed$FDR = factor(signif(ontologies_sig_trimmmed$FDR*100,2)) ontologies_sig_trimmmed$enrichmentRatio = round(ontologies_sig_trimmmed$enrichmentRatio,2) names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR") ##---- plot table ----## plot_height = 300/90 plot_width = 800/90 header_size = 1 text_size = 1 border_thickness = 2 table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size))) gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1) svg("plots/cavanagh_BMI_webgestaldt_table.svg", height=plot_height, width=plot_width) grid.arrange(gt) dev.off() ==== Age Webgestaldt table ==== ##---- general libraries ----## library(ggplot2) library(reshape) ##---- table libraries ----## library(grid) library(gridExtra) library(gtable) ##---- sets the working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- input files ----## ontologies = read.table(file="cross_dataset/web_gestaldt/age/enrichment_results_wg_result1566543491.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) ontologies_sig = head(ontologies,10) ontologies_sig$Rank = row.names(ontologies_sig) ontologies_sig_trimmmed = ontologies_sig[,c("geneSet","description","size","overlap","enrichmentRatio","FDR")] ontologies_sig_trimmmed$FDR = factor(signif(ontologies_sig_trimmmed$FDR*100,2)) ontologies_sig_trimmmed$enrichmentRatio = round(ontologies_sig_trimmmed$enrichmentRatio,2) names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR") ##---- plot table ----## plot_height = 300/90 plot_width = 800/90 header_size = 1 text_size = 1 border_thickness = 2 table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size))) gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt)) gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1) svg("plots/cavanagh_Age_webgestaldt_table.svg", height=plot_height, width=plot_width) grid.arrange(gt) dev.off() ==== Webgestaldt scatter plots ==== library(ggplot2) ##---- Load data ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cross_dataset/web_gestaldt") age = read.table(file="age/enrichment_results_wg_result1566543491.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) bmi = read.table(file="bmi/enrichment_results_wg_result1565946369.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) age_bmi = read.table(file="age_plus_BMI/enrichment_results_wg_result1566544579.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) mdd165 = read.table(file="MDD165/enrichment_results_wg_result1565946153.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) Jaansen = read.table(file="Jaansen/enrichment_results_wg_result1565953466.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) Mostafavi = read.table(file="Mostafavi/enrichment_results_wg_result1565955664.csv", header=TRUE, sep='\t', quote='',check.names = TRUE) ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- BMI ----## # BMI vs Leday merged= merge(mdd165,bmi, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_bmi_mdd165.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_bmi_mdd165_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # BMI vs Jaansen merged= merge(Jaansen,bmi, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_bmi_Jansen.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_bmi_Jansen_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # BMI vs Mostafavi merged= merge(Mostafavi,bmi, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_bmi_Mostafavi.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_bmi_Mostafavi_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") ##---- Age + BMI ----## # BMI vs Leday merged= merge(mdd165,age_bmi, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_bmi_mdd165.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_bmi_mdd165_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # BMI vs Jaansen merged= merge(Jaansen,age_bmi, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_bmi_Jansen.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_bmi_Jansen_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # BMI vs Mostafavi merged= merge(Mostafavi,age_bmi, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_bmi_Mostafavi.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_bmi_Mostafavi_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") ##---- Age ----## # BMI vs Leday merged= merge(mdd165,age, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_mdd165.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_mdd165_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # BMI vs Jaansen merged= merge(Jaansen,age, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_Jansen.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_Jansen_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # BMI vs Mostafavi merged= merge(Mostafavi,age, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_Mostafavi.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_age_Mostafavi_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") ##---- MDD ----## # leday vs Jaansen merged= merge(mdd165,Jaansen, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + ylab("-log10p Jansen") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_mdd165_Jansen.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p Jansen") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_mdd165_Jansen_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # leday vs Mostafavi merged= merge(Mostafavi,mdd165, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + xlab("-log10p Mostafavi") + ylab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_mdd165_Mostafavi.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + xlab("-log10p Mostafavi") + ylab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_mdd165_Mostafavi_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") # Jaansen vs Mostafavi merged= merge(Mostafavi,Jaansen, by="geneSet") sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05) ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + xlab("-log10p Mostafavi") + ylab("-log10p GO\nenrichment Jaansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_Jaansen_Mostafavi.svg", height=350/90, width=350/90) print(ggp) dev.off() ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1) + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + xlab("-log10p Mostafavi") + ylab("-log10p GO\nenrichment Jaansen")# + xlim(c(0,20)) + ylim(c(0,20)) svg("WebGestaldt_Jaansen_Mostafavi_labelled.svg", height=350/90, width=350/90) print(ggp) dev.off() cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10), method = "spearman") ==== Cavanagh Immune Age Scatterplot ==== ##---- libraries----## library(ggplot2) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- load data ----## expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- get meta gene ----## sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01 & log2fold > 0.5)) expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,])))) meta_gene_up = data.frame(colMeans(expression_data_sig_zscore)) names(meta_gene_up) = "up" nrow(expression_data_sig_zscore) sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01 & log2fold < -1)) expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,])))) meta_gene_down = data.frame(colMeans(expression_data_sig_zscore)) names(meta_gene_down) = "down" nrow(expression_data_sig_zscore) meta_gene = data.frame(rowMeans(cbind(-meta_gene_down,meta_gene_up))) ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- plots ----## age = data.frame(samples$age) mdd = data.frame(samples$MDD) group = data.frame(samples$group) hamilton = data.frame(samples$hamilton_score) plot_data = cbind(meta_gene,age,mdd,group,hamilton) names(plot_data) = c("biological_age","chronological_age","MDD","group","hamilton") hm.palette = colorRampPalette(c("blue","red")) ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none") svg("plots/cavanagh_age_metrics_MDD_scatterplot.svg", height=400/90, width=550/90) print(ggp) dev.off() ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none") svg("plots/cavanagh_age_metrics_MDD_scatterplot_BIG.svg", height=1000/90, width=1000/90) print(ggp) dev.off() cor(meta_gene,age, method = "spearman") ==== Leday Immune Age Scatterplot ==== ##---- libraries----## library(ggplot2) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- load data ----## expression_data = read.table(file="leday/EM_batch_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) samples = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- get meta gene ----## sig_genes = row.names(subset(read.table(file="leday/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),P.Value<0.001 & logFC > 0)) expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,])))) meta_gene_up = data.frame(colMeans(na.omit(expression_data_sig_zscore))) names(meta_gene_up) = "up" nrow(expression_data_sig_zscore) sig_genes = row.names(subset(read.table(file="leday/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),P.Value<0.001 & logFC < -0)) expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,])))) meta_gene_down = data.frame(colMeans(na.omit(expression_data_sig_zscore))) names(meta_gene_down) = "down" nrow(expression_data_sig_zscore) meta_gene = data.frame(rowMeans(cbind(meta_gene_down,-meta_gene_up))) ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=12, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- plots ----## age = data.frame(samples$age) mdd = data.frame(samples$group) group = data.frame(samples$batch) plot_data = cbind(meta_gene,age,mdd,group) names(plot_data) = c("biological_age","chronological_age","MDD","group") hm.palette = colorRampPalette(c("blue","red")) ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none") svg("plots/leday_age_metrics_MDD_scatterplot.svg", height=400/90, width=550/90) print(ggp) dev.off() ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none") svg("plots/leday_age_metrics_MDD_scatterplot_BIG.svg", height=2000/90, width=2000/90) print(ggp) dev.off() cor(meta_gene,age, method = "spearman") temp = subset(mdd, samples.group == 0) nrow(temp) temp = subset(mdd, samples.group == 1) nrow(temp) ===== Rebuttal Plots ===== ==== HC vs MDD volcano plots ==== ##---- general libraries ----## library(ggplot2) library(reshape) ##---- PDE sample and group names ----## sample_groupings = factor(sample_groupings, levels = sample_groups) ##---- sets the working directory. If you move the SL2 folder please update this path ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 16, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 14, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 14, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 16, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 16, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=14, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(2.5,"line"), plot.margin=unit(c(0.4,0.4,0.4,0.4), "cm") ) } ##----- Volcano Plot Function -----## make_volcano_plot <- function(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) { ggp = ggplot(data=PDE_annotated, aes(x=log2fold, y=-log10(p), colour=sig, group=sig,fill=sig)) + geom_point(size=dot_size,alpha=dot_transparency) + scale_color_manual(values=c(non_significant_colour,significant_colour),breaks=c(FALSE,TRUE),labels=c(significant_name, non_significant_name)) + scale_fill_manual(values=c(non_significant_colour,significant_colour),breaks=c(FALSE,TRUE),labels=c(significant_name, non_significant_name)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position, legend.title = element_blank()) + geom_text(data=PDE_annotated_sig,aes(label=rownames(PDE_annotated_sig)),hjust=-0.1, vjust=-0.1, size=data_label_size, show.legend = FALSE) + xlim(c(-4,4)) return(ggp) } ##----- Volcano Plot Variables -----## plot_height = 500 plot_width = 600 non_significant_colour = "black" significant_colour = "red" dot_size = 1.5 dot_transparency = 1 significant_name = "non-significant" non_significant_name = "significant" x_axis_label = "log2 fold change" y_axis_label = "-log10 p-value" legend_position = "bottom" ##---- No model ----## PDE_annotated = read.table(file="deseq2/unmodelled/DE_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) PDE_annotated$sig = PDE_annotated$p.adj < 0.01 PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01) data_label_size = 4.5 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_unmodelled.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() data_label_size = 0 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_unmodelled.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() ##---- No model ----## PDE_annotated = read.table(file="deseq2/modelled/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) PDE_annotated$sig = PDE_annotated$p.adj < 0.01 PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01) data_label_size = 4.5 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_modelled.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() data_label_size = 0 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_modelled.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() ##---- limited model ----## PDE_annotated = read.table(file="deseq2/limited_model/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) PDE_annotated$sig = PDE_annotated$p.adj < 0.01 PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01) data_label_size = 4.5 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_limited_model.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() data_label_size = 0 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_limited_model.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() ##---- limited model no BMI ----## PDE_annotated = read.table(file="deseq2/limited_model_no_BMI/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) PDE_annotated$sig = PDE_annotated$p.adj < 0.01 PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01) data_label_size = 4.5 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_limited_model_no_BMI.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() data_label_size = 0 ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_limited_model_no_BMI.png", height=plot_height, width=plot_width, pointsize=5) print(ggp) dev.off() ==== HC vs MDD expression boxplots ==== ##---- libraries----## library(ggplot2) ##---- sets working directory ----## setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/") ##---- SL2 Theme ----## theme_SL2 <- function () { theme_bw() %+replace% theme( panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1), plot.background = element_blank(), legend.background = element_rect(fill="transparent", colour=NA), legend.key = element_rect(fill="transparent", colour=NA), title = element_text(size = 24, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.text.y = element_text(size = 22, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"), axis.text.x = element_text(size = 22, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), axis.title.y = element_text(size = 24, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"), axis.title.x = element_text(size = 24, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"), legend.text=element_text(size=24, family="Arial", face="bold"), legend.title=element_blank(), legend.key.size=unit(1,"line"), plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm") ) } ##---- plot size ----## plot_height=600 plot_width=400 ##---- loads cavanagh data ----## samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) ##---- No model ----## expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) gene = "ENSG00000188000" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_1_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000188659" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_2_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000158406" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_3_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000229314" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_4_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000232810" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_5_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() ##---- Limited model no BMI ----## expression_data = read.table(file="limited_model_no_BMI/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) gene = "ENSG00000081041" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_1_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000158406" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_2_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000277075" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_3_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000277632" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_4_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000229314" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_5_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() ##---- Limited model ----## expression_data = read.table(file="limited_model/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) gene = "ENSG00000277075" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_1_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000158406" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_2_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000277632" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_3_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000229314" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_4_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000137267" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_5_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() ##---- full model ----## expression_data = read.table(file="modelled/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE) gene = "ENSG00000277075" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_1_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000115290" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_2_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000081041" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_3_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000196632" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_4_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off() gene = "ENSG00000278828" expression_data_gene = t(expression_data[gene,]) groups = samples$MDD plot_data = data.frame(cbind(expression_data_gene,groups)) names(plot_data) = c("expression","group") plot_data$group = factor(plot_data$group, levels=c("0", "1")) sample_groups = c("HC", "MDD") ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) + geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1) png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_5_boxplot.png", height=plot_height, width=plot_width) print(ggp) dev.off()