===== DeSeq2 Cavanagh ===== ==== Clinical parameters ====


##-------------------------##
##       R libraries       ##
##-------------------------##


##---- general libraries ----##
library("data.table")
library("ggplot2")

##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))


##-------------------------##
##       Input Files       ##
##-------------------------##

##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##-------------------------##
##     DESeq2 Clinical     ##
##-------------------------##

##---- Runs DEseq2 for each of the clinical parameters ----##
sig_genes_by_clinical_parameter = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) 

# iterates through the clinical parameters and runs DESeq2:
for(clinical_parameter in colnames(patient_data))
{
  if(clinical_parameter != "study_arm" & clinical_parameter != "group" & clinical_parameter != "centre" )
  {
    print(clinical_parameter)
    clinical_parameter_data = patient_data[,clinical_parameter]
    clinical_parameter_data = clinical_parameter_data[complete.cases(clinical_parameter_data)]
    
    # tests for a binary parameter (e.g. gender)
    if(length(summary(as.factor(clinical_parameter_data),maxsum=50000)) == 2) 
    {
    
      min = min(as.numeric(clinical_parameter_data))
      max = max(as.numeric(clinical_parameter_data))
      group1 = row.names(subset(patient_data,patient_data[clinical_parameter] == min))
      group2 = row.names(subset(patient_data,patient_data[clinical_parameter] == max))
    }
    # or must be a continuous paramter
    else
    {
      q1 = summary(clinical_parameter_data)[2]
      q3 = summary(clinical_parameter_data)[5]
      group1 = row.names(subset(patient_data,patient_data[clinical_parameter] < q1))
      group2 = row.names(subset(patient_data,patient_data[clinical_parameter] > q3))
    }
    # tests for a suitable number of samples
    if(min(c(length(group1),length(group2))) >2) 
    {           
      # runs deseq2
      condition = factor(c(rep("low", length(group1)), rep("high", length(group2)))) 
      read_count_filtered = cbind(read_counts[,group1],read_counts[,group2])
      countdata = as.matrix(read_count_filtered)
      countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)
      coldata = data.frame(row.names=colnames(countdata), condition)
      dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
      dds = DESeq(dds, parallel=T)
      res = results(dds, c("condition","high","low"), cooksCutoff=0.2)
      
      res = res[order(res$padj), ]
      resdata = as.data.frame(res)
      resdata$ID = row.names(resdata)
      resdata = resdata[,c(7,2,5,6)]
      colnames(resdata) = c("ID","log2fold","p","p.adj")
      write.table(resdata, file=paste("deseq2/unmodelled/DE_",clinical_parameter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
      
      samples = data.frame(cbind(c(group1,group2),condition))
      write.table(samples, file=paste("deseq2/unmodelled/SAMPLES_",clinical_parameter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)

      resdata_sig = subset(resdata, p.adj < 0.01)
      sig_genes_for_clinical_parameter = data.frame(clinical_parameter,nrow(resdata_sig))
      names(sig_genes_for_clinical_parameter)<-c("parameter","count")
      sig_genes_by_clinical_parameter <- rbind(sig_genes_by_clinical_parameter, sig_genes_for_clinical_parameter)
    }
  }
}
write.table(sig_genes_by_clinical_parameter, file="deseq2/unmodelled/SIG_COUNTS.csv",row.names=FALSE, sep="\t", quote = FALSE)

==== HC vs MDD ====


##---- general libraries ----##
library("data.table")
library("ggplot2")

##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))


##-------------------------##
##       Input Files       ##
##-------------------------##

##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##-------------------------##
##     DESeq2 HC vs MDD    ##
##-------------------------##

sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) 

# gets the confounding variables
MDD = factor(patient_data$MDD) 
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)

coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)

# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)

# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16+MDD)
dds = DESeq(dds, parallel=T)

# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)

# gets the results
res = results(dds, c("MDD","0","1"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)


##-------------------------##
##  DESeq2 HC vs MDD Types ##
##-------------------------##

# gets the confounding variables
group = factor(patient_data$group) 
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)
coldata = data.frame(row.names=colnames(read_counts),group,confound1,confound2,confound3,confound4)

# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)

# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+group)
dds = DESeq(dds, parallel=T)

# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD_subgroups.csv",row.names=FALSE, sep="\t", quote = FALSE)

# gets the results
res = results(dds, c("group","HC","MDD_untreated"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_untreated.csv",row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.01)
sig_genes_for_comparison = data.frame("HC vs MDD untreated" ,nrow(resdata_sig))
names(sig_genes_for_comparison)<-c("parameter","count")
sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison)

res = results(dds, c("group","HC","MDD_responder"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_responder.csv",row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.01)
sig_genes_for_comparison = data.frame("HC vs MDD responder",nrow(resdata_sig))
names(sig_genes_for_comparison)<-c("parameter","count")
sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison)

res = results(dds, c("group","HC","MDD_resistant"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/modelled/DE_HC_vs_MDD_resistant.csv",row.names=FALSE, sep="\t", quote = FALSE)
resdata_sig = subset(resdata, p.adj < 0.01)
sig_genes_for_comparison = data.frame("HC vs MDD resistant",nrow(resdata_sig))
names(sig_genes_for_comparison)<-c("parameter","count")
sig_genes_by_comparison <- rbind(sig_genes_by_comparison, sig_genes_for_comparison)

==== HC vs MDD - limited model ====


##---- general libraries ----##
library("data.table")
library("ggplot2")

##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))


##-------------------------##
##       Input Files       ##
##-------------------------##

##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##-------------------------##
##     DESeq2 HC vs MDD    ##
##-------------------------##

sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) 

# gets the confounding variables
MDD = factor(patient_data$MDD) 
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound4 = factor(patient_data$centre)


coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound3,confound4)

# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)

# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound3+confound4+MDD)
dds = DESeq(dds, parallel=T)

# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)

# gets the results
res = results(dds, c("MDD","0","1"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/limited_model/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)


##-------------------------##
##       Input Files       ##
##-------------------------##

##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- import data ----##
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##-------------------------##
##     DESeq2 HC vs MDD    ##
##-------------------------##

sig_genes_by_comparison = data.frame(parameter=character(),count=numeric(),stringsAsFactors=FALSE) 

# gets the confounding variables
MDD = factor(patient_data$MDD) 
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound4 = factor(patient_data$centre)


coldata = data.frame(row.names=colnames(read_counts),MDD,confound1,confound2,confound4)

# filters the read counts
countdata = as.matrix(read_counts)
countdata <- subset(countdata,apply(countdata, 1, mean) >= 1)

# runs deseq2
dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~confound1+confound2+confound4+MDD)
dds = DESeq(dds, parallel=T)

# Get the EM
resdata <- round(as.data.frame(counts(dds, normalized=TRUE)),2)
resdata_IDs <-resdata
resdata_IDs$IDs <- row.names(resdata)
resdata_IDs <-resdata_IDs[,ncol(resdata_IDs)]
resdata <- cbind(resdata_IDs,resdata)
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model_no_BMI/EM_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)

# gets the results
res = results(dds, c("MDD","0","1"), cooksCutoff=0.2)
summary(res)
res = res[order(res$pvalue),]
resdata = as.data.frame(res)
resdata$ID = row.names(resdata)
resdata = resdata[,c(7,2,5,6)]
colnames(resdata) = c("ID","log2fold","p","p.adj")
write.table(resdata, file="deseq2/limited_model_no_BMI/DE_HC_vs_MDD.csv",row.names=FALSE, sep="\t", quote = FALSE)

==== HC vs MDD - non parameteric ====


##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- loads cavanagh data ----##
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

##---- No model ----##
results = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

# get groups
group1 = results[,row.names(subset(samples, MDD == 0))]
group2 = results[,row.names(subset(samples, MDD == 1))]
nrow(group1)
nrow(group2)
ncol(group1)
ncol(group2)

# sets up columns
results$log2fold = rep(1,nrow(expression_data))
results$p =  rep(1,nrow(expression_data))
results$p.adj =  rep(1,nrow(expression_data))

# does the stats
for (row in 1:nrow(results))
{
 print(row)
 gene_group1 = unlist(group1[row,])
 gene_group2 = unlist(group2[row,])
 results[row,"log2fold"] = log(mean(gene_group2),2) - log(mean(gene_group1),2)
 results[row,"p"] = wilcox.test(gene_group1,gene_group2, paired=FALSE)$p.value
}
results[["p.adj"]] = p.adjust(results[["p"]], method = p.adjust.methods, n = length(results[["p"]]))

# outputs data
results = results[,c("log2fold","p","p.adj")]
write.table(results, file="cavanagh/mann_whitney/DE_HC_vs_MDD.csv",row.names=TRUE, sep="\t", quote = FALSE)

==== Random cases vs controls ====


##---- general libraries ----##
library("data.table")
library("ggplot2")

##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))


##-------------------------##
##       Input Files       ##
##-------------------------##

##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- import data ----##
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##--------------------------------------------##
##   50 random deseq runs with 50 / 50 split  ##
##--------------------------------------------##

read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))

for (counter in 1:50)
{
  # set random seed
  set.seed(counter)
  
  # gets the random samples
  samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),ncol(read_counts_trimmed)/2,replace=FALSE)])
  samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
  print(length(samples_random1))
  print(length(samples_random2))
  random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
  condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) 
  
  # gets the confounding variables
  MDD = factor(patient_data$MDD) 
  confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
  confound2 = factor(patient_data$gender)
  confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
  for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
  confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
  confound4 = factor(patient_data$centre)
  confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
  confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
  for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
  confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
  confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
  confound9 = factor(patient_data$Venlafaxine)
  confound10 = factor(patient_data$live_with)
  confound11 = factor(patient_data$any.other.significant.negative.events)
  confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
  confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
  for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
  confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
  confound15 = factor(patient_data$Tobacco)
  confound16 = factor(patient_data$Magic_Mushroom)

  # runs DESeq2
  countdata = as.matrix(random_samples_counts)
  coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
  dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16)
  dds = DESeq(dds, parallel=T,betaPrior=TRUE)
  res = results(dds, c("condition","high","low"))

  # gets the deseq results
  res = res[order(res$padj), ]
  resdata = as.data.frame(res)
  resdata$ID = row.names(resdata)
  resdata = resdata[,c(7,2,5,6)]
  colnames(resdata) = c("ID","log2fold","p","p.adj")
  write.table(resdata, file=paste("deseq2/random/50_50/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_sig = subset(resdata, p.adj < 0.25)
  write.table(resdata_sig, file=paste("deseq2/random/50_50/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_top100 = head(resdata, 100)
  write.table(resdata_top100, file=paste("deseq2/random/50_50/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)

  # updates the p-value distribution results
  p_value_dist[,counter] = head(sort(resdata$p),10000)
  write.table(p_value_dist, file="deseq2/random/50_50/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
  
  # prints the patient data
  #coldata = data.frame(coldata)
  #row.names(coldata) = names(random_samples_counts)
  write.table(data.frame(coldata), file=paste("deseq2/random/50_50/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)

}



##---------------------------------------------##
##   50 random deseq runs with 44 / 187 split  ##
##---------------------------------------------##

read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))

for (counter in 1:50)
{
  # set random seed
  set.seed(counter)
  
  # gets the random samples
  samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)])
  samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
  print(length(samples_random1))
  print(length(samples_random2))
  random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
  condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) 
  
  # gets the confounding variables
  MDD = factor(patient_data$MDD) 
  confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
  confound2 = factor(patient_data$gender)
  confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
  for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
  confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
  confound4 = factor(patient_data$centre)
  confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
  confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
  for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
  confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
  confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
  confound9 = factor(patient_data$Venlafaxine)
  confound10 = factor(patient_data$live_with)
  confound11 = factor(patient_data$any.other.significant.negative.events)
  confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
  confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
  for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
  confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
  confound15 = factor(patient_data$Tobacco)
  confound16 = factor(patient_data$Magic_Mushroom)

  # runs DESeq2
  countdata = as.matrix(random_samples_counts)
  coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
  dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16)
  dds = DESeq(dds, parallel=T,betaPrior=TRUE)
  res = results(dds, c("condition","high","low"))

  # gets the deseq results
  res = res[order(res$padj), ]
  resdata = as.data.frame(res)
  resdata$ID = row.names(resdata)
  resdata = resdata[,c(7,2,5,6)]
  colnames(resdata) = c("ID","log2fold","p","p.adj")
  write.table(resdata, file=paste("deseq2/random/44_187/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_sig = subset(resdata, p.adj < 0.25)
  write.table(resdata_sig, file=paste("deseq2/random/44_187/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_top100 = head(resdata, 100)
  write.table(resdata_top100, file=paste("deseq2/random/44_187/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)

  # updates the p-value distribution results
  p_value_dist[,counter] = head(sort(resdata$p),10000)
  write.table(p_value_dist, file="deseq2/random/44_187/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
  
  # prints the patient data
  #coldata = data.frame(coldata)
  #row.names(coldata) = names(random_samples_counts)
  write.table(data.frame(coldata), file=paste("deseq2/random/44_187/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)

}

Merge the random genes


cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/50_50
cat DE_SIG_1.csv > DE_SIGS_COMBINED.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED.csv | cut -f1 | sort -u > temp.csv
mv temp.csv DE_SIGS_COMBINED.csv
done


cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/44_187
cat DE_SIG_1.csv > DE_SIGS_COMBINED.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED.csv | cut -f1 | sort -u > temp.csv
mv temp.csv DE_SIGS_COMBINED.csv
done


cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/50_50
cat DE_SIG_1.csv > DE_SIGS_COMBINED_counts.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED_counts.csv | cut -f1 > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
done
cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv


cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/deseq2/random/44_187
cat DE_SIG_1.csv > DE_SIGS_COMBINED_counts.csv
for i in DE_SIG_*
do
cat $i DE_SIGS_COMBINED_counts.csv | cut -f1 > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
done
cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv

==== Randomised Cases vs Controls - limited or no model ====


##---- general libraries ----##
library("data.table")
library("ggplot2")

##---- DESeq2 libraries ----##
library("DESeq2")
library("BiocParallel")
register(MulticoreParam(25))


##-------------------------##
##       Input Files       ##
##-------------------------##

##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- import data ----##
read_counts = read.table(file="raw_data/counts_protein_coding_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
patient_data = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

##---------------------------------------------##
##   50 random deseq runs with 44 / 187 split  ##
##---------------------------------------------##

read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))

for (counter in 1:50)
{
  # set random seed
  set.seed(counter)
  
  # gets the random samples
  samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)])
  samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
  print(length(samples_random1))
  print(length(samples_random2))
  random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
  condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) 
  
  # gets the confounding variables
  MDD = factor(patient_data$MDD) 
  confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
  confound2 = factor(patient_data$gender)
  confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
  for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
  confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
  confound4 = factor(patient_data$centre)
  confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
  confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
  for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
  confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
  confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
  confound9 = factor(patient_data$Venlafaxine)
  confound10 = factor(patient_data$live_with)
  confound11 = factor(patient_data$any.other.significant.negative.events)
  confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
  confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
  for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
  confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
  confound15 = factor(patient_data$Tobacco)
  confound16 = factor(patient_data$Magic_Mushroom)

  # runs DESeq2
  countdata = as.matrix(random_samples_counts)
  coldata = data.frame(row.names=colnames(countdata),condition)
  dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
  dds = DESeq(dds, parallel=T,betaPrior=TRUE)
  res = results(dds, c("condition","high","low"))

  # gets the deseq results
  res = res[order(res$padj), ]
  resdata = as.data.frame(res)
  resdata$ID = row.names(resdata)
  resdata = resdata[,c(7,2,5,6)]
  colnames(resdata) = c("ID","log2fold","p","p.adj")
  write.table(resdata, file=paste("deseq2/random/44_187_no_model/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_sig = subset(resdata, p.adj < 0.25)
  write.table(resdata_sig, file=paste("deseq2/random/44_187_no_model/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_top100 = head(resdata, 100)
  write.table(resdata_top100, file=paste("deseq2/random/44_187_no_model/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)

  # updates the p-value distribution results
  p_value_dist[,counter] = head(sort(resdata$p),10000)
  write.table(p_value_dist, file="deseq2/random/44_187_no_model/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
  
  # prints the patient data
  #coldata = data.frame(coldata)
  #row.names(coldata) = names(random_samples_counts)
  write.table(data.frame(coldata), file=paste("deseq2/random/44_187_no_model/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)

}







##---------------------------------------------##
##   50 random deseq runs with 44 / 187 split  ##
##---------------------------------------------##

read_counts_trimmed = subset(read_counts,apply(read_counts, 1, mean) >= 1)
p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))

for (counter in 1:50)
{
  # set random seed
  set.seed(counter)
  
  # gets the random samples
  samples_random1 = names(read_counts_trimmed[,sample(1:ncol(read_counts_trimmed),44,replace=FALSE)])
  samples_random2 = names(read_counts_trimmed[,-which(names(read_counts_trimmed) %in% samples_random1)])
  print(length(samples_random1))
  print(length(samples_random2))
  random_samples_counts = read_counts_trimmed[,c(samples_random1,samples_random2)]
  condition = factor(c(rep("low", length(samples_random1)), rep("high", length(samples_random2)))) 
  
  # gets the confounding variables
  MDD = factor(patient_data$MDD) 
  confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
  confound2 = factor(patient_data$gender)
  confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
  for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
  confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
  confound4 = factor(patient_data$centre)
  confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
  confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
  for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
  confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
  confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
  confound9 = factor(patient_data$Venlafaxine)
  confound10 = factor(patient_data$live_with)
  confound11 = factor(patient_data$any.other.significant.negative.events)
  confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
  confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
  for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
  confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
  for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
  confound15 = factor(patient_data$Tobacco)
  confound16 = factor(patient_data$Magic_Mushroom)

  # runs DESeq2
  countdata = as.matrix(random_samples_counts)
  coldata = data.frame(row.names=colnames(countdata),condition,confound1,confound2,confound3,confound4,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)
  dds = DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition+confound1+confound2+confound3+confound4)
  dds = DESeq(dds, parallel=T,betaPrior=TRUE)
  res = results(dds, c("condition","high","low"))

  # gets the deseq results
  res = res[order(res$padj), ]
  resdata = as.data.frame(res)
  resdata$ID = row.names(resdata)
  resdata = resdata[,c(7,2,5,6)]
  colnames(resdata) = c("ID","log2fold","p","p.adj")
  write.table(resdata, file=paste("deseq2/random/44_187_limited_model/DE_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_sig = subset(resdata, p.adj < 0.25)
  write.table(resdata_sig, file=paste("deseq2/random/44_187_limited_model/DE_SIG_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  resdata_top100 = head(resdata, 100)
  write.table(resdata_top100, file=paste("deseq2/random/44_187_limited_model/DE_TOP_100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)

  # updates the p-value distribution results
  p_value_dist[,counter] = head(sort(resdata$p),10000)
  write.table(p_value_dist, file="deseq2/random/44_187_limited_model/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
  
  # prints the patient data
  #coldata = data.frame(coldata)
  #row.names(coldata) = names(random_samples_counts)
  write.table(data.frame(coldata), file=paste("deseq2/random/44_187_limited_model/DE_PATIENTS_",counter,".csv",sep=""),row.names=TRUE, sep="\t", quote = FALSE)

}

==== Convert results to gene symbols ====


setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
bmi = read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
converter = read.table(file="cavanagh/raw_data/ID_to_symbol.csv",header=TRUE, sep='\t', quote='',check.names = TRUE)

converted_BMI = merge(bmi, converter, by="ID")
converted_BMI_sig = subset(converted_BMI, p.adj < 0.01)
write.table(converted_BMI,"cavanagh/deseq2/unmodelled/symbols/DE_BMI_symbols.csv")
write.table(converted_BMI_sig,"cavanagh/deseq2/unmodelled/symbols/DE_BMI_sig_symbols.csv")

===== Limma Leday =====


library(Biobase)
library(GEOquery)
library(limma)
library(jetset)
library(sdef)

##----------------------##
## Download and Process ##
##----------------------##

setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/leday")

# download data
tmp <- getGEO("GSE98793", GSEMatrix=TRUE)
eset <- tmp[[1]]


# remove samples that failed Leday QC
sample_info = read.table(file="sample_IDs.csv",header=TRUE, sep='\t', quote='',check.names = TRUE)
valid_samples = subset(sample_info, keep == 0)$ID
eset = eset[,valid_samples]


# normalise
normalizeBetweenArrays(exprs(eset),method="quantile")

# remove unannotated probes
data <- featureData(eset)@data
data.f <- data[-which(data[["Gene Symbol"]] == ""), ]
nrow(data)
nrow(data.f)
eset.f <- eset[rownames(eset) %in% rownames(data.f), ]

#use the jetset to select only the probe set of highest jset score for each gene
jscores = jscores("hgu133plus2", rownames(eset.f))
unique_genes = data.frame(unique(jscores$symbol))
unique_genes$probe = NA
names(unique_genes) = c("symbols","probe")

for(i in 1:nrow(unique_genes)) 
{
  gene_symbol = unique_genes[i,1]
  duplicates = subset(jscores,jscores[["symbol"]] == gene_symbol)
  unique_genes[i,2] = rownames(head(duplicates,1))[1]
}
nrow(unique_genes)
eset.fu <- eset.f[rownames(eset.f) %in% unique_genes$probe, ]
nrow(eset.fu)

# adds the covariates with user friendly names
pData(eset.fu)$group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
pData(eset.fu)$batch <- pData(eset.fu)$"batch"
pData(eset.fu)$gender <- pData(eset.fu)$"gender"
pData(eset.fu)$age <- pData(eset.fu)$"age"
pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"


##----------------------------------##
##         Limma Uncorrected        ##
##----------------------------------##

# Gender
mydesign2 <- model.matrix(~ gender, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), coef="genderM", adjust="BH")
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE_gender.csv",row.names=FALSE, sep="\t", quote = FALSE)

# Age 
q1 = summary(as.numeric(pData(eset.fu)$age))[2]
q3 = summary(as.numeric(pData(eset.fu)$age))[5]
group1 = row.names(subset(pData(eset.fu),as.numeric(pData(eset.fu)$age) < q1))
group2 = row.names(subset(pData(eset.fu),as.numeric(pData(eset.fu)$age) > q3))
group1_2 = c(group1,group2)
eset.fu.age = eset.fu[,group1_2]
pData(eset.fu.age)$age_group <- factor(c(rep("low", length(group1)), rep("high", length(group2)))) 

mydesign2 <- model.matrix(~ age_group, data=pData(eset.fu.age))
fit <- lmFit(eset.fu.age, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu.age), adjust="BH")
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE_age.csv",row.names=FALSE, sep="\t", quote = FALSE)

# Anxiety
mydesign2 <- model.matrix(~ anxiety, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), adjust="BH")
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE_anxiety.csv",row.names=FALSE, sep="\t", quote = FALSE)



##--------------------------------##
##         Limma Corrected        ##
##--------------------------------##

#differential expression analysis performed via Limma
mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
fit <- lmFit(eset.fu, design = mydesign2)
fit <- eBayes(fit)
res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")

# save results
res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
write.table(res, file="DE.csv",row.names=FALSE, sep="\t", quote = FALSE)

residuals <- data.frame(residuals(fit,eset.fu))
write.table(residuals, file="Residuals.csv",row.names=TRUE, sep="\t", quote = FALSE)

em <- exprs(eset.fu)
write.table(round(em,2), file="EM.csv",row.names=TRUE, sep="\t", quote = FALSE)

patient_data <- eset.fu@phenoData@data
write.table(patient_data, file="patient_data.csv",row.names=TRUE, sep="\t", quote = FALSE)

# batch corrected EM
em_corrected <- removeBatchEffect(em, batch=pData(eset.fu)$batch)
write.table(round(em_corrected,2), file="EM_batch_corrected.csv",row.names=TRUE, sep="\t", quote = FALSE)


# batch and covariate corrected EM
covariates <- data.frame(cbind(pData(eset.fu)$gender,pData(eset.fu)$age,pData(eset.fu)$anxiety))
covariates <- data.matrix(covariates)
em_corrected <- removeBatchEffect(em, batch=pData(eset.fu)$batch, covariates = covariates)
write.table(round(em_corrected,2), file="EM_covariate_corrected.csv",row.names=TRUE, sep="\t", quote = FALSE)




##----------------------##
##     Randomised       ##
##----------------------##

p_value_dist = data.frame(matrix(0, ncol = 50, nrow = 10000))

for (counter in 1:50)
{
  # set random seed
  set.seed(counter) 
  print(counter)
  
  # gets the random samples
  group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
  group_random = sample(group,length(group),replace=FALSE)
  print(group_random)

  #differential expression analysis performed via Limma
  pData(eset.fu)$group <- factor(group_random)
  pData(eset.fu)$batch <- pData(eset.fu)$"batch"
  pData(eset.fu)$gender <- pData(eset.fu)$"gender"
  pData(eset.fu)$age <- pData(eset.fu)$"age"
  pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"

  mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
  fit <- lmFit(eset.fu, design = mydesign2)
  fit <- eBayes(fit)
  res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")

  # save results
  res <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
  write.table(res, file=paste("random/DE",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  res_sig = subset(res, P.Value < 0.001)
  write.table(res_sig, file=paste("random/DE_sig_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)
  res_100 = head(res,100)
  write.table(res_100, file=paste("random/DE_top100_",counter,".csv",sep=""),row.names=FALSE, sep="\t", quote = FALSE)

  
  # updates the p-value distribution results
  p_value_dist[,counter] = head(sort(res$P.Value),10000)
  write.table(p_value_dist, file="random/P_DISTRIBUTION.csv",row.names=FALSE, sep="\t", quote = FALSE)
  
}



##----------------------##
##  Randomised Overlap  ##
##----------------------##


for (counter in 1:50)
{
  # set random seed
  set.seed(counter) 
  print(counter)
  
  
  ## group 1  
  
  # gets the random samples
  group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
  group_random = sample(group,length(group),replace=FALSE)
  print(group_random)

  #differential expression analysis performed via Limma
  pData(eset.fu)$group <- factor(group_random)
  pData(eset.fu)$batch <- pData(eset.fu)$"batch"
  pData(eset.fu)$gender <- pData(eset.fu)$"gender"
  pData(eset.fu)$age <- pData(eset.fu)$"age"
  pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"

  mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
  fit <- lmFit(eset.fu, design = mydesign2)
  fit <- eBayes(fit)
  res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")

  # save results
  res1 <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))

  
  ## group 2
  
  # set random seed
  set.seed(counter*100) 
  print(counter)
  
  # gets the random samples
  group <- factor((pData(eset)$"subject group"=="CASE; major depressive disorder (MDD) patient")*1)
  group_random = sample(group,length(group),replace=FALSE)
  print(group_random)

  #differential expression analysis performed via Limma
  pData(eset.fu)$group <- factor(group_random)
  pData(eset.fu)$batch <- pData(eset.fu)$"batch"
  pData(eset.fu)$gender <- pData(eset.fu)$"gender"
  pData(eset.fu)$age <- pData(eset.fu)$"age"
  pData(eset.fu)$anxiety <- pData(eset.fu)$"anxiety"

  mydesign2 <- model.matrix(~ group + batch + gender + age + anxiety, data=pData(eset.fu))
  fit <- lmFit(eset.fu, design = mydesign2)
  fit <- eBayes(fit)
  res <- topTable(fit, number=nrow(eset.fu), coef="group1", adjust="BH")

  # save results
  res2 <- subset(res, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.Symbol","Gene.Title"))
  
    
  ## overlap
  
  # parse data
  data_merged = merge(res1,res2, by="ID")
  ids = data_merged$ID
  data_trimmed = data_merged[,c("P.Value.x","P.Value.y")]

  # get overlap
  ratios = try(ratio(data_trimmed))
  if(inherits(ratios, "try-error"))
  {
    next
  }
  print("ratios generated")
  
  bays = try(baymod(ratios))
  if(inherits(bays, "try-error"))
  {
    next
  }
  print("bays modelled")
  
  sdef_features = try(data.frame(extractFeatures.R(ratios,bays,ids)$max))
  if(inherits(sdef_features, "try-error"))
  {
    next
  }
  print("features extracted")
    
  # write results
  if (! is.null(sdef_features))
  {
  results = data_merged[data.frame(sdef_features)$Names,"Gene.Symbol.x"]
  write.table(results, file=paste("random_sdef/DE_sig_",counter,".csv",sep=""),row.names=FALSE, col.names=FALSE, sep="\t", quote = FALSE)
  }
}

Gets the counts for the random data


cd /GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/leday/random
cat DE_sig_1.csv > DE_SIGS_COMBINED_counts.csv
for i in DE_sig_*
do
cat $i DE_SIGS_COMBINED_counts.csv | cut -f7 > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv
done
cat DE_SIGS_COMBINED_counts.csv | sort | uniq -c | sort -nr > temp.csv
mv temp.csv DE_SIGS_COMBINED_counts.csv

===== Cibersort ===== Get the expression data as gene symbols


##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- loads data ----##
cavanagh = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

##---- converts cavanagh ids to symbols ----##
cavanagh_conversion = read.table(file="cavanagh/raw_data/ID_to_symbol.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
cavanagh_converted = merge(cavanagh, cavanagh_conversion, by=0)

##---- saves the data ----##
write.table(cavanagh_converted, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_symbols.csv",row.names=FALSE, sep="\t", quote = FALSE)

We run Cibersort on the modelled HC vs MDD EM file, using the default LM22 background set, no quantile and 100 permutations: https://cibersort.stanford.edu/upload.php


/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_2019/cavanagh/cibersort/CIBERSORT.Output_Job6.csv

===== ComBat Correction =====


## Correct with ComBat

library(devtools)
library(Biobase)
library(sva)


# loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
EM <- read.table("deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE, row.names=1)
patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1)

# gets the confounding variables
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
confound5 = factor(patient_data$serious.illness.injury..or.assault) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound5)){if(is.na(confound5[i])){confound5[i] = median(as.numeric(na.omit(confound5)))}}
confound6 = factor(patient_data$eczema_adult_clean) #note - there is 101 NAs. We set these to the median.
for(i in 1:length(confound6)){if(is.na(confound6[i])){confound6[i] = median(as.numeric(na.omit(confound6)))}}
confound7 = factor(patient_data$close.relative.serious.illness.injury..or.assault)
confound8 = factor(patient_data$renal) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound8)){if(is.na(confound8[i])){confound8[i] = median(as.numeric(na.omit(confound8)))}}
confound9 = factor(patient_data$Venlafaxine)
confound10 = factor(patient_data$live_with)
confound11 = factor(patient_data$any.other.significant.negative.events)
confound12 = factor(patient_data$infections) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound12)){if(is.na(confound12[i])){confound12[i] = median(as.numeric(na.omit(confound12)))}}
confound13 = factor(patient_data$haematological) #note - there is 1 NA. We set these to the median.default
for(i in 1:length(confound13)){if(is.na(confound13[i])){confound13[i] = median(as.numeric(na.omit(confound13)))}}
confound14 = factor(patient_data$thyroid) #note - there is 1 NA. We set these to the median.
for(i in 1:length(confound14)){if(is.na(confound14[i])){confound14[i] = median(as.numeric(na.omit(confound14)))}}
confound15 = factor(patient_data$Tobacco)
confound16 = factor(patient_data$Magic_Mushroom)
coldata = data.frame(row.names=colnames(EM),confound1,confound2,confound3,confound5,confound6,confound7,confound8,confound9,confound10,confound11,confound12,confound13,confound14,confound15,confound16)

# gets batch
confound4 = factor(patient_data$centre)

# corrects
modcombat = model.matrix(~confound1+confound2+confound3+confound5+confound6+confound7+confound8+confound9+confound10+confound11+confound12+confound13+confound14+confound15+confound16, data=coldata)
combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)

# saves
resdata <- cbind(row.names(combat_edata),round(combat_edata,2))
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/modelled/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE)

===== Combat correction - limited model=====


## Correct with ComBat

library(devtools)
library(Biobase)
library(sva)


# loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
EM <- read.table("deseq2/limited_model/EM_HC_vs_MDD.csv", header=TRUE, row.names=1)
patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1)

# gets the confounding variables
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
confound3 = patient_data$BMI #note - there are 5 NAs in BMI. We set these to the mean.
for(i in 1:length(confound3)){if(is.na(confound3[i])){confound3[i] = mean(na.omit(confound3))}}
confound3 = factor(cut(confound3,4,labels=c("a","b","c","d")))
coldata = data.frame(row.names=colnames(EM),confound1,confound2,confound3)

# gets batch
confound4 = factor(patient_data$centre)

# corrects
modcombat = model.matrix(~confound1+confound2+confound3, data=coldata)
combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)

# saves
resdata <- cbind(row.names(combat_edata),round(combat_edata,2))
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE)


## Correct with ComBat

library(devtools)
library(Biobase)
library(sva)


# loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")
EM <- read.table("deseq2/limited_model_no_BMI/EM_HC_vs_MDD.csv", header=TRUE, row.names=1)
patient_data <- read.table("raw_data/clinical_data_valid_samples.tsv", header=TRUE, row.names=1)

# gets the confounding variables
confound1 = factor(cut(patient_data$age,4,labels=c("a","b","c","d")))
confound2 = factor(patient_data$gender)
coldata = data.frame(row.names=colnames(EM),confound1,confound2)

# gets batch
confound4 = factor(patient_data$centre)

# corrects
modcombat = model.matrix(~confound1+confound2, data=coldata)
combat_edata = ComBat(dat=EM, batch=confound4, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)

# saves
resdata <- cbind(row.names(combat_edata),round(combat_edata,2))
colnames(resdata)[1] <- "ID"
write.table(resdata, file="deseq2/limited_model_no_BMI/EM_HC_vs_MDD_combat.csv",row.names=FALSE, sep="\t", quote = FALSE)

===== Percentile Correction =====


setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")


# method for correcting outliers
correct_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- qnt[1] - H
  y[x > (qnt[2] + H)] <- qnt[2] + H
  y
}


# pre Combat
expression = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
EM <- expression[0,]

for(row in 1:nrow(expression)) 
{
 gene <- expression[(row),]
 EM <- rbind(EM,correct_outliers(gene))
}

EM_IDs = EM
EM_IDs$IDs = row.names(EM)
EM_IDs = EM_IDs[,ncol(EM_IDs)]
EM = cbind(EM_IDs,EM)
colnames(EM)[1] = "IDs"

write.table(EM, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv",row.names=FALSE, sep="\t", quote = FALSE)


# post Combat
expression = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
EM <- expression[0,]

for(row in 1:nrow(expression)) 
{
 gene <- expression[(row),]
 EM <- rbind(EM,correct_outliers(gene))
}

EM_IDs = EM
EM_IDs$IDs = row.names(EM)
EM_IDs = EM_IDs[,ncol(EM_IDs)]
EM = cbind(EM_IDs,EM)
colnames(EM)[1] = "IDs"

write.table(EM, file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv",row.names=FALSE, sep="\t", quote = FALSE)

===== Expected False Positives by FDR Threshold ===== ==== Cavanagh ====


##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- List of thresholds ----##
thresholds = c(0.25,0.1,0.05,0.01,0.001,0.0001,0.00001,0.000001)

##---- get the number of false positives ----##
false_positive_dist = data.frame(matrix(0, ncol = 3, nrow = length(thresholds)))
names(false_positive_dist) = c("threshold","count","max")
false_positive_dist$threshold = thresholds

for (index in 1:length(thresholds)) {
  threshold = thresholds[index]
  print(threshold)
  values <- vector(mode="numeric", length=50)
  max = 0
  for (counter in 1:50)
  {
    DE_data = read.table(file=paste("cavanagh/deseq2/random/44_187/DE_",counter,".csv",sep=""), header=TRUE, sep='\t', quote='',check.names = TRUE)
    values[counter] = nrow(subset(DE_data,p.adj < threshold))
    
    if (nrow(subset(DE_data,p.adj < threshold)) > max) 
    {
      max = nrow(subset(DE_data,p.adj < threshold))
    }
  }
  false_positive_dist[index,2] = median(values)
  false_positive_dist[index,3] = max

}

write.table(false_positive_dist, file="cavanagh/deseq2/random/44_187/false_positive_count.csv",row.names=FALSE, sep="\t", quote = FALSE)

==== Leday ====


##---- working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- List of thresholds ----##
thresholds = c(0.25,0.1,0.05,0.01,0.001,0.0001,0.00001,0.000001)

##---- get the number of false positives ----##
false_positive_dist = data.frame(matrix(0, ncol = 3, nrow = length(thresholds)))
names(false_positive_dist) = c("threshold","count","max")
false_positive_dist$threshold = thresholds

for (index in 1:length(thresholds)) {
  threshold = thresholds[index]
  print(threshold)
  values <- vector(mode="numeric", length=50)
  max = 0
  for (counter in 1:50)
  {
    DE_data = read.table(file=paste("leday/random/DE",counter,".csv",sep=""), header=TRUE, sep='\t', quote='',check.names = TRUE)
    values[counter] = nrow(subset(DE_data,adj.P.Val < threshold))
    
    if (nrow(subset(DE_data,adj.P.Val < threshold)) > max) 
    {
      max = nrow(subset(DE_data,adj.P.Val < threshold))
    }
  }
  false_positive_dist[index,2] = median(values)
  false_positive_dist[index,3] = max

}

write.table(false_positive_dist, file="leday/random/false_positive_count.csv",row.names=FALSE, sep="\t", quote = FALSE)

===== Correlation Clusters ===== Pre-filters the data


##---------------------------------------------##
##					       ##
##     Code to pre-filter the RNA-seq data     ##
##					       ##
##---------------------------------------------##

#Clears the enrivoment
rm(list = ls())


#Load libraries
library(psych)
library(stats)

#loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")
exp_values = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##---- Pre-filters the RNA-seq data ----##

# Remove genes
exp_values <- data.frame(subset(exp_values,apply(exp_values, 1, mean) >= 10))


# Identify genes with a high threshold
geneHighCoefOfVarFilter <- function(dataMatrix, coefOfVarThreshold) {
  coefOfVars <- apply(dataMatrix, 1, function(x) { sd(x) / abs(mean(x)) })
  fdata <- dataMatrix[coefOfVars < coefOfVarThreshold, ]
  
  # return the filtered data
  list(coefOfVars = coefOfVars, fdata=fdata)
}

# the smaller the threshold, the higher the experimental effect relative to the measurement precision
thr <- 0.15

# does the filtering
filtered <- geneHighCoefOfVarFilter(exp_values, thr)
filtered.combined <- na.omit(data.frame(filtered$fdata))
nrow(filtered.combined)

num.genes <- nrow(filtered.combined)
genes <- row.names(filtered.combined)

rnaSeq <- t(filtered.combined)
subjects <- row.names(rnaSeq)
my_subjs <- subjects


##---- Saves the data ----##

rnaSeq <- rnaSeq[my_subjs,]
num.genes <- ncol(rnaSeq)
save(num.genes, rnaSeq, my_subjs,file = "cavanagh/clusters/exp.filtered.corrected.Rdata")

Does the clustering


##---------------------------------------------##
##					       ##
##          Code to do the clustering          ##
##					       ##
##---------------------------------------------##

#loads the data
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/clusters/")
load("exp.filtered.corrected.Rdata")


#loads the libraries
library("dendextend")
library(WGCNA)
library(GSVA)
library(reshape2)
library(grid)
library(broom)
library(ggplot2)
library(ade4)
library(leaps) 
library(dplyr)
library(flashClust)


# Prepares the data
pathnames <- list.files(pattern="[.]R$", path=paste(getwd(),"/funcs", sep =""), full.names=TRUE);
sapply(pathnames, FUN=source);
genes <- colnames(rnaSeq)
rnaseq_counts <- rnaSeq
expr <- as.matrix(t(rnaseq_counts))


##---- does the clustering ----##
corrThresh = 0.5
inputmat = rnaSeq
regs = genes
inputmat <- cor(inputmat)
adjacency <- inputmat > corrThresh
TOM <- TOMsimilarity(adjacency+0, TOMType = "unsigned");
dissTOM = 1-TOM

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 50;

# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, cutHeight = 0.95, distM = dissTOM,deepSplit = 2, pamRespectsDendro = FALSE,minClusterSize = minModuleSize);
mod_sizes <- table(dynamicMods)
length(mod_sizes)
mod_sizes

# gets all cluster members except first (-1) because first is "unassigned genes"
numMods <- sort(unique(names(dynamicMods)))[-1]
names(dynamicMods) <- colnames(adjacency)
modNumbers <- unique(dynamicMods)
names(dynamicMods) <- colnames(adjacency)
gene.mod.list <- lapply(modNumbers, function(x) {
  cluster_genes <- dynamicMods[dynamicMods == x]
  names(cluster_genes)
})

gene.mod.df <- melt(gene.mod.list)
rownames(gene.mod.df) <- gene.mod.df[,1]

# saves the results
write.table(gene.mod.df, file="meta_genes.csv",row.names=TRUE, sep="\t", quote = FALSE)

Stats and plots


setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- general libraries ----##
library(ggplot2)
library(reshape)
library(amap)


##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}


##---- number of clusters:  ----##
number_of_clusters = 48


##---- plot metagene correlations:  ----##

DGM_all = read.table(file="clusters/meta_genes.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
expression_data = read.table(file="deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

for(cluster_no in 1:number_of_clusters)
{

cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),])
expression_data_cluster_corr_melted = melt(cor(t(expression_data_cluster)))
expression_data_cluster_corr_melted = subset(expression_data_cluster_corr_melted,value < 1)

ggp <- ggplot(expression_data_cluster_corr_melted, aes(x=value)) + geom_density(alpha=0.25) + xlab("SCC") + ylab("density") + theme_SL2() + theme(legend.position="bottom") + xlim(c(-1,1)) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_corr_plots/",cluster_no,"_corrplot.svg",sep=""), height=300/90, width=300/90)
print(ggp)
dev.off()

}

##---- plot metagene HC vs MDD  ----##

patient_data_cavanagh = read.table(file="raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

# violin plots
legend_position = "none"

mdd_ttest_list = data.frame(matrix(, nrow=number_of_clusters, ncol=4))
mdd_spearman_list = data.frame(matrix(, nrow=number_of_clusters, ncol=2))

for(cluster_no in 1:number_of_clusters)
{

# gets the cluster expression data
cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),])
expression_data_cluster_scaled = data.frame(t(scale(t(expression_data_cluster))))
expression_data_cluster_scaled_colmeans = data.frame(colMeans(expression_data_cluster_scaled))

#stats for subtypes
expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$group
group1 = subset(expression_data_cluster_scaled_colmeans,Diag=="HC")$colMeans.expression_data_cluster_scaled.
group2 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_resistant")$colMeans.expression_data_cluster_scaled.
group3 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_responder")$colMeans.expression_data_cluster_scaled.
group4 = subset(expression_data_cluster_scaled_colmeans,Diag=="MDD_untreated")$colMeans.expression_data_cluster_scaled.
mdd_ttest_list[cluster_no,2] = t.test(group1,group2)$p.value
mdd_ttest_list[cluster_no,3] = t.test(group1,group3)$p.value
mdd_ttest_list[cluster_no,4] = t.test(group1,group4)$p.value

#plots the subtypes
x_axis_label = ""
y_axis_label = "Cluster meta-gene\nexpression (Z-score)"
ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag, fill=Diag)) + geom_boxplot() + ylim(min(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25), max(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_boxplots_subtypes/",cluster_no,"_boxplot.svg",sep=""), height=160/90, width=400/90)
print(ggp)
dev.off()

#stats for disease state
expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$MDD
group1 = subset(expression_data_cluster_scaled_colmeans, Diag==0)$colMeans.expression_data_cluster_scaled.
group2 = subset(expression_data_cluster_scaled_colmeans,Diag==1)$colMeans.expression_data_cluster_scaled.
mdd_ttest_list[cluster_no,1] = t.test(group1,group2)$p.value

#plots for disease state
x_axis_label = ""
y_axis_label = "Cluster meta-gene\nexpression (Z-score)"
ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag)) + scale_x_discrete(breaks=c(0, 1),labels=c("HC","MDD"),limits=c(0, 1)) + geom_boxplot() + ylim(min(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25), max(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled. * 1.25)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_boxplots_MDD/",cluster_no,"_boxplot.svg",sep=""), height=200/90, width=200/90)
print(ggp)
dev.off() 

#stats for hamilton score
expression_data_cluster_scaled_colmeans$Diag = patient_data_cavanagh$hamilton_score
mdd_spearman_list[cluster_no,1]=cor.test(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled., expression_data_cluster_scaled_colmeans$Diag, method=c("spearman"))$estimate
mdd_spearman_list[cluster_no,2]=cor.test(expression_data_cluster_scaled_colmeans$colMeans.expression_data_cluster_scaled., expression_data_cluster_scaled_colmeans$Diag, method=c("spearman"))$p.value

#plots the hamilton score
x_axis_label = "Hamilton score"
y_axis_label = "Cluster meta-gene\nexpression (Z-score)"
ggp = ggplot(expression_data_cluster_scaled_colmeans, aes(x=Diag, y=colMeans.expression_data_cluster_scaled., group=Diag, fill=Diag)) + geom_point() + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position) + ggtitle(paste("cluster",cluster_no))
svg(paste("clusters/cluster_vs_hamilton_corr_plots/",cluster_no,"_dotplot.svg",sep=""), height=160/90, width=150/90)
print(ggp)
dev.off()

}

# BH correction of stats
mdd_ttest_list$HC_vs_MDD_FDR = p.adjust(mdd_ttest_list$X1, method = "BH", n = number_of_clusters)
mdd_ttest_list$HC_vs_resistant_FDR = p.adjust(mdd_ttest_list$X2, method = "BH", n = number_of_clusters)
mdd_ttest_list$HC_vs_responder_FDR = p.adjust(mdd_ttest_list$X3, method = "BH", n = number_of_clusters)
mdd_ttest_list$HC_vs_untreated_FDR = p.adjust(mdd_ttest_list$X4, method = "BH", n = number_of_clusters)
names(mdd_ttest_list) = c("HC_vs_MDD_p","HC_vs_resistant_p","HC_vs_responder_p","HC_vs_untreated_p","HC_vs_MDD_pBH","HC_vs_resistant_pBH","HC_vs_responder_pBH","HC_vs_untreated_pBH")
write.table(mdd_ttest_list, file="clusters/cluster_violin_pvals.csv",row.names=TRUE, sep="\t", quote = FALSE)

mdd_spearman_list$HAM_vs_Exp_SCC_FDR = p.adjust(mdd_spearman_list$X2, method = "BH", n = number_of_clusters)
names(mdd_spearman_list) = c("SCC","p","pBH")
write.table(mdd_spearman_list, file="clusters/cluster_vs_ham_pvals.csv",row.names=TRUE, sep="\t", quote = FALSE)



##----- Clustering Method -----##
cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
  x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method)
  y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method)
  
  x.dd = as.dendrogram(x.cluster)
  y.dd = as.dendrogram(y.cluster)
  x.dd.reorder = reorder(x.dd,0,FUN=reorder_function)
  y.dd.reorder = reorder(y.dd,0,FUN=reorder_function)
  x.order = order.dendrogram(x.dd.reorder)
  y.order = order.dendrogram(y.dd.reorder)

  if(cluster_x == TRUE && cluster_y == TRUE) 
  {
      mx_clustered = mx[y.order,x.order]
  }
  
  if(cluster_x == TRUE && cluster_y == FALSE)
  {
      mx_clustered = mx[,x.order]
  }
  
  if(cluster_x == FALSE && cluster_y == TRUE)
  {
      mx_clustered = mx[y.order,]
  }
  if(cluster_x == FALSE && cluster_y == FALSE) 
  {
      mx_clustered = mx
  }
  return(mx_clustered)
}

##----- Significant Genes Heatmap Function  -----##
make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{

  if(nrow(matrix_scaled) >= 2) 
  {
    matrix_scaled_renamed = as.matrix(matrix_scaled)  
    matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
    matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered)
    matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered))
    matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered))

    hm.palette <- colorRampPalette(colours)
    ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank())
    return(ggp)
  }
  else
  {
    return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this."))
  }
}


##---- Heatmap settings ----##

plot_height = 750
plot_width = 500
colours = c("blue","yellow")
distance_method = "spearman"
clustering_method = "average"
reorder_function = "average"
cluster_x = TRUE
cluster_y = TRUE


##---- plot metagene Heatmaps  ----##

for(cluster_no in 1:number_of_clusters)
{

cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_cluster = na.omit(expression_data[row.names(cluster_IDs),])
heatmap_matrix = data.frame(t(scale(t(expression_data_cluster))))

nrow(heatmap_matrix)
ncol(heatmap_matrix)

ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png(paste("clusters/cluster_heatmaps/",cluster_no,"_heatmap.png",sep=""), height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()
}


##---- plot correlation heatmap  ----##
plot_height = 2500
plot_width = 2500
cluster_x = TRUE
cluster_y = TRUE


#select the first 25 genes from each cluster (to make clustering quicker)
cluster_genes_25 = list()
for(cluster_no in 1:number_of_clusters)
{
  cluster_genes = subset(DGM_all,L1 == cluster_no)
  cluster_genes_top = head(cluster_genes,round(nrow(cluster_genes)/4))
  cluster_genes_25 = rbind(cluster_genes_25,cluster_genes_top)
}

nrow(cluster_genes_25)

# makes the matrix
expression_data_clusters = na.omit(expression_data[row.names(cluster_genes_25),])
cluster_IDs = subset(DGM_all,L1 == cluster_no)
expression_data_clusters_corr = cor(t(expression_data_clusters))
heatmap_matrix = data.frame(expression_data_clusters_corr)

nrow(heatmap_matrix)
ncol(heatmap_matrix)

# plots
ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("clusters/correlation_heatmap.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()

===== Plots ===== ==== Cibersort ====


##---- libraries----##
library(ggplot2)
library(reshape)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- loads data ----##
cavanagh = read.table(file="cavanagh/cibersort/CIBERSORT.Output_Job6_parsed.csv",row.names=1,header=TRUE, sep='\t', quote='',check.names = TRUE)

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(1,1,1,1), "cm")
        )
}

##---- Cavanagh only plot ----##
names(cavanagh) = c("Monocytes","T cells CD4 memory resting","NK cells resting","T cells CD4 naive","T cells CD8","B cells naive","Macrophages M0","B cells memory","Neutrophils","T cells regulatory (Tregs)","Mast cells resting","Dendritic cells activated","T cells CD4 memory activated","Plasma cells","Macrophages M2","Eosinophils","Mast cells activated","Macrophages M1")
cavanagh_melted = melt(cavanagh)

ggp = ggplot(cavanagh_melted, aes(x=variable, y=value*100)) + geom_boxplot() + xlab("") + ylab("% of cell population") + theme_SL2() + theme(legend.position="right",axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
svg("plots/cibersort_cavanagh.svg", height=400/90, width=800/90)
print(ggp)
dev.off()

==== Signficant Confounders Bar Chart ====


##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- number of sig genes by clinical parameter bar chart ----##
sig_genes_by_clinical_parameter = read.table(file="cavanagh/deseq2/unmodelled/SIG_COUNTS.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

sig_genes_by_clinical_parameter$IDs = row.names(sig_genes_by_clinical_parameter)
sig_genes_by_clinical_parameter$IDs = gsub('\\.', ' ', sig_genes_by_clinical_parameter$IDs)
sig_genes_by_clinical_parameter$IDs = gsub('  ', ' ', sig_genes_by_clinical_parameter$IDs)
sig_genes_by_clinical_parameter$IDs = factor(gsub('_', ' ', sig_genes_by_clinical_parameter$IDs))
row.names(sig_genes_by_clinical_parameter) = sig_genes_by_clinical_parameter$IDs
sig_genes_by_clinical_parameter = sig_genes_by_clinical_parameter[with(sig_genes_by_clinical_parameter, order(count)), ]

sig_genes_by_clinical_parameter_top = subset(sig_genes_by_clinical_parameter,count >= 5)
sig_genes_by_clinical_parameter_top$IDs = factor(sig_genes_by_clinical_parameter_top$IDs)
sig_genes_by_clinical_parameter_top$IDs <- factor(sig_genes_by_clinical_parameter_top$IDs, levels = sig_genes_by_clinical_parameter_top$IDs)

x_axis_label = "number of significant genes (adjusted p <0.01)"
y_axis_label = ""
bar_outline_size = 1
bar_transparency = 0.75
legend_position = "bottom"
data_label_size = 3

ggp <- ggplot(data=sig_genes_by_clinical_parameter_top , aes(x=IDs, y=count)) + geom_bar(colour="black",stat="identity", position = "dodge", alpha = bar_transparency) + coord_flip() + ylab(x_axis_label)  + xlab(y_axis_label) + theme_SL2() + theme(axis.text.y = element_text(hjust=1),legend.position=legend_position, legend.spacing.x = unit(0.15, 'cm')) + scale_y_continuous(expand=c(0,0), limits=c(0,max(sig_genes_by_clinical_parameter_top$count)*1.25)) + geom_text(aes(label=count), position=position_dodge(width=0.9), size=data_label_size, show.legend = FALSE, hjust=-0.5)
svg("plots/sig_genes_barchart.svg", height=300/90, width=800/90, pointsize=5)
print(ggp)
dev.off()

==== Age, Gender, BMI heatmaps =====


##---- libraries----##
library(ggplot2)
library(amap)
library(reshape)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- loads data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(1,1,1,1), "cm")
        )
}

##----- Clustering Method -----##
cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
  x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method)
  y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method)
  
  x.dd = as.dendrogram(x.cluster)
  y.dd = as.dendrogram(y.cluster)
  x.dd.reorder = reorder(x.dd,0,FUN=reorder_function)
  y.dd.reorder = reorder(y.dd,0,FUN=reorder_function)
  x.order = order.dendrogram(x.dd.reorder)
  y.order = order.dendrogram(y.dd.reorder)

  if(cluster_x == TRUE && cluster_y == TRUE) 
  {
      mx_clustered = mx[y.order,x.order]
  }
  
  if(cluster_x == TRUE && cluster_y == FALSE)
  {
      mx_clustered = mx[,x.order]
  }
  
  if(cluster_x == FALSE && cluster_y == TRUE)
  {
      mx_clustered = mx[y.order,]
  }
  if(cluster_x == FALSE && cluster_y == FALSE) 
  {
      mx_clustered = mx
  }
  return(mx_clustered)
}

##----- Significant Genes Heatmap Function  -----##
make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{

  if(nrow(matrix_scaled) >= 2) 
  {
    matrix_scaled_renamed = as.matrix(matrix_scaled)  
    matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
    matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered)
    matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered))
    matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered))

    hm.palette <- colorRampPalette(colours)
    ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank())
    return(ggp)
  }
  else
  {
    return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this."))
  }
}


##---- Heatmap settings ----##

plot_height = 750
plot_width = 500
colours = c("blue","yellow")
distance_method = "spearman"
clustering_method = "average"
reorder_function = "average"
cluster_x = FALSE
cluster_y = TRUE

##---- Gender Heatmap ----##

samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_gender.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)))
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_gender.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01))
heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)

ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/cavanagh_gender_heatmap.png", height=1750, width=1000, pointsize=5)
print(ggp)
dev.off()

##---- Age Heatmap ----##

samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)))
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01))
heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)

ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/cavanagh_age_heatmap.png", height=2000, width=1000, pointsize=5)
print(ggp)
dev.off()

##---- BMI Heatmap ----##

samples = unlist(row.names(read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)))
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01))
heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)

ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/cavanagh_BMI_heatmap.png", height=2000, width=1000, pointsize=5)
print(ggp)
dev.off()

==== Age, Gender, BMI Boxplots ====


##---- libraries----##
library(ggplot2)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- loads cavanagh data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
age_samples = read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
BMI_samples = read.table(file="cavanagh/deseq2/unmodelled/SAMPLES_BMI.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

##---- gender plots ----##

# 2 is female
gene = "ENSG00000012817"
expression_data_gene = t(expression_data[gene,])
groups = samples$gender
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
sample_groups = c("Male", "Female")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=sample_groups,labels=sample_groups,limits=sample_groups) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_gender_up_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000147050"
expression_data_gene = t(expression_data[gene,])
groups = samples$gender
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
sample_groups = c("Male", "Female")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=sample_groups,labels=sample_groups,limits=sample_groups) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_gender_down_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()


##---- age plots ----##

# 2 is younger
gene = "ENSG00000169855"
groups = age_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Youngest", "Oldest")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_age_down_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000277586"
groups = age_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Youngest", "Oldest")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_age_up_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()


##---- BMI plots ----##

# 2 is lower
gene = "ENSG00000133742"
groups = BMI_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Lowest", "Highest")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(c(0,500))
svg("plots/cavanagh_BMI_up_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000161681"
groups = BMI_samples
expression_data_gene = t(expression_data[gene,row.names(groups)])
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("Lowest", "Highest")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_BMI_down_boxplot.svg", height=200/90, width=150/90)
print(ggp)
dev.off()

==== Cavanagh P-Value Waterfall Plots ====


##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(1,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- Plot settings ----##
x_axis_label = "Top 250 genes ranked by p-value\n(1 = lowest p-value, 250 = highest p-value)"
y_axis_label = "-log10p value"
line_type = "solid"
line_size = 1.5


##---- Cavanagh ----##
random = read.table(file="cavanagh/deseq2/random/44_187/P_DISTRIBUTION.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
gender = read.table(file="cavanagh/deseq2/unmodelled/DE_gender.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
age = read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
bmi = read.table(file="cavanagh/deseq2/unmodelled/DE_BMI.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_mdd = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_resistant = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_resistant.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_responder = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_responder.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_untreated = read.table(file="cavanagh/deseq2/modelled/DE_HC_vs_MDD_untreated.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)

random_dist = head(sort(rowMeans(random)),1000)
gender_dist = head(sort(gender$p),1000)
age_dist = head(sort(age$p),1000)
bmi_dist = head(sort(bmi$p),1000)
hc_vs_mdd_dist = head(sort(hc_vs_mdd$p),1000)
hc_vs_resistant_dist = head(sort(hc_vs_resistant$p),1000)
hc_vs_responder_dist = head(sort(hc_vs_responder$p),1000)
hc_vs_untreated_dist = head(sort(hc_vs_untreated$p),1000)
rank = 1:1000

# all
p_dist_combined = data.frame(cbind(gender_dist,age_dist,bmi_dist,hc_vs_mdd_dist,hc_vs_resistant_dist,hc_vs_responder_dist,hc_vs_untreated_dist,random_dist,rank))
names(p_dist_combined) = c("male vs female", "youngest vs oldest", "lowest vs highest BMI", "HC vs MDD", "HC vs MDD resistant", "HC vs MDD responder", "HC vs MDD untreated","random", "rank")
p_dist_combined_melted = melt(p_dist_combined, id="rank")

ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000000000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="none")
svg("plots/cavanagh_p_distribution.svg", height=250/90, width=450/90, pointsize=5)
print(ggp)
dev.off()

ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000000000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="right")
svg("plots/cavanagh_p_distribution_key.svg", height=500/90, width=600/90)
print(ggp)
dev.off()

==== Cavanagh MDD Boxplots ====


##---- libraries----##
library(ggplot2)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(1,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}


##---- loads cavanagh data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

##---- MDD plots ----##

# 0 is HC
gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_mdd_down1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000115290"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_mdd_down2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()


##---- Resistant plots ----##

# 1 is HC
samples_hc_and_resistant = subset(samples,group=="HC" | group=="MDD_resistant")
expression_data_hc_and_resistant = expression_data[,row.names(samples_hc_and_resistant)]

gene = "ENSG00000162738"
expression_data_gene = t(expression_data_hc_and_resistant[gene,])
groups = samples_hc_and_resistant$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("HC", "RES")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_resistant_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000182272"
expression_data_gene = t(expression_data_hc_and_resistant[gene,])
groups = samples_hc_and_resistant$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("HC", "RES")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_resistant_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()


##---- Responder plots ----##

# 1 is HC
samples_hc_and_responder = subset(samples,group=="HC" | group=="MDD_responder")
expression_data_hc_and_responder = expression_data[,row.names(samples_hc_and_responder)]

gene = "ENSG00000142686"
expression_data_gene = t(expression_data_hc_and_responder[gene,])
groups = samples_hc_and_responder$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "3"))
sample_groups = c("HC", "RSP")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "3"),labels=sample_groups,limits=c("1", "3")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_responder_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000128039"
expression_data_gene = t(expression_data_hc_and_responder[gene,])
groups = samples_hc_and_responder$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "3"))
sample_groups = c("HC", "RSP")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "3"),labels=sample_groups,limits=c("1", "3")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_responder_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()


##---- Untreated plots ----##

# 1 is HC
samples_hc_and_untreated = subset(samples,group=="HC" | group=="MDD_untreated")
expression_data_hc_and_untreated = expression_data[,row.names(samples_hc_and_untreated)]

gene = "ENSG00000165633"
expression_data_gene = t(expression_data_hc_and_untreated[gene,])
groups = samples_hc_and_untreated$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "4"))
sample_groups = c("HC", "UNT")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "4"),labels=sample_groups,limits=c("1", "4")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_untreated_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000075151"
expression_data_gene = t(expression_data_hc_and_untreated[gene,])
groups = samples_hc_and_untreated$group
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "4"))
sample_groups = c("HC", "UNT")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "4"),labels=sample_groups,limits=c("1", "4")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_untreated_up2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

==== Cavanagh Random Boxplots ====


##---- libraries----##
library(ggplot2)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(1,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- Random 1 plots ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/deseq2/random/44_187/DE_PATIENTS_1.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

gene = "ENSG00000186973"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G1", "G2")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random1_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000182272"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G1", "G2")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random1_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()




##---- Random 2 plots ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_combat_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/deseq2/random/44_187/DE_PATIENTS_2.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

gene = "ENSG00000120658"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G3", "G4")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random2_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "ENSG00000115414"
expression_data_gene = t(expression_data[gene,])
groups = samples$condition
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("1", "2"))
sample_groups = c("G3", "G4")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("1", "2"),labels=sample_groups,limits=c("1", "2")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/cavanagh_random2_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

==== Cavanagh False Positive Counts Bar Charts ====


##---- libraries----##
library(ggplot2)
library(reshape)


##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- load data ----##
false_positives = read.table(file="cavanagh/deseq2/random/44_187/false_positive_count.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)

##---- Make plots ----##
false_positives$threshold = factor(false_positives$threshold,levels=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"))
false_positives_melt = melt(false_positives,id.vars="threshold")

ggp = ggplot(data=false_positives,aes(x=threshold,y=count)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("median false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=count,label=round(count)),vjust=0,nudge_y=1)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,25)) 
svg("plots/cavanagh_false_positive_counts.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()

ggp = ggplot(data=false_positives,aes(x=threshold,y=max)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("max false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=max,label=round(max)),vjust=0,nudge_y=5)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,2500)) 
svg("plots/cavanagh_false_positive_max.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()

==== Cavanagh Random Enriched GO ====


##---- general libraries ----##
library(ggplot2)
library(reshape)

##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)

##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- input files ----##
ontologies = read.table(file="cavanagh/deseq2/random/44_187/david/DAVID_at_least_3_GO_KEGG_REACTOME_BIOCARTA.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = subset(ontologies,FDR < 5)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("Term","Name","Pop.Hits","Count","Fold.Enrichment","FDR")]
ontologies_sig_trimmmed$FDR = factor(round(ontologies_sig_trimmmed$FDR,5))
ontologies_sig_trimmmed$Fold.Enrichment = round(ontologies_sig_trimmmed$Fold.Enrichment,2)
names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR")

##---- plot table ----##
plot_height = 475/90
plot_width = 800/90
header_size = 1
text_size = 1
border_thickness = 2

table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/cavanagh_random_ontologies_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()

==== Leday MDD Boxplots ====


##---- libraries----##
library(ggplot2)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(1,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}


##---- loads leday data ----##
expression_data = read.table(file="leday/EM_covariate_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
expression_data = expression_data[,rownames(samples)]

##---- MDD plots ----## 

# 2 is HC
gene = "212531_at" # LCN2
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_1_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "212768_s_at" # OLFM4
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_2_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "207269_at" # DEFA4
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_3_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "231688_at" # MMP8
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_4_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "206676_at" # CEACAM8
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_5_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "205557_at" # BPI
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_6_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "206851_at" # RNASE3
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_7_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

gene = "206697_s_at" # HP
expression_data_gene = t(expression_data[gene,])
groups = samples$source_name_ch1
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("2", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=expression, group=group)) + geom_boxplot() + scale_x_discrete(breaks=c("2", "1"),labels=sample_groups,limits=c("2", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1))
svg("plots/leday_mdd_8_boxplot.svg", height=150/90, width=150/90)
print(ggp)
dev.off()

==== Leday MDD heatmap ====


##---- libraries----##
library(ggplot2)
library(amap)
library(reshape)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- loads data ----##
expression_data = read.table(file="leday/EM_covariate_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(1,1,1,1), "cm")
        )
}

##----- Clustering Method -----##
cluster_matrix <- function(mx,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{
  x.cluster = hclust(Dist(t(mx), method=distance_method), method=clustering_method)
  y.cluster = hclust(Dist(mx, method=distance_method), method=clustering_method)
  
  x.dd = as.dendrogram(x.cluster)
  y.dd = as.dendrogram(y.cluster)
  x.dd.reorder = reorder(x.dd,0,FUN=reorder_function)
  y.dd.reorder = reorder(y.dd,0,FUN=reorder_function)
  x.order = order.dendrogram(x.dd.reorder)
  y.order = order.dendrogram(y.dd.reorder)

  if(cluster_x == TRUE && cluster_y == TRUE) 
  {
      mx_clustered = mx[y.order,x.order]
  }
  
  if(cluster_x == TRUE && cluster_y == FALSE)
  {
      mx_clustered = mx[,x.order]
  }
  
  if(cluster_x == FALSE && cluster_y == TRUE)
  {
      mx_clustered = mx[y.order,]
  }
  if(cluster_x == FALSE && cluster_y == FALSE) 
  {
      mx_clustered = mx
  }
  return(mx_clustered)
}

##----- Significant Genes Heatmap Function  -----##
make_significant_genes_heatmap <- function(matrix_scaled,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
{

  if(nrow(matrix_scaled) >= 2) 
  {
    matrix_scaled_renamed = as.matrix(matrix_scaled)  
    matrix_scaled_renamed_clustered = cluster_matrix(matrix_scaled_renamed,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
    matrix_scaled_renamed_clustered_melted <- melt(matrix_scaled_renamed_clustered)
    matrix_scaled_renamed_clustered_melted$X1 = factor(matrix_scaled_renamed_clustered_melted$X1, levels=row.names(matrix_scaled_renamed_clustered))
    matrix_scaled_renamed_clustered_melted$X2 = factor(matrix_scaled_renamed_clustered_melted$X2, levels=colnames(matrix_scaled_renamed_clustered))

    hm.palette <- colorRampPalette(colours)
    ggp = ggplot(matrix_scaled_renamed_clustered_melted, aes(x = X2, y = X1, fill = value)) + geom_tile() + scale_fill_gradientn(colours = hm.palette(100)) + ylab('') + xlab('') + theme_SL2() + theme(legend.position="none", legend.title = element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank())
    return(ggp)
  }
  else
  {
    return(ggplot(data.frame()) + theme_SL2() + geom_blank() + ggtitle("There were too few genes to sensibly plot this."))
  }
}


##---- Heatmap settings ----##

plot_height = 750
plot_width = 500
colours = c("blue","yellow")
distance_method = "spearman"
clustering_method = "average"
reorder_function = "average"
cluster_x = FALSE
cluster_y = TRUE

##---- Heatmap ----##

#get the samples in order of HC then MDD
sample_data = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
control = unlist(row.names(subset(sample_data,group == 0)))
mdd = unlist(row.names(subset(sample_data,group == 1)))
samples = c(control,mdd)

MDD165 = read.table(file="leday/MDD165.txt", header=FALSE, sep='\t', quote='',check.names = TRUE)
names(MDD165) = "Gene.Symbol"
id_converter = read.table(file="leday/DE.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
sig_genes = merge(MDD165,id_converter, by="Gene.Symbol")$Gene.Symbol


heatmap_matrix = data.frame(t(scale(t(expression_data[sig_genes,samples]))))
nrow(heatmap_matrix)
ncol(heatmap_matrix)

ggp = make_significant_genes_heatmap(heatmap_matrix,colours,cluster_x,cluster_y,distance_method,clustering_method,reorder_function)
png("plots/leday_MDD165_heatmap.png", height=1750, width=1000, pointsize=5)
print(ggp)
dev.off()

==== Leday P Distribution Waterfall ====


##---- libraries ----##
library(ggplot2)
library(reshape)


##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(1,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- Plot settings ----##
x_axis_label = "Top 250 genes ranked by p-value\n(1 = lowest p-value, 250 = highest p-value)"
y_axis_label = "-log10p value"
line_type = "solid"
line_size = 1.5


##---- Leday ----##
random = read.table(file="leday/random/P_DISTRIBUTION.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
hc_vs_mdd = read.table(file="leday/DE.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
male_vs_female = read.table(file="leday/DE_gender.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
young_vs_old = read.table(file="leday/DE_age.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
anxiety_vs_control = read.table(file="leday/DE_anxiety.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)

random_dist = head(sort(rowMeans(random)),1000)
hc_vs_mdd_dist = head(sort(hc_vs_mdd$P.Value),1000)
gender_dist = head(sort(male_vs_female$P.Value),1000)
age_dist = head(sort(young_vs_old$P.Value),1000)
anxiety_dist = head(sort(anxiety_vs_control$P.Value),1000)
rank = 1:1000

# all
p_dist_combined = data.frame(cbind(gender_dist,age_dist,anxiety_dist,hc_vs_mdd_dist,random_dist,rank))
names(p_dist_combined) = c("male vs female","youngest vs oldest","low vs high anxiety","HC vs MDD","random", "rank")
p_dist_combined_melted = melt(p_dist_combined, id="rank")

ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000001,10),group=variable,colour=variable)) + ylim(0,10) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="none")
svg("plots/leday_p_distribution.svg", height=250/90, width=450/90, pointsize=5)
print(ggp)
dev.off()

ggp = ggplot(data=p_dist_combined_melted,aes(x=rank,y=-log(value+0.0000000001,10),group=variable,colour=variable)) + xlim(1,250) + geom_line(linetype=line_type,size=line_size) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position="right")
svg("plots/leday_p_distribution_key.svg", height=500/90, width=600/90)
print(ggp)
dev.off()

==== Leday False Positive Count ====


##---- libraries----##
library(ggplot2)
library(reshape)


##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- load data ----##
false_positives = read.table(file="leday/random/false_positive_count.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)

##---- Make plots ----##
false_positives$threshold = factor(false_positives$threshold,levels=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"))
false_positives_melt = melt(false_positives,id.vars="threshold")

ggp = ggplot(data=false_positives,aes(x=threshold,y=count)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("mean false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=count,label=round(count)),vjust=0,nudge_y=1)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,200)) 
svg("plots/leday_false_positive_counts.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()

ggp = ggplot(data=false_positives,aes(x=threshold,y=max)) + geom_bar(stat="identity") + xlab("adjusted p threshold") + ylab("max false positives") + theme_SL2() + theme(legend.position="none",axis.text.x = element_text(angle = 30, hjust=1, vjust=1)) + geom_text(data=false_positives,aes(x=threshold,y=max,label=round(max)),vjust=0,nudge_y=5)+ scale_x_discrete(breaks=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06"),labels=c("0.25","0.1","0.05","0.01","0.001","0.0001","0.00001","0.000001"),limits=c("0.25","0.1","0.05","0.01","0.001","1e-04","1e-05","1e-06")) + ylim(c(0,5000)) 
svg("plots/leday_false_positive_max.svg", height=250/90, width=350/90, pointsize=5)
print(ggp)
dev.off()

==== Leday random GO terms table ====


##---- general libraries ----##
library(ggplot2)
library(reshape)

##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)

##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- input files ----##
ontologies = read.table(file="leday/random/david/David_any1_GO_Kegg_Reactome_Biocarta.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = subset(ontologies,FDR < 50)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("Rank","Term","Name","Pop.Hits","Count","Fold.Enrichment","FDR")]
ontologies_sig_trimmmed$FDR = factor(round(ontologies_sig_trimmmed$FDR,2))
ontologies_sig_trimmmed$Fold.Enrichment = round(ontologies_sig_trimmmed$Fold.Enrichment,2)
names(ontologies_sig_trimmmed) = c("Rank","ID","Term","Term Size","Overlap","Enrichment","FDR")

##---- plot table ----##
plot_height = 300/90
plot_width = 850/90
header_size = 1
text_size = 1
border_thickness = 2

table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/leday_random_ontologies_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()

==== Venn diagram stats ====


# BMI
phyper(14,203,18000-203,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(1,203,18000-203,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(1,203,18000-203,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi

# Age
phyper(37,1244,18000-1244,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(26,1244,18000-1244,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(2,1244,18000-1244,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi

# Gender
phyper(18,625,18000-625,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(3,625,18000-625,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(0,625,18000-625,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi

# Combined
phyper(49,1863,18000-1863,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(27,1863,18000-1863,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(2,1863,18000-1863,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi

# MDD
phyper(5,123,18000-129,165, lower.tail = FALSE, log.p = FALSE) # Leday vs Jansen
phyper(1,29,18000-29,165, lower.tail = FALSE, log.p = FALSE) # Leday vs Mostafavi
phyper(0,29,18000-29,129, lower.tail = FALSE, log.p = FALSE) # Jansen vs Mostafavi

# Random
phyper(3,154,18000-154,165, lower.tail = FALSE, log.p = FALSE) # Leday
phyper(2,154,18000-154,129, lower.tail = FALSE, log.p = FALSE) # Jansen
phyper(1,154,18000-154,29, lower.tail = FALSE, log.p = FALSE) # Mostafavi

==== BMI Webgestaldt table ====


##---- general libraries ----##
library(ggplot2)
library(reshape)

##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)

##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- input files ----##
ontologies = read.table(file="cross_dataset/web_gestaldt/bmi/enrichment_results_wg_result1565946369.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = head(ontologies,10)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("geneSet","description","size","overlap","enrichmentRatio","FDR")]
ontologies_sig_trimmmed$FDR = factor(signif(ontologies_sig_trimmmed$FDR*100,2))
ontologies_sig_trimmmed$enrichmentRatio = round(ontologies_sig_trimmmed$enrichmentRatio,2)
names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR")

##---- plot table ----##
plot_height = 300/90
plot_width = 800/90
header_size = 1
text_size = 1
border_thickness = 2

table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/cavanagh_BMI_webgestaldt_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()

==== Age Webgestaldt table ====


##---- general libraries ----##
library(ggplot2)
library(reshape)

##---- table libraries ----##
library(grid)
library(gridExtra)
library(gtable)

##---- sets the working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- input files ----##
ontologies = read.table(file="cross_dataset/web_gestaldt/age/enrichment_results_wg_result1566543491.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
ontologies_sig = head(ontologies,10)
ontologies_sig$Rank = row.names(ontologies_sig)
ontologies_sig_trimmmed = ontologies_sig[,c("geneSet","description","size","overlap","enrichmentRatio","FDR")]
ontologies_sig_trimmmed$FDR = factor(signif(ontologies_sig_trimmmed$FDR*100,2))
ontologies_sig_trimmmed$enrichmentRatio = round(ontologies_sig_trimmmed$enrichmentRatio,2)
names(ontologies_sig_trimmmed) = c("ID","Term","Term Size","Overlap","Enrichment","FDR")

##---- plot table ----##
plot_height = 300/90
plot_width = 800/90
header_size = 1
text_size = 1
border_thickness = 2

table_theme <- gridExtra::ttheme_minimal(core = list(fg_params=list(cex = text_size)),colhead = list(fg_params=list(cex = header_size)))
gt <- tableGrob(ontologies_sig_trimmmed, theme=table_theme, rows=NULL)
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 2, b = nrow(gt), l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, l = 1, r = ncol(gt))
gt <- gtable_add_grob(gt,grobs = rectGrob(gp = gpar(fill = NA, lwd = border_thickness)),t = 1, b = nrow(gt), l = 1, r = 1)
svg("plots/cavanagh_Age_webgestaldt_table.svg", height=plot_height, width=plot_width)
grid.arrange(gt)
dev.off()

==== Webgestaldt scatter plots ====


library(ggplot2)

##---- Load data ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cross_dataset/web_gestaldt")
age = read.table(file="age/enrichment_results_wg_result1566543491.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
bmi = read.table(file="bmi/enrichment_results_wg_result1565946369.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
age_bmi = read.table(file="age_plus_BMI/enrichment_results_wg_result1566544579.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
mdd165 = read.table(file="MDD165/enrichment_results_wg_result1565946153.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
Jaansen = read.table(file="Jaansen/enrichment_results_wg_result1565953466.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)
Mostafavi = read.table(file="Mostafavi/enrichment_results_wg_result1565955664.csv", header=TRUE, sep='\t', quote='',check.names = TRUE)


##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}


##---- BMI ----##

# BMI vs Leday
merged= merge(mdd165,bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_bmi_mdd165.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_bmi_mdd165_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")


# BMI vs Jaansen
merged= merge(Jaansen,bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_bmi_Jansen.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_bmi_Jansen_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")


# BMI vs Mostafavi
merged= merge(Mostafavi,bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_bmi_Mostafavi.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_bmi_Mostafavi_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")



##---- Age + BMI ----##


# BMI vs Leday
merged= merge(mdd165,age_bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_bmi_mdd165.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_bmi_mdd165_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")


# BMI vs Jaansen
merged= merge(Jaansen,age_bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_bmi_Jansen.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_bmi_Jansen_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")


# BMI vs Mostafavi
merged= merge(Mostafavi,age_bmi, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p age + BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_bmi_Mostafavi.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age + BMI") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_bmi_Mostafavi_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")





##---- Age ----##


# BMI vs Leday
merged= merge(mdd165,age, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_mdd165.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_mdd165_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")


# BMI vs Jaansen
merged= merge(Jaansen,age, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_Jansen.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Jansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_Jansen_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")


# BMI vs Mostafavi
merged= merge(Mostafavi,age, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p age") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_Mostafavi.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none")+ geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p age") + xlab("-log10p Mostafavi")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_age_Mostafavi_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")



##---- MDD ----##



# leday vs Jaansen
merged= merge(mdd165,Jaansen, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + ylab("-log10p Jansen") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_mdd165_Jansen.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + ylab("-log10p Jansen") + xlab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_mdd165_Jansen_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")



# leday vs Mostafavi
merged= merge(Mostafavi,mdd165, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + xlab("-log10p Mostafavi") + ylab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_mdd165_Mostafavi.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + xlab("-log10p Mostafavi") + ylab("-log10p Leday")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_mdd165_Mostafavi_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")



# Jaansen vs Mostafavi
merged= merge(Mostafavi,Jaansen, by="geneSet")
sig_both = subset(merged,FDR.x < 0.05 & FDR.y < 0.05)

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + xlab("-log10p Mostafavi") + ylab("-log10p GO\nenrichment Jaansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_Jaansen_Mostafavi.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

ggp = ggplot(data=merged, aes(x=-log(pValue.x,10), y=-log(pValue.y,10))) + geom_point(size=1.5,alpha=1)  + theme_SL2() + theme(legend.position="none") + geom_text(data = sig_both, aes(label=description.x),hjust=1, vjust=0,size=3,colour="red",alpha=0.7) + xlab("-log10p Mostafavi") + ylab("-log10p GO\nenrichment Jaansen")# + xlim(c(0,20)) + ylim(c(0,20))  
svg("WebGestaldt_Jaansen_Mostafavi_labelled.svg", height=350/90, width=350/90) 
print(ggp)
dev.off()

cor(-log(merged$pValue.x,10), -log(merged$pValue.y,10),  method = "spearman")

==== Cavanagh Immune Age Scatterplot ====


##---- libraries----##
library(ggplot2)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- load data ----##
expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD_outlier_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##---- get meta gene ----##
sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01 & log2fold > 0.5))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_up = data.frame(colMeans(expression_data_sig_zscore))
names(meta_gene_up) = "up"
nrow(expression_data_sig_zscore)


sig_genes = row.names(subset(read.table(file="cavanagh/deseq2/unmodelled/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),p.adj<0.01 & log2fold < -1))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_down = data.frame(colMeans(expression_data_sig_zscore))
names(meta_gene_down) = "down"
nrow(expression_data_sig_zscore)

meta_gene = data.frame(rowMeans(cbind(-meta_gene_down,meta_gene_up)))



##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}


##---- plots ----##

age = data.frame(samples$age)
mdd = data.frame(samples$MDD)
group = data.frame(samples$group)
hamilton = data.frame(samples$hamilton_score)
plot_data = cbind(meta_gene,age,mdd,group,hamilton)
names(plot_data) = c("biological_age","chronological_age","MDD","group","hamilton")


hm.palette = colorRampPalette(c("blue","red"))

ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/cavanagh_age_metrics_MDD_scatterplot.svg", height=400/90, width=550/90)
print(ggp)
dev.off()

ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/cavanagh_age_metrics_MDD_scatterplot_BIG.svg", height=1000/90, width=1000/90)
print(ggp)
dev.off()
cor(meta_gene,age,  method = "spearman")

==== Leday Immune Age Scatterplot ====


##---- libraries----##
library(ggplot2)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- load data ----##
expression_data = read.table(file="leday/EM_batch_corrected.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
samples = read.table(file="leday/patient_data.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##---- get meta gene ----##
sig_genes = row.names(subset(read.table(file="leday/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),P.Value<0.001 & logFC > 0))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_up = data.frame(colMeans(na.omit(expression_data_sig_zscore)))
names(meta_gene_up) = "up"
nrow(expression_data_sig_zscore)


sig_genes = row.names(subset(read.table(file="leday/DE_age.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE),P.Value<0.001 & logFC < -0))
expression_data_sig_zscore = data.frame(t(scale(t(expression_data[sig_genes,]))))
meta_gene_down = data.frame(colMeans(na.omit(expression_data_sig_zscore)))
names(meta_gene_down) = "down"
nrow(expression_data_sig_zscore)

meta_gene = data.frame(rowMeans(cbind(meta_gene_down,-meta_gene_up)))



##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 12, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 11, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 11, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 12, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 12, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=12, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}


##---- plots ----##

age = data.frame(samples$age)
mdd = data.frame(samples$group)
group = data.frame(samples$batch)
plot_data = cbind(meta_gene,age,mdd,group)
names(plot_data) = c("biological_age","chronological_age","MDD","group")


hm.palette = colorRampPalette(c("blue","red"))

ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/leday_age_metrics_MDD_scatterplot.svg", height=400/90, width=550/90)
print(ggp)
dev.off()

ggp = ggplot(data=plot_data, aes(x=chronological_age, y=biological_age,colour=MDD)) + geom_point(size=1,alpha=1) + geom_smooth(method='lm',formula=y~x) + scale_colour_gradientn(colours = hm.palette(100)) + ylab("Immune age") + xlab("Patient age") + theme_SL2() + theme(legend.position="none")
svg("plots/leday_age_metrics_MDD_scatterplot_BIG.svg", height=2000/90, width=2000/90)
print(ggp)
dev.off()
cor(meta_gene,age,  method = "spearman")

temp = subset(mdd, samples.group == 0)
nrow(temp) 

temp = subset(mdd, samples.group == 1)
nrow(temp)

===== Rebuttal Plots ===== ==== HC vs MDD volcano plots ====


##---- general libraries ----##
library(ggplot2)
library(reshape)

##---- PDE sample and group names ----##
sample_groupings = factor(sample_groupings, levels = sample_groups)

##---- sets the working directory. If you move the SL2 folder please update this path ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/cavanagh/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 16, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 14, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 14, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 16, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 16, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=14, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(2.5,"line"),
            plot.margin=unit(c(0.4,0.4,0.4,0.4), "cm")
        )
}

##----- Volcano Plot Function -----##
make_volcano_plot <- function(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size) 
{
  ggp = ggplot(data=PDE_annotated, aes(x=log2fold, y=-log10(p), colour=sig, group=sig,fill=sig)) + geom_point(size=dot_size,alpha=dot_transparency) + scale_color_manual(values=c(non_significant_colour,significant_colour),breaks=c(FALSE,TRUE),labels=c(significant_name, non_significant_name)) + scale_fill_manual(values=c(non_significant_colour,significant_colour),breaks=c(FALSE,TRUE),labels=c(significant_name, non_significant_name)) + xlab(x_axis_label) + ylab(y_axis_label) + theme_SL2() + theme(legend.position=legend_position, legend.title = element_blank()) + geom_text(data=PDE_annotated_sig,aes(label=rownames(PDE_annotated_sig)),hjust=-0.1, vjust=-0.1, size=data_label_size, show.legend = FALSE) + xlim(c(-4,4))
  return(ggp)
}

##----- Volcano Plot Variables -----##

plot_height = 500
plot_width = 600
non_significant_colour = "black"
significant_colour = "red"
dot_size = 1.5
dot_transparency = 1
significant_name = "non-significant"
non_significant_name = "significant"
x_axis_label = "log2 fold change"
y_axis_label = "-log10 p-value"
legend_position = "bottom"


##---- No model ----##
PDE_annotated = read.table(file="deseq2/unmodelled/DE_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)

data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_unmodelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()

data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_unmodelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()


##---- No model ----##
PDE_annotated = read.table(file="deseq2/modelled/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)

data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_modelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()

data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_modelled.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()



##---- limited model ----##
PDE_annotated = read.table(file="deseq2/limited_model/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)

data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_limited_model.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()

data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_limited_model.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()



##---- limited model no BMI ----##
PDE_annotated = read.table(file="deseq2/limited_model_no_BMI/DE_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)
PDE_annotated$sig = PDE_annotated$p.adj < 0.01
PDE_annotated_sig = subset(PDE_annotated,p.adj < 0.01)

data_label_size = 4.5
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_labelled_HC_vs_MDD_limited_model_no_BMI.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()

data_label_size = 0
ggp = make_volcano_plot(non_significant_colour,significant_colour,dot_size,dot_transparency,significant_name,non_significant_name,x_axis_label,y_axis_label,legend_position,data_label_size)
png("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/rebuttals/Friday_27th_sept_2019/volcano_plot_unlabelled_HC_vs_MDD_limited_model_no_BMI.png", height=plot_height, width=plot_width, pointsize=5)
print(ggp)
dev.off()

==== HC vs MDD expression boxplots ====


##---- libraries----##
library(ggplot2)

##---- sets working directory ----##
setwd("/GLAZgo/analysis/Johnathan_Cavanagh_MDD_RNA_seq_paper_2019/")

##---- SL2 Theme ----##
theme_SL2 <- function () { 
    theme_bw() %+replace% 
        theme(
	    panel.grid = element_blank(),
            panel.background = element_blank(),
            panel.border = element_rect(colour = "black", fill=NA, size=1),
            plot.background = element_blank(), 
            legend.background = element_rect(fill="transparent", colour=NA),
            legend.key = element_rect(fill="transparent", colour=NA),
            title = element_text(size = 24, margin = margin(r = 10),hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.text.y = element_text(size = 22, margin = margin(r = 5),hjust=1,vjust=0.5, family="Arial", face="bold",colour="black"),
            axis.text.x = element_text(size = 22, margin = margin(t = 5),hjust=0.5,vjust=1, family="Arial", face="bold",colour="black"), 
            axis.title.y = element_text(size = 24, margin = margin(r = 10),angle = 90,hjust=0.5,vjust=0.5, family="Arial", face="bold"),
            axis.title.x = element_text(size = 24, margin = margin(t = 10),hjust=0.5,vjust=1, family="Arial", face="bold"),
            legend.text=element_text(size=24, family="Arial", face="bold"),
            legend.title=element_blank(), 
            legend.key.size=unit(1,"line"),
            plot.margin=unit(c(0.1,0.1,0.1,0.1), "cm")
        )
}

##---- plot size ----##
plot_height=600
plot_width=400

##---- loads cavanagh data ----##
samples = read.table(file="cavanagh/raw_data/clinical_data_valid_samples.tsv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)


##---- No model ----##

expression_data = read.table(file="cavanagh/deseq2/modelled/EM_HC_vs_MDD.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

gene = "ENSG00000188000"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000188659"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000158406"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000229314"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000232810"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_no_model_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()




##---- Limited model no BMI ----##

expression_data = read.table(file="limited_model_no_BMI/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

gene = "ENSG00000081041"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000158406"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000277632"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000229314"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_no_BMI_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()




##---- Limited model ----##

expression_data = read.table(file="limited_model/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000158406"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000277632"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000229314"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()



gene = "ENSG00000137267"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_limited_model_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()





##---- full model ----##

expression_data = read.table(file="modelled/EM_HC_vs_MDD_combat.csv", header=TRUE,row.names = 1, sep='\t', quote='',check.names = TRUE)

gene = "ENSG00000277075"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_1_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000115290"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_2_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000081041"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_3_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000196632"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_4_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()


gene = "ENSG00000278828"
expression_data_gene = t(expression_data[gene,])
groups = samples$MDD
plot_data = data.frame(cbind(expression_data_gene,groups))
names(plot_data) = c("expression","group")
plot_data$group = factor(plot_data$group, levels=c("0", "1"))
sample_groups = c("HC", "MDD")

ggp = ggplot(plot_data, aes(x=group, y=log(expression,2), group=group)) +  geom_violin(trim=FALSE) + geom_boxplot(width=0.25) + scale_x_discrete(breaks=c("0", "1"),labels=sample_groups,limits=c("0", "1")) + xlab(element_blank()) + ylab(element_blank()) + theme_SL2() + theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0,max(log(plot_data$expression,2))*1.1)
png("rebuttals/Friday_27th_sept_2019/cavanagh_full_model_mdd_5_boxplot.png", height=plot_height, width=plot_width)
print(ggp)
dev.off()