############################################################################
###  R code for analyses and figures for the paper:
#
# Lightfoot E and O’Connell TC (2016). 
# "On The Use of Biomineral Oxygen Isotope Data to Identify Human Migrants in the Archaeological Record: 
# Intra-Sample Variation, Statistical Methods and Geographical Considerations." PLoS ONE 11(4).
# doi:10.1371/journal.pone.0153850
#
#
############################################################################
#  To cite this code, please cite:
#  O’Connell, TC. 2016. R code, https://www.repository.cam.ac.uk/handle/1810/252773
############################################################################
#
############################################################################
#  PLEASE READ THESE NOTES BEFORE USING THE CODE
############################################################################
#
#Please save this file as OConnell_R_code.R to use
#
#This file contains code to produce
#	** some extra variables necessary to produce the figures 
#	** the summary statistics data by site.
#	** a summary sheet for all data by site, including outlier percentages and numbers
#	** code for each of the figures (1,2,3,4,7,8,9,10). [Figs 5 and 6 were not done in R.]
#
#Please note, there is a single set of code for each figure, which can be used to produce both 
#the figure in the main paper (the PID data) and the version in supplementary 
#information (the full dataset). 
#To run the analyses on the PID dataset, I first selected the relevant datapoints from 
#the dataset in excel (where PID=1), and saved that to use in R.
#[I did not try to get so clever that I wrote different code for the PID subset.]
#
#For all the code, it relies on the use of a primary dataset called “Odataset”, 
#which is the dataset saved as a text tab delimited file.
#
#Some of the column headings in the dataset in the repository have been renamed, to make coding easier. 
#The original column headings are given first, then the ones used in the R code as follows:
#Site = Site
#Site-combined (nearby sites combined) = Site_combined
#Country = Country
#Continent = Continent
#Latitude = Lat
#Longitude = Long
#Grid square (European sites only) = Grid_sq
#Alt (masl) = Alt
#Modelled Mean Annual Precipitation (MAP) (‰) = MAP
#Bone/Tooth = B_T
#Tooth grading (1=PID, 2=other tooth, 3=n/a) = PID
#Phosphate  d18O SMOW (‰) = d18O
#
############################################################################
### WARNINGS ###
############################################################################
#Sometimes the figures are so complicated that R can’t cope with plotting them and gives a warning message that ends in
#Polygon edge not found”. Usually, just rerunning the code produces the figure.
#In extreme situations, a reboot of R does the trick.
#
#There are also warnings for the creation of duplicate factors for the sumcountry statistics etc
#"Warning message:
#"In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels,  :
#                                                                                duplicated levels in factors are deprecated
#These can be ignored in my experience.
#As can the warnings when exporting the data to the .csv files - has always done it fine for me.
#
#
############################################################################
### THE CODE ###
############################################################################
#
#Clear out any previous items in the environment, and clear out graphics
rm(list=ls())
graphics.off()
#Load relevant packages before using them
library(ggplot2)
library(aplpack)
library (car)
library (psych)
library(Hmisc)
library(grid)
library(gridExtra)
library(plyr)
library(reshape2)
#Defining a function for layout of viewports
vplayout <-function (x,y)
  viewport(layout.pos.row = x, layout.pos.col = y)
#
############################################################################
### Input the data ###
############################################################################
#
#Upload relevant dataset, which must be called Odataset.txt
#
#
############################################################################
### Sumary stats and outlier identification ###
############################################################################
#Create label for European-only sites
Odataset$Europe <- factor(Odataset$Continent, levels = c("Europe"))
#Create bins for altitude and MAP data by site for European sites only
#MAP bin only works for Europe, as that data are not inputted for the rest of the world
Odataset$MAPbin <- cut_width(Odataset$MAP, 1, boundary = 0, closed = c("right"))
Odataset$altbin <- cut_width(Odataset$Alt, 150, boundary = 0, closed = c("right"))
#In order to only create altbin for European sites (because of Creating Fig 9 later), create a second Alt and altbin for Europe
Odataset$Alt2 <- ifelse((Odataset$Continent %in% "Europe"), Odataset$Alt, NA)
Odataset$altbin2 <- cut_width(Odataset$Alt2, 150, boundary = 0, closed = c("right"))
#
#Statistics computed on d18O at each site: mean, median, SD, difference between individual's value 
#and the site mean and median (for Fig 2 & S2), as well as abs diff from site median (for MAD_Q3)
#
Odataset$Site_N <- ave(Odataset$d18O, Odataset$Site_combined, FUN=length)
Odataset$Site_mean_all <- ave(Odataset$d18O, Odataset$Site_combined, FUN=mean)
Odataset$Site_median_all <- ave(Odataset$d18O, Odataset$Site_combined, FUN=median)
Odataset$Site_SD_all <- ave(Odataset$d18O, Odataset$Site_combined, FUN=SD)
Odataset$Diff_mean_all <- (Odataset$d18O - Odataset$Site_mean_all)
Odataset$Diff_median_all <- (Odataset$d18O - Odataset$Site_median_all)
Odataset$Abs_diff_median_all <- abs(Odataset$Diff_median_all)
Odataset$Zscore_all <- (Odataset$Diff_mean_all / Odataset$Site_SD_all)
#
#Summary statistics data by site.
#This calculates the sample number (N), range, mean, sd, median, IQR, MAD_norm and MAD_Q3 
#for each sites in the dataset
#
Odataset_summary_all <- ddply(Odataset, .(Site_combined), summarize,
                              N = length(d18O), Mean=mean(d18O), Median=median(d18O),
                              Range=(max(d18O) - min(d18O)), SD=sd(d18O), 
                              IQR=IQR(d18O), MAD_norm=mad(d18O),
                              MAD_Q3=(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4))),
                              Spread_Range=(max(d18O) - min(d18O)), Spread_2SD=(4*sd(d18O)), 
                              Spread_1.5IQR=(4*IQR(d18O)), 
                              Spread_3MAD_norm=(6*mad(d18O)),
                              Spread_3MAD_Q3=(6*(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4)))),
                              Twopermil_hi=(mean(d18O)+1), Twopermil_low=(mean(d18O)-1),
                              Range_hi=max(d18O), Range_low=min(d18O),
                              SD_hi=(mean(d18O)+2*sd(d18O)), SD_low=(mean(d18O)-2*sd(d18O)),
                              IQR_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), 
                              IQR_low=(quantile(d18O,1/4)-1.5*IQR(d18O)),
                              MAD_norm_hi=(median(d18O)+3*mad(d18O)), 
                              MAD_norm_low=(median(d18O)-3*mad(d18O)),
                              MAD_Q3_hi=(median(d18O)+3*(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4)))),
                              MAD_Q3_low=(median(d18O)-3*(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4)))) )
#
#Then merge the two datasets together, so that the summary data by site are also listed for
#each individual in the full dataset
#You have some redundant (doubled) variables in there, but it doesn't matter
Odataset_merge <- merge(Odataset, Odataset_summary_all, by = "Site_combined")
#
#Now for working out whether an individual is an outlier or not for their site by each method
#Methods = 2permil, 2SD, 1.5IQR, 3MAD_norm, 3MAD_Q3
Odataset_merge$Outlier_2permil <- ifelse(Odataset_merge$d18O > Odataset_merge$Twopermil_hi | Odataset_merge$d18O < Odataset_merge$Twopermil_low, 1, 0)
Odataset_merge$Outlier_2SD <- ifelse(Odataset_merge$d18O > Odataset_merge$SD_hi | Odataset_merge$d18O < Odataset_merge$SD_low, 1, 0)
Odataset_merge$Outlier_1.5IQR <- ifelse(Odataset_merge$d18O > Odataset_merge$IQR_hi | Odataset_merge$d18O < Odataset_merge$IQR_low, 1, 0)
Odataset_merge$Outlier_3MAD_norm <- ifelse(Odataset_merge$d18O > Odataset_merge$MAD_norm_hi | Odataset_merge$d18O < Odataset_merge$MAD_norm_low, 1, 0)
Odataset_merge$Outlier_3MAD_Q3 <- ifelse(Odataset_merge$d18O > Odataset_merge$MAD_Q3_hi | Odataset_merge$d18O < Odataset_merge$MAD_Q3_low, 1, 0)
#
#Now creating a summary sheet for all data by site, including outlier percentages and numbers
#This is basically Table S3 & is the basis of Figure 3
Odataset_stats_outliers <- ddply(Odataset_merge, .(Site_combined), summarize,
                                N = length(d18O), 
                                mean=mean(d18O),
                                sd=sd(d18O), 
                                Median=median(d18O),
                                Range=(max(d18O)-min(d18O)),
                                IQR=IQR(d18O),
                                MAD_norm=mad(d18O),
                                Spread_2SD=(4*sd(d18O)), 
                                Spread_1.5IQR=(4*IQR(d18O)), 
                                Spread_3MAD_norm=(6*mad(d18O)),
                                Spread_3MAD_Q3=mean(Spread_3MAD_Q3),
                                Outlier_2permil=sum(Outlier_2permil),
                                Outlier_2SD=sum(Outlier_2SD),
                                Outlier_1.5IQR=sum(Outlier_1.5IQR),
                                Outlier_3MAD_norm=sum(Outlier_3MAD_norm),
                                Outlier_3MAD_Q3=sum(Outlier_3MAD_Q3))
#                                
#Add in the percentages, as I am not good enough to work out how to do it in a single step (rounded to 2dpl)
Odataset_stats_outliers$Outlier_percent_2permil <- round((Odataset_stats_outliers$Outlier_2permil/Odataset_stats_outliers$N),2)
Odataset_stats_outliers$Outlier_percent_2SD <- round((Odataset_stats_outliers$Outlier_2SD/Odataset_stats_outliers$N),2)
Odataset_stats_outliers$Outlier_percent_1.5IQR <- round((Odataset_stats_outliers$Outlier_1.5IQR/Odataset_stats_outliers$N),2)
Odataset_stats_outliers$Outlier_percent_3MAD_norm <- round((Odataset_stats_outliers$Outlier_3MAD_norm/Odataset_stats_outliers$N),2)
Odataset_stats_outliers$Outlier_percent_3MAD_Q3 <- round((Odataset_stats_outliers$Outlier_3MAD_Q3/Odataset_stats_outliers$N),2)
#
#Exporting data for tables
write.csv(Odataset_stats_outliers, file="Odataset_stats_outliers.csv", sep = ",", col.names = NA, qmethod = "double")
#
############################################################################
#Fig 1 The range, SD, IQR and MAD of oxygen isotopic data from each site 
#in the full dataset plotted by site sample size, for all sites where N>4.
#Uses summary data by site, Odataset_all
#
#This requires data in long format, not wide, so it must be melted first.
Odataset_summary_all_long <- melt(Odataset_summary_all, id.vars = c("Site_combined","N"), 
                                       measure =c("Range", "SD", "IQR", "MAD_norm", "MAD_Q3",
                                                  "Spread_Range", "Spread_2SD", "Spread_1.5IQR", 
                                                  "Spread_3MAD_norm", "Spread_3MAD_Q3"))
#Now create new variables to make sure the facets are in the right order - order by 
# levels, and then instruct as to what the labels are.
#Labels are in plotmath so they can parsed directly.
#First a "statistic" for just Fig1a, then a "Criterion" for just FIg1b 
Odataset_summary_all_long$Statistic2 <- factor(Odataset_summary_all_long$variable, 
                                 levels = c("Range","SD","IQR", "MAD_norm", "MAD_Q3"), 
                                 labels = c("Range","SD","IQR","MAD[norm]","MAD[Q3]"))
Odataset_summary_all_long$Criterion2 <- factor(Odataset_summary_all_long$variable, 
                                                levels = c("Spread_Range","Spread_2SD","Spread_1.5IQR", "Spread_3MAD_norm", "Spread_3MAD_Q3"), 
                                                labels = c("Range","SD","IQR","MAD[norm]","MAD[Q3]"))
#
#Now for Figures 1a & 1b                                
fig_1a <- ggplot(data = subset(Odataset_summary_all_long[!is.na(Odataset_summary_all_long$Statistic2), ], N>4)) + 
                                 geom_point(aes(x=N, y=value), alpha=0.5) + 
                                 facet_grid(Statistic2 ~ ., margins = FALSE, labeller = label_parsed) + 
                                 labs(y="Statistic value (\u2030)", x ="Sample number") +
                                 theme_bw()
                                 
fig_1b <- ggplot(data = subset(Odataset_summary_all_long[!is.na(Odataset_summary_all_long$Criterion2), ], N>4)) + 
  geom_point(aes(x=N, y=value), alpha=0.5) + 
  facet_grid(Criterion2 ~ ., margins = FALSE, labeller = label_parsed) + 
  labs(y="Spread encompassed by the boundaries (\u2030)", x ="Sample number") +
  theme_bw()
#
#Combining them into a single figure ready for printing as an eps file
#Figure 1
cairo_ps("fig1.eps", width = 7.5, height = 8.75, pointsize = 12)
grid.newpage()
pushViewport(viewport(layout = grid.layout(1,2)))
vplayout <-function (x,y)
  viewport(layout.pos.row = x, layout.pos.col = y)
print (fig_1a, vp = vplayout (1,1))
print (fig_1b, vp = vplayout (1,2))
grid.text('a', x=0.03, y=0.98, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(1,1))
grid.text('b', x=0.04, y=0.98, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(1,2))
dev.off()
#
#
############################################################################
# Figure 2. Histogram of the difference between each individual’s oxygen isotope
# value and (a) their site mean oxygen isotopic value or (b) their site median
# oxygen isotopic value, for all individuals in the full data-subset.
#
#2a - mean
fig_2a <- ggplot(Odataset) + 
  geom_histogram(aes(x=Diff_mean_all), stat = "bin",binwidth=0.1) + 
  scale_x_continuous(breaks = c(-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6)) +
  labs(y="Count", x = expression("Individual "*delta^18 * O[PO4]  * " - "*"Site mean "*delta^18 * O[PO4]  *" "("\u2030"))) +
  theme_bw()
#2b - median
fig_2b <- ggplot(Odataset) + 
  geom_histogram(aes(x=Diff_median_all), stat = "bin",binwidth=0.1) + 
  scale_x_continuous(breaks = c(-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6)) +
  labs(y="Count", x = expression("Individual "*delta^18 * O[PO4]  * " - "*"Site median "*delta^18 * O[PO4]  *" "("\u2030"))) +
  theme_bw()
#
#Combining them into a single figure ready for printing as an eps file
#Fig 2
cairo_ps("fig2.eps", width = 5.2, height = 7, pointsize = 12)
grid.newpage()
pushViewport(viewport(layout = grid.layout(2,1)))
vplayout <-function (x,y)
  viewport(layout.pos.row = x, layout.pos.col = y)
print (fig_2a, vp = vplayout (1,1))
print (fig_2b, vp = vplayout (2,1))
grid.text('a', x=0.03, y=0.95, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(1,1))
grid.text('b', x=0.03, y=0.95, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(2,1))
dev.off()
#
#
############################################################################
#Figure 3
#This requires data in long format, not wide, so it must be melted first.
Odataset_stats_outliers_long <- melt(Odataset_stats_outliers, id.vars = c("Site_combined","N"), 
                                  measure =c("Outlier_percent_2permil", "Outlier_percent_2SD", 
                                             "Outlier_percent_1.5IQR", "Outlier_percent_3MAD_norm", 
                                             "Outlier_percent_3MAD_Q3"), variable.name="Method", value.name="Fraction")

fig_3 <- ggplot(data = subset(Odataset_stats_outliers_long, N > 4)) + 
  geom_point(aes(x=N, y=(100*Fraction), shape = Method)) +
  theme_bw() + 
  labs(y="Percentage of outliers", x ="Sample number", shape = "Method") +
  scale_shape("Method",
              breaks = c("Outlier_percent_2permil", "Outlier_percent_2SD", 
                         "Outlier_percent_1.5IQR", "Outlier_percent_3MAD_norm", 
                         "Outlier_percent_3MAD_Q3"),
              labels = c("2\u2030","2SD","1.5IQR",bquote("3MAD"[norm]),bquote("3MAD"[Q3])))
#
#Printing Fig 3
cairo_ps("fig3.eps", width = 5.2, height = 4, pointsize = 12)
fig_3
dev.off()
#
#
############################################################################
#Figure 4: Density plot of each individual’s oxygen isotopic value expressed as Z score for 
#all specimens from sites where N>4 in the full dataset, with a fitted normal distribution curve 
#overlaid. Make a subset of Odataset first, as otherwise normal overlay doesn't work. 
Odataset_N4 <- subset(Odataset, Site_N>4)
fig_4 <- ggplot(Odataset_N4, aes(x=Zscore_all)) + 
  geom_histogram(aes(y=..density..), stat = "bin",binwidth=0.2) + 
  scale_x_continuous(breaks = c(-4,-2,0,2,4,6)) +
  labs(y="Density", x = expression(delta^18 * O[PO4]* " expressed as Z score (\u2030)")) +
  theme_bw() +
  stat_function(fun=dnorm, args=list(mean=mean(Odataset_N4$Zscore_all), 
                                     sd=sd(Odataset_N4$Zscore_all)))
#
#Printing Fig 4
cairo_ps("fig4.eps", width = 5.2, height = 4, pointsize = 12)
fig_4
dev.off()
#
#
############################################################################
#Figure 7. Histogram of each individual’s oxygen isotopic value for all 
# European individuals in the full dataset.
fig_7 <- ggplot(Odataset[!is.na(Odataset$Europe), ]) + 
  geom_histogram(aes(x=d18O), stat = "bin",binwidth=0.2) + 
  scale_x_continuous(breaks = c(4,6,8,10,12,14,16,18,20)) +
  labs(y="Count", x = expression(delta^18 * O[PO4]  * " "("\u2030"))) +
  theme_bw()
#
#Printing Fig 7
cairo_ps("fig7.eps", width = 5.2, height = 4, pointsize = 12)
fig_7
dev.off()
#
#
############################################################################
#Figure 8
#This is six separate figures plotted from six separate subsets
#This is scatterplot by country.
#This is to put countries in the right order - make a new variable order by levels
Odataset$Country2 <- factor(Odataset$Country, 
                              levels = c("Ireland", "UK", "France","Belgium",
                                         "Netherlands","Germany","Denmark", "Norway", 
                                         "Finland","Austria", "Czech Rep", "Italy",
                                         "Croatia", "Bulgaria", "Greece", "Turkey"))
#
#Have put in [!is.na(Odataset$Europe), ] in every plot below which tells it not to plot non-European data
#
fig_8a <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Country2, d18O)) +
  geom_jitter(alpha=0.2, width = 0.6) +   theme_bw() +
  labs(x ="Country", y = expression(delta^18 * O[PO4]  * " "("\u2030")))  +
  theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 8))
#This is boxplot by country. Shows IQR
fig_8b <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(x=Country2, y=d18O)) +
  geom_boxplot() + theme_bw() +
  labs(x ="Country", y = expression(delta^18 * O[PO4]  * " "("\u2030"))) +
  theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 8))
#This is scatterplot by latitude
fig_8c <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Lat, d18O)) +
  geom_point(alpha=0.2) + theme_bw() +
  geom_smooth(method = "lm", colour = "black") +
  labs(x = expression("Degrees of Latitude"), y = expression(delta^18 * O[PO4]  * " "("\u2030")))
#This is scatterplot by longitude with alpha
fig_8d <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Long, d18O)) +
  geom_point(alpha=0.2) + theme_bw()+
  geom_smooth(method = "lm", colour = "black") +
  labs(x = expression("Degrees of Longitude"), y = expression(delta^18 * O[PO4]  * " "("\u2030")))
#This is scatterplot by altitude
fig_8e <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Alt, d18O)) +
  geom_point(alpha=0.2) + theme_bw()+
  geom_smooth(method = "lm", colour = "black") +
  labs(x = expression("Altitude "*"(m)"), y = expression(delta^18 * O[PO4]  * " "("\u2030")))
#This is scatterplot by MAP
fig_8f <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(MAP, d18O)) +
  geom_point(alpha=0.2) + theme_bw() +
  geom_smooth(method = "lm", colour = "black") +
  labs(x=expression("Site "*delta^18 * O[MAP] *" "("\u2030")), y = expression(delta^18 * O[PO4]  *" "("\u2030")))
#
#Combining all subplots into single Figure 8 for printing
cairo_ps("fig8.eps", width = 7.5, height = 8.75)
grid.arrange(arrangeGrob(fig_8a,fig_8b,fig_8c,fig_8d,fig_8e,fig_8f, heights=c(0.35,0.325, 0.325), ncol=2))
grid.text('a', x=0.03, y=0.98, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(1,1))
grid.text('b', x=0.53, y=0.98, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(2,1))
grid.text('c', x=0.03, y=0.635, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(3,1))
grid.text('d', x=0.53, y=0.635, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(4,1))
grid.text('e', x=0.03, y=0.315, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(3,1))
grid.text('f', x=0.53, y=0.315, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(3,2))
dev.off()
#
#
############################################################################
#For Figure 9, we need to compute the limits of each method by a separate grouping variable,
#i.e. country, grid_sq, altitude and MAP.
#We use only the European data, using the "[!is.na(Odataset$Europe), ]" command for alt (all the others are specific to Europe)
#First we compute the limits, and then export the data to csv, as that can be helpful (tables in paper)
#Then we melt the data to long form, for each of the methods, then finally we can plot them
#
#Summary limits for country
#First need to calc the variable diff to use in calculating MAD_Q3, then the summary
#By European country
#Make sure you have "Country2" in there - if not, do the first command
Odataset$Country2 <- factor(Odataset$Country, 
                            levels = c("Ireland", "UK", "France","Belgium",
                                       "Netherlands","Germany","Denmark", "Norway", 
                                       "Finland","Austria", "Czech Rep", "Italy",
                                       "Croatia", "Bulgaria", "Greece", "Turkey"))
#
Odataset$Country_median_all <- ave(Odataset$d18O, Odataset$Country2, FUN=median)
Odataset$Diff_country_median_all <- (Odataset$d18O - Odataset$Country_median_all)
Odataset$Abs_diff_country_median_all <- abs(Odataset$Diff_country_median_all)
#By grid sq
Odataset$Grid_sq_median_all <- ave(Odataset$d18O, Odataset$Grid_sq, FUN=median)
Odataset$Diff_grid_sq_median_all <- (Odataset$d18O - Odataset$Grid_sq_median_all)
Odataset$Abs_diff_grid_sq_median_all <- abs(Odataset$Diff_grid_sq_median_all)
#By altitude bin
Odataset$altbin_median_all <- ave(Odataset$d18O, Odataset$altbin2, FUN=median)
Odataset$Diff_altbin_median_all <- (Odataset$d18O - Odataset$altbin_median_all)
Odataset$Abs_diff_altbin_median_all <- abs(Odataset$Diff_altbin_median_all)
#By MAP bin
Odataset$MAPbin_median_all <- ave(Odataset$d18O, Odataset$MAPbin, FUN=median)
Odataset$Diff_MAPbin_median_all <- (Odataset$d18O - Odataset$MAPbin_median_all)
Odataset$Abs_diff_MAPbin_median_all <- abs(Odataset$Diff_MAPbin_median_all)
#
#Summary limits for country
summarycountry_all <- 
  ddply(Odataset, .(Country2), summarize,
        mean=mean(d18O),
        N=length(d18O),
        range_hi=max(d18O), range_low=min(d18O),
        sd_hi=(mean(d18O)+2*sd(d18O)), 
        sd_low=(mean(d18O)-2*sd(d18O)),
        iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), 
        iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)),
        madnorm_hi=(median(d18O)+3*mad(d18O)), 
        madnorm_low=(median(d18O)-3*mad(d18O)),
        madq3_hi=(median(d18O)+3*(median(Abs_diff_country_median_all)*(1/quantile(Abs_diff_country_median_all, 3/4)))),
        madq3_low=(median(d18O)-3*(median(Abs_diff_country_median_all)*(1/quantile(Abs_diff_country_median_all, 3/4)))) )
#
#Summary limits for grid sq
summarygridsq_all <- 
  ddply(Odataset, .(Grid_sq), summarize,
        mean=mean(d18O),
        N=length(d18O),
        range_hi=max(d18O), range_low=min(d18O),
        sd_hi=(mean(d18O)+2*sd(d18O)), 
        sd_low=(mean(d18O)-2*sd(d18O)),
        iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), 
        iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)),
        madnorm_hi=(median(d18O)+3*mad(d18O)), 
        madnorm_low=(median(d18O)-3*mad(d18O)),
        madq3_hi=(median(d18O)+3*(median(Abs_diff_grid_sq_median_all)*(1/quantile(Abs_diff_grid_sq_median_all, 3/4)))),
        madq3_low=(median(d18O)-3*(median(Abs_diff_grid_sq_median_all)*(1/quantile(Abs_diff_grid_sq_median_all, 3/4)))) )
#
#Summary limits for altitude
#First need to calc the variable diff to use in calculating MAD_Q3, then the summary 
summaryaltitude_all <- 
  ddply(Odataset, .(altbin2), summarize,
        mean=mean(d18O),
        N=length(d18O),
        range_hi=max(d18O), range_low=min(d18O),
        sd_hi=(mean(d18O)+2*sd(d18O)), 
        sd_low=(mean(d18O)-2*sd(d18O)),
        iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), 
        iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)),
        madnorm_hi=(median(d18O)+3*mad(d18O)), 
        madnorm_low=(median(d18O)-3*mad(d18O)),
        madq3_hi=(median(d18O)+3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all,3/4)))),
        madq3_low=(median(d18O)-3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all,3/4)))) )
#
#Summary limits for binned MAP
summaryMAP_all <- 
  ddply(Odataset, .(MAPbin), summarize,
        mean=mean(d18O),
        N=length(d18O),
        range_hi=max(d18O), range_low=min(d18O),
        sd_hi=(mean(d18O)+2*sd(d18O)), 
        sd_low=(mean(d18O)-2*sd(d18O)),
        iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), 
        iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)),
        madnorm_hi=(median(d18O)+3*mad(d18O)), 
        madnorm_low=(median(d18O)-3*mad(d18O)),
        madq3_hi=(median(d18O)+3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all, 3/4)))),
        madq3_low=(median(d18O)-3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all, 3/4))))
  )
#
#Exporting data for tables
write.csv(summarycountry_all, file="summarycountry_all.csv", sep = ",", col.names = NA, qmethod = "double")
write.csv(summarygridsq_all, file="summarygridsq_all.csv", sep = ",", col.names = NA, qmethod = "double")
write.csv(summaryaltitude_all, file="summaryaltitude_all.csv", sep = ",", col.names = NA, qmethod = "double")
write.csv(summaryMAP_all, file="summaryMAP_all.csv", sep = ",", col.names = NA, qmethod = "double")
#
#Now converting to long data format for each grouping
sumcountry_all <- melt(summarycountry_all, id.vars = c("Country2","N"), 
                       measure = c("range_hi", "range_low","sd_hi", 
                                   "sd_low", "iqr_hi", "iqr_low", 
                                   "madnorm_hi","madnorm_low","madq3_hi","madq3_low"))

sumgridsq_all <- melt(summarygridsq_all, id.vars = c("Grid_sq","N"), 
                      measure = c("range_hi", "range_low","sd_hi", 
                                  "sd_low", "iqr_hi", "iqr_low", 
                                  "madnorm_hi","madnorm_low","madq3_hi","madq3_low"))

sumalt_all <- melt(summaryaltitude_all, id.vars = c("altbin2","N"), 
                   measure = c("range_hi", "range_low","sd_hi", 
                               "sd_low", "iqr_hi", "iqr_low", 
                               "madnorm_hi","madnorm_low","madq3_hi","madq3_low"))

sumMAP_all <- melt(summaryMAP_all, id.vars = c("MAPbin","N"), 
                   measure = c("range_hi", "range_low","sd_hi", 
                               "sd_low", "iqr_hi", "iqr_low", 
                               "madnorm_hi","madnorm_low","madq3_hi","madq3_low"))

#Now all the limits into one long column, now fixing labels to remove hi and low
#By country
sumcountry_all$statistic <- factor(sumcountry_all$variable, 
                                   levels = c("range_hi","range_low","sd_hi", "sd_low",
                                              "iqr_hi","iqr_low","madnorm_hi", "madnorm_low",
                                              "madq3_hi", "madq3_low"), 
                                   labels = c("Range","Range","SD","SD","IQR","IQR",
                                              "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]"))
sumcountry_all$statistic2 <-factor(sumcountry_all$statistic)

#By grid sq
sumgridsq_all$statistic <- factor(sumgridsq_all$variable, 
                                  levels = c("range_hi","range_low","sd_hi", "sd_low",
                                             "iqr_hi","iqr_low","madnorm_hi", "madnorm_low",
                                             "madq3_hi", "madq3_low"), 
                                  labels = c("Range","Range","SD","SD","IQR","IQR",
                                             "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]"))

sumgridsq_all$statistic2 <-factor(sumgridsq_all$statistic)

#By alt
sumalt_all$statistic <- factor(sumalt_all$variable, 
                               levels = c("range_hi","range_low","sd_hi", "sd_low",
                                          "iqr_hi","iqr_low","madnorm_hi", "madnorm_low",
                                          "madq3_hi", "madq3_low"), 
                               labels = c("Range","Range","SD","SD","IQR","IQR",
                                          "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]"))

sumalt_all$statistic2 <-factor(sumalt_all$statistic)

#By MAP
sumMAP_all$statistic <- factor(sumMAP_all$variable, 
                               levels = c("range_hi","range_low","sd_hi", "sd_low",
                                          "iqr_hi","iqr_low","madnorm_hi", "madnorm_low",
                                          "madq3_hi", "madq3_low"), 
                               labels = c("Range","Range","SD","SD","IQR","IQR",
                                          "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]"))

sumMAP_all$statistic2 <-factor(sumMAP_all$statistic)
#
#Plots for Fig 9
#
#By country
sumcountry_all$country2 <- factor(sumcountry_all$Country, 
                                  levels = c("Ireland","UK","France","Belgium",
                                             "Netherlands","Germany","Denmark","Norway",
                                             "Finland","Austria","Czech Rep","Italy",
                                             "Croatia","Bulgaria","Greece","Turkey"))
sumcountry_all$statistic2 <- factor(sumcountry_all$statistic, 
                                    levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"),
                                    labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3])))

fig_9A <- ggplot(data = subset(sumcountry_all[!is.na(sumcountry_all$Country2), ], N > 4), aes(country2, value)) + 
  stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, geom = "crossbar", width = 0.2, size = 0.8, colour = "black") + 
  facet_grid(. ~ statistic2, margins = FALSE,labeller = label_parsed) +
  scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) +
  labs(x="Country", y = expression("'Local' "*delta^18 * O[PO4] *" signal "("\u2030"))) +
  theme_bw() +  theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 6),
                      axis.text.y=element_text (size=8),
                      axis.title=element_text(size=8),
                      strip.text=element_text(size=8)) 
#
#By grid sq
sumgridsq_all$Grid_sq2 <- factor(sumgridsq_all$Grid_sq, 
                                 levels = c("A4","A5","B3","B4","B5","C4","D3","D4",
                                            "E2","E3", "E4","F2","F3","H2"))
sumgridsq_all$statistic2 <- factor(sumgridsq_all$statistic, 
                                   levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"),
                                   labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3])))
#
fig_9B <- ggplot(data = subset(sumgridsq_all[!is.na(sumgridsq_all$Grid_sq2), ], N > 4), aes(Grid_sq, value)) + 
  stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, geom = "crossbar", width = 0.2, size = 0.8, colour = "black") + 
  facet_grid(. ~ statistic2, margins = FALSE,labeller = label_parsed) +
  scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) +
  labs(x="Grid Square", y = expression("'Local' "*delta^18 * O[PO4] *" signal "("\u2030"))) +
  theme_bw() + 
  theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 4),
        axis.text.y=element_text (size=8),
        axis.title=element_text(size=8),
        strip.text=element_text(size=8)) 
#
#By alt
sumalt_all$statistic2 <- factor(sumalt_all$statistic, 
                                levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"),
                                labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3])))

fig_9C <- ggplot(data = subset(sumalt_all[!is.na(sumalt_all$altbin2), ], N > 4), aes(altbin2, value)) + 
  stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, 
               geom = "crossbar", width = 0.2, size = 0.8,colour = "black") +
  facet_grid(. ~ statistic2, margins = FALSE, labeller = label_parsed) +
  scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) +
  labs(x="Altitude (m)", y = expression("'Local' "*delta^18 * O[PO4] *" signal "("\u2030")))+
  theme_bw() +
  theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 6),
        axis.text.y=element_text (size=8),
        axis.title=element_text(size=8),
        strip.text=element_text(size=8)) 
#
#By MAP
sumMAP_all$statistic2 <- factor(sumMAP_all$statistic, 
                                levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"),
                                labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3])))

fig_9D <- ggplot(data = subset(sumMAP_all[!is.na(sumMAP_all$MAPbin), ], N > 4), aes(MAPbin, value)) + 
  stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, 
               geom = "crossbar", width = 0.2, size = 0.8, colour = "black") +
  facet_grid(. ~ statistic2, margins = FALSE,labeller = label_parsed) +
  scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) +
  labs(x=expression("Site "*delta^18 * O[MAP] *" "("\u2030")), y = expression("'Local' "*delta^18 * O[PO4]*" signal "("\u2030"))) +
  theme_bw()  + 
  theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 6),
        axis.text.y=element_text (size=8),
        axis.title=element_text(size=8),
        strip.text=element_text(size=8)) 

#Printing Fig 9
cairo_ps("fig9.eps", width = 7.5, height = 8.75, pointsize = 8)
grid.arrange(arrangeGrob(fig_9A,fig_9B,fig_9C,fig_9D, heights=c(0.265,0.235, 0.255,0.255), ncol=1))
grid.text('a', x=0.03, y=0.98, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(1,1))
grid.text('b', x=0.03, y=0.726, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(2,1))
grid.text('c', x=0.03, y=0.495, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(3,1))
grid.text('d', x=0.03, y=0.245, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(4,1))
dev.off()
#
#
############################################################################
#Fig 10
#This is done on the PID dataset ONLY, i.e. presorted so that only samples with PID=1 are included.
#Also note that R can't cope with the non-std letters, so Velim V's name is "Velim Veli_tak"
Odataset$Site_combined2 <- factor(Odataset$Site_combined, 
                                  levels = c("Ban Non Wat","Kaminaljuyu","Velim-Veli_tak"),
                                  labels = c("Ban Non Wat","Kaminaljuyu","Velim Velistak"))
fig_10 <- ggplot(Odataset[!is.na(Odataset$Site_combined2), ]) + 
  geom_histogram(aes(x=d18O), stat = "bin",binwidth=0.4) + 
  facet_grid(Site_combined2 ~ ., margins = FALSE) + 
  scale_x_continuous(breaks = c(14,16,18,20)) +
  labs(y="Count", x = expression(delta^18 * O[PO4]  *" "("\u2030"))) +
  theme_bw()
#
#Printing Fig 10
cairo_ps("fig10.eps", width = 5.2, height = 6, pointsize = 12)
fig_10
dev.off()
#