############################################################################ ### R code for analyses and figures for the paper: # # Lightfoot E and O’Connell TC (2016). # "On The Use of Biomineral Oxygen Isotope Data to Identify Human Migrants in the Archaeological Record: # Intra-Sample Variation, Statistical Methods and Geographical Considerations." PLoS ONE 11(4). # doi:10.1371/journal.pone.0153850 # # ############################################################################ # To cite this code, please cite: # O’Connell, TC. 2016. R code, https://www.repository.cam.ac.uk/handle/1810/252773 ############################################################################ # ############################################################################ # PLEASE READ THESE NOTES BEFORE USING THE CODE ############################################################################ # #Please save this file as OConnell_R_code.R to use # #This file contains code to produce # ** some extra variables necessary to produce the figures # ** the summary statistics data by site. # ** a summary sheet for all data by site, including outlier percentages and numbers # ** code for each of the figures (1,2,3,4,7,8,9,10). [Figs 5 and 6 were not done in R.] # #Please note, there is a single set of code for each figure, which can be used to produce both #the figure in the main paper (the PID data) and the version in supplementary #information (the full dataset). #To run the analyses on the PID dataset, I first selected the relevant datapoints from #the dataset in excel (where PID=1), and saved that to use in R. #[I did not try to get so clever that I wrote different code for the PID subset.] # #For all the code, it relies on the use of a primary dataset called “Odataset”, #which is the dataset saved as a text tab delimited file. # #Some of the column headings in the dataset in the repository have been renamed, to make coding easier. #The original column headings are given first, then the ones used in the R code as follows: #Site = Site #Site-combined (nearby sites combined) = Site_combined #Country = Country #Continent = Continent #Latitude = Lat #Longitude = Long #Grid square (European sites only) = Grid_sq #Alt (masl) = Alt #Modelled Mean Annual Precipitation (MAP) (‰) = MAP #Bone/Tooth = B_T #Tooth grading (1=PID, 2=other tooth, 3=n/a) = PID #Phosphate d18O SMOW (‰) = d18O # ############################################################################ ### WARNINGS ### ############################################################################ #Sometimes the figures are so complicated that R can’t cope with plotting them and gives a warning message that ends in #Polygon edge not found”. Usually, just rerunning the code produces the figure. #In extreme situations, a reboot of R does the trick. # #There are also warnings for the creation of duplicate factors for the sumcountry statistics etc #"Warning message: #"In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, : # duplicated levels in factors are deprecated #These can be ignored in my experience. #As can the warnings when exporting the data to the .csv files - has always done it fine for me. # # ############################################################################ ### THE CODE ### ############################################################################ # #Clear out any previous items in the environment, and clear out graphics rm(list=ls()) graphics.off() #Load relevant packages before using them library(ggplot2) library(aplpack) library (car) library (psych) library(Hmisc) library(grid) library(gridExtra) library(plyr) library(reshape2) #Defining a function for layout of viewports vplayout <-function (x,y) viewport(layout.pos.row = x, layout.pos.col = y) # ############################################################################ ### Input the data ### ############################################################################ # #Upload relevant dataset, which must be called Odataset.txt # # ############################################################################ ### Sumary stats and outlier identification ### ############################################################################ #Create label for European-only sites Odataset$Europe <- factor(Odataset$Continent, levels = c("Europe")) #Create bins for altitude and MAP data by site for European sites only #MAP bin only works for Europe, as that data are not inputted for the rest of the world Odataset$MAPbin <- cut_width(Odataset$MAP, 1, boundary = 0, closed = c("right")) Odataset$altbin <- cut_width(Odataset$Alt, 150, boundary = 0, closed = c("right")) #In order to only create altbin for European sites (because of Creating Fig 9 later), create a second Alt and altbin for Europe Odataset$Alt2 <- ifelse((Odataset$Continent %in% "Europe"), Odataset$Alt, NA) Odataset$altbin2 <- cut_width(Odataset$Alt2, 150, boundary = 0, closed = c("right")) # #Statistics computed on d18O at each site: mean, median, SD, difference between individual's value #and the site mean and median (for Fig 2 & S2), as well as abs diff from site median (for MAD_Q3) # Odataset$Site_N <- ave(Odataset$d18O, Odataset$Site_combined, FUN=length) Odataset$Site_mean_all <- ave(Odataset$d18O, Odataset$Site_combined, FUN=mean) Odataset$Site_median_all <- ave(Odataset$d18O, Odataset$Site_combined, FUN=median) Odataset$Site_SD_all <- ave(Odataset$d18O, Odataset$Site_combined, FUN=SD) Odataset$Diff_mean_all <- (Odataset$d18O - Odataset$Site_mean_all) Odataset$Diff_median_all <- (Odataset$d18O - Odataset$Site_median_all) Odataset$Abs_diff_median_all <- abs(Odataset$Diff_median_all) Odataset$Zscore_all <- (Odataset$Diff_mean_all / Odataset$Site_SD_all) # #Summary statistics data by site. #This calculates the sample number (N), range, mean, sd, median, IQR, MAD_norm and MAD_Q3 #for each sites in the dataset # Odataset_summary_all <- ddply(Odataset, .(Site_combined), summarize, N = length(d18O), Mean=mean(d18O), Median=median(d18O), Range=(max(d18O) - min(d18O)), SD=sd(d18O), IQR=IQR(d18O), MAD_norm=mad(d18O), MAD_Q3=(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4))), Spread_Range=(max(d18O) - min(d18O)), Spread_2SD=(4*sd(d18O)), Spread_1.5IQR=(4*IQR(d18O)), Spread_3MAD_norm=(6*mad(d18O)), Spread_3MAD_Q3=(6*(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4)))), Twopermil_hi=(mean(d18O)+1), Twopermil_low=(mean(d18O)-1), Range_hi=max(d18O), Range_low=min(d18O), SD_hi=(mean(d18O)+2*sd(d18O)), SD_low=(mean(d18O)-2*sd(d18O)), IQR_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), IQR_low=(quantile(d18O,1/4)-1.5*IQR(d18O)), MAD_norm_hi=(median(d18O)+3*mad(d18O)), MAD_norm_low=(median(d18O)-3*mad(d18O)), MAD_Q3_hi=(median(d18O)+3*(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4)))), MAD_Q3_low=(median(d18O)-3*(median(Abs_diff_median_all)*(1/quantile(Abs_diff_median_all, 3/4)))) ) # #Then merge the two datasets together, so that the summary data by site are also listed for #each individual in the full dataset #You have some redundant (doubled) variables in there, but it doesn't matter Odataset_merge <- merge(Odataset, Odataset_summary_all, by = "Site_combined") # #Now for working out whether an individual is an outlier or not for their site by each method #Methods = 2permil, 2SD, 1.5IQR, 3MAD_norm, 3MAD_Q3 Odataset_merge$Outlier_2permil <- ifelse(Odataset_merge$d18O > Odataset_merge$Twopermil_hi | Odataset_merge$d18O < Odataset_merge$Twopermil_low, 1, 0) Odataset_merge$Outlier_2SD <- ifelse(Odataset_merge$d18O > Odataset_merge$SD_hi | Odataset_merge$d18O < Odataset_merge$SD_low, 1, 0) Odataset_merge$Outlier_1.5IQR <- ifelse(Odataset_merge$d18O > Odataset_merge$IQR_hi | Odataset_merge$d18O < Odataset_merge$IQR_low, 1, 0) Odataset_merge$Outlier_3MAD_norm <- ifelse(Odataset_merge$d18O > Odataset_merge$MAD_norm_hi | Odataset_merge$d18O < Odataset_merge$MAD_norm_low, 1, 0) Odataset_merge$Outlier_3MAD_Q3 <- ifelse(Odataset_merge$d18O > Odataset_merge$MAD_Q3_hi | Odataset_merge$d18O < Odataset_merge$MAD_Q3_low, 1, 0) # #Now creating a summary sheet for all data by site, including outlier percentages and numbers #This is basically Table S3 & is the basis of Figure 3 Odataset_stats_outliers <- ddply(Odataset_merge, .(Site_combined), summarize, N = length(d18O), mean=mean(d18O), sd=sd(d18O), Median=median(d18O), Range=(max(d18O)-min(d18O)), IQR=IQR(d18O), MAD_norm=mad(d18O), Spread_2SD=(4*sd(d18O)), Spread_1.5IQR=(4*IQR(d18O)), Spread_3MAD_norm=(6*mad(d18O)), Spread_3MAD_Q3=mean(Spread_3MAD_Q3), Outlier_2permil=sum(Outlier_2permil), Outlier_2SD=sum(Outlier_2SD), Outlier_1.5IQR=sum(Outlier_1.5IQR), Outlier_3MAD_norm=sum(Outlier_3MAD_norm), Outlier_3MAD_Q3=sum(Outlier_3MAD_Q3)) # #Add in the percentages, as I am not good enough to work out how to do it in a single step (rounded to 2dpl) Odataset_stats_outliers$Outlier_percent_2permil <- round((Odataset_stats_outliers$Outlier_2permil/Odataset_stats_outliers$N),2) Odataset_stats_outliers$Outlier_percent_2SD <- round((Odataset_stats_outliers$Outlier_2SD/Odataset_stats_outliers$N),2) Odataset_stats_outliers$Outlier_percent_1.5IQR <- round((Odataset_stats_outliers$Outlier_1.5IQR/Odataset_stats_outliers$N),2) Odataset_stats_outliers$Outlier_percent_3MAD_norm <- round((Odataset_stats_outliers$Outlier_3MAD_norm/Odataset_stats_outliers$N),2) Odataset_stats_outliers$Outlier_percent_3MAD_Q3 <- round((Odataset_stats_outliers$Outlier_3MAD_Q3/Odataset_stats_outliers$N),2) # #Exporting data for tables write.csv(Odataset_stats_outliers, file="Odataset_stats_outliers.csv", sep = ",", col.names = NA, qmethod = "double") # ############################################################################ #Fig 1 The range, SD, IQR and MAD of oxygen isotopic data from each site #in the full dataset plotted by site sample size, for all sites where N>4. #Uses summary data by site, Odataset_all # #This requires data in long format, not wide, so it must be melted first. Odataset_summary_all_long <- melt(Odataset_summary_all, id.vars = c("Site_combined","N"), measure =c("Range", "SD", "IQR", "MAD_norm", "MAD_Q3", "Spread_Range", "Spread_2SD", "Spread_1.5IQR", "Spread_3MAD_norm", "Spread_3MAD_Q3")) #Now create new variables to make sure the facets are in the right order - order by # levels, and then instruct as to what the labels are. #Labels are in plotmath so they can parsed directly. #First a "statistic" for just Fig1a, then a "Criterion" for just FIg1b Odataset_summary_all_long$Statistic2 <- factor(Odataset_summary_all_long$variable, levels = c("Range","SD","IQR", "MAD_norm", "MAD_Q3"), labels = c("Range","SD","IQR","MAD[norm]","MAD[Q3]")) Odataset_summary_all_long$Criterion2 <- factor(Odataset_summary_all_long$variable, levels = c("Spread_Range","Spread_2SD","Spread_1.5IQR", "Spread_3MAD_norm", "Spread_3MAD_Q3"), labels = c("Range","SD","IQR","MAD[norm]","MAD[Q3]")) # #Now for Figures 1a & 1b fig_1a <- ggplot(data = subset(Odataset_summary_all_long[!is.na(Odataset_summary_all_long$Statistic2), ], N>4)) + geom_point(aes(x=N, y=value), alpha=0.5) + facet_grid(Statistic2 ~ ., margins = FALSE, labeller = label_parsed) + labs(y="Statistic value (\u2030)", x ="Sample number") + theme_bw() fig_1b <- ggplot(data = subset(Odataset_summary_all_long[!is.na(Odataset_summary_all_long$Criterion2), ], N>4)) + geom_point(aes(x=N, y=value), alpha=0.5) + facet_grid(Criterion2 ~ ., margins = FALSE, labeller = label_parsed) + labs(y="Spread encompassed by the boundaries (\u2030)", x ="Sample number") + theme_bw() # #Combining them into a single figure ready for printing as an eps file #Figure 1 cairo_ps("fig1.eps", width = 7.5, height = 8.75, pointsize = 12) grid.newpage() pushViewport(viewport(layout = grid.layout(1,2))) vplayout <-function (x,y) viewport(layout.pos.row = x, layout.pos.col = y) print (fig_1a, vp = vplayout (1,1)) print (fig_1b, vp = vplayout (1,2)) grid.text('a', x=0.03, y=0.98, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(1,1)) grid.text('b', x=0.04, y=0.98, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(1,2)) dev.off() # # ############################################################################ # Figure 2. Histogram of the difference between each individual’s oxygen isotope # value and (a) their site mean oxygen isotopic value or (b) their site median # oxygen isotopic value, for all individuals in the full data-subset. # #2a - mean fig_2a <- ggplot(Odataset) + geom_histogram(aes(x=Diff_mean_all), stat = "bin",binwidth=0.1) + scale_x_continuous(breaks = c(-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6)) + labs(y="Count", x = expression("Individual "*delta^18 * O[PO4] * " - "*"Site mean "*delta^18 * O[PO4] *" "("\u2030"))) + theme_bw() #2b - median fig_2b <- ggplot(Odataset) + geom_histogram(aes(x=Diff_median_all), stat = "bin",binwidth=0.1) + scale_x_continuous(breaks = c(-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6)) + labs(y="Count", x = expression("Individual "*delta^18 * O[PO4] * " - "*"Site median "*delta^18 * O[PO4] *" "("\u2030"))) + theme_bw() # #Combining them into a single figure ready for printing as an eps file #Fig 2 cairo_ps("fig2.eps", width = 5.2, height = 7, pointsize = 12) grid.newpage() pushViewport(viewport(layout = grid.layout(2,1))) vplayout <-function (x,y) viewport(layout.pos.row = x, layout.pos.col = y) print (fig_2a, vp = vplayout (1,1)) print (fig_2b, vp = vplayout (2,1)) grid.text('a', x=0.03, y=0.95, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(1,1)) grid.text('b', x=0.03, y=0.95, gp = gpar(fontsize = 18, fontface="bold"), vp=vplayout(2,1)) dev.off() # # ############################################################################ #Figure 3 #This requires data in long format, not wide, so it must be melted first. Odataset_stats_outliers_long <- melt(Odataset_stats_outliers, id.vars = c("Site_combined","N"), measure =c("Outlier_percent_2permil", "Outlier_percent_2SD", "Outlier_percent_1.5IQR", "Outlier_percent_3MAD_norm", "Outlier_percent_3MAD_Q3"), variable.name="Method", value.name="Fraction") fig_3 <- ggplot(data = subset(Odataset_stats_outliers_long, N > 4)) + geom_point(aes(x=N, y=(100*Fraction), shape = Method)) + theme_bw() + labs(y="Percentage of outliers", x ="Sample number", shape = "Method") + scale_shape("Method", breaks = c("Outlier_percent_2permil", "Outlier_percent_2SD", "Outlier_percent_1.5IQR", "Outlier_percent_3MAD_norm", "Outlier_percent_3MAD_Q3"), labels = c("2\u2030","2SD","1.5IQR",bquote("3MAD"[norm]),bquote("3MAD"[Q3]))) # #Printing Fig 3 cairo_ps("fig3.eps", width = 5.2, height = 4, pointsize = 12) fig_3 dev.off() # # ############################################################################ #Figure 4: Density plot of each individual’s oxygen isotopic value expressed as Z score for #all specimens from sites where N>4 in the full dataset, with a fitted normal distribution curve #overlaid. Make a subset of Odataset first, as otherwise normal overlay doesn't work. Odataset_N4 <- subset(Odataset, Site_N>4) fig_4 <- ggplot(Odataset_N4, aes(x=Zscore_all)) + geom_histogram(aes(y=..density..), stat = "bin",binwidth=0.2) + scale_x_continuous(breaks = c(-4,-2,0,2,4,6)) + labs(y="Density", x = expression(delta^18 * O[PO4]* " expressed as Z score (\u2030)")) + theme_bw() + stat_function(fun=dnorm, args=list(mean=mean(Odataset_N4$Zscore_all), sd=sd(Odataset_N4$Zscore_all))) # #Printing Fig 4 cairo_ps("fig4.eps", width = 5.2, height = 4, pointsize = 12) fig_4 dev.off() # # ############################################################################ #Figure 7. Histogram of each individual’s oxygen isotopic value for all # European individuals in the full dataset. fig_7 <- ggplot(Odataset[!is.na(Odataset$Europe), ]) + geom_histogram(aes(x=d18O), stat = "bin",binwidth=0.2) + scale_x_continuous(breaks = c(4,6,8,10,12,14,16,18,20)) + labs(y="Count", x = expression(delta^18 * O[PO4] * " "("\u2030"))) + theme_bw() # #Printing Fig 7 cairo_ps("fig7.eps", width = 5.2, height = 4, pointsize = 12) fig_7 dev.off() # # ############################################################################ #Figure 8 #This is six separate figures plotted from six separate subsets #This is scatterplot by country. #This is to put countries in the right order - make a new variable order by levels Odataset$Country2 <- factor(Odataset$Country, levels = c("Ireland", "UK", "France","Belgium", "Netherlands","Germany","Denmark", "Norway", "Finland","Austria", "Czech Rep", "Italy", "Croatia", "Bulgaria", "Greece", "Turkey")) # #Have put in [!is.na(Odataset$Europe), ] in every plot below which tells it not to plot non-European data # fig_8a <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Country2, d18O)) + geom_jitter(alpha=0.2, width = 0.6) + theme_bw() + labs(x ="Country", y = expression(delta^18 * O[PO4] * " "("\u2030"))) + theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 8)) #This is boxplot by country. Shows IQR fig_8b <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(x=Country2, y=d18O)) + geom_boxplot() + theme_bw() + labs(x ="Country", y = expression(delta^18 * O[PO4] * " "("\u2030"))) + theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 8)) #This is scatterplot by latitude fig_8c <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Lat, d18O)) + geom_point(alpha=0.2) + theme_bw() + geom_smooth(method = "lm", colour = "black") + labs(x = expression("Degrees of Latitude"), y = expression(delta^18 * O[PO4] * " "("\u2030"))) #This is scatterplot by longitude with alpha fig_8d <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Long, d18O)) + geom_point(alpha=0.2) + theme_bw()+ geom_smooth(method = "lm", colour = "black") + labs(x = expression("Degrees of Longitude"), y = expression(delta^18 * O[PO4] * " "("\u2030"))) #This is scatterplot by altitude fig_8e <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(Alt, d18O)) + geom_point(alpha=0.2) + theme_bw()+ geom_smooth(method = "lm", colour = "black") + labs(x = expression("Altitude "*"(m)"), y = expression(delta^18 * O[PO4] * " "("\u2030"))) #This is scatterplot by MAP fig_8f <- ggplot(Odataset[!is.na(Odataset$Europe), ], aes(MAP, d18O)) + geom_point(alpha=0.2) + theme_bw() + geom_smooth(method = "lm", colour = "black") + labs(x=expression("Site "*delta^18 * O[MAP] *" "("\u2030")), y = expression(delta^18 * O[PO4] *" "("\u2030"))) # #Combining all subplots into single Figure 8 for printing cairo_ps("fig8.eps", width = 7.5, height = 8.75) grid.arrange(arrangeGrob(fig_8a,fig_8b,fig_8c,fig_8d,fig_8e,fig_8f, heights=c(0.35,0.325, 0.325), ncol=2)) grid.text('a', x=0.03, y=0.98, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(1,1)) grid.text('b', x=0.53, y=0.98, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(2,1)) grid.text('c', x=0.03, y=0.635, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(3,1)) grid.text('d', x=0.53, y=0.635, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(4,1)) grid.text('e', x=0.03, y=0.315, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(3,1)) grid.text('f', x=0.53, y=0.315, gp = gpar(fontsize = 14, fontface="bold"), vp=vplayout(3,2)) dev.off() # # ############################################################################ #For Figure 9, we need to compute the limits of each method by a separate grouping variable, #i.e. country, grid_sq, altitude and MAP. #We use only the European data, using the "[!is.na(Odataset$Europe), ]" command for alt (all the others are specific to Europe) #First we compute the limits, and then export the data to csv, as that can be helpful (tables in paper) #Then we melt the data to long form, for each of the methods, then finally we can plot them # #Summary limits for country #First need to calc the variable diff to use in calculating MAD_Q3, then the summary #By European country #Make sure you have "Country2" in there - if not, do the first command Odataset$Country2 <- factor(Odataset$Country, levels = c("Ireland", "UK", "France","Belgium", "Netherlands","Germany","Denmark", "Norway", "Finland","Austria", "Czech Rep", "Italy", "Croatia", "Bulgaria", "Greece", "Turkey")) # Odataset$Country_median_all <- ave(Odataset$d18O, Odataset$Country2, FUN=median) Odataset$Diff_country_median_all <- (Odataset$d18O - Odataset$Country_median_all) Odataset$Abs_diff_country_median_all <- abs(Odataset$Diff_country_median_all) #By grid sq Odataset$Grid_sq_median_all <- ave(Odataset$d18O, Odataset$Grid_sq, FUN=median) Odataset$Diff_grid_sq_median_all <- (Odataset$d18O - Odataset$Grid_sq_median_all) Odataset$Abs_diff_grid_sq_median_all <- abs(Odataset$Diff_grid_sq_median_all) #By altitude bin Odataset$altbin_median_all <- ave(Odataset$d18O, Odataset$altbin2, FUN=median) Odataset$Diff_altbin_median_all <- (Odataset$d18O - Odataset$altbin_median_all) Odataset$Abs_diff_altbin_median_all <- abs(Odataset$Diff_altbin_median_all) #By MAP bin Odataset$MAPbin_median_all <- ave(Odataset$d18O, Odataset$MAPbin, FUN=median) Odataset$Diff_MAPbin_median_all <- (Odataset$d18O - Odataset$MAPbin_median_all) Odataset$Abs_diff_MAPbin_median_all <- abs(Odataset$Diff_MAPbin_median_all) # #Summary limits for country summarycountry_all <- ddply(Odataset, .(Country2), summarize, mean=mean(d18O), N=length(d18O), range_hi=max(d18O), range_low=min(d18O), sd_hi=(mean(d18O)+2*sd(d18O)), sd_low=(mean(d18O)-2*sd(d18O)), iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)), madnorm_hi=(median(d18O)+3*mad(d18O)), madnorm_low=(median(d18O)-3*mad(d18O)), madq3_hi=(median(d18O)+3*(median(Abs_diff_country_median_all)*(1/quantile(Abs_diff_country_median_all, 3/4)))), madq3_low=(median(d18O)-3*(median(Abs_diff_country_median_all)*(1/quantile(Abs_diff_country_median_all, 3/4)))) ) # #Summary limits for grid sq summarygridsq_all <- ddply(Odataset, .(Grid_sq), summarize, mean=mean(d18O), N=length(d18O), range_hi=max(d18O), range_low=min(d18O), sd_hi=(mean(d18O)+2*sd(d18O)), sd_low=(mean(d18O)-2*sd(d18O)), iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)), madnorm_hi=(median(d18O)+3*mad(d18O)), madnorm_low=(median(d18O)-3*mad(d18O)), madq3_hi=(median(d18O)+3*(median(Abs_diff_grid_sq_median_all)*(1/quantile(Abs_diff_grid_sq_median_all, 3/4)))), madq3_low=(median(d18O)-3*(median(Abs_diff_grid_sq_median_all)*(1/quantile(Abs_diff_grid_sq_median_all, 3/4)))) ) # #Summary limits for altitude #First need to calc the variable diff to use in calculating MAD_Q3, then the summary summaryaltitude_all <- ddply(Odataset, .(altbin2), summarize, mean=mean(d18O), N=length(d18O), range_hi=max(d18O), range_low=min(d18O), sd_hi=(mean(d18O)+2*sd(d18O)), sd_low=(mean(d18O)-2*sd(d18O)), iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)), madnorm_hi=(median(d18O)+3*mad(d18O)), madnorm_low=(median(d18O)-3*mad(d18O)), madq3_hi=(median(d18O)+3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all,3/4)))), madq3_low=(median(d18O)-3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all,3/4)))) ) # #Summary limits for binned MAP summaryMAP_all <- ddply(Odataset, .(MAPbin), summarize, mean=mean(d18O), N=length(d18O), range_hi=max(d18O), range_low=min(d18O), sd_hi=(mean(d18O)+2*sd(d18O)), sd_low=(mean(d18O)-2*sd(d18O)), iqr_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), iqr_low=(quantile(d18O,1/4)-1.5*IQR(d18O)), madnorm_hi=(median(d18O)+3*mad(d18O)), madnorm_low=(median(d18O)-3*mad(d18O)), madq3_hi=(median(d18O)+3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all, 3/4)))), madq3_low=(median(d18O)-3*(median(Abs_diff_MAPbin_median_all)*(1/quantile(Abs_diff_MAPbin_median_all, 3/4)))) ) # #Exporting data for tables write.csv(summarycountry_all, file="summarycountry_all.csv", sep = ",", col.names = NA, qmethod = "double") write.csv(summarygridsq_all, file="summarygridsq_all.csv", sep = ",", col.names = NA, qmethod = "double") write.csv(summaryaltitude_all, file="summaryaltitude_all.csv", sep = ",", col.names = NA, qmethod = "double") write.csv(summaryMAP_all, file="summaryMAP_all.csv", sep = ",", col.names = NA, qmethod = "double") # #Now converting to long data format for each grouping sumcountry_all <- melt(summarycountry_all, id.vars = c("Country2","N"), measure = c("range_hi", "range_low","sd_hi", "sd_low", "iqr_hi", "iqr_low", "madnorm_hi","madnorm_low","madq3_hi","madq3_low")) sumgridsq_all <- melt(summarygridsq_all, id.vars = c("Grid_sq","N"), measure = c("range_hi", "range_low","sd_hi", "sd_low", "iqr_hi", "iqr_low", "madnorm_hi","madnorm_low","madq3_hi","madq3_low")) sumalt_all <- melt(summaryaltitude_all, id.vars = c("altbin2","N"), measure = c("range_hi", "range_low","sd_hi", "sd_low", "iqr_hi", "iqr_low", "madnorm_hi","madnorm_low","madq3_hi","madq3_low")) sumMAP_all <- melt(summaryMAP_all, id.vars = c("MAPbin","N"), measure = c("range_hi", "range_low","sd_hi", "sd_low", "iqr_hi", "iqr_low", "madnorm_hi","madnorm_low","madq3_hi","madq3_low")) #Now all the limits into one long column, now fixing labels to remove hi and low #By country sumcountry_all$statistic <- factor(sumcountry_all$variable, levels = c("range_hi","range_low","sd_hi", "sd_low", "iqr_hi","iqr_low","madnorm_hi", "madnorm_low", "madq3_hi", "madq3_low"), labels = c("Range","Range","SD","SD","IQR","IQR", "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]")) sumcountry_all$statistic2 <-factor(sumcountry_all$statistic) #By grid sq sumgridsq_all$statistic <- factor(sumgridsq_all$variable, levels = c("range_hi","range_low","sd_hi", "sd_low", "iqr_hi","iqr_low","madnorm_hi", "madnorm_low", "madq3_hi", "madq3_low"), labels = c("Range","Range","SD","SD","IQR","IQR", "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]")) sumgridsq_all$statistic2 <-factor(sumgridsq_all$statistic) #By alt sumalt_all$statistic <- factor(sumalt_all$variable, levels = c("range_hi","range_low","sd_hi", "sd_low", "iqr_hi","iqr_low","madnorm_hi", "madnorm_low", "madq3_hi", "madq3_low"), labels = c("Range","Range","SD","SD","IQR","IQR", "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]")) sumalt_all$statistic2 <-factor(sumalt_all$statistic) #By MAP sumMAP_all$statistic <- factor(sumMAP_all$variable, levels = c("range_hi","range_low","sd_hi", "sd_low", "iqr_hi","iqr_low","madnorm_hi", "madnorm_low", "madq3_hi", "madq3_low"), labels = c("Range","Range","SD","SD","IQR","IQR", "MAD[norm]","MAD[norm]","MAD[Q3]","MAD[Q3]")) sumMAP_all$statistic2 <-factor(sumMAP_all$statistic) # #Plots for Fig 9 # #By country sumcountry_all$country2 <- factor(sumcountry_all$Country, levels = c("Ireland","UK","France","Belgium", "Netherlands","Germany","Denmark","Norway", "Finland","Austria","Czech Rep","Italy", "Croatia","Bulgaria","Greece","Turkey")) sumcountry_all$statistic2 <- factor(sumcountry_all$statistic, levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"), labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3]))) fig_9A <- ggplot(data = subset(sumcountry_all[!is.na(sumcountry_all$Country2), ], N > 4), aes(country2, value)) + stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, geom = "crossbar", width = 0.2, size = 0.8, colour = "black") + facet_grid(. ~ statistic2, margins = FALSE,labeller = label_parsed) + scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) + labs(x="Country", y = expression("'Local' "*delta^18 * O[PO4] *" signal "("\u2030"))) + theme_bw() + theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 6), axis.text.y=element_text (size=8), axis.title=element_text(size=8), strip.text=element_text(size=8)) # #By grid sq sumgridsq_all$Grid_sq2 <- factor(sumgridsq_all$Grid_sq, levels = c("A4","A5","B3","B4","B5","C4","D3","D4", "E2","E3", "E4","F2","F3","H2")) sumgridsq_all$statistic2 <- factor(sumgridsq_all$statistic, levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"), labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3]))) # fig_9B <- ggplot(data = subset(sumgridsq_all[!is.na(sumgridsq_all$Grid_sq2), ], N > 4), aes(Grid_sq, value)) + stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, geom = "crossbar", width = 0.2, size = 0.8, colour = "black") + facet_grid(. ~ statistic2, margins = FALSE,labeller = label_parsed) + scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) + labs(x="Grid Square", y = expression("'Local' "*delta^18 * O[PO4] *" signal "("\u2030"))) + theme_bw() + theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 4), axis.text.y=element_text (size=8), axis.title=element_text(size=8), strip.text=element_text(size=8)) # #By alt sumalt_all$statistic2 <- factor(sumalt_all$statistic, levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"), labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3]))) fig_9C <- ggplot(data = subset(sumalt_all[!is.na(sumalt_all$altbin2), ], N > 4), aes(altbin2, value)) + stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, geom = "crossbar", width = 0.2, size = 0.8,colour = "black") + facet_grid(. ~ statistic2, margins = FALSE, labeller = label_parsed) + scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) + labs(x="Altitude (m)", y = expression("'Local' "*delta^18 * O[PO4] *" signal "("\u2030")))+ theme_bw() + theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 6), axis.text.y=element_text (size=8), axis.title=element_text(size=8), strip.text=element_text(size=8)) # #By MAP sumMAP_all$statistic2 <- factor(sumMAP_all$statistic, levels = c("Range","SD","IQR", "MAD[norm]", "MAD[Q3]"), labels = c("Range",bquote(2*"SD"),bquote(1.5*"IQR"), bquote(3*"MAD"[norm]), bquote(3*"MAD"[Q3]))) fig_9D <- ggplot(data = subset(sumMAP_all[!is.na(sumMAP_all$MAPbin), ], N > 4), aes(MAPbin, value)) + stat_summary(fun.y = range, fun.ymin = min, fun.ymax = max, geom = "crossbar", width = 0.2, size = 0.8, colour = "black") + facet_grid(. ~ statistic2, margins = FALSE,labeller = label_parsed) + scale_y_continuous(breaks = c(4,6,8,10,12,14,16,18,20,22,24)) + labs(x=expression("Site "*delta^18 * O[MAP] *" "("\u2030")), y = expression("'Local' "*delta^18 * O[PO4]*" signal "("\u2030"))) + theme_bw() + theme(axis.text.x=element_text(angle = 90, hjust = 1, size = 6), axis.text.y=element_text (size=8), axis.title=element_text(size=8), strip.text=element_text(size=8)) #Printing Fig 9 cairo_ps("fig9.eps", width = 7.5, height = 8.75, pointsize = 8) grid.arrange(arrangeGrob(fig_9A,fig_9B,fig_9C,fig_9D, heights=c(0.265,0.235, 0.255,0.255), ncol=1)) grid.text('a', x=0.03, y=0.98, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(1,1)) grid.text('b', x=0.03, y=0.726, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(2,1)) grid.text('c', x=0.03, y=0.495, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(3,1)) grid.text('d', x=0.03, y=0.245, gp = gpar(fontsize = 12, fontface="bold"),vp=vplayout(4,1)) dev.off() # # ############################################################################ #Fig 10 #This is done on the PID dataset ONLY, i.e. presorted so that only samples with PID=1 are included. #Also note that R can't cope with the non-std letters, so Velim V's name is "Velim Veli_tak" Odataset$Site_combined2 <- factor(Odataset$Site_combined, levels = c("Ban Non Wat","Kaminaljuyu","Velim-Veli_tak"), labels = c("Ban Non Wat","Kaminaljuyu","Velim Velistak")) fig_10 <- ggplot(Odataset[!is.na(Odataset$Site_combined2), ]) + geom_histogram(aes(x=d18O), stat = "bin",binwidth=0.4) + facet_grid(Site_combined2 ~ ., margins = FALSE) + scale_x_continuous(breaks = c(14,16,18,20)) + labs(y="Count", x = expression(delta^18 * O[PO4] *" "("\u2030"))) + theme_bw() # #Printing Fig 10 cairo_ps("fig10.eps", width = 5.2, height = 6, pointsize = 12) fig_10 dev.off() #