############################################################################
#  R code to identify outliers from a univariate dataset, for multiple sites.
############################################################################
#As used in the paper:
#Lightfoot E and O’Connell TC (2016). 
#"On The Use of Biomineral Oxygen Isotope Data to Identify Human Migrants in the Archaeological Record: 
#Intra-Sample Variation, Statistical Methods and Geographical Considerations." PLoS ONE 11(4).
#doi:10.1371/journal.pone.0153850
#
############################################################################
#  To cite this code, please cite:
#  O’Connell, TC. 2016. R code, https://www.repository.cam.ac.uk/handle/1810/252773
############################################################################
#
#Please save this file as OConnell_R_code_outlier_ident.R to use
#
#This file contains code to produce:
#	** an histogram of d18O by each site in your dataset, exported as "histogram.eps"
#	** the summary statistics data by site, as well as outlier limits for the methods 
#		of SD, IQR, and MAD_norm. 
#	** a summary sheet for all data by site, including outlier numbers
#	** an exported version of the latter summary sheet as a .csv file, called “Odataset_stats.csv”
#
#For all the code, it relies on the use of a primary dataset called “Odataset”.
#To use the code as it stands, you will need to import your data as a file called “Odataset”.
#There should be a column with your site name(s) called “Site”
#and a column with the oxygen isotope data called “d18O”
#
############################################################################
### WARNINGS ###
############################################################################
#Sometimes the figures are so complicated that R can’t cope with plotting them and gives a warning message that ends in
#Polygon edge not found”. Usually, just rerunning the code produces the figure.
#In extreme situations, a reboot of R does the trick.
#
#
############################################################################
### The Code ###
############################################################################
#Clear out any previous items in the environment, and clear out graphics
rm(list=ls())
graphics.off()
#Load relevant packages before using them
library(ggplot2)
library(aplpack)
library (car)
library (psych)
library(Hmisc)
library(grid)
library(gridExtra)
library(plyr)
#
############################################################################
### Input yr data ###
############################################################################
#
#  Upload relevant dataset, assumed to be called Odataset.txt
#
#
############################################################################
###  Histogram of each individual’s oxygen isotopic value for each site in the dataset.
############################################################################
#
histogram <- ggplot(Odataset) + 
  geom_histogram(aes(x=d18O), stat = "bin",binwidth=0.2) + 
  facet_grid(Site ~ ., margins = FALSE) + 
  labs(y="Count", x = expression(delta^18 * O[PO4]  * " "("\u2030"))) +
  theme_bw()
#
#If it is not phosphate data, then can change the x expression to carbonate or nothing by 
#substituting what is in the square brckets
#
#  Printing figure
#Need to export to something that can handle delta values. I use cairo, and export to eps.
#You can also export to pdf using cairo_pdf
cairo_ps("histogram.eps", width = 5.2, height = 4, pointsize = 12)
histogram
dev.off()
#
############################################################################
###  Summary statistics data by site. ###
############################################################################
#
#For each site in your dataset, this calculates a series of different statistics, as well as
#outlier limits for the methods of SD, IQR, and MAD_norm. 
#In our paper, we do not recommend the use of the SD method, but I include it here for completeness.
#
Odataset_summary <- ddply(Odataset, .(Site), summarize,
                              N = length(d18O), Mean=mean(d18O), Median=median(d18O),
                              Range=(max(d18O) - min(d18O)), SD=sd(d18O),
                              Max=max(d18O),Min=min(d18O),
                              IQR=IQR(d18O), MAD_norm=mad(d18O),
                              Spread_2SD=(4*sd(d18O)), 
                              Spread_1.5IQR=(4*IQR(d18O)), 
                              Spread_3MAD_norm=(6*mad(d18O)),
                              SD_hi=(mean(d18O)+2*sd(d18O)), SD_low=(mean(d18O)-2*sd(d18O)),
                              IQR_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), 
                              IQR_low=(quantile(d18O,1/4)-1.5*IQR(d18O)),
                              MAD_norm_hi=(median(d18O)+3*mad(d18O)), 
                              MAD_norm_low=(median(d18O)-3*mad(d18O)) )
#
#Then merge the two datasets together, so that the summary data by site are also listed for
#each individual in the full dataset
#You have some redundant (doubled) variables in there, but it doesn't matter
Odataset_merge <- merge(Odataset, Odataset_summary, by = "Site")

#Now for working out whether an individual is an outlier or not for their site by each method
#Methods = 2SD, 1.5IQR, 3MAD_norm
Odataset_merge$Outlier_2SD <- ifelse(Odataset_merge$d18O > Odataset_merge$SD_hi | Odataset_merge$d18O < Odataset_merge$SD_low, 1, 0)
Odataset_merge$Outlier_1.5IQR <- ifelse(Odataset_merge$d18O > Odataset_merge$IQR_hi | Odataset_merge$d18O < Odataset_merge$IQR_low, 1, 0)
Odataset_merge$Outlier_3MAD_norm <- ifelse(Odataset_merge$d18O > Odataset_merge$MAD_norm_hi | Odataset_merge$d18O < Odataset_merge$MAD_norm_low, 1, 0)

#Now creating a summary sheet for all data by site, including outlier percentages and numbers
Odataset_stats <- ddply(Odataset_merge, .(Site), summarize,
                                N = length(d18O), 
                                Mean=mean(d18O),
                                SD=sd(d18O), 
                                Median=median(d18O),
                                Skew=round(skew(d18O),3),
                                Range=(max(d18O)-min(d18O)),
                                Max=max(d18O),
                                Min=min(d18O),
                                IQR=IQR(d18O),
                                MAD_norm=mad(d18O),
                                Spread_2SD=(4*sd(d18O)), 
                                Spread_1.5IQR=(4*IQR(d18O)), 
                                Spread_3MAD_norm=(6*mad(d18O)),
                                Outlier_2SD=sum(Outlier_2SD),
                                Outlier_1.5IQR=sum(Outlier_1.5IQR),
                                Outlier_3MAD_norm=sum(Outlier_3MAD_norm) )
#
############################################################################
### Exporting data to a csv file
############################################################################
write.csv(Odataset_stats, file="Odataset_stats.csv", sep = ",", col.names = NA, qmethod = "double")
#