############################################################################ # R code to identify outliers from a univariate dataset, for multiple sites. ############################################################################ #As used in the paper: #Lightfoot E and O’Connell TC (2016). #"On The Use of Biomineral Oxygen Isotope Data to Identify Human Migrants in the Archaeological Record: #Intra-Sample Variation, Statistical Methods and Geographical Considerations." PLoS ONE 11(4). #doi:10.1371/journal.pone.0153850 # ############################################################################ # To cite this code, please cite: # O’Connell, TC. 2016. R code, https://www.repository.cam.ac.uk/handle/1810/252773 ############################################################################ # #Please save this file as OConnell_R_code_outlier_ident.R to use # #This file contains code to produce: # ** an histogram of d18O by each site in your dataset, exported as "histogram.eps" # ** the summary statistics data by site, as well as outlier limits for the methods # of SD, IQR, and MAD_norm. # ** a summary sheet for all data by site, including outlier numbers # ** an exported version of the latter summary sheet as a .csv file, called “Odataset_stats.csv” # #For all the code, it relies on the use of a primary dataset called “Odataset”. #To use the code as it stands, you will need to import your data as a file called “Odataset”. #There should be a column with your site name(s) called “Site” #and a column with the oxygen isotope data called “d18O” # ############################################################################ ### WARNINGS ### ############################################################################ #Sometimes the figures are so complicated that R can’t cope with plotting them and gives a warning message that ends in #Polygon edge not found”. Usually, just rerunning the code produces the figure. #In extreme situations, a reboot of R does the trick. # # ############################################################################ ### The Code ### ############################################################################ #Clear out any previous items in the environment, and clear out graphics rm(list=ls()) graphics.off() #Load relevant packages before using them library(ggplot2) library(aplpack) library (car) library (psych) library(Hmisc) library(grid) library(gridExtra) library(plyr) # ############################################################################ ### Input yr data ### ############################################################################ # # Upload relevant dataset, assumed to be called Odataset.txt # # ############################################################################ ### Histogram of each individual’s oxygen isotopic value for each site in the dataset. ############################################################################ # histogram <- ggplot(Odataset) + geom_histogram(aes(x=d18O), stat = "bin",binwidth=0.2) + facet_grid(Site ~ ., margins = FALSE) + labs(y="Count", x = expression(delta^18 * O[PO4] * " "("\u2030"))) + theme_bw() # #If it is not phosphate data, then can change the x expression to carbonate or nothing by #substituting what is in the square brckets # # Printing figure #Need to export to something that can handle delta values. I use cairo, and export to eps. #You can also export to pdf using cairo_pdf cairo_ps("histogram.eps", width = 5.2, height = 4, pointsize = 12) histogram dev.off() # ############################################################################ ### Summary statistics data by site. ### ############################################################################ # #For each site in your dataset, this calculates a series of different statistics, as well as #outlier limits for the methods of SD, IQR, and MAD_norm. #In our paper, we do not recommend the use of the SD method, but I include it here for completeness. # Odataset_summary <- ddply(Odataset, .(Site), summarize, N = length(d18O), Mean=mean(d18O), Median=median(d18O), Range=(max(d18O) - min(d18O)), SD=sd(d18O), Max=max(d18O),Min=min(d18O), IQR=IQR(d18O), MAD_norm=mad(d18O), Spread_2SD=(4*sd(d18O)), Spread_1.5IQR=(4*IQR(d18O)), Spread_3MAD_norm=(6*mad(d18O)), SD_hi=(mean(d18O)+2*sd(d18O)), SD_low=(mean(d18O)-2*sd(d18O)), IQR_hi=(quantile(d18O,3/4)+1.5*IQR(d18O)), IQR_low=(quantile(d18O,1/4)-1.5*IQR(d18O)), MAD_norm_hi=(median(d18O)+3*mad(d18O)), MAD_norm_low=(median(d18O)-3*mad(d18O)) ) # #Then merge the two datasets together, so that the summary data by site are also listed for #each individual in the full dataset #You have some redundant (doubled) variables in there, but it doesn't matter Odataset_merge <- merge(Odataset, Odataset_summary, by = "Site") #Now for working out whether an individual is an outlier or not for their site by each method #Methods = 2SD, 1.5IQR, 3MAD_norm Odataset_merge$Outlier_2SD <- ifelse(Odataset_merge$d18O > Odataset_merge$SD_hi | Odataset_merge$d18O < Odataset_merge$SD_low, 1, 0) Odataset_merge$Outlier_1.5IQR <- ifelse(Odataset_merge$d18O > Odataset_merge$IQR_hi | Odataset_merge$d18O < Odataset_merge$IQR_low, 1, 0) Odataset_merge$Outlier_3MAD_norm <- ifelse(Odataset_merge$d18O > Odataset_merge$MAD_norm_hi | Odataset_merge$d18O < Odataset_merge$MAD_norm_low, 1, 0) #Now creating a summary sheet for all data by site, including outlier percentages and numbers Odataset_stats <- ddply(Odataset_merge, .(Site), summarize, N = length(d18O), Mean=mean(d18O), SD=sd(d18O), Median=median(d18O), Skew=round(skew(d18O),3), Range=(max(d18O)-min(d18O)), Max=max(d18O), Min=min(d18O), IQR=IQR(d18O), MAD_norm=mad(d18O), Spread_2SD=(4*sd(d18O)), Spread_1.5IQR=(4*IQR(d18O)), Spread_3MAD_norm=(6*mad(d18O)), Outlier_2SD=sum(Outlier_2SD), Outlier_1.5IQR=sum(Outlier_1.5IQR), Outlier_3MAD_norm=sum(Outlier_3MAD_norm) ) # ############################################################################ ### Exporting data to a csv file ############################################################################ write.csv(Odataset_stats, file="Odataset_stats.csv", sep = ",", col.names = NA, qmethod = "double") #