## MRDS-with-uncertain-dups.R ## Code for the paper "Accounting for uncertainty in duplicate identification and group size judgments in ## mark-recapture distance sampling", by ## O.N.P. Hamilton, S.E. Kincaid, R. Constantine, L. Kozmian-Ledward, C.G. Walker, & R.M. Fewster ## All code in this file is written by Rachel Fewster, r.fewster@auckland.ac.nz. ## Code for imputing missing group-size values is written by Sophie Kincaid and is ## supplied separately in Impute.R (automatically sourced when this file is sourced). ## ---------------------------------------------------------------------------------------------------------------------- ## DATA AND FILES REQUIRED: ## ---------------------------------------------------------------------------------------------------------------------- ## ## 1. Raw data: this is the data frame with one row for every sighting by every observer. ## There is no pairing of sightings into duplicates in the raw data file. ## The raw data for our analysis is in the R object dolphin.dat, which is automatically loaded ## from the file "SAVE-data-and-survey-lengths.RData" when the file MRDS-with-uncertain-dups.R is sourced. ## See below for details of the required columns in dolphin.dat. ## ## 2. transect.lengths.by.survey.in.km : this is a vector of length K specifying the length of transect ## flown for each of the K aerial surveys flown in the overall data set. In our case, K=22. ## The transect length vector for our 22 surveys is automatically loaded ## from the file "SAVE-data-and-survey-lengths.RData" along with the data. ## ## ---------------------------------------------------------------------------------------------------------------------- ## COLUMNS IN THE RAW DATA ## ---------------------------------------------------------------------------------------------------------------------- ## The raw data (dolphin.dat in our case) needs the following columns: ## ## - Survey, transect.id and transect.label: transect indicators. Transects in our survey were divided ## into segments with short breaks between segments where survey effort was interrupted by land, ## e.g. on transect 1 there were segments 1A, 1B,..., 1F as denoted by the column transect.label. ## If transects are not divided into segments, the transect.label column can repeat the transect.id column. ## Sightings are only treated as candidate duplicates if they are from the same Survey and transect segment. ## If there is only one aerial survey, put Survey=1 everywhere (the code will look for the Survey column). ## ## - object: a sighting ID. ## - angle: downward angle of the sighting from the aircraft. ## ## - hour, minute, second, am.pm : time of sighting: these columns are converted to seconds-since-midnight ## in the function compile.data.func() below. ## ## - distance : derived perpendicular distance of sighting, using the downward angle and the formula in ## Appendix S2 of the paper. Altitude in our case was 152.4 metres. For example, an angle of ## 20 degrees is 418 metres from the trackline; an angle of 90 degrees is 0 metres from the trackline. ## ## - ObserverCode: the observer who made the sighting. ## - ObserverPosition: M1: front left; M2: front right; M3: rear left; M4: rear right. ## Thus the observer teams are front (M1-M2) and rear (M3-M4). ## Sightings are only considered as candidate duplicates if they are from the same side of the ## aircraft and viewed by different observer teams: pairings M1-M3 or M2-M4. ## ## - min, best, max: observers' judgements for group size. Often include missing data (NA). ## ---------------------------------------------------------------------------------------------------------------------- ## The next two lines are automatically run when this code file is sourced: load("SAVE-data-and-survey-lengths.RData") ## Load in the raw data and the transect lengths flown load("SAVE-modelgrid.RData") ## Load in the model grid for the suite of MRDS models investigated source("Impute.R") ## Source Sophie Kincaid's code for imputing missing group-size values require(mgcv) if(F){ ##---------------------------------------------------------------------------------------------------------------- ## HOW TO USE: ##---------------------------------------------------------------------------------------------------------------- ## Cut and paste the commands below. ## First source the code in this file: source("MRDS-with-uncertain-dups.R") ## This auto-loads the second code file Impute.R and the data dolphin.dat and transect lengths flown ## for each of the 22 surveys in our data. ## It also loads the object grid.allmodels, which is a list of 512 possible models: see Hamilton et al ## Appendix S3. ## Look at the data: head(dolphin.dat) ## Look at Model 198: the selected model in the Hamilton et al paper. ## The line below specifies which of the 15 parameters (alpha1, ..., epsilon3) are included in model 198: ## see Hamilton et al, Appendix S3 for details. grid.allmodels[[198]] ## To get started, try out an MRDS model with bootstrap-by-survey (ER included), ## fixed time and angle tolerances of T=5 seconds and A=10 degrees, ## and B=5 replicates, each using model 198. (In practice, B should be much greater than 5.) ## Note that mrds.wrap also takes arguments for survey area and transect lengths flown per survey ## that are defaulted to the values used in Hamilton et al. ## Assign the output to res.fixed.ER. This takes a few minutes as each replicate involves three model ## fits: one for each of the three protocols, "impute.all", "impute.missing", and "impute.none" for group sizes. res.fixed.ER <- mrds.wrap(dolphin.dat, fixedTol=T, ## run as a fixed tolerance analysis fixed.args=c(timeTol=5, angleTol=10), ## fixed tolerance choices bootstrap.by.survey=T, ## ER included model=grid.allmodels[[198]], ## model 198 Nsim=5, ## B=5 plotit=T) ## Plots model fits for each of the B iterations res.fixed.noER <- mrds.wrap(dolphin.dat, fixedTol=T, ## run as a fixed tolerance analysis fixed.args=c(timeTol=5, angleTol=10), ## fixed tolerance choices bootstrap.by.survey=F, ## ER omitted model=grid.allmodels[[198]], ## model 198 Nsim=5, ## B=5 plotit=T) ## Plots model fits for each of the B iterations ## Plot the results and view the summary text. ## (Run with B much greater than 5 for a more meaningful display.) ## The following line displays boxplots of the results and prints summary output to the screen. ## Any results deemed ludicrous (over 6000 here) are discarded before plotting and are excluded from ## summaries: set discard.above=Inf if no results are to be discarded. ## Use plot.legend=F to turn off the legend. ## This next line detects all objects with names that begin with "res.fixed", and plots them in a single boxplot. mrds.fixedtol.boxplot(stem="res.fixed", paperplot=T, mark.angletol=F, discard.above=6000) ## Now try out a similar analysis using the probabilistic "discrepancy"-based approach: ## the mixture model fit will be plotted for every iteration, as well as the fitted MRDS models. res.discrep.ER <- mrds.wrap(dolphin.dat, fixedTol=F, ## run as a probabilistic analysis bootstrap.by.survey=T, ## ER included model=grid.allmodels[[198]], ## model 198 Nsim=5, ## B=5 plotit=T) ## Plots model fits for each of the B iterations res.discrep.noER <- mrds.wrap(dolphin.dat, fixedTol=F, ## run as a probabilistic analysis bootstrap.by.survey=F, ## ER omitted model=grid.allmodels[[198]], ## model 198 Nsim=5, ## B=5 plotit=T) ## Plots model fits for each of the B iterations ## Plot the results and view the summary text. mrds.discrep.boxplot(stem="res.discrep", discard.above=6000) ## ------------------------------------------------------------------------------ ## Check model fit for a single set of bootstrap data: ## ------------------------------------------------------------------------------ ## First generate the data (using a random seed specified below) and fit the desired model, e.g. model 198: ## check.modelfit.func defaults to a probabilistic discrepancy analysis with impute.all protocol. onefit.198 <- check.modelfit.func(seed=3, model=grid.allmodels[[198]]) ## Subsequently, use the ready-fitted onefit.198 for further plots. ## Use plot.type="curves" to display the detection function curves: check.modelfit.func(seed=3, readyfit=onefit.198, plot.type="curves") ## Use plot.type="gof" to display goodness-of-fit plots as in Hamilton et al Appendix S3: check.modelfit.func(seed=3, readyfit=onefit.198, plot.type="gof") ## The functions above give quick results with only B=5 iterations. ## Now set B=500 or B=2000 and re-run to replicate the analyses in Hamilton et al. ## ------------------------------------------------------------------------------------------------- ## Model-selection and inspection: ## ------------------------------------------------------------------------------------------------- ## For the model-selection process in Hamilton et al, use the functions grid.mrds.func and grid.process.func. ## This analysis is very lengthy, as all 512 models are fit to each data set. ## The data sets are generated by specified random seeds, then all 512 models are fitted. This is done one ## dataset at a time by grid.mrds.func. ## For example, to create a set of results (called results set "C") where each result consists of all 512 possible ## models applied to the same set of data, use commands as follows. ## The first set of data is bootstrapped data created with random seed 42 and using the probabilistic ## discrepancy analysis (fixedTol=F). The command below fits all 512 models to this data set and stores ## the results in Cbootgrid.mrds.res42. Cbootgrid.mrds.res42 <- grid.mrds.func(seed=42, fixedTol=F, fixed.args=NULL, groupsize.col = "impute.all", bootstrap.by.survey=T) ## The second set of data follows the same protocol but is created with random seed 315: Cbootgrid.mrds.res315 <- grid.mrds.func(seed=315, fixedTol=F, fixed.args=NULL, groupsize.col = "impute.all", bootstrap.by.survey=T) ## And so on for numerous different datasets: we used 20 in Hamilton et al. ## Finally process the results. resCgrid <- grid.process.func(boot=T, which.set="C") ## resCgrid contains many items for inspection and further processing. See grid.process.func for details. } ## ---------------------------------------------------------------------------------------------------------------------- ## FUNCTIONS START HERE ## ---------------------------------------------------------------------------------------------------------------------- ## ---------------------------------------------------------------------------------------------------------------------- ## mrds.wrap is the main wrapper code for all MRDS analyses ## Calls functions needed to create duplicate-reconciled data and fit an MRDS model. ## ---------------------------------------------------------------------------------------------------------------------- mrds.wrap <- function(raw.dat=dolphin.dat, ## raw.dat is the raw data with no duplicate matches: one row per sighting per observer. survey.lengths=transect.lengths.by.survey.in.km, ## surveylengths gives transect lengths flown for each of the K aerial surveys, in order from 1:K. surveyRegionArea=4352.369, ## total area of the survey region in km^2, excluding islands and peninsulas fixedTol=T, ## determines whether to use fixed tolerance or probabilistic analysis fixed.args=(if(fixedTol) c(timeTol=10, angleTol=15) else NULL), ## choice of fixed-tolerance arguments if using this analysis: defaults to T=10, A=15 in line above discrep.args=(if(fixedTol) NULL else c(discrepmax = 1, tdiffmax = 400, histbar = 0.1, plot.discrep=T, print.discrep=F)), ## choice of probabilistic arguments if using the probabilistic analysis bootstrap.by.survey=T, ## whether or not to include bootstrap by survey (bootstrap.by.survey=T is "ER-included" in paper) imputecolnames=c("impute.all", "impute.missing", "impute.none"), ## which of the impute types to include for group size imputation allparnames = c("al1", "al2", "al3", "be1", "be2", "be3", "gam1", "gam2", "gam3", "delt1", "delt2", "delt3", "eps1", "eps2", "eps3"), ## specifies the full set of parameter names for MRDS detection functions model = allparnames, ## specifies which of the parameters to include in the model: e.g. specifying ## model=c("al1","al2","al3","be1","be2","be3") would denote the model with only alpha and beta ## parameters start.pars = c(al1 = 3, al2 = 3, al3 = 3, be1 = -0.03, be2 = -0.03, be3 = -0.03, gam1 = -1e-04, gam2 = -1e-04, gam3 = -1e-04, delt1 = 0.02, delt2 = 0.02, delt3 = 0.02, eps1 = 1e-05, eps2 = 1e-05, eps3 = 1e-05), ## starting values for detection model parameters for likelihood maximization routine w=400, ## truncation distance in metres Nsim=100, ## number of bootstrap replicates (B in the paper) front.ymin=60, ## minimum distance in metres at which the front observer team can see plotit=T, paperplot=F){ ## mrds.wrap 1/4/16 ## MRDS wrapper: accommodates EITHER: ## - fixed time and angle tolerances (fixedTol=T and fixed.args supplied, discrep.args=NULL); ## OR: ## - probabilistic discrepancy-based analysis (fixedTol=F and discrep.args supplied, fixed.args=NULL); ## ## For either choice, include bootstrap or not according to bootstrap.by.survey=T or F. ## ## Surveys in the data must be labeled 1, 2, ..., K, where K=length(survey.lengths) is the number of surveys. ## ## This function: ## 1. Takes the raw data supplied with one row per sighting (no duplicate reconciliation). ## 2. Uses region area and total transect length supplied. ## 3. If fixedTol=T (fixed tolerance analysis): ## for the given time tolerance and angle tolerance, and Nsim times, creates a data frame with ## duplicates resolved for this time and angle tolerance; ## note that this will not be the same each time, because there is randomness involved in ## decisions for resolving clusters of multiple duplicates. This step uses compile.data.func. ## If fixedTol=F (discrepancy analysis): ## fits the discrepancy.func mixture model to determine P(duplicate)=Pdup for every pair of sightings, ## then selects which pairs are declared duplicates at random with probabilities given by Pdup, and ## resolves any clusters of multiple duplicates. This again all happens within compile.data.func. ## Note that the discrepancy model is re-fit for every iteration in mrds.wrap, which is necessary ## for bootstrapping. (If not bootstrapping, this does mean repeatedly fitting the same ## discrepancy model to the same data, but no matter as it is very quick (~0.02sec). ## 4. Given the data for a single iteration, uses the code in Impute.R by Sophie Kincaid to add three ## columns: impute.all, impute.missing, and impute.none, each imputing group size according to ## the respective protocol. This also happens within compile.data.func. ## 5. For each of the three impute protocols, fits the MRDS model specified to the resulting data, and ## calculates estimated density. ## 6. If bootstrap.by.survey=T, every iteration includes resampling the data according to survey. ## This is used to include encounter rate variance in the overall uncertainty. ## For example, if there are 22 surveys, each survey's data is included or not included as a whole, ## and might be included more than once. If the survey is included twice (say), the resolution of ## duplicates occurs separately for the different occurrences. Thus they might not be ## exact copies of each other in the data frame once multiple duplicates have been resolved. ## ## The ouput from this function is a list of three data frames, giving the Nsim results from each of ## the three impute protocols. ## ## If plotit=T, plots the detection functions from each iteration as it goes. K <- length(survey.lengths) ## Number of surveys survnames <- 1:K ## ------------------------------------------------------------------------------------------------------------------ ## Start the iterations: ## ------------------------------------------------------------------------------------------------------------------ ## Assemble the results objects: ## Hard-code output column names, even if only some of them will be used: colnam <- c("Nchat", allparnames, "minimum", "AIC", "code.123good", "iterations", "density.indivs", "total.indivs") all.out <- list(impute.all=matrix(NA, nrow=Nsim, ncol=length(colnam)), impute.missing=matrix(NA, nrow=Nsim, ncol=length(colnam)), impute.none=matrix(NA, nrow=Nsim, ncol=length(colnam))) colnames(all.out$impute.all)<-colnames(all.out$impute.missing)<-colnames(all.out$impute.none)<-colnam ## Now remove any that won't be included: all.out <- all.out[imputecolnames] ## If it's a discrepancy (probabilistic) analysis, include another data frame giving the results of the ## discrepancy fit for each iteration. if(!fixedTol){ discrepnam <- c("mu1", "sig1", "mu2", "sig2", "dupP") all.out$discrep.fit <- matrix(NA, nrow=Nsim, ncol=length(discrepnam)) colnames(all.out$discrep.fit) <- discrepnam } ## Add another data frame that gives the transect length flown and the covered area for each iteration. ## If not bootstrapping, this should be the same for all iterations, but we'll keep the same data structure ## regardless. all.out$covered.area <- matrix(NA, nrow=Nsim, ncol=2) colnames(all.out$covered.area) <- c("lengthFlown.km", "coveredArea.km2") ## ------------------------------------------------------------------------------------------------------------------ ## Iterations: for(sim in 1:Nsim){ catline("\n================================\n") catline("Iteration ", sim, "\n") catline("Generating data...") ## If bootstrapping, resample by survey before resolving duplicates: if(bootstrap.by.survey){ cat(" Bootstrapping: ") boot.surv <- sample(x=survnames, size=length(survnames), replace=T) ## Find the covered area for this bootstrap replicate, in km^2. ## w is measured in metres, so use w/1000 to convert to km. totalLength.sim <- sum(survey.lengths[boot.surv]) coveredArea.sim <- 2*(w/1000)*totalLength.sim catline(boot.surv) catline("Total length flown (km):", totalLength.sim) catline("Covered area (km^2):", coveredArea.sim) ## Create the bootstrap data: base.dat <- NULL ## Insert the data from the corresponding survey into base.dat, BUT give it a new ## survey number. For example, if true-data survey 1 was selected twice in the bootstrap, ## we want to treat these as if they are different surveys when it comes to pairing off ## duplicates. To distinguish from the real surveys, call the bootstrap-data ## surveys 101, 102, ..., 122. for(bs.ind in 1:length(boot.surv)) { bs <- boot.surv[bs.ind] dat.surv <- raw.dat[raw.dat$Survey==bs,] dat.surv$Survey <- rep(100+bs.ind, nrow(dat.surv)) base.dat <- rbind(base.dat, dat.surv) } } else { ## If not bootstrapping, use the fixed data set and the fixed survey.lengths: totalLength.sim <- sum(survey.lengths) coveredArea.sim <- 2*(w/1000)*totalLength.sim catline("Total length flown (km):", totalLength.sim) catline("Covered area (km^2):", coveredArea.sim) base.dat <- raw.dat } ## Enter the area covered details into the data frame: all.out$covered.area[sim,] <- c(totalLength.sim, coveredArea.sim) ## Generate data by reconciling duplicates and imputing group sizes by the three protocols. ## Use "try" here, because if there are no duplicates then compile.data.func will crash, ## but we won't get a successful fit anyway. So crash it at the compile.data.func stage and ## just skip the result in the bootstrapping. dat.sim.objs <- try(compile.data.func(data=base.dat, fixedTol=fixedTol, fixed.args=fixed.args, discrep.args=discrep.args, printit = F, paperplot=paperplot)) ## Early exit if running for plots only: if(paperplot){ print("Plotting complete: quitting function because of paperplot=T argument.") if(!inherits(dat.sim.objs, "try-error")) return(dat.sim.objs$data.out) else return(NULL) } ## Only proceed if compile.data.func was successful: if(!inherits(dat.sim.objs, "try-error")){ dat.sim <- dat.sim.objs$data.out ## If it's a discrepancy analysis, put the discrepancy fit into all.out$discrep.fit: if(!fixedTol){ all.out$discrep.fit[sim,] <- dat.sim.objs$discrep.summary } ## Continue to fit models: ## use "try" again so that if something goes wrong, the whole function won't crash. for(imputecol in imputecolnames){ ## Fit the model for this impute column: catline("Fitting model using", imputecol, ".... ") res.sim.col <- try( mrds.func(dat.in=dat.sim, groupsize.col=imputecol, allparnames=allparnames, model=model, w=w, front.ymin=front.ymin, start.pars=start.pars, printit=F)) ## If fitting was successful, append results to resmat, and plot if required: if(!inherits(res.sim.col, "try-error")){ ## Derive the estimated density of individuals and total number of individuals ## in the survey region. Use as.vector here to purge pesky names attributes. if(plotit) mrds.plot(res.sim.col) derived.ests <- as.vector(c(res.sim.col["Nchat"]/coveredArea.sim, res.sim.col["Nchat"]/coveredArea.sim*surveyRegionArea)) names(derived.ests) <- c("density.indivs", "total.indivs") res.sim.col <- c(res.sim.col, derived.ests) all.out[[imputecol]][sim,] <- res.sim.col[colnam] } } ## End of fitting the model for a particular impute column. } ## End of successful execution of compile.data.func. } ## End of iterations. ## Convert from matrices into data frames: if("impute.all" %in% imputecolnames) all.out$impute.all <- data.frame(all.out$impute.all) if("impute.missing" %in% imputecolnames) all.out$impute.missing <- data.frame(all.out$impute.missing) if("impute.none" %in% imputecolnames) all.out$impute.none <- data.frame(all.out$impute.none) all.out$covered.area <- data.frame(all.out$covered.area) ## Append extra details to all.out: if(fixedTol){ all.out$type <- c(fixedTol=T, discrep=F) all.out$timeTol <- as.vector(fixed.args["timeTol"]) all.out$angleTol <- as.vector(fixed.args["angleTol"]) } else{ all.out$type <- c(fixedTol=F, discrep=T) all.out$discrep.fit <- data.frame(all.out$discrep.fit) all.out$discrep.args <- discrep.args } all.out$w <- w all.out$front.ymin <- front.ymin all.out$start.pars <- start.pars all.out$area.details <- c(Nsurveys=K, Survey.area.km2=surveyRegionArea) all.out$survey.lengths <- survey.lengths all.out$call <- match.call() all.out$bootstrap.by.survey <- bootstrap.by.survey all.out } ############################################################ create.caphists.func <- function(dat.in, groupsize.col, w, front.ymin){ ## Create.caphists.func 1/4/16 ## Code that was previously at the top of mrds.func is now in a separate function that can ## be called by multiple functions. ## Takes a raw duplicate data frame, output from compile.data.func()$data.out, ## and creates the capture history, omega=1, 2, 3; the concensus distance, y; and the ## required groupsize, s, for every observation. ## groupsize.col could be "impute.missing", "impute.all", "impute.none". ## Find the groupsize vector: groupsize <- dat.in[, groupsize.col] if(length(groupsize)!= nrow(dat.in)) stop("Can't find groupsize column or something wrong?") ## Create the data frame for estimation: dat <- dat.in[, c("distance", "ObserverPosition", "distance.1", "ObserverPosition.1")] names(dat) <- c("ytmp1", "OP1", "ytmp2", "OP2") ## Ensure that ObserverPositions are characters, not factors: dat$OP1 <- as.character(dat$OP1) dat$OP2 <- as.character(dat$OP2) ## Decide a single distance, y, for duplicate observations: ## Start with the ytmp1 distance, which was "distance" in the original data frame: dat$y <- dat$ytmp1 ## If the ytmp2 distance, or "distance.1" in the original data frame, is present, then take the mean of ## the two distances as being the final distance for the observation. dat$y[!is.na(dat$ytmp2)] <- (dat$ytmp1[!is.na(dat$ytmp2)] + dat$ytmp2[!is.na(dat$ytmp2)] )/2 ## Add the imputed group sizes and call it column "s": dat$s <- groupsize ## If any group sizes are 0, remove them from the dataframe. A group size of 0 implies that for ## this imputation, it was deemed to be a false entry. This can happen if the min, best, and max groupsize ## estimates are missing. The count value is in that case taken from a uniform distribution between 0 and the ## average group size for the whole data set. dat <- dat[dat$s>0,] ## Remove any entries with y greater than the truncation distance, w: dat <- dat[dat$y <= w,] ## Remove any entries where the front observer is given a sighting with distance y10 | adiff>15)] ## Add a column to dat that says whether this observation is in the ourNotDup.sameside.discrep category: ## dat$ourNotDup.sameSide <- as.numeric(sameside==1 & (tdiff>10 | adiff>15)) ## Add column to say whether this sighting is included as a duplicate according to ## tdiff<=10 and adiff<=15: dat$ourDup.sameSide <- as.numeric(tdiff<=10 & adiff<=15 & sameside==1) ## Estimate the Normal parameters that optimise the likelihood of the duplicates or non-duplicates that ## the fixed-tolerance analysis would have identified as above: this is for creating start-values for the mixture ## model. likOneCpt.func <- function(ourpars){ ## Likelihood for a single discrepancy x is ## dtruncNorm.func(x, m=mu, s=sig) ## Define normDat to be the data to be optimized, outside this function. mu <- ourpars[1] sig <- ourpars[2] if(mu<0 | sig<0) return(10000*max(abs(ourpars))) nllike <- sum(log( dtruncNorm.func(normDat, m=mu, s=sig) ))*(-1) if(printit) cat(format(ourpars, digits=3), " nllike=", nllike, "\n") ## If things have gone wrong, give a ridiculous answer: if(is.na(nllike)) nllike <- 10000 if(is.infinite(nllike)) nllike <- 20000 return(nllike) } normDat <- ourDup.discrep ourDup.est <- nlm(likOneCpt.func, p=c(0.2, 0.2), hessian=F, typsize=c(0.2, 0.2), iterlim=1500, gradtol=1e-4) ourDup.meanest <- ourDup.est$estimate[1] ourDup.sdest <- ourDup.est$estimate[2] if(printit){ cat("=================================\n") cat("Estimating a truncated Normal density on pairs with tdiff<=10 and adiff<=15 :hardcoded values for plotting likely duplicates and for providing start values for mixture fit\n") print(ourDup.est) cat("\n\n") } ## ------------------------------------------------------------------------------------------ ## PLOTS breakvals <- seq(0, by=histbar, length=ceiling(max(discrep)/histbar)+1) densvals <- seq(1e-5, max(breakvals), length=200) if(plotit & !paperplot){ ## If paperplot=T, the layout is set in paper.discrepancy.plot; this whole code block is only wanted ## if paperplot=F. layout(matrix(1:4, byrow=F, ncol=2)) par(mar=c(3, 3.5, 3, 1.5), mgp=c(2, 0.8, 0)) ourDup.density <- dtruncNorm.func(densvals, m=ourDup.meanest, s=ourDup.sdest) if(length(ourDup.discrep)>0){ hist(ourDup.discrep, breaks=breakvals, xlim=range(breakvals), ylim=c(0, max(c(6, ourDup.density))), probability=T, main=paste( "Pairs with timeDiff <= 10 and angleDiff <= 15 and same side:\n estimated mean=", round(ourDup.meanest,2), " ; sd=", round(ourDup.sdest, 2), sep=""), col="lightblue", xlab="Measured discrepancy between pair") lines(densvals, ourDup.density, col=4, lwd=2) } } ## ------------------------------------------------------------------------------------------ ## Discrepancies of observations that the fixed-tolerance analysis would NOT identify as ## duplicates with the hardcoded limits: again for providing start-values for the mixture model. normDat <- ourNotDup.discrep ourNotDup.est <- nlm(likOneCpt.func, p=c(0.5, 0.5), hessian=F, typsize=c(0.5, 0.5), iterlim=1500, gradtol=1e-4) ourNotDup.meanest <- ourNotDup.est$estimate[1] ourNotDup.sdest <- ourNotDup.est$estimate[2] if(printit){ cat("=================================\n") cat("Estimating a truncated Normal density on pairs with tdiff>10 and/or adiff>15 :hardcoded values for plotting likely NON-duplicates and for providing start values for mixture fit\n") print(ourNotDup.est) cat("\n\n") } if(plotit & !paperplot){ ourNotDup.density <- dtruncNorm.func(densvals, m=ourNotDup.meanest, s=ourNotDup.sdest) if(length(ourNotDup.discrep)>0){ hist(ourNotDup.discrep, breaks=breakvals, xlim=range(breakvals), probability=T, main=paste( "Pairs with timeDiff > 10 and/or angleDiff > 15 and/or diff sides:\n estimated mean=", round(ourNotDup.meanest,2), " ; sd=", round(ourNotDup.sdest, 2), sep=""), col="lightblue", xlab="Measured discrepancy between pair") lines(densvals, ourNotDup.density, col=4, lwd=2) } } ## ------------------------------------------------------------------------------------------ ## Now create the mixture function: mix.func <- function(pars){ ## pars are: (muNear, sigNear, muFar, sigFar, dupP) ## The probability dupP is the probability for the first component (near or probable-duplicate ## discrepancies). ## Log-likelihood for a single discrepancy "x" where it's unknown whether it's a duplicate or not, ## is a mixture of two truncated Normals: ## log ( dupP*dtruncNorm(x, m=muNear, s=sigNear)+ ## (1-dupP)*dtruncNorm(x, m=muFar, s=sigFar)) ## However, if we know that the pair is a non-duplicate because they come from different sides, their ## contribution is just dtruncNorm(x, m=muFar, s=sigFar) ## Unwrap the elements of pars: muNear <- pars[1] sigNear <- pars[2] muFar <- pars[3] sigFar <- pars[4] dupP <- pars[5] ## Reject if the algorithm has selected parameters outside limits: cheap and dirty tricks if (muNear<0 | sigNear<0 | muFar<0 | sigFar<0) return(10000*max(abs(pars))) if (dupP<0 | dupP>1) return(20000*max(abs(pars))) ## Log-likelihood for a single discrepancy "x" if it's not known it's a non-duplicate is: ## log ( dupP*dtruncNorm(x, m=muNear, s=sigNear)+ ## (1-dupP)*dtruncNorm(x, m=muFar, s=sigFar) ) ## ## Do this calculation first, then replace all known non-duplicates with their correct values ## log ( dtruncNorm(x, m=muFar, s=sigFar) ): ## First fill in all the entries for unknowns: loglik <- log ( dupP*dtruncNorm.func(discrep, m=muNear, s=sigNear)+ (1-dupP)*dtruncNorm.func(discrep, m=muFar, s=sigFar)) ## Then replace the entries where the discrepancies are on different sides: loglik[which.diffsides] <- log(dtruncNorm.func(discrep[which.diffsides], m=muFar, s=sigFar)) ## Finally add: nllike <- (-1)*sum(loglik) ## Print current estimates: if(printit) cat(format(pars, digits=3), " nllike=", nllike, "\n") ## If things have gone wrong, give a ridiculous answer: if(is.na(nllike)) nllike <- 10000 if(is.infinite(nllike)) nllike <- 20000 return(nllike) } ## Create the starting parameters from the informal estimates gained above, using our own ## judgements about what the duplicates might be: ## for dupP use length(ourDup.discrep)/length(discrep). muNear.start <- ourDup.meanest sigNear.start <- ourDup.sdest muFar.start <- ourNotDup.meanest sigFar.start <- ourNotDup.sdest dupP.start <- length(ourDup.discrep)/length(discrep) start.pars <- c(muNear.start, sigNear.start, muFar.start, sigFar.start, dupP.start) mix.res <- nlm(mix.func, p=start.pars, hessian=F, typsize=start.pars, iterlim=1500, gradtol=1e-6) if(printit){ cat("=================================\n") cat("Estimating a mix of truncated Normals on all discrepancies:\n") cat("Starting parameters: ", start.pars, "\n") print(mix.res) } ## Unpack the MLEs from mix.res: muNear.hat <- mix.res$estimate[1] sigNear.hat <- mix.res$estimate[2] muFar.hat <- mix.res$estimate[3] sigFar.hat <- mix.res$estimate[4] dupP.hat <- mix.res$estimate[5] if(plotit){ overall.cpt1 <- dupP.hat*dtruncNorm.func(densvals, m=muNear.hat, s=sigNear.hat) overall.cpt2 <- (1-dupP.hat)*dtruncNorm.func(densvals, m=muFar.hat, s=sigFar.hat) overall.density <- overall.cpt1 + overall.cpt2 start.density <- dupP.start*dtruncNorm.func(densvals, m=muNear.start, s=sigNear.start) + (1-dupP.start)*dtruncNorm.func(densvals, m=muFar.start, s=sigFar.start) ## ------------------------------------------------------------------------------------------ ## Now plot all same-side discrepancies and superimpose the estimated function: ## note the hint of bimodality in the overall plot. ## NOTE: The histogram should only show same-side discrepancies, ## because the mixture model involving Pdup is only for same-side discrepancies. ## Discrepancies for opposite-side pairs are all assigned to the higher mixture component and ## so their shape should not be fitted by the mixture model. hist(discrep[which.sameside], breaks=breakvals, xlim=range(breakvals), probability=T, col="lightblue", xlab="Discrepancy within pair, r", cex.lab=1.2, cex.axis=1, ylim=c(0, max(overall.density)), main="") lines(densvals, overall.density, col="red3", lwd=4) ## lines(densvals, start.density, col=4, lwd=2, lty=2) ## Overlay the component densities: lines(densvals, overall.cpt1, lwd=1, lty=2, col="grey30") lines(densvals, overall.cpt2, lwd=1, lty=2, col="grey30") if(paperplot) title(main="Fitted mixture model for same-side discrepancies", cex.main=1.5) else title(main=paste("All discrepancies: mu1=", round(muNear.hat,2), " ; sig1=", round(sigNear.hat, 2), " ;\n mu2=", round(muFar.hat, 2), " ; sig2=", round(sigFar.hat, 2), " ; P(dup)=", round(dupP.hat, 2), sep="")) ## Plot probability of duplicate if(discrepmax ==100) pdupvals <- seq(1e-5, 1.5*max(ourDup.discrep), length=100) else pdupvals <- seq(1e-5, discrepmax, length=100) cpt1 <- dupP.hat*dtruncNorm.func(pdupvals, m=muNear.hat, s=sigNear.hat) cpt2 <- (1-dupP.hat)*dtruncNorm.func(pdupvals, m=muFar.hat, s=sigFar.hat) plot(pdupvals, cpt1/(cpt1+cpt2), type="l", xlab="Discrepancy, r", ylim=c(-0.05, 1.05), ylab="P( accept as a duplicate | r )", lwd=3, cex.lab=1.2, cex.axis=1, pty="n") abline(h=c(0, 0.5, 1), col="grey60", lty=3) lines(pdupvals, cpt1/(cpt1+cpt2), lwd=3) if(paperplot) title(main="Duplicate acceptance function", cex.main=1.5) else title(main="P(accept as a duplicate) as a function of discrepancy, r") ## Rug plot of the observations the fixed-tolerance analysis would have selected as duplicates: suppressWarnings(rug(ourDup.discrep, side=3, tck=0.05, col="red3")) ## Rug plot of the observations on the SAME SIDE that the fixed-tolerance analysis ## would not have selected as duplicates: suppressWarnings(rug(ourNotDup.sameside.discrep, side=1, tck=0.05, col=4)) if(paperplot){ legend(max(pdupvals), 0.9, xjust=1, col=c("red3", 4), legend=c("Satisfies A=15, T=10", "Fails A=15, T=10"), lty=c(1,1), bg="white", cex=1.1) } else legend(max(pdupvals), 0.9, xjust=1, col=c("red3", 4), legend=c("Satisfy A<=15, T<=10", "Fail, but on same side"), lty=c(1,1), bg="white") } ## Calculate P(duplicate | d) for all entries in the data frame that are on the SAME SIDE: cpt1.dat <- dupP.hat*dtruncNorm.func(dat$discrep, m=muNear.hat, s=sigNear.hat) cpt2.dat <- (1-dupP.hat)*dtruncNorm.func(dat$discrep, m=muFar.hat, s=sigFar.hat) Pdup.dat <- cpt1.dat / (cpt1.dat + cpt2.dat) ## Remove any observations on different sides (give them a P(duplicate) of zero): Pdup.dat <- Pdup.dat * dat$sameSide dat$Pdup <- Pdup.dat ## Compile objects for returning: mix.est <- mix.res$estimate names(mix.est) <- c("mu1", "sig1", "mu2", "sig2", "dupP") return(list(outdat=dat, mix.est=mix.est)) } ############################################################## compile.data.func <- function(data, fixedTol=T, fixed.args=(if(fixedTol) c(timeTol=10, angleTol=15) else NULL), discrep.args=(if(fixedTol) NULL else c(discrepmax = 1, tdiffmax = 400, histbar = 0.05, plot.discrep=T, print.discrep=F)), printit=T, paperplot=F){ ## compile.data.func 26/3/15 ## Identifies duplicates and deals with cases where there are clusters of interconnected potential ## duplicates. ## ## If fixedTol=T, this function identifies duplicates as all those who are within timeTol seconds and angleTol ## degrees of each other. For example if timeTol=5, if -5 <= time(B) - time(A) <= 5 then the ## two are compatible. If fixedTol=T, then discrep.args should be supplied as NULL for clarity. ## ## If fixedTol=F, this function performs a discrepancy analysis to gain P(duplicate) for each pair on a ## common flight segment / survey. If fixedTol=F, then fixed.args should be supplied as NULL for clarity. ## ## Regardless of fixedTol, clusters of multiple duplicates are resolved using build.dup.data.func: ## see the blurb for that function. ## ## RETURNS THE FOLLOWING OBJECTS: ## discrep.summary: NA if fixedTol=T, otherwise gives the estimated mixture components from ## the discrepancy analysis. ## ## data.in: the unprocessed data input to this function as argument "data". ## ## orig: this is the data frame of all possible pairs that could be duplicates, before multiple-duplicate ## clusters are resolved. index1 and index2 refer to ROW INDICES of the original data frame "data". ## ## final: as for orig, but having removed all rows that correspond to pairs that are dealt with in the ## multiple-duplicate resolutions. Note that "cluster" name will change between orig and final, because ## we rename clusters every time we search for conflicts; but otherwise final is a subset of orig. ## ## repaired: pun intended, is a list of "re-paired" sightings created during multiple-duplicate conflict ## resolution. Each element of repaired is itself a list, with components "anchor" - the anchor sighting ## in the resolution; "merge" - the sighting or sightings that connect with anchor and are merged to ## resolve the conflict; min1, best1, max1: the group size estimates for the anchor sighting; and ## min2, best2, max2: the merged group size estimates for the merged sightings. ## ## data.out : this is the final data with one row per reconciled sighting or sighting-pair (including ## merged ones if appropriate). To link data.out back to the input data, use the final two columns ## orig.row and orig.row.1, NOT the columns marked object and object.1, because objects in the ## input data frame don't correspond exactly to rows in the input data frame (and, to spawn confusion, they ## are very similar but not the same). ## ## destination: this is a record-keeping data frame that shows what happened to all the rows in the ## original data frame. Possible statuses: L = lone sighting in final df; P or S = Primary or Secondary ## sighting in a duplicate pair that did not involve any conflict resolution; A or M = Anchor or Merged ## sighting in a duplicate pair that did involve conflict resolution. ## ## To link between input data, data.out, and destination: ## e.g. say data.out has a row with orig.row=255, orig.row.1=254+256 ## then look at input data[c(255, 254, 256),] to see the three sightings concerned; ## look at destination[c(255, 254, 256),] and you'll see sighting 255 has status A, 254 and 256 both ## have status M, and all three sightings have destination row 254; ## so check the destination row 254 in data.out: ## data.out[254,] - sure enough, this reflects the anchor sighting unchanged in the first set of columns, ## and the merge of the other two sightings in the second set of columns. ## -------------------------------------------------------------------------------------- ## Housekeeping: data$ObserverCode <- as.character(data$ObserverCode) data$ObserverPosition <- as.character(data$ObserverPosition) data$am.pm <- as.character(data$am.pm) data$transect.label <- as.character(data$transect.label) ## Ensure the arguments supplied make sense with whether fixedTol = T or fixedTol = F: if(fixedTol){ ## Fixed tolerance analysis: if(!is.null(discrep.args)) stop("discrep.args should be NULL if fixedTol=T") timeTol <- fixed.args["timeTol"] angleTol <- fixed.args["angleTol"] } else{ ## Discrepancy / Pdup analysis: if(!is.null(fixed.args)) stop("fixed.args should be NULL if fixedTol=F") } ## -------------------------------------------------------------------------------------- ## Create secs.from.12am column for data to enable easy time comparisons: ## Hours: secs.from.12am <- data$hour*3600 ## Add 12 hours for pm times: secs.from.12am[data$am.pm=="p.m."] <- secs.from.12am[data$am.pm=="p.m."]+12*3600 ## Minutes: secs.from.12am <- secs.from.12am + data$minute*60 ## Seconds: secs.from.12am <- secs.from.12am + data$second ## Form into a column of "data": data$secs.from.12am <- secs.from.12am ##------------------------------------------------------------------------------------------- ## Create the data frame pairs.df. This consists of all pairs of sightings on the same ## survey with the same transect.label, but with different observers. The pairs can be ## either same side or different side of plane. ## Create a survey.transect.id, e.g. survey 4 transect.label=5B will have id "4*5B". ## Two sightings are only considered for a pair if they have the same survey.transect.id. survey.transect.id <- paste(data$Survey, data$transect.label, sep="*") ## Now run through the data identifying allowed pairs: ## index1 and index2 reference the rows of "data" corresponding to the pair members. nobs <- nrow(data) ## common.st.id is a list where each list element contains the indices (rows of "data") corresponding ## to each survey.transect.id. For example, element "8*6A" = c(120 121 122 123 124 125 126 127) ## because all these indices are on the same survey 8 and transect label 6A. common.st.id <- split(1:nobs, survey.transect.id) ## Now form all possible pairs from each list element using the function combn(vec, 2). ## Unfortunately if vec has only a single element, combn will interpret it as 1:vec, which we don't want, ## so use a loop instead of using lapply. pairs.df <- NULL for(st in 1:length(common.st.id)) if(length(common.st.id[[st]])>1){ pairs.df <- rbind(pairs.df, t(combn(common.st.id[[st]], 2))) } pairs.df <- data.frame(pairs.df) ## Sort by index1: names(pairs.df) <- c("index1", "index2") pairs.df <- pairs.df[order(pairs.df$index1),] ## ----------------------------------------------------------------------------------------------------------------------- ## For each pair in pairs.df, find timeDiff, angleDiff, sameSide, and sameObs: ## sameSide is 1 if the pair was observed from the same side of the plane (pairings M1-M3 or M2-M4) ## and 0 if opposite sides. ## sameObs is 1 if the pair was observed by the same observer (so it can't be a duplicate). pairs.df$angleDiff <- abs(data$angle[pairs.df$index1] - data$angle[pairs.df$index2]) pairs.df$timeDiff <- abs(data$secs.from.12am[pairs.df$index1] - data$secs.from.12am[pairs.df$index2]) ## Create vector "side" to say which side each sighting in "data" is on, for ease. ## Let the possible sides be 13 (for M1 or M3) and 24 (for M2 or M4) so they are stored as numbers ## but clearly specify the link to M1/M2/M3/M4. side <- rep(13, nobs) side[data$ObserverPosition %in% c("M2", "M4")] <- 24 pairs.df$sameSide <- as.numeric(side[pairs.df$index1]==side[pairs.df$index2]) pairs.df$sameObs <- as.numeric(data$ObserverCode[pairs.df$index1]== data$ObserverCode[pairs.df$index2]) ## Remove from pairs.df any sightings made by the same observer: ## NOTE: we don't necessarily have to do this, but it does give better results in discrepancy.res, ## and results from the same observer are not quite in the population of maybe-duplicates-at-large ## because there is bound to be a bit of a gap between them. pairs.df <- pairs.df[pairs.df$sameObs==0,] rownames(pairs.df) <- 1:nrow(pairs.df) ## ----------------------------------------------------------------------------------------------------------------------- ## Now proceed differently for funding Pdup according to whether it's a fixed tolerance analysis ## or a discrepancy analysis. If it's fixed tolerance, Pdup is 1 for all pairs within the tolerances, and 0 ## for all pairs outside. If it's a discrepancy analysis, Pdup is estimated by discrepancy.func. if(fixedTol){ ## ---------------------------------- ## Fixed tolerance analysis: ## ---------------------------------- ## Identify which pairs are duplicates according to the timeTol and angleTols specified: pairs.df$dup <- as.numeric(pairs.df$timeDiff<= timeTol & pairs.df$angleDiff <= angleTol & pairs.df$sameSide==1) ## Create a new data frame, duplicates.df, with only the duplicate pairs in it: duplicates.df <- pairs.df[pairs.df$dup==1,] ## Create a dummy column Pdup that contains the probability of being a duplicate: ## this is for consistency with the discrepancy.func output and for use in build.dup.data.func. ## The Pdup column here will be entirely 1's because all rows are definitely selected as duplicates. duplicates.df$Pdup <- rep(1, nrow(duplicates.df)) discrep.summary <- NA } else{ ## ---------------------------------- ## Discrepancy analysis: ## ---------------------------------- discrep.res <- discrepancy.func(dat=pairs.df, discrepmax=discrep.args["discrepmax"], tdiffmax=discrep.args["tdiffmax"], histbar=discrep.args["histbar"], printit=discrep.args["print.discrep"], plotit=discrep.args["plot.discrep"], paperplot=paperplot) discrep.summary <- discrep.res$mix.est duplicates.df <- discrep.res$outdat ## Use the Pdup column to determine which pairs are selected as duplicates: duplicates.df$dup <- rbinom(nrow(duplicates.df), 1, duplicates.df$Pdup) ## Only include the rows that are selected as duplicates: duplicates.df <- duplicates.df[duplicates.df$dup==1,] } ## ---------------------------------------------------------------------------------------- ## Build the new data with duplicates identified and inserted into the same row as each other. ## Farm out to function build.dup.data.func to create duplicate data, including random ## selections for multiple duplicates, from the pairs in duplicates.df: dupdat.res <- build.dup.data.func(data=data, duplicates.df=duplicates.df, printit=printit) data.out <- dupdat.res$data.out ## ---------------------------------------------------------------------------------------- ## Impute group size by the three protocols and add results to three columns of data.out. ## 1. impute.all imputes all group sizes; ## 2. impute.missing imputes only those group sizes that are missing; ## 3. impute no group sizes; insert 0 if all estimates are missing. ## Calls function "impute" in Impute.R by Sophie Kincaid: impute.res <- impute(fixed=data.out, iterations=1) ## impute.all and impute.missing are data frames; impute.none is a single vector because (for a single data ## frame) there is no randomness in it. data.out$impute.all <- impute.res$impute.all[,1] data.out$impute.missing <- impute.res$impute.missing[,1] data.out$impute.none <- impute.res$impute.none ## ---------------------------------------------------------------------------------------- ## Collate and return all objects: return(list(data.in=data, orig=dupdat.res$dup.orig, final=dupdat.res$dup.final, repaired=dupdat.res$repaired.list, destination=dupdat.res$destination, data.out=data.out, discrep.summary=discrep.summary)) } ########################################################### build.dup.data.func <- function(data, duplicates.df, printit){ ## build.dup.data.func 29/3/15 ## Builds a data frame with duplicates included, including resolution of clusters of multiple duplicates. ## This resolution involves randomness, so different results will be obtained each time. ## Suitable for calling from compile.data.func. ## ## Resolves clusters of multiple duplicates as follows: ## 1. Given a cluster of interconnected duplicate possibilities, pick a single anchor observation ## at random from all those in the cluster, with probability proportional to the number of times ## the index appears in the cluster. ## For example, if the cluster is index100=index101 and index101=index102, then we assemble ## all indices in the cluster: (100, 101, 101, 102), and then pick one of these at random with probabilities ## given by the corresponding Pdup column. If this is a fixed-tolerance analysis, all Pdups will be 1 so ## all indices will be picked with equal probability, but multiple occurrences get multiple chances of ## being picked; so in the example above we'd pick index 101 to be the anchor with probability 1/2, ## and indices 100 and 102 to be the anchor with probability 1/4 each. However if there is a Pdup ## record from a discrepancy analysis associated with each pair, then this is taken into account: for example ## if Pdup(index100=index101)=0.2 and Pdup(index101=index102)=0.9, then we have indices ## (100, 101, 101, 102) and Pdup=(0.2, 0.9, 0.2, 0.9) so P(pick 100) = 0.2/(0.2+0.9+0.2+0.9), ## P(pick 101) = (0.9+0.2)/(0.2+0.9+0.2+0.9), etc. ## ## 2. Leaves all records for the anchor alone (min, best, and max) and calls them min1, best1, max1 ## in the "re-paired" data list. Merges everything connecting with anchor as the second sighting: ## this second sighting could be a merge of 1, 2, or more rows of the original data frame. The ## min, best, and max of the merged sighting is the sum of the constituent ones, with NAs ignored ## (count as 0). ## ## 3. When sightings are merged to resolve multiple duplicates, take the distance of the merged sighting ## as the weighted average of the original distances, weighted by group size. The group size used is ## best, if it exists; if not, then the average of min and max or whichever of these exist; and if neither of ## these exist we just use group size 1 for the weighting (under the assumption that the group was ## probably a small one if no estimate of group size was offered at all). ## ## RETURNS THE FOLLOWING OBJECTS: ## dup.orig: this is the data frame of all possible pairs that could be duplicates, before multiple-duplicate ## clusters are resolved. index1 and index2 refer to ROW INDICES of the original data frame "data". ## ## dup.final: as for orig, but having removed all rows that correspond to pairs that are dealt with in the ## multiple-duplicate resolutions. Note that "cluster" name will change between orig and final, because ## we rename clusters every time we search for conflicts; but otherwise final is a subset of orig. ## ## repaired.list: pun intended, is a list of "re-paired" sightings created during multiple-duplicate conflict ## resolution. Each element of repaired is itself a list, with components "anchor" - the anchor sighting ## in the resolution; "merge" - the sighting or sightings that connect with anchor and are merged to ## resolve the conflict; min1, best1, max1: the group size estimates for the anchor sighting; and ## min2, best2, max2: the merged group size estimates for the merged sightings. ## ## data.out : this is the final data with one row per reconciled sighting or sighting-pair (including ## merged ones if appropriate). To link data.out back to the input data, use the final two columns ## orig.row and orig.row.1, NOT the columns marked object and object.1, because objects in the ## input data frame don't correspond exactly to rows in the input data frame (and, to spawn confusion, they ## are very similar but not the same). ## ## destination: this is a record-keeping data frame that shows what happened to all the rows in the ## original data frame. Possible statuses: L = lone sighting in final df; P or S = Primary or Secondary ## sighting in a duplicate pair that did not involve any conflict resolution; A or M = Anchor or Merged ## sighting in a duplicate pair that did involve conflict resolution. ## ## To link between input data, data.out, and destination: ## e.g. say data.out has a row with orig.row=255, orig.row.1=254+256 ## then look at input data[c(255, 254, 256),] to see the three sightings concerned; ## look at destination[c(255, 254, 256),] and you'll see sighting 255 has status A, 254 and 256 both ## have status M, and all three sightings have destination row 254; ## so check the destination row 254 in data.out: ## data.out[254,] - sure enough, this reflects the anchor sighting unchanged in the first set of columns, ## and the merge of the other two sightings in the second set of columns. nobs <- nrow(data) ## ---------------------------------------------------------------------------------------- ## Find clusters of duplicates that need to be reconciled: dup.orig <- dup.cluster.func(duplicates.df) dup.orig$dup <- rep(1, nrow(dup.orig)) ## Add to dup.orig the min/best/max estimates from each observer: dup.orig$min1 <- data$min[dup.orig$index1] dup.orig$best1 <- data$best[dup.orig$index1] dup.orig$max1 <- data$max[dup.orig$index1] ## dup.orig$min2 <- data$min[dup.orig$index2] dup.orig$best2 <- data$best[dup.orig$index2] dup.orig$max2 <- data$max[dup.orig$index2] ## ---------------------------------------------------------------------------------------- ## Resolve multiple duplicates. ## dup.final is the same as dup.orig but having removed all rows involving ## indices that are selected for pairing (including merges). No new rows are added to ## dup.final; we only remove rows. ## The selected and merged results are stored in a separate data frame called repaired.list ## (deliberate pun) dup.final <- dup.orig repaired.list <- list() clus.size <- table(dup.final$cluster) while(any(clus.size>1)){ ## If any(clus.size>1), clusters with more than one pair need to be resolved. ## First find a cluster with size>1. clusname <- min(as.numeric(names(clus.size)[clus.size>1])) resolve.df <- dup.final[dup.final$cluster==clusname,] ## Collate all indices involved in the cluster, including multiples: ## for example, if the cluster is 100=101 and 101=102, then all.inds = c(100, 101, 101, 102). all.inds <- c(resolve.df$index1, resolve.df$index2) ## Probability weightings for selecting the target are gained from the Pdup column. ## This might be 1 everywhere (for fixed tolerance analyses), in which case targets are ## selected with probability proportional to how often they appear. In the case of the discrepancy ## analysis, the probabilities will be <1 and the weightings will favour selection of targets with ## the highest probability of being a duplicate. ind.probweights <- c(resolve.df$Pdup, resolve.df$Pdup) ## Pick target or anchor observation, at random from those in all.inds, with weightings ## ind.probweights. ## In the example above, if all inds have Pdup=1, then 101 will be selected with probability 1/2, ## and 100 and 102 with probability 1/4 each. If Pdup=c(0.2, 0.4, 0.2, 0.4) (note it is cycled ## like this) then 101 will be selected with probability (0.4+0.2)/(0.2+0.4+0.2+0.4), 100 will be ## selected with probability 0.2/(0.2+0.4+0.2+0.4), etc. anchor <- sample(all.inds, 1, prob=ind.probweights) ## Merge everything connecting with anchor and add the results to repaired.list. merge <- NULL for(rw in 1:nrow(resolve.df)){ if(resolve.df$index1[rw]==anchor) merge <- c(merge, resolve.df$index2[rw]) else if(resolve.df$index2[rw]==anchor) merge <- c(merge, resolve.df$index1[rw]) } ## Remove all rows from dup.final that include any of c(anchor,merge): dup.final <- dup.final[!(dup.final$index1 %in% c(anchor, merge)),] dup.final <- dup.final[!(dup.final$index2 %in% c(anchor, merge)),] ## Put the results into repaired.list: resolve.list <- list(anchor=anchor, merge=merge) ## Anchor retains the same min, best, max as it always had: resolve.list$min1 <- data$min[anchor] resolve.list$best1 <- data$best[anchor] resolve.list$max1 <- data$max[anchor] ## Merged groups add the mins, bests, and maxes. Use sum with na.rm=T so that NAs are ignored. if(all(is.na(data$min[merge]))) resolve.list$min2 <- NA else resolve.list$min2 <- sum(data$min[merge], na.rm=T) if(all(is.na(data$best[merge]))) resolve.list$best2 <- NA else resolve.list$best2 <- sum(data$best[merge], na.rm=T) if(all(is.na(data$max[merge]))) resolve.list$max2 <- NA else resolve.list$max2 <- sum(data$max[merge], na.rm=T) ## For some combinations of NAs, it could happen that we end up with bestmax. In these cases, replace the offending entries to be NA. Note that the order ## in which this is done does make a difference, and we are attempting to be vague when ## we have conflicting information. It is just a quick fix for a situation that won't arise often. if(!is.na(resolve.list$min2) & !is.na(resolve.list$best2)){ if(resolve.list$min2 > resolve.list$best2) resolve.list$best2 <- NA } if(!is.na(resolve.list$best2) & !is.na(resolve.list$max2)){ if(resolve.list$best2 > resolve.list$max2) resolve.list$best2 <- NA } if(!is.na(resolve.list$min2) & !is.na(resolve.list$max2)){ if(resolve.list$min2 > resolve.list$max2) resolve.list$min2 <- NA } resolve.name <- paste("anchor", anchor, sep="") repaired.list[[resolve.name]] <- resolve.list ## Print outcome from this loop: if(printit){ cat("\n--------------------------------------------------------------------------\n") cat("Sizes of duplicate clusters:\n") print(sort(clus.size)) cat("\nResolving cluster", clusname, ":\n") print(resolve.df) cat("Anchor observation = ", anchor, "\n") cat("Merging observations:", merge, "\n") cat("Resolved elements:\n") print(unlist(resolve.list)) cat("\n") ## readline() } ## Finally recreate clusters and clus.size to cycle round the loop if necessary. dup.final <- dup.cluster.func(dup.final) clus.size <- table(dup.final$cluster) } ## --------------------------------------------------------------------------------------------------------------------------------------- ## Create the new data frame, data.out, with one row per observation, in which duplicates are ## inserted into the same row as each other. ## First create a data frame "destination" that describes what is happening to each observation in the initial ## data frame. Each observation can have status "L", "P", "S", "A", or "M": ## "L" for "lone observation", i.e. if status[i] = "L" then row i of initial data "data" is transmitted ## to a row of data.out by itself and there is no duplicate sighting. ## "P" for "primary observation", i.e. if status[i]="P" then row i of "data" is the primary observation ## in a duplicate pair in data.out. Being the primary observation just means that it is in the first set ## of columns - names "min", "best", "out", etc - it doesn't have any implication about it being a better ## quality observation than the secondary one or about who made the observation. ## "S" for "secondary observation", i.e. the second set of columns in a duplicate pair. ## "A" for "anchor observation", i.e. it has been selected (at random) to be the anchor observation for ## resolving a duplicate cluster. In practice this will make it indistinguishable from "P" for primary obsn. ## "M" for "merged observation": i.e. it is connected to an anchor observation and will appear as a ## constituent of the secondary columns for this row. Note that it might be merged only with itself, ## in which case it is indistinguishable from "S" for secondary obs; but it might also be merged with ## other sightings to make a larger group. ## Find sightings that will be anchors or merged: anchor.vec <- unlist(lapply(repaired.list, function(x) x$anchor)) merge.vec <- unlist(lapply(repaired.list, function(x) x$merge)) ## Check that there is no overlap among anchor.vec, merge.vec, dup.final$index1, and dup.final$index2: all.dups <- sort(c(anchor.vec, merge.vec, dup.final$index1, dup.final$index2)) if(any(duplicated(all.dups))) { cat("Something's wrong! Some rows of data have been selected for more than one treatment.\n") print(all.dups[duplicated(all.dups)]) stop("Some rows of data appear more than once in all.dups.") } destination <- data.frame(orig.row=1:nobs) ## Find status of all rows of the initial data frame. ## Start off all observations with status "L": destination$status <- rep("L", nobs) ## All observations in dup.final$index1 will have status "P": destination$status[dup.final$index1] <- "P" ## All observations in dup.final$index2 will have status "S": destination$status[dup.final$index2] <- "S" ## All observations in "anchor" positions of repaired.list will have status "A": destination$status[anchor.vec] <- "A" ## All observations in "merged" positions of repaired.list will have status "M": destination$status[merge.vec] <- "M" ## Select columns for going forward to impute and MRDS functions: outnames <- c("Survey", "transect.id", "transect.label", "object", "distance", "ObserverCode", "ObserverPosition", "min", "best", "max") outnames.categorical <- c("Survey", "transect.id", "transect.label", "ObserverCode", "ObserverPosition") outnames.1 <- paste(outnames, ".1", sep="") ## Put all lone observations into data.out first, before adding extra columns: data.out <- data[destination$status=="L", outnames] ## Record the destination rows of all these sightings in destination$dest.row: destination$dest.row <- rep(NA, nobs) destination$dest.row[destination$status=="L"] <- 1:nrow(data.out) ## Now add extra columns to data.out to accommodate duplicate sightings and their details: data.out[, outnames.1] <- NA ## Put orig.row and orig.row.1 prominently in the final columns: data.out$orig.row <- which(destination$status=="L") data.out[, "orig.row.1"] <- NA row.counter <- nrow(data.out) ncl <- ncol(data.out) ## Run through primary and secondary sightings in dup.final, adding them to data.out: for(i in 1:nrow(dup.final)){ row.counter <- row.counter + 1 ind1 <- dup.final$index1[i] ind2 <- dup.final$index2[i] data.out <- rbind(data.out, rep(NA, ncl)) data.out[row.counter, outnames] <- data[ind1, outnames] data.out[row.counter, outnames.1] <- data[ind2, outnames] ## Specify original row of each sighting: data.out[row.counter, c("orig.row", "orig.row.1")] <- c(ind1, ind2) ## Specify destination row in data frame destination: destination$dest.row[c(ind1,ind2)] <- row.counter } ## Convert the object columns to character, because they will need to be for merged sightings: data.out$object <- as.character(data.out$object) data.out$object.1 <- as.character(data.out$object.1) data.out$orig.row <- as.character(data.out$orig.row) data.out$orig.row.1 <- as.character(data.out$orig.row.1) ## Run through anchor and merged sightings in repaired.list, adding them to data.out: if(length(anchor.vec)>0) for(i in 1:length(anchor.vec)){ row.counter <- row.counter + 1 ## Put in the anchor observation: ind1 <- repaired.list[[i]]$anchor data.out <- rbind(data.out, rep(NA, ncl)) data.out[row.counter, outnames] <- data[ind1, outnames] data.out[row.counter, "orig.row"] <- ind1 ## Deal with the merged sightings: inds.merged <- repaired.list[[i]]$merge ## It should be the case that Survey, transect.id, transect.label, ObserverCode, and ObserverPosition ## are identical for each of the merged sightings. Check this here and dump if not. for(colm in outnames.categorical){ ## For example, colm="Survey", or colm="transect.id": merged.colm <- data[inds.merged, colm] if(any(merged.colm!=merged.colm[1])){ print(data[inds.merged,]) stop(paste("Trying to merge data records with different values of ", colm, "field??")) } else data.out[row.counter, paste(colm, ".1", sep="")] <- merged.colm[1] } ## Insert the merged min.1, best.1, max.1: data.out[row.counter, c("min.1", "best.1", "max.1")] <- c(repaired.list[[i]]$min2, repaired.list[[i]]$best2, repaired.list[[i]]$max2) ## Calculate the distance of the merged sighting as a group-size-weighted average ## of the component distances. (Note: we are averaging distances, not angles; because we ## aim to measure the distance to the group centroid.) ## For each sighting in inds.merged, use: ## - best if best exists; ## - mean(min, max, na.rm=T) if best is NA; ## - if this still gives us NA (i.e. none of min, best, and max exist for this sighting), set it to 1. groupsize.merged <- data$best[inds.merged] for(j in 1:length(inds.merged)) if(is.na(groupsize.merged[j])){ groupsize.merged[j] <- mean(c(data$min[inds.merged[j]], data$max[inds.merged[j]]), na.rm=T) if(is.na(groupsize.merged[j])) groupsize.merged[j] <- 1 } ## Enter data.out$distance.1 as the average of data$distance[inds.merged] ## weighted by groupsize.merged: distance.average <- sum( data$distance[inds.merged]*groupsize.merged)/sum(groupsize.merged) data.out[row.counter, "distance.1"] <- distance.average ## Print to check if wanted: if(F){ print(data.frame(ind=inds.merged, best=data$best[inds.merged], min=data$min[inds.merged], max=data$max[inds.merged], final=groupsize.merged, dist=data$distance[inds.merged],final.dist=rep(distance.average,length(inds.merged)))) readline() } ## For the merged object.1 column, paste the original objects separated by "+": data.out[row.counter, "object.1"] <- paste(data$object[inds.merged], collapse="+") ## Likewise for merged orig.row.1: data.out[row.counter, "orig.row.1"] <- paste(inds.merged, collapse="+") ## Specify destination row in data frame destination: destination$dest.row[c(ind1,inds.merged)] <- row.counter } row.names(data.out) <- 1:nrow(data.out) return(list(dup.orig=dup.orig, dup.final=dup.final, repaired.list=repaired.list, data.out=data.out, destination=destination)) } ########################################################### dup.cluster.func <- function(dupres){ ## dup.cluster.func 27/3/15 ## Finds clusters of multiple duplicates from data frame dupres. ## dupres has the format of a "pairs" data frame, generated from discrepancy.func. ## The choice of duplicates has already been made from the probabilities. ## dupres needs to have columns index1, index2, and dup, where index1 and index2 refer to the ## rows of the master data frame, and dup is 1 when a pair has been selected as a duplicate and 0 ## otherwise. ## If dup is 1 everywhere, it doesn't matter (i.e. if only pre-selected duplicates are fed to this function, ## that's fine). res <- dupres[dupres$dup==1,] res$cluster <- NULL ## In case res has already been clustered before, remove its previous clusters. ## Cluster will be 0 if dup=0, otherwise it will give the number of a cluster. nr <- nrow(res) cluster <- rep(0, nr) ## Run through rows of res allotting cluster numbers: cluster.counter <- 1 for(i in 1:nr){ ## Check whether res$index1[i] is already in a cluster: clus.ind1 <- c(cluster[res$index1==res$index1[i]], cluster[res$index2==res$index1[i]]) ## Check whether res$index2[i] is already in a cluster: clus.ind2 <- c(cluster[res$index1==res$index2[i]], cluster[res$index2==res$index2[i]]) clus.both <- c(clus.ind1, clus.ind2) ## If any of clus.both are non-zero, we have pre-existing clusters to sort out. ## If not, allot this row the current cluster number. if(all(clus.both==0)){ ## No pre-existing clusters to sort out: cluster[i] <- cluster.counter cluster.counter <- cluster.counter + 1 } else{ ## There are pre-existing clusters to sort out. clus.prev <- clus.both[clus.both!=0] ## If there is only one pre-existing cluster, then add row i to this cluster: if(length(clus.prev)==1) cluster[i] <- clus.prev else{ ## If length(clus.prev)>1, there are multiple clusters that all need to be merged. ## Delete these previous clusters and rename them all to the current cluster.counter. merge.rows <- which(cluster %in% clus.prev) ## Add the current row under consideration to this cluster: merge.rows <- c(merge.rows, i) cluster[merge.rows] <- cluster.counter cluster.counter <- cluster.counter + 1 } ## End of more than one pre-existing cluster to merge } ## End of pre-existing clusters to sort out } ## End of examining row i res$cluster <- cluster return(res) } ############################################################# mrds.func <- function(dat.in, groupsize.col="impute.all", w=400, front.ymin=60, start.pars=c(al1=3, al2=3, al3=3, be1=-0.03, be2=-0.03, be3=-0.03, gam1=-0.0001, gam2=-0.0001, gam3=-0.0001, delt1=0.02, delt2=0.02, delt3=0.02, eps1=0.00001, eps2=0.00001, eps3=0.00001), allparnames=c("al1","al2","al3", "be1", "be2", "be3", "gam1", "gam2", "gam3", "delt1", "delt2", "delt3", "eps1", "eps2", "eps3"), ## All possible parameters for inclusion model=allparnames, ## Specifies which of the parameters to include in the model printit=T){ ## mrds.func 29/3/2016 ## Fits the MRDS detection model and gains estimates of all detection parameters and of Nc ## given duplicate-reconciled data supplied in dat.in. ## ## The full model suite available includes any combination of terms in the next 3 lines: ## logit(P(rear | dist, size)) = al1 + be1*dist + gam1*dist^2 + delt1*size + eps1*size*dist ## logit(P(front | not-rear; dist, size)) = al2 + be2*dist + gam2*dist^2 + delt2*size + eps2*size*dist ## logit(P(front | rear; dist, size)) = al3 + be3*dist + gam3*dist^2 + delt3*size + eps3*size*dist ## Any of these parameters not included in the "model" argument will be ignored, so we ## can try a range of models. ## NOTE: the start.pars vector must have names specified, e.g. al1=3 etc, ## so that the correct values are drawn for the desired parameters. ## ## This function conducts estimation of population size in R, given dat.in which contains a set of ## resolved duplicates and various columns "impute.missing", "impute.all", and "impute.none" ## for the different ways of imputing group-size given the assembly of "min", "best", and "max" records ## from the one or two observers. ## ## dat.in is gained from compile.data.func()$data.out. ## ## w is the right-truncation distance: set at 400 metres which cuts out about 3% of the sightings in our data. ## ## front.ymin is the minimum distance at which a front observer can see. The front observers are ## M1 and M2. ## ## Starting parameters are based on estimates from a trial run of the optimization, and have been ## successful based on results from grid.mrds.func. ## ## The returned output contains values Nchat (estimated number of dolphins in the covered area); ## MLE parameter estimates al1, ..., delt3; ## minimum (the minimum negative log likelihood attained, which is the value at the MLEs); ## code.123good : optimization convergence code. Code 1 or 2 in this column should be OK. ## Code 3 is probably OK. Code 4 or 5 means the optimization probably didn't converge and you can't trust ## the answers. ## Create the capture-history data from dat.in. The output data, dat, has a capture history, ## concensus distance y, and groupsize s, for each observation. dat <- create.caphists.func(dat.in=dat.in, groupsize.col=groupsize.col, w=w, front.ymin=front.ymin) n <- nrow(dat) ## Create a table of look-up values for s. For the likelihood, we need to integrate pdot(y, s) with respect to ## y for every value of s that occurs in the data. So create a vector of the unique values of s, and create ## another vector that specifies how many of each value there are. ## This means we don't have to perform the integration more times than we need: ## e.g. if lots of s-values are s=1, we only perform this integration once. ## The values are in svals, and the number of times this value occurs is in sfreq. stab <- table(dat$s) svals <- as.numeric(names(stab)) sfreq <- as.numeric(stab) nsvals <- length(svals) ## Create objects for easy reference: yvec <- dat$y svec <- dat$s omvec <- dat$omega ## ------------------------------------------------------------------------------------------------------------------ ## Now for the likelihood function, nllike.func, for the negative log-likelihood: nllike.func <- function(pars, returnNchat=F){ ## Use returnNchat=T if you want to calculate the estimated Nc (number of dolphins in the ## covered area) for a given value of pars, e.g. when you've obtained the final MLEs. ## Unpack the parameter vector into alpha, beta, delta, and epsilon parameters: parvals <- rep(0, length(allparnames)) names(parvals) <- allparnames ## Fill in elements of parvals with the corresponding parameters as specified in the "model" vector: parvals[model] <- pars ## For example, if model=c("al1", "be2", "gam3") then parvals would be 0 everywhere ## except for the three elements "al1", "be2", "gam3" that would be populated by the values in pars. ## Any unused parameters are given the value 0, which means they won't influence the corresponding ## expressions. al1 <- parvals["al1"]; al2 <- parvals["al2"]; al3 <- parvals["al3"] be1 <- parvals["be1"]; be2 <- parvals["be2"]; be3 <- parvals["be3"] gam1 <- parvals["gam1"]; gam2 <- parvals["gam2"]; gam3 <- parvals["gam3"] delt1 <- parvals["delt1"]; delt2 <- parvals["delt2"]; delt3 <- parvals["delt3"] eps1 <- parvals["eps1"]; eps2 <- parvals["eps2"]; eps3 <- parvals["eps3"] ## For the likelihood, we need p_R(y, s) for every observation: pRvec <- 1/( 1 + exp(-al1 - be1*yvec - gam1*yvec^2 - delt1*svec - eps1*svec*yvec) ) ## Bit of redundant calculation coming next, but it's probably easiest to calculate p_{F | R} ## and p_{F | notR} for every observation too, instead of having to manipulate the indices for strictly ## the observations that need each quantity. pFnotRvec <- 1/( 1 + exp(-al2 - be2*yvec - gam2*yvec^2 - delt2*svec - eps2*svec*yvec) ) pFRvec <- 1/( 1 + exp(-al3 - be3*yvec - gam3*yvec^2 - delt3*svec - eps3*svec*yvec) ) ## Ensure that pF is 0 for any y<60 (or front.ymin): this will cause the likelihood to converge ## falsely because it will include log(0). However these lines should not execute because ## we remove any observations with yfront.ymin] <- 1/( 1 + exp(-al2 - be2*u[u>front.ymin] - gam2*(u[u>front.ymin])^2 - delt2*sval - eps2*sval*u[u>front.ymin]) ) pdot <- pR + (1-pR)*pFnotR ## To see pdot values plotted for all u, values, uncomment the points() line below, and ## use something like this at the command line first: plot(-1, xlim=c(0, 500), ylim=c(0, 1)) ## points(u, pdot, pch=".") pdot } int.pdot.s <- rep(0, nsvals) for(i in 1:nsvals){ int.pdot.s[i] <- integrate(pdot.func, lower=0, upper=w, sval=svals[i])$value } ## Check svals and int.pdot by uncommenting this below ## return(data.frame(svals, int.pdot)) ## and returning just below the end of this function something like: ## return(nllike.func(c(0, 0, -0.001, -0.002, 0.003, 0.004, 0.2, -0.05))) ## If returnNchat=T, take a diversion here. Calculate Nchat = w * sum(sfreq * svals / int.pdot.s). if(returnNchat==T){ Nchat <- w * sum(sfreq * svals / int.pdot.s) return(Nchat) } ## Now compile the first term of the log-likelihood: ## sum_svals sfreq * log(int.pdot). sum.log.int.pdot <- sum( sfreq * log(int.pdot.s)) ## Find the omega terms: logPomega <- rep(0, n) ## omega = 1: history (1, 0), Front team only. P(omega=1,0) = (1-pR)*pFnotR logPomega[omvec==1] <- log(1 - pRvec[omvec==1]) + log(pFnotRvec[omvec==1]) ## omega = 2: history (0, 1), Rear team only. P(omega=0,1) = pR*(1-pFR) logPomega[omvec==2] <- log(pRvec[omvec==2]) + log(1 - pFRvec[omvec==2]) ## omega = 3: history (1, 1), Both teams. P(omega=1,1) = pR*pFR logPomega[omvec==3] <- log(pRvec[omvec==3]) + log(pFRvec[omvec==3]) sum.logPomega <- sum(logPomega) ## Finally return the neg-log likelihood: nllike <- sum.log.int.pdot - sum.logPomega ## If things have gone wrong, give nllike a large positive value: ## print(c(sum.log.int.pdot, sum.logPomega, nllike)) if(is.na(nllike)) nllike <- 100000*(5+max(abs(pars))) if(is.infinite(nllike)) nllike <- 200000*(10+max(abs(pars))) if(printit) cat(format(pars, digits=3), " nllike=", nllike, "\n") return(nllike) } ## --------------------------------------------------------------------------------------------- ## Uncomment lines like this to get a single evaluation of the likelihood: ## return(nllike.func(c(0, 0, -0.001, -0.002, 0.003, 0.004, 0.2, -0.05))) ## return(nllike.func(start.pars)) ## Find the MLE: start.pars <- start.pars[model] mle.res <- nlm(nllike.func, p=start.pars, hessian=F, typsize=start.pars, iterlim=1500, gradtol=1e-4) if(printit) print(mle.res) mle.params <- mle.res$estimate mle.code.123good <- mle.res$code mle.min <- mle.res$minimum ## Use the MLEs to find Nchat: this has been coded into nllike.func with returnNchat=T for ease of coding. Nchat <- nllike.func(pars=mle.params, returnNchat=T) ## Calculate the AIC: AIC = 2*npar - 2 * max( log like ) and we want the minimum AIC. ## So AIC = 2*npar - 2 * (- min (-log like)) = 2*npar + 2*min(negloglike) ## Also ## AICc = AIC + 2*npar*(npar+1)/(nrow(dat.in)-npar-1) according to MARK manual page 111 (4-34). npar <- length(model) AIC <- 2*npar + 2*mle.min AICc <- AIC + 2*npar*(npar+1)/(nrow(dat.in)-npar-1) ## Compile the final results for return: allpar.out <- rep(NA, length(allparnames)) names(allpar.out) <- allparnames allpar.out[model] <- mle.params mle.out <- c(Nchat, allpar.out, mle.min, mle.code.123good, w, front.ymin, mle.res$iterations, AIC, AICc) names(mle.out) <- c("Nchat", allparnames, "minimum", "code.123good", "w", "front.ymin", "iterations", "AIC", "AICc") attributes(mle.out)$allparnames <- allparnames attributes(mle.out)$model <- model mle.out } ############################################################# mrds.plot <- function(mle.res, svals = c(1, 10, 20, 100), which.legend="all", plot.Nc=T){ ## mrds.plot 1/8/14 ## Plots the detection functions derived from the output from mrds.func. ## svals give the values of group sizes at which detection functions will be plotted. ## For a given value of s in svals, the detection function will be plotted for y=0 to y=w. ## ## If which.legend="all", the legend is plotted in each subplot. Otherwise, if which.legend=1, 2, 3, 4 etc, the ## legend is plotted only in the corresponding subplot. par(mfrow=c(2,2), oma=c(1, 0, 1, 0), mar=c(3, 4, 3, 1.5), mgp=c(1.8, 0.5, 0)) if(mle.res["code.123good"]>3) print("WARNING! Convergence code is 4 or 5: you should not trust the results from this analysis.") ## Extract the parameters and settings for convenience: w <- mle.res["w"] front.ymin <- mle.res["front.ymin"] allparnames <- attributes(mle.res)$allparnames ## Replace any NA's in the parameters with 0s: mle.res[allparnames][is.na(mle.res[allparnames])] <- 0 ## Quick way of unpacking all the parameters in one go: for(i in 1:length(allparnames)) assign(allparnames[i], mle.res[allparnames[i]]) Nchat <- mle.res["Nchat"] ## pdot.func is for finding integral_(u=0)^w pdot(u, sval) du for a particular value of sval: pdot.func <- function(u, sval){ pR <- 1/( 1 + exp(-al1 - be1*u - gam1*u^2 - delt1*sval - eps1*sval*u) ) pFnotR <- rep(0, length(u)) pFnotR[u>front.ymin] <- 1/( 1 + exp(-al2 - be2*u[u>front.ymin] - gam2*(u[u>front.ymin])^2 - delt2*sval - eps2*sval*u[u>front.ymin]) ) pdot <- pR + (1-pR)*pFnotR } ## -------------------------------------------------------------------------------------------- ## Go through s-values one at a time to find the detection functions and the overall probability of detection. yvec <- c(seq(0, front.ymin-0.00001, length=20), seq(front.ymin, w, length=100)) nsvals <- length(svals) ## Determine legend plotting: if(is.numeric(which.legend)) s.legend <- svals[which.legend] else s.legend <- svals for(s in svals){ pRvec <- 1/( 1 + exp(-al1 - be1*yvec - gam1*yvec^2 - delt1*s - eps1*s*yvec) ) pFnotRvec <- 1/( 1 + exp(-al2 - be2*yvec - gam2*yvec^2 - delt2*s - eps2*s*yvec) ) pFRvec <- 1/( 1 + exp(-al3 - be3*yvec - gam3*yvec^2 - delt3*s - eps3*s*yvec) ) ## Ensure that pF is 0 for any y < front.ymin : pFnotRvec[yvec < front.ymin] <- 0 pFRvec[yvec < front.ymin] <- 0 ## Now create int.pdot = integral_0^w pdot(u, s) du for this s-value: int.pdot.s <- integrate(pdot.func, lower=0, upper=w, sval=s)$value Epdot.s <- int.pdot.s/w ## Plot: plot(yvec, pRvec, type="l", lwd=4, ylim=c(0, 1), xlab="Distance, y", ylab="Detection Probability", cex.lab=1.3) lines(yvec, pFRvec, col=2, lwd=3) lines(yvec, pFnotRvec, col=3, lwd=3) abline(h=1, col="grey", lty=2) abline(h=0.5, col="grey", lty=2) if(s %in% s.legend) legend("topright", col=c(1, 3, 2), lwd=c(2, 2, 2), legend=c(expression(p[scriptstyle(R)]), expression(p[scriptstyle(F*"|"*bar(R))]), expression(p[scriptstyle(F*"|"*R)])), bg="white", cex=1.6) title(main=paste("s =", s, ": E(p.) =", format(round(Epdot.s, 2), nsmall=2)), cex.main=1.6) } ## Overall title giving the estimated Nchat for this mle.res: if(plot.Nc) title(main=paste("Estimated Nc =", round(Nchat)), outer=T, line=-0.2, adj=0.02, cex.main=1.6) } ############################################################# mrds.fixedtol.boxplot <- function(stem="res.fixedtol", plotwhat="total.indivs",titlestr="fixed tolerance analyses", mark.angletol=T, discard.above=Inf, ylim=NULL, plot.legend=T, paperplot=F, horiz.line=NULL, plot.cv=T, cairo.export=F){ ## mrds.fixedtol.boxplot 31/3/15 ## Searches for all objects with name matching "stem", which should be results from mrds.wrap ## with fixedtol=T. Each object has results for impute.all, impute.missing, and impute.none. ## Plots a triplet of boxplots for these three, in different colours. ## Puts boxplots for all the different objects on the same figure, sorted first in angle tolerance ## (which seems to be the more impactful factor) then in time tolerance. ## ## If mark.angletol=T, delimits the boxplots corresponding to the same angle tolerances on the chart. ## ## Argument "discard.above" enables us to discard any results that have apparently converged but are ## patently absurd. Need to use this with manual diagnosis to determine where there is a break between ## sensible answers and absurd ones. ## ## horiz.line can only be a number; have not coded the more elaborate options in mrds.discrep.boxplot. ## ## cairo.export is needed for high-res Cairo exports: the dot-dash lengths of dashed lines need to be ## adjusted. ## Decide on colours for impute.all, impute.missing, and impute.none: col.all <- "lightgreen" col.missing <- "yellow" col.none <- "salmon" colvec <- c(col.all, col.missing, col.none) ## First find all objects that match "stem". ## This behaves differently according to whether RMF's special function "ls.true" is present or not. if(exists("ls.true")) objnames <- ls.true(pattern=stem, envir=.GlobalEnv) else objnames <- ls(pattern=stem, envir=.GlobalEnv) ## Create vectors timeTolvec, angleTolvec, wvec, front.yminvec, that contain the corresponding results ## from each of objnames: nobj <- length(objnames) timeTolvec <- angleTolvec <- bootvec <- rep(NA, nobj) for(i in 1:nobj){ objres <- eval(parse(text=objnames[i])) timeTolvec[i] <- objres$timeTol angleTolvec[i] <- objres$angleTol bootvec[i] <- objres$bootstrap.by.survey } ## Sort objects according to bootstrap or not first, then angletol which seems to matter more, ## and then timetol which seems to matter less: obj.order <- order(bootvec, angleTolvec, timeTolvec) objnames <- objnames[obj.order] timeTolvec <- timeTolvec[obj.order] angleTolvec <- angleTolvec[obj.order] bootvec <- bootvec[obj.order] ## labels gives a label to each cluster of three boxplots. ## boxlabels gives a label only to the middle of each cluster of three boxplots. labels <- paste("A", angleTolvec, ".T", timeTolvec, sep="") labels[bootvec] <- paste("B", labels[bootvec], sep="") boxlabels <- rep("", 3*nobj) if(paperplot){ boxlabels[3*(1:nobj)-1] <- paste("T=", timeTolvec, sep="") } else{ boxlabels[3*(1:nobj)-1] <- paste("A:", angleTolvec, " T:", timeTolvec, sep="") boxlabels[3*(1:nobj)-1][bootvec] <- paste("B", boxlabels[3*(1:nobj)-1][bootvec]) } ## Create the list of components for boxplots: comps <- list() for(i in 1:nobj){ ## For each of the objects below, remove any with code.123good > 3 or plotwhat is NA or ## plotwhat is > discard.above: ## Add object i impute.all: comp.tmp <- eval(parse(text=paste(objnames[i], "$impute.all", sep=""))) comp.tmp <- comp.tmp[[plotwhat]][comp.tmp$code.123good<=3] comp.tmp <- comp.tmp[!is.na(comp.tmp) & (comp.tmp < discard.above)] comps[[paste(labels[i], ".A", sep="")]] <- comp.tmp ## Add object i impute.missing: comp.tmp <- eval(parse(text=paste(objnames[i], "$impute.missing", sep=""))) comp.tmp <- comp.tmp[[plotwhat]][comp.tmp$code.123good<=3] comp.tmp <- comp.tmp[!is.na(comp.tmp) & (comp.tmp < discard.above)] comps[[paste(labels[i], ".M", sep="")]] <- comp.tmp ## Add object i impute.none: comp.tmp <- eval(parse(text=paste(objnames[i], "$impute.none", sep=""))) comp.tmp <- comp.tmp[[plotwhat]][comp.tmp$code.123good<=3] comp.tmp <- comp.tmp[!is.na(comp.tmp) & (comp.tmp < discard.above)] comps[[paste(labels[i], ".N", sep="")]] <- comp.tmp } ## Plot the boxplots: if(is.null(ylim)){ ylim.range <- range(unlist(comps)) ylim <- ylim.range if(plot.cv) { ylim <- ylim + c(-0.03*diff(ylim.range)) cv.y <- mean(min(ylim), min(ylim.range)) } } else cv.y <- min(ylim) + diff(ylim)/80 if(paperplot){ bp <- boxplot(comps, pars=list(medlty="blank", boxfill=colvec, outwex = 0.3, boxwex = 0.6, outpch=NA, outlty="solid", staplewex=0.6, staplelwd=1, whisklty=1, whisklwd=0.4, outlwd=0.4), cex.axis=1.3, ylim=ylim, names=boxlabels, xaxt="n", lwd=0.5) ## Add horizontal axis labels: axis(side=1, at=3*(1:nobj)-1, labels=boxlabels[boxlabels!=""], tck=0, lwd=0, cex.axis=1.4, mgp=c(0,0.5,0)) ## For high-res cairo export, use lty="4F" which multiplies the default dot-space of 1,3 to 4,15 ## (because 15 = F in hex): see more under horiz.line. if(cairo.export) abline(v=3*(1:nobj)+0.5, col="grey55", lty="4F") else abline(v=3*(1:nobj)+0.5, col="grey55", lty=3) } else bp <- boxplot(comps, pars=list(medlty="blank", boxfill=colvec, outwex = 0.3, boxwex = 0.6, outpch=NA, outlty="solid", staplewex=0.6, staplelwd=1, whisklty=1, whisklwd=0.4, outlwd=0.4), cex.axis=0.8, ylim=ylim, xlim=c(1, 3*nobj), names=boxlabels, las=2) ## Plot the horizontal line if required: if(!is.null(horiz.line)){ if(!is.numeric(horiz.line)) horiz.line <- mean(comps[[paste0("ER included.", horiz.line)]]) ## For the cairo.export, use lty="FF" below to get 15 pixels on and 15 pixels off: "F" is 15 in hex. ## The default for lty=2 is the sequence "44". See "line type" in help(par) for more. if(cairo.export) abline(h=horiz.line, lty="FF", lwd=1, col="grey25") else abline(h=horiz.line, lty=2, lwd=1, col="grey25") cat("horiz.line=", horiz.line, "\n") } ## Use the line below if you want to give labels to every boxplot ## names=names(comps), las=2) ## Add line across each box to display the mean of each set of estimates: for(i in 1:length(comps)) lines(c(i - 0.4, i + 0.4), rep(mean(comps[[i]]), 2), lwd = 2) ## Add the title: if(titlestr!="") title(main=paste("Estimates of", plotwhat, ":", titlestr)) ## --------------------------------------------------------------------------------------------------- ## Results summary data frame: sum.df <- data.frame(name=rep(objnames, each=3), boot=as.numeric(rep(bootvec, each=3)), t.tol=rep(timeTolvec, each=3), a.tol=rep(angleTolvec, each=3), impute=rep(c("all", "missing", "none"), times=nobj)) ## Add columns for mean, 95% CI low, 95% CI high, %CV, and standard error: sum.df$est <- unlist(lapply(comps, mean)) sum.df$ci95.lo <- unlist(lapply(comps, function(x) quantile(x, (1-0.95)/2))) sum.df$ci95.hi <- unlist(lapply(comps, function(x) quantile(x, 1-(1-0.95)/2))) sum.df$CI.95 <- paste("(", round(sum.df$ci95.lo), ",", round(sum.df$ci95.hi), ")", sep="") sum.df$CV.pc <- unlist(lapply(comps, function(x) sqrt(var(x))/abs(mean(x))*100)) sum.df$se <- unlist(lapply(comps, function(x) sqrt(var(x)))) ## Print results: sum.print <- sum.df sum.print$est <- round(sum.print$est) sum.print$CV.pc <- round(sum.print$CV.pc) ## If desired, plot the CV rounded percentages below each boxplot: if(plot.cv){ text(1:length(comps), rep(cv.y, length(comps)), sum.print$CV.pc) text(0, cv.y, "%CV", adj=0.6) } ## Remove unwanted columns from sum.print and sum.df: sum.print$se <- NULL sum.print$ci95.lo <- NULL sum.print$ci95.hi <- NULL sum.df$CI.95 <- NULL catline("Results for", plotwhat, ":") print(sum.print, row.names=F) ## --------------------------------------------------------------------------------------------------- ## Mark angle tolerances on the chart if desired: if(mark.angletol){ angle.limits <- c(0, which(diff(sum.df$a.tol)!=0), length(sum.df$a.tol)) abline(v=angle.limits+0.5, col=1, lty=1) for(i in 2:length(angle.limits)) if(paperplot) text(c(0.7, 10, 19)[i-1]+1, 0.99*max(ylim), paste("A=", sum.df$a.tol[angle.limits[i]], sep=""), cex=1.5, col=1, adj=1) else text(angle.limits[i]+0.2, max(ylim), paste("A=", sum.df$a.tol[angle.limits[i]], sep=""), cex=1.5, col="grey45", adj=1) } ## Add a legend to explain the colours: ## (This has to be done after the angle limits are marked, so as to be plotted on top). if(plot.legend) legend("topleft", pch=22, col=1, pt.bg=colvec, legend=c("Impute All", "Impute Missing", "Impute None"), pt.cex=2, bg="white") ## Return results to full precision: list(quantity=plotwhat, results=sum.df) } ############################################################# mrds.discrep.boxplot <- function(stem="res.discrep", plotwhat="total.indivs", titlestr="discrepancy analyses", discard.above=Inf, ylim=NULL, horiz.line="A", plot.legend=T, plot.cv=T, cairo.export=F){ ## mrds.discrep.boxplot 3/4/15 ## Searches for all objects with name matching "stem", which should be results from mrds.wrap ## with fixedtol=F. Each object has results for impute.all, impute.missing, and impute.none. ## Plots a triplet of boxplots for these three, in different colours. ## ## Argument "discard.above" enables us to discard any results that have apparently converged but are ## patently absurd. Need to use this with manual diagnosis to determine where there is a break between ## sensible answers and absurd ones. ## ## horiz.line can be a number, or "A", "M", "N", standing for "all", "missing", "none"; or NULL for no line. ## If it is supplied, a horizontal line will be plotted at the mean bootstrap-by-survey estimate for ## impute.** for the corresponding **, e.g. impute.missing if horiz.line="missing". ## ## cairo.export is needed for high-res Cairo exports: the dot-dash lengths of dashed lines need to be ## adjusted. ## Decide on colours for impute.all, impute.missing, and impute.none: col.all <- "lightgreen" col.missing <- "yellow" col.none <- "salmon" colvec <- c(col.all, col.missing, col.none) ## First find all objects that match "stem". ## This behaves differently according to whether RMF's special function "ls.true" is present or not. if(exists("ls.true")) objnames <- ls.true(pattern=stem, envir=.GlobalEnv) else objnames <- ls(pattern=stem, envir=.GlobalEnv) nobj <- length(objnames) ## Determine whether the objects involve bootstrap or not: bootvec <- rep(NA, nobj) for(i in 1:nobj){ objres <- eval(parse(text=objnames[i])) bootvec[i] <- objres$bootstrap.by.survey } ## Sort objects according to bootstrap or not: obj.order <- order(bootvec) objnames <- objnames[obj.order] bootvec <- bootvec[obj.order] ## labels gives a label to each cluster of three boxplots. ## boxlabels gives a label only to the middle of each cluster of three boxplots. labels <- rep("ER omitted", nobj) labels[bootvec] <- "ER included" boxlabels <- rep("", 3*nobj) boxlabels[3*(1:nobj)-1] <- ("ER omitted") boxlabels[3*(1:nobj)-1][bootvec] <- "ER included" ## Create the list of components for boxplots: comps <- list() for(i in 1:nobj){ ## For each of the objects below, remove any with code.123good > 3 or plotwhat is NA or ## plotwhat is > discard.above: ## Add object i impute.all: comp.tmp <- eval(parse(text=paste(objnames[i], "$impute.all", sep=""))) comp.tmp <- comp.tmp[[plotwhat]][comp.tmp$code.123good<=3] comp.tmp <- comp.tmp[!is.na(comp.tmp) & (comp.tmp < discard.above)] comps[[paste(labels[i], ".A", sep="")]] <- comp.tmp ## Add object i impute.missing: comp.tmp <- eval(parse(text=paste(objnames[i], "$impute.missing", sep=""))) comp.tmp <- comp.tmp[[plotwhat]][comp.tmp$code.123good<=3] comp.tmp <- comp.tmp[!is.na(comp.tmp) & (comp.tmp < discard.above)] comps[[paste(labels[i], ".M", sep="")]] <- comp.tmp ## Add object i impute.none: comp.tmp <- eval(parse(text=paste(objnames[i], "$impute.none", sep=""))) comp.tmp <- comp.tmp[[plotwhat]][comp.tmp$code.123good<=3] comp.tmp <- comp.tmp[!is.na(comp.tmp) & (comp.tmp < discard.above)] comps[[paste(labels[i], ".N", sep="")]] <- comp.tmp } ## Plot the boxplots: ylim.range <- range(unlist(comps)) if(is.null(ylim)){ ylim <- ylim.range if(plot.cv) { ylim <- ylim + c(-0.03*diff(ylim.range)) cv.y <- mean(min(ylim), min(ylim.range)) } } else cv.y <- min(ylim) + diff(ylim)/80 cat("ylim: ", ylim, "\n") bp <- boxplot(comps, pars=list(medlty="blank", boxfill=colvec, outwex = 0.3, boxwex = 0.6, outpch=NA, outlty="solid", staplewex=0.6, staplelwd=1, whisklty=1, whisklwd=0.4, outlwd=0.4), cex.axis=1.3, ylim=ylim, names=boxlabels, xaxt="n", lwd=0.5) ## Add horizontal axis labels: axis(side=1, at=3*(1:nobj)-1, labels=boxlabels[boxlabels!=""], tck=0, lwd=0, cex.axis=1.4) ## Add vertical divider line: abline(v=3*nobj/2+0.5) ## Plot the horizontal line if required: if(!is.null(horiz.line)){ if(!is.numeric(horiz.line)) horiz.line <- mean(comps[[paste0("ER included.", horiz.line)]]) ## For the cairo.export, use lty="FF" below to get 15 pixels on and 15 pixels off: "F" is 15 in hex. ## The default for lty=2 is the sequence "44". See "line type" in help(par) for more. if(cairo.export) abline(h=horiz.line, lty="FF", lwd=1, col="grey25") else abline(h=horiz.line, lty=2, lwd=1, col="grey25") cat("horiz.line=", horiz.line, "\n") } ## Use the line below if you want to give labels to every boxplot ## names=names(comps), las=2) ## Add line across each box to display the mean of each set of estimates: for(i in 1:length(comps)) lines(c(i - 0.4, i + 0.4), rep(mean(comps[[i]]), 2), lwd = 2) ## Add the title: if(titlestr!="") title(main=paste("Estimates of", plotwhat, ":", titlestr)) ## Add a legend to explain the colours: if(plot.legend) legend("topleft", pch=22, col=1, pt.bg=colvec, legend=c("Impute All", "Impute Missing", "Impute None"), pt.cex=2, inset=0.05, cex=1.2) ## --------------------------------------------------------------------------------------------------- ## Results summary data frame: sum.df <- data.frame(name=rep(objnames, each=3), boot=as.numeric(rep(bootvec, each=3)), impute=rep(c("all", "missing", "none"), times=nobj)) ## Add columns for mean, 95% CI low, 95% CI high, %CV, and standard error: sum.df$est <- unlist(lapply(comps, mean)) sum.df$ci95.lo <- unlist(lapply(comps, function(x) quantile(x, (1-0.95)/2))) sum.df$ci95.hi <- unlist(lapply(comps, function(x) quantile(x, 1-(1-0.95)/2))) sum.df$CI.95 <- paste("(", round(sum.df$ci95.lo), ",", round(sum.df$ci95.hi), ")", sep="") sum.df$CV.pc <- unlist(lapply(comps, function(x) sqrt(var(x))/abs(mean(x))*100)) sum.df$se <- unlist(lapply(comps, function(x) sqrt(var(x)))) ## Print results: sum.print <- sum.df sum.print$est <- round(sum.print$est) sum.print$CV.pc <- round(sum.print$CV.pc) ## If desired, plot the CV rounded percentages below each boxplot: if(plot.cv){ text(1:length(comps), rep(cv.y, length(comps)), sum.print$CV.pc) text(0.75, cv.y, "%CV", adj=1) } ## Remove unwanted columns from sum.print and sum.df: sum.print$se <- NULL sum.print$ci95.lo <- NULL sum.print$ci95.hi <- NULL sum.df$CI.95 <- NULL catline("Results for", plotwhat, ":") print(sum.print, row.names=F) ## --------------------------------------------------------------------------------------------------- ## Return results to full precision: list(quantity=plotwhat, results=sum.df) } ######################################################### testfit.func <- function(stem="res.fixedtol", checkwhat="iterations"){ ## testfit.func 2/4/15 ## Cycles through all objects that match stem, then checks properties of the specified column, ## in each of impute.all, impute.missing, impute.none results. ## checkwhat can be "iterations" (don't want values too small); ## "minimum" (don't want values of either 10000 or 20000 which indicate convergence to a stooge); ## "code.123good" (want values of 1, 2, or 3). ## First find all objects that match "stem". ## This behaves differently according to whether RMF's special function "ls.true" is present or not. if(exists("ls.true")) objnames <- ls.true(pattern=stem, envir=.GlobalEnv) else objnames <- ls(pattern=stem, envir=.GlobalEnv) for(nm in objnames){ res <- eval(parse(text=nm)) catline(nm) catline() ## Remove any NA observations: which.missing.all <- which(is.na(res$impute.all$minimum)) if(length(which.missing.all)>0){ res$impute.all <- res$impute.all[-which.missing.all,] catline("Removing", length(which.missing.all), "missing rows from impute.all") } which.missing.missing <- which(is.na(res$impute.missing$minimum)) if(length(which.missing.missing)>0){ res$impute.missing <- res$impute.missing[-which.missing.missing,] catline("Removing", length(which.missing.missing), "missing rows from impute.missing") } which.missing.none <- which(is.na(res$impute.none$minimum)) if(length(which.missing.none)>0){ res$impute.none <- res$impute.none[-which.missing.none,] catline("Removing", length(which.missing.none), "missing rows from impute.none") } if(checkwhat=="minimum"){ par(mfrow=c(3,1)) hist(res$impute.all[[checkwhat]], col="lightgreen", main="Impute All") hist(res$impute.missing[[checkwhat]], col="yellow", main="Impute Missing") hist(res$impute.none[[checkwhat]], col="salmon", main="Impute None") for(imp in c("impute.all", "impute.missing", "impute.none")){ print(res[[imp]][[checkwhat]] [ (10000-1 < res[[imp]][[checkwhat]] & res[[imp]][[checkwhat]] < 10000+1 ) | (20000-1 < res[[imp]][[checkwhat]] & res[[imp]][[checkwhat]] < 20000+1 ) ]) } } else if(checkwhat=="total.indivs"){ catline("Printing highest 500...") print(tail(sort(res$impute.all[[checkwhat]]), 500), digits=3) catline("Printing highest 500...") print(tail(sort(res$impute.missing[[checkwhat]]), 500), digits=3) catline("Printing highest 500...") print(tail(sort(res$impute.none[[checkwhat]]), 500), digits=3) } else{ print(table(res$impute.all[[checkwhat]])) print(table(res$impute.missing[[checkwhat]])) print(table(res$impute.none[[checkwhat]])) } catline() readline() } } ########################################################## quadratic.ys.func <- function(yvals=seq(0, 150, length=100), svals=c(1, 10, 50, 100), pars=c(2, -15, -1, 1, 1)/100){ ## quadratic.ys.func 30/3/16 ## For testing shapes of detection models with linear and quadratic terms in y and a term in y*s. out.mat <- matrix(0, nrow=length(svals), ncol=length(yvals)) for(i in 1:length(svals)){ out.mat[i,] <- invlogit.func(pars[1] + pars[2]*yvals + pars[3]*yvals^2 + pars[4]*svals[i] + pars[5]*yvals*svals[i]) } plot(-1, xlab="y", ylab="out", type="n", xlim=range(yvals), ylim=range(as.vector(out.mat))) for(i in 1:length(svals)) lines(yvals, out.mat[i,], col=i, lwd=2) } ########################################################## invlogit.func <- function(expr) 1/(1 + exp(-expr)) ########################################################## grid.mrds.func <- function(seed=3, raw.dat=dolphin.dat, fixedTol=T, fixed.args =c(timeTol = 10, angleTol = 15), groupsize.col="impute.all", bootstrap.by.survey=F, K=22){ ## grid.mrds.func 30/3/2016 ## Hardcodes all 512 possible combinations of gam, delt, and eps parameters for the mrds.func model. ## For example, one combination is gam1, gam2, delt3, eps1, eps3, with the rest missing from the model. ## All models assessed have all three alpha and beta parameters included, but might have any of the ## gamma, delta, and epsilon parameters missing. ## ## This function creates a single set of data with the seed given, then ## fits mrds.func for every model combination to this single set of data. ## The output is a list of two outputs: ests gives the numeric output from mrds.func for each of the ## 512 models, while models is a list of model specifications for each one. ## Thus the result from model=models[[i]] is found in ests[i,]. ## ## USAGE: ## grid.mrds.res <- grid.mrds.func() ## grid.process.func() ## view results survnames <- 1:K set.seed(seed) ## If bootstrapping, resample by survey before resolving duplicates: if(bootstrap.by.survey){ catline("Bootstrapping data... ") boot.surv <- sample(x=survnames, size=length(survnames), replace=T) ## Create the bootstrap data: base.dat <- NULL ## Insert the data from the corresponding survey into base.dat, BUT give it a new ## survey number. For example, if true-data survey 1 was selected twice in the bootstrap, ## we want to treat these as if they are different surveys when it comes to pairing off ## duplicates. To distinguish from the real surveys, call the bootstrap-data ## surveys 101, 102, ..., 122. for(bs.ind in 1:length(boot.surv)) { bs <- boot.surv[bs.ind] dat.surv <- raw.dat[raw.dat$Survey==bs,] dat.surv$Survey <- rep(100+bs.ind, nrow(dat.surv)) base.dat <- rbind(base.dat, dat.surv) } } else{ ## Not bootstrapping: base.dat <- raw.dat } dat.try <- try(compile.data.func(data=base.dat, fixedTol=fixedTol, fixed.args=fixed.args, printit=F)) ## If there are no duplicates in the data, quit with NULL: if(inherits(dat.try, "try-error")) return(NULL) ## Otherwise, continue: dat <- dat.try$data.out ## Finished this section with a single set of (possibly bootstrapped) data, to which ## all 512 models will be fitted. ## --------------------------------------------------------------------------------------------------------- ## Now formulate models. lowparnames <- c("al1", "al2", "al3", "be1", "be2", "be3") highparnames <- c("gam1", "gam2", "gam3", "delt1", "delt2", "delt3", "eps1", "eps2", "eps3") ## Crass but simple way of creating all 2^9 = 512 possible models: model.grid <- expand.grid(c(0, 1), c(0, 1), c(0, 1), c(0, 1), c(0, 1), c(0, 1), c(0, 1), c(0, 1), c(0, 1)) names(model.grid) <- highparnames model.list <- vector("list", nrow(model.grid)) for(i in 1:nrow(model.grid)) model.list[[i]] <- c(lowparnames, highparnames[model.grid[i,]==1]) ## Prepare the results objects: (hard-coding 23 columns in the output) res.out <- list(ests=matrix(0, nrow=nrow(model.grid), ncol=23), models=model.list) ## Start fitting the models: for(model.index in 1:nrow(model.grid)){ catline("Fitting model", model.index, "...") res.mod <- mrds.func(dat.in=dat, groupsize.col=groupsize.col, w = 400, front.ymin = 60, model=model.list[[model.index]], printit=F) res.out$ests[model.index,] <- as.numeric(res.mod) if(model.index==1) colnames(res.out$ests) <- names(res.mod) ## Save in-progress results: grid.in.progress <- list(ests=res.out$ests[1:model.index,], models=res.out$models[1:model.index]) save(list="grid.in.progress", file="SAVEMRDS_in_progress") } res.out$ests <- as.data.frame(res.out$ests) res.out$ests$model <- 1:nrow(model.grid) res.out$ests$deltaAIC <- res.out$ests$AIC - min(res.out$ests$AIC) res.out$ests$deltaAICc <- res.out$ests$AICc - min(res.out$ests$AICc) return(res.out) } ########################################################## grid.process.func <- function(boot=T, which.set="C", test.model=512){ ## grid.process.func 1/4/16 ## Processes the results from grid.mrds.func model selection to inspect outcomes and find ## persistently-best models. ## ## The output from grid.mrds.func is saved with a name specifying which settings were used in running it. ## These names are hard-coded below. For example, results set A fitted to bootstrapped data (ER included) ## is named "Abootgrid.mrds.res". Results set A fitted to non-bootstrapped data (ER omitted) is named ## Agrid.mrds.res. These results objects need to be made available to this function before it is run. ## ## Analyses run for the Hamilton et al paper: ## If which.set="A" then the results are from time tolerance T=10, angle tolerance A=15, impute missing. ## If which.set="B" then the results are from time tolerance T=5, angle tolerance A=10, impute all. ## If which.set="C" then the results are from the discrepancy analysis with impute all. ## Final results used in Hamilton et al paper (August 2016) are from Set C: Discrepancy Analysis. ## ## All set A results are of the form grid.mrds.resx where "x" is the seed used to generate them: ## for example, grid.mrds.res3 or grid.mrds.res142. They are loaded from SAVE.grid.mrds.res. ## These different seeds are called different "runs". The set B results have the form Bgrid.mrds.resx. ## ## The boot=T results are respectively bootgrid.mrds.resx and Bbootgrid.mrds.resx. ## ## test.model is a proposal model that could be used for all runs - the aim is to see whether Nchat ## varies substantially between the selected model with deltaAIC=0 and the test model. Default 512 ## means it is the full model with all parameters. ## ## Returned: ## model.best contains all model names that scored an AIC<2 for any of the runs. ## model.deltaAIC.res contains the deltaAIC for every run for each of the models in model.best. ## model.average.delAIC.less.than.2 contains all models with average deltaAIC across runs being less than 2. ## best.models is the list of the model formulations in model.average.delAIC.less.than.2 ## best.models.brief omits the first 6 elements (the alphas and the betas) from each of these models ## so the differences between them can be better seen. ## Nc.test.vs.selected is the difference in Nchat between the best model and a single proposed ## test model. ## Cheap and easy coding below to find the right names for which.set=A, B, or C, and boot=T or F: if(which.set=="A"){ if(boot){ ## load("SAVE.Abootgrid.mrds.res") ## Or make available some other way stemname <- "Abootgrid.mrds.res" splitname <- "Abootgrid.mrds." } else{ ## load("SAVE.Agrid.mrds.res") stemname <- "Agrid.mrds.res" splitname <- "Agrid.mrds." } } if(which.set=="B"){ if(boot){ ## load("SAVE.Bbootgrid.mrds.res") stemname <- "Bbootgrid.mrds.res" splitname <- "Bbootgrid.mrds." } else{ ## load("SAVE.Bgrid.mrds.res") stemname <- "Bgrid.mrds.res" splitname <- "Bgrid.mrds." } } if(which.set=="C"){ if(boot){ ## load("SAVE.Cbootgrid.mrds.res") stemname <- "Cbootgrid.mrds.res" splitname <- "Cbootgrid.mrds." } else{ ## load("SAVE.Cgrid.mrds.res") stemname <- "Cgrid.mrds.res" splitname <- "Cgrid.mrds." } } model.best <- NULL ## All models with deltaAIC<2 for any of the runs. Nc.test.vs.selected <- NULL nruns <- 0 for(gnm in ls(pattern=stemname, envir=.GlobalEnv)){ print(gnm) res <- eval(parse(text=gnm))$ests catline("All convergence codes in results: ", unique(res$code.123good)) print(head(res[order(res$deltaAIC), c("model", "deltaAIC", "iterations", "code.123good", "Nchat")], 5)) ## Prints out the best 5 models. ## Find the difference between Nchat under the best model minus Nchat under the test model: Nc.test.vs.selected <- rbind(Nc.test.vs.selected, c(Nc.sel=res$Nchat[which.min(res$deltaAIC)], Nc.test=res$Nchat[res$model==test.model], code.sel=res$code.123good[which.min(res$deltaAIC)], code.test=res$code.123good[res$model==test.model])) ## All of the maximizations seem to have converged fine. model.best <- c(model.best, res$model[res$deltaAIC<2]) nruns <- nruns+1 ## readline() } model.best <- sort(unique(model.best)) ## Create a data frame giving the deltaAIC for each of the models in model.best. model.deltaAIC.res <- data.frame(model=model.best) for(gnm in ls(pattern=stemname, envir=.GlobalEnv)){ res <- eval(parse(text=gnm))$ests colnm <- strsplit(gnm, split=splitname)[[1]][2] model.deltaAIC.res[colnm] <- res$deltaAIC[model.best] } ## Find which models give consistently best results: model.summary <- data.frame(model=model.deltaAIC.res$model[order(rowSums(model.deltaAIC.res[,-1]))], sumdelAIC=sort(rowSums(model.deltaAIC.res[,-1]))) model.average.delAIC.less.than.2 <- model.summary[model.summary$sumdelAIC <2*nruns, ] best.models <- eval(parse(text=gnm))$model[model.average.delAIC.less.than.2$model] best.models.brief <- lapply(best.models, function(x)x[-(1:6)]) Nc.test.vs.selected <- data.frame(Nc.test.vs.selected) Nc.test.vs.selected$diff <- Nc.test.vs.selected$Nc.test - Nc.test.vs.selected$Nc.sel Nc.test.vs.selected$percdiff <- round(Nc.test.vs.selected$diff/Nc.test.vs.selected$Nc.sel*100) return(list(Nc.test.vs.selected = Nc.test.vs.selected, model.best=model.best, model.deltaAIC.res=model.deltaAIC.res, model.average.delAIC.less.than.2=model.average.delAIC.less.than.2, best.models=best.models, best.models.brief=best.models.brief)) } ####################################################### check.modelfit.func <- function(seed=3, model, readyfit=NULL, svals.FgivenR=list(1:150), gamdf.FgivenR=rep(0, length(svals.FgivenR)), svals.RgivenF=list(1:4, 5:20, 21:100), gamdf.RgivenF=rep(0, length(svals.RgivenF)), raw.dat=dolphin.dat, fixedTol=F, fixed.args = NULL,#c(timeTol = 10, angleTol = 15), groupsize.col="impute.all", w=400, front.ymin=60, twowin=T, plot.gam=F, plot.type="one.window") { ## check.modelfit.func 1/4/2016 ## Given a random seed, creates the duplicate-reconciled data set corresponding to that seed, from raw.dat ## which doesn't have any duplicates. ## If readyfit is supplied, it is the output from a previous iteration of check.modelfit.func. ## It then skips the model-fitting and only plots the graphics. ## If readyfit is NULL, data will be generated using the random seed specified, and the model specified ## will be fitted to it. ## ## Plots the fitted model using mrds.plot, and then creates charts of P(front | rear) and P(rear | front) ## in the raw data (blue points) and via a fitted GAM logistic-regression model (red or green dashed lines). ## plot.gam doubles up as a paperplot argument: if plot.gam=F, the GAM curves will not be plotted, ## and the Estimated Nc in the outer margin won't be either. ## ## plot.type determines how to deal with the two plots that appear: ## plot.type="two.windows" opens two R plotting windows for the two plots; ## plot.type="one.window" plots the two consecutively, suitable for exporting to a PDF; ## plot.type="curves" plots the first plot only, of curves; ## plot.type="gof" plots the second plot only, of data underlying the fitted probability curves. ## The default ("one.window") just plots the first window and then overplots the second window. ## ## EXAMPLES: ## Initially: ## delres <- check.modelfit.func(seed=3, model=grid.allmodels[[198]], gamdf.RgivenF=c(3, 0, 0)) ## Then subsequently, use the ready-fitted delres to do plots only: ## graphics.off();del <- check.modelfit.func(readyfit=delres, gamdf.RgivenF=c(3, 0, 0)) ## if(!fixedTol) fixed.args <- NULL ## If readyfit is supplied, unpack it: if(!is.null(readyfit)){ dat <- readyfit$data model <- readyfit$model seed <- readyfit$seed raw.dat <- readyfit$raw.dat fixed.args <- readyfit$fixed.args groupsize.col <- readyfit$groupsize.col w <- readyfit$w front.ymin <- readyfit$front.ymin mrds.out <- readyfit$mrds.out } else{ ## Readyfit is not supplied. ## Generate the duplicate data frame: set.seed(seed) dat.in <- compile.data.func(data=raw.dat, fixedTol = fixedTol, fixed.args=fixed.args, printit=F)$data.out dat <- create.caphists.func(dat.in=dat.in, groupsize.col=groupsize.col, w=w, front.ymin=front.ymin) ## Fit the model: mrds.out <- mrds.func(dat.in=dat.in, groupsize.col=groupsize.col, w=w, front.ymin = front.ymin, model = model, printit = T) } ## ------------------------------------------------------------------------------------------ ## Unpack results from the model: allparnames <- attributes(mrds.out)$allparnames ## Replace any NA's in the parameters with 0s: mrds.out[allparnames][is.na(mrds.out[allparnames])] <- 0 ## Quick way of unpacking all the parameters in one go: for(i in 1:length(allparnames)) assign(allparnames[i], mrds.out[allparnames[i]]) ## ------------------------------------------------------------------------------------------ ## Plot the fitted model using mrds.plot. Arbitrarily set plot.Nc=plot.gam below. if(plot.type!="gof") mrds.plot(mrds.out, which.legend=2, plot.Nc=plot.gam) if(plot.type=="curves") return("Finished") ## ------------------------------------------------------------------------------------------ ## FRONT TEAM SUCCESSES FROM REAR TEAM DETECTIONS: ## Now create the data frame "rear" of all sightings made by the rear observer team: reardat <- dat[dat$omega %in% c(2,3), ] reardat <- reardat[reardat$y>=front.ymin, ] ## Create a binary observation: 1 if the front team saw it, 0 otherwise. ## This is easy because omega=3 if sighted by both, and omega=2 if sighted by rear only, ## so front=omega-2. reardat$front <- reardat$omega-2 ## Run through each set of svals in svals.FgivenR: counter <- 0 for(i in 1:length(svals.FgivenR)){ if(twowin & counter %% 4 ==0){ if(plot.type=="two.windows") X11() par(mfrow=c(2,2), mar=c(3, 4, 3, 1.5), mgp=c(1.8, 0.5, 0)) } counter <- counter+1 svals.i <- svals.FgivenR[[i]] reardat.i <- reardat[reardat$s %in% svals.i,] s.string <- paste0("s = ", min(svals.i), ":", max(svals.i)) ## Plot the raw data for front given rear: ybreaks <- hist(reardat.i$y, plot=F)$breaks plot(reardat.i$y, reardat.i$front, pch=16, xlab="Distance, y", ylab="Detection by front team", main=paste0("Front team success from rear team detections: ", s.string), type="n", xlim=range(ybreaks), cex.lab=1.2) new.y <- seq(front.ymin, max(ybreaks), length=50) ## Create barplots of proportion of rear detections sighted by front in each y-bin ## determined by hist(): ybreaks[1] <- front.ymin for(b in 1:(length(ybreaks)-1)){ which.bin <- which(ybreaks[b] <= reardat.i$y & reardat.i$y < ybreaks[b+1]) prop.bin <- sum(reardat.i$front[which.bin])/length(which.bin) rect(ybreaks[b], 0, ybreaks[b+1], prop.bin, col="grey85", border="grey65") } ## Rug plot for the observations: points(reardat.i$y, reardat.i$front, pch="|", cex=0.9, col="maroon") ## Overplot the fitted GAM in red dash: ## If plot.gam, fit and plot a logistic regression to the front given rear successes: if(plot.gam){ if(gamdf.FgivenR[i] !=0) ## Specific request for GAM df for this category: reargam.i <- gam(front ~ s(y, fx=T, k=gamdf.FgivenR[i]), data=reardat.i, family=binomial(link="logit")) else ## No specific request for GAM df: reargam.i <- gam(front ~ s(y), data=reardat.i, family=binomial(link="logit")) lines(new.y, predict.gam(reargam.i, type="response", newdata=data.frame(y=new.y)), pch=16, col=2, lwd=3, lty=2) } ## Overplot the fitted MRDS curve: al3 + be3*dist + gam3*dist^2 + delt3*size + eps3*size*dist ## using results from mrds.out in blue points: pFgivenR <- invlogit.func( al3 + be3*reardat.i$y + gam3*(reardat.i$y^2) + delt3*reardat.i$s + eps3*reardat.i$s*reardat.i$y) points(reardat.i$y, pFgivenR, col=4, pch=1) } ## ------------------------------------------------------------------------------------------ ## REAR TEAM SUCCESSES FROM FRONT TEAM DETECTIONS: ## Create the data frame "front" of all sightings made by the front observer team: frontdat <- dat[dat$omega %in% c(1,3), ] ## Create a binary observation: 1 if the rear team saw it, 0 otherwise. ## omega=3 if sighted by both, and omega=1 if sighted by front only: frontdat$rear <- rep(0, nrow(frontdat)) frontdat$rear[frontdat$omega==3] <- 1 ## Run through each set of svals in svals.RgivenF: for(i in 1:length(svals.RgivenF)){ if(twowin & counter %% 4 ==0){ if(plot.type=="two.windows") X11() par(mfrow=c(2,2)) } svals.i <- svals.RgivenF[[i]] s.string <- paste0("s = ", min(svals.i), ":", max(svals.i)) frontdat.i <- frontdat[frontdat$s %in% svals.i,] ## Plot the raw data for rear given front: ybreaks <- hist(frontdat.i$y, plot=F)$breaks plot(frontdat.i$y, frontdat.i$rear, pch=16, xlab="Distance, y", ylab="Detection by rear team", main=paste0("Rear team success from front team detections: ", s.string), type="n", xlim=range(ybreaks), cex.lab=1.2) new.y <- seq(min(ybreaks), max(ybreaks), length=50) ## Create barplots of proportion of front detections sighted by rear in each y-bin ## determined by hist(): for(b in 1:(length(ybreaks)-1)){ which.bin <- which(ybreaks[b] <= frontdat.i$y & frontdat.i$y < ybreaks[b+1]) prop.bin <- sum(frontdat.i$rear[which.bin])/length(which.bin) rect(ybreaks[b], 0, ybreaks[b+1], prop.bin, col="grey85", border="grey65") } ## Rug plot for the observations: points(frontdat.i$y, frontdat.i$rear, pch="|", cex=0.9, col="maroon") ## If required, fit and plot a logistic regression GAM: if(plot.gam){ ## Fit a logistic regression to the rear given front successes: if(gamdf.RgivenF[i] !=0){ ## Specific request for GAM df for this category: frontgam.i <- gam(rear ~ s(y, fx=T, k=gamdf.RgivenF[i]), data=frontdat.i, family=binomial(link="logit")) } else ## No specific request for GAM df: frontgam.i <- gam(rear ~ s(y), data=frontdat.i, family=binomial(link="logit")) ## Overplot the fitted GAM in green dash: lines(new.y, predict.gam(frontgam.i, type="response", newdata=data.frame(y=new.y)), pch=16, col=3, lwd=3, lty=2) } ## Calculate the results from MRDS: ## P(R|F) = P(F|R)P(R)/ (P(F|R)P(R) + P(F|notR)P(notR)) dist <- frontdat.i$y size <- frontdat.i$s ## P(R) = invlogit.func(al1 + be1*dist + gam1*dist^2 + delt1*size + eps1*size*dist) pR.i <- invlogit.func(al1 + be1*dist + gam1*dist^2 + delt1*size + eps1*size*dist) ## P(F|R)=invlogit.func( al3 + be3*dist + gam3*dist^2 + delt3*size + eps3*size*dist): pFgivenR.i <- invlogit.func(al3 + be3*dist + gam3*dist^2 + delt3*size + eps3*size*dist) ## P(F|notR) = invlogit.func(al2 + be2*dist + gam2*dist^2 + delt2*size + eps2*size*dist): pFgivenNotR.i <- invlogit.func(al2 + be2*dist + gam2*dist^2 + delt2*size + eps2*size*dist) ## Then P(R|F) using Bayes rule: pRgivenF.i <- pFgivenR.i*pR.i / (pFgivenR.i*pR.i + pFgivenNotR.i*(1-pR.i)) ## Overplot the fitted MRDS curve: al3 + be3*dist + gam3*dist^2 + delt3*size + eps3*size*dist ## using results from mrds.out in blue points: points(frontdat.i$y, pRgivenF.i, col=4, pch=1) } return(list(raw.dat=raw.dat, data=dat, model=model, seed=seed, fixedTol=fixedTol, fixed.args=fixed.args, groupsize.col=groupsize.col, w=w, front.ymin=front.ymin, mrds.out=mrds.out)) } ############################################################### catline <- function (...) { cat(..., "\n") } ##########################################################