And here are some homebrewed functions that will be helpful.
Code
# Function libraryharvest.model.likelihoods <-function(workingDir = workingDir,outfileName ="outfile.txt",multilocus = T){# this function harvests model marginal likelihoods for models calculated by# the program migrate-n (Beerli & Felsenstein 2001).# It takes as input a directory full of directories, # each of which contains output from a migrate model, and is named# after that model. #initialize a data frame to take the values modelMarglikes <-data.frame(model=character(),thermodynamic=numeric(),bezier.corrected=numeric(), harmonic=numeric()) # loop through directories in the working directory, each of which is name# after a different modelfor(i inlist.dirs(workingDir, full.names = F)[-1]){ #i<-"stepping.stone" modelDir<-file.path(workingDir,i)print(modelDir)#scan in the outfile, separating at each newline outfile<-scan(file=file.path(modelDir,outfileName),what="character",sep="\n") #find the line with the likelihoods on it and split on runs of spaces marglikeline <-grep("Scaling factor",outfile,value=F)-1 marglikeline <-strsplit(outfile[marglikeline],"\\s+", perl = T)[[1]][3:5]# if(length(marglikeline)==0){next} marglikes <-c(i,marglikeline) modelMarglikes <-rbind(modelMarglikes,marglikes, deparse.level =2) }names(modelMarglikes) <-c("model","thermodynamic","bezier.corrected","harmonic") modelMarglikes[2:4] <-sapply(modelMarglikes[2:4], as.numeric)return(modelMarglikes)}bfcalcs<-function(df,ml="bezier.corrected"){# This calculates log bayes factors on data frames output by# harvest.model.likelihoods(), following Johnson and Omland (2004)# You may choose the likelihood flavor with# ml = "bezier.corrected", "thermodynamic" or "harmonic"#df$thermodynamic <- as.numeric(df$thermodynamic)#df$bezier.corrected <- as.numeric(df$bezier.corrected)#df$harmonic <- as.numeric(df$harmonic) mlcol <- df[[ml]] bmvalue <- mlcol[which.max(mlcol)] lbf <-2*(mlcol-bmvalue) choice <-rank(-mlcol) modelprob <-exp(lbf/2)/sum(exp(lbf/2)) dfall <-cbind(df,lbf,choice,modelprob)return(dfall)} migrants.per.gen<-function(x){#a function for creating Nm vectors out of m and Theta vectors.#x<-x[[1]] m<-names(x)[which(grepl("M_",names(x)))] #names of m columns#theta<-names(x)[which(grepl("Theta_",names(x)))] #names of theta columnsfor(n in m){ t<-paste("Theta",strsplit(n,split="_")[[1]][3],sep="_") x[,paste("Nm",strsplit(n,split="_")[[1]][2],strsplit(n,split="_")[[1]][3],sep="_")]<- x[,which(names(x)==n)]*x[,which(names(x)==t)] #this hairy little statement makes a new column named "Nm_X_Y" and then fills it by multiplying the M_X_Y column by the Theta_Y column }return(x)}######Functions for summing over loci in a bayesallfile###############################remove_prior <-function(densityd,prior,threshold =1e-10){# this function changes values less than a threshold # to zero removes a prior from the # y values of a density distributions (density).# first it zeros values less than a threshold, then # removes the prior, then renormalizes so the density# sums to 1 prior[which(prior <1e-30)] <-1e-30 densityd[which(densityd < threshold)] <-0 new <- (densityd/prior)/sum(densityd/prior)return(new)}sum_over_loci <-function(df,parameter){#this function takes a data frame of probability densities for many loci# that have had the prior removed,# together with with a logged prior named "logPrior",# as well as the name of a parameter (e.g. "Theta_1")# and sums the densities over loci.# Rmpfr package allows quadruple precision for calcs on very small numbers.require(Rmpfr)#add a teeny-tiny amount to all values to avoid zeros df2 <- df %>%mutate(across(starts_with(parameter), .fns=function(x) x +1e-11)) %>%#log all valuesmutate(across(starts_with(c(parameter)),.fns=log)) %>%# convert the df to rowwise so that rows can be summed# and then sum across the row, including the priorrowwise() %>%mutate(sum_param_prior =sum(c_across(starts_with(c(parameter,"logPrior"))))) %>%#convert back to a regular dfungroup()#need to convert to quadruple precision because #these will exponentiate to very small numbers. sum_param_prior_exp <-exp(mpfr(df2$sum_param_prior, precBits =128))# standardize by dividing by the sum of the column sum_param_prior_standardized <- sum_param_prior_exp/sum(sum_param_prior_exp)#drop the intermediate columns (no longer needed), change the standardized# output back to double precision so that it can be incorporated into the df# rename the summed column after the parameter df3 <- df2 %>%select(-c(sum_param_prior)) %>%mutate(summed_over_loci =as.numeric(sum_param_prior_standardized)) %>%rename_with(.fn =~paste(parameter), .cols = summed_over_loci)return(df3)}summarize_posterior <-function(posterior, parameter_type =c("Theta","M","Nm","D"),prior_params =c(min,mean,max),n=16384){# this function takes a Migrate-n posterior "bayesallfile" as a dataframe# as well as one of the parameter types, and the prior on that parameter# as a vector c(min,mean,max). Currently only exponential priors supported# it will create densities for each parameter of the given type,# remove the prior from each, sum across loci, and re-add the prior (once) parameters <-names(posterior) %>%str_subset(parameter_type)# create a tibble with the x values for all density plots # for a particular parameter type getx <- posterior %>%filter(Locus ==1) %>%select(parameters[1])# create a tibble with x values for a density plot# of the chosen number of points dens <-tibble(.rows = n, x =density(getx[[1]], n=n, from = prior_params[1], to = prior_params[3],bw ="nrd0")$x)print("calculating densities")# calculate densities for each parameter of a given type at each locus dens <- posterior %>%select(starts_with(c("Steps","Locus","rep",paste0(parameter_type,"_")))) %>%pivot_wider(names_from ="Locus", values_from =starts_with(paste0(parameter_type,"_")),names_sep ="-") %>%select(starts_with(paste0(parameter_type,"_"))) %>%map_dfc(function(x) density(x, n = n, from = prior_params[1], to = prior_params[3], bw ="nrd0")$y) %>%bind_cols(dens)# create, standardize, log and remove prior dens$prior <-dexp(dens$x, rate =1/prior_params[2], log = F) dens$prior <- dens$prior/sum(dens$prior) dens$logPrior <-log(dens$prior)print("removing prior") dens2 <- dens %>%#remove the prior, standardizemutate(across(starts_with(parameter_type), ~remove_prior(densityd = .x,prior = dens$prior,threshold =1e-10) )) dens3 <- dens2for(p in parameters){print(p) dens3 <-sum_over_loci(df = dens3, parameter = p) }# trying to do the above loop with purrr:map_dfc#dens4 <- parameters %>% # map_dfc(.f = ~ sum_over_loci(df = dens2, parameter = .x))return(dens3)}posterior_stats <-function(df,parameter){require(spatstat.core) p <- df %>%select(x,parameter) %>%as.list(bw =6.33e-05)names(p) <-c("x", "y") p$bw <-6.33e-5attr(p, "class") <-"density" qu <-quantile.density(p, c(0.025, 0.25, 0.5, 0.75, 0.975)) wmo <- p$x[which(p$y==max(p$y))] wme <-weighted.mean(p$x, p$y) wsd <-sqrt(weighted.var(p$x, p$y)) stats <-c(qu,mode = wmo, mean = wme, sd = wsd)return(stats)}
Introduction
Brian Bowen’s student Anne Lee asked me to do a migrate analysis on two species of deepwater snapper, Etelis carbunculus and Etelis coruscans. She wrote:
My thesis focuses on assessing the population structure and origin of deepwater snappers, Etelis carbunculus and Etelis coruscans, that were sampled from Japan, Johnston Atoll, Main Hawaiian Islands, and Northwestern Hawaiian Islands. My data consists of SNPs and I will be using migrate to help me with the analysis of the origin. This will help me examine the patterns of gene flow within both species to compare the Hawaiian populations to Johnston Atoll and Japan populations.
Because these guys are so deep down, and harder to sample, we just have single population samples from the four populations she mentions above.
Etelis carbunculis; photo by Thomas Schreiber
Etelis coruscans; photo by Wiwied Soeparto
Bee-utiful deepwater snappers wondering where they came from, where they are going, and whether they can get back in the water now.
I’m also going to see how much I can learn about quarto notebooks, which Rstudio is apparently using to replace Rmarkdown notebooks. They seem like an improvement so far! Following along here, using the simple option to render to the docs folder.
Anne placed all the data on Google Drive, zipped by population. I will download them with the gdown python package. This involves looking up the google id for each file in the URL of the share link and pasting that into gdown thusly. They were quite large, and unzip and gzip wouldn’t work. I used jar (which I didn’t even know could uncompress things) as suggested here
Need to rename the files into Illumina format in order for Stacks to recognize the paired reads.
cd rawfor f in*.R1.fq.gzdomv$f${f/.R1.fq.gz/_R1_001.fastq.gz}donefor f in*.R2.fq.gzdomv$f${f/.R2.fq.gz/_R2_001.fastq.gz}donecd ..
Haplotypes from Stacks
Clean the data with process_radtags
Same commands for carbunculus and coruscans. Paired end. Look for ecoR1 cutsites on read 1 and mspI cutsites on read 2. Remove low quality reads (based on a sliding window average). Remove reads smaller than 145 bp, and trim others to that length.
BEGIN total_raw_read_counts
Total Sequences 255732606
Reads containing adapter sequence 1957206 0.8%
Barcode Not Found 0 0.0%
Low Quality 6558845 2.6%
RAD Cutsite Not Found 1678118 0.7%
Retained Reads 245538437 96.0%
END total_raw_read_counts
E. coruscans output
BEGIN total_raw_read_counts
Total Sequences 272955592
Reads containing adapter sequence 1919907 0.7%
Barcode Not Found 0 0.0%
Low Quality 8734891 3.2%
RAD Cutsite Not Found 1884115 0.7%
Retained Reads 260416679 95.4%
END total_raw_read_counts
Running the denovo_map.pl pipeline
Hmm… gotta rename the cleaned files too. All code is being run on both species.
More renaming
cd cleanedfor f in*001.1.fq.gzdomv$f${f/_R[12]_001.1.fq.gz/.1.fq.gz}donefor f in*001.2.fq.gzdomv$f${f/_R[12]_001.2.fq.gz/.2.fq.gz}done#rename the "remainder reads" that have had their pair removed for low qualityfor f in*001.rem.1.fq.gz;domv$f${f/_R[12]_001.rem.1.fq.gz/.rem.1.fq.gz};donefor f in*001.rem.2.fq.gz;domv$f${f/_R[12]_001.rem.2.fq.gz/.rem.2.fq.gz};done
The pipeline
Removing extra flags
Oy - so denovo_map.pl crashed with this message:
Error: Failed to find any matching paired-end reads in ‘./cleaned/Etelis_S36.2.fq.gz’.
Aborted.
Julian Catchen gives the solution here. Because these data were already run through process_radtags once to demultiplex them, it attached a \1 to the labels of all primary reads and a \2 to the labels of all paired reads. When I re-ran them in process_radtags, it attached an extra flag to each, which then confused tsv2bam. Julian provides the below code to remove the extra flags.
I’m going to use the denovo_map.pl pipeline. This command will take data from ./cleaned and the popmap provided by Anne and -m 5 reads per stack, -M 4 distance between stacks, -n 4 distance between catalog loci. Running on 8 -T threads and only keeping loci that appear in 80% of individuals in all 4 populations
Now I need to re-run populations to get more statistics. Original populations output is in output1, this will be in output2.
r80
Settings for keeping loci that occur in 80 percent of indivs per population (so 4/5 of the Johnston invivs for coruscans). I’m being intentionally stringent by setting the p-value cutoff for hwe at 0.05.
Removed 98351 loci that did not pass sample/population constraints from 115429 loci.
Kept 17078 loci, composed of 5539395 sites; 17697 of those sites were filtered, 49261 variant sites remained.
Number of loci with PE contig: 17078.00 (100.0%);
Mean length of loci: 314.36bp (stderr 0.38);
Number of loci with SE/PE overlap: 2546.00 (14.9%);
Mean length of overlapping loci: 285.08bp (stderr 0.47); mean overlap: 27.61bp (stderr 0.13);
Mean genotyped sites per locus: 315.85bp (stderr 0.37).
Population summary statistics (more detail in populations.sumstats_summary.tsv):
Japan: 13.333 samples per locus; pi: 0.16058; all/variant/polymorphic sites: 5394107/49261/33997; private alleles: 6596
MHI: 13.55 samples per locus; pi: 0.15487; all/variant/polymorphic sites: 5394107/49261/31541; private alleles: 3166
NWHI: 25.142 samples per locus; pi: 0.15329; all/variant/polymorphic sites: 5394107/49261/38082; private alleles: 7321
Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Japan: 1145
MHI: 1103
NWHI: 1527
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Japan: 3917
MHI: 4185
NWHI: 3407
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)
Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
Japan-MHI: mean Fst: 0.030208; mean Phi_st: 0.023147; mean Fst': 0.025616; mean Dxy: 0.0019853
Japan-NWHI: mean Fst: 0.02638; mean Phi_st: 0.028095; mean Fst': 0.029652; mean Dxy: 0.0019372
MHI-NWHI: mean Fst: 0.014407; mean Phi_st: 0.0018347; mean Fst': 0.0016037; mean Dxy: 0.0018308
E. coruscans
Removed 130579 loci that did not pass sample/population constraints from 141967 loci.
Kept 11388 loci, composed of 3690405 sites; 32281 of those sites were filtered, 59470 variant sites remained.
Number of loci with PE contig: 11388.00 (100.0%);
Mean length of loci: 314.06bp (stderr 0.42);
Number of loci with SE/PE overlap: 1079.00 (9.5%);
Mean length of overlapping loci: 291.05bp (stderr 0.59); mean overlap: 24.47bp (stderr 0.12);
Mean genotyped sites per locus: 315.02bp (stderr 0.42).
Population summary statistics (more detail in populations.sumstats_summary.tsv):
Johnston_Atoll: 4.1993 samples per locus; pi: 0.074411; all/variant/polymorphic sites: 3587423/59470/15449; private alleles: 5208
MHI: 13.68 samples per locus; pi: 0.060304; all/variant/polymorphic sites: 3587423/59470/25569; private alleles: 7241
NWHI: 26.37 samples per locus; pi: 0.060932; all/variant/polymorphic sites: 3587423/59470/38902; private alleles: 15366
Japan: 13.217 samples per locus; pi: 0.054013; all/variant/polymorphic sites: 3587423/59470/22717; private alleles: 6028
Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Johnston_Atoll: 7
MHI: 1134
NWHI: 1648
Japan: 700
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Johnston_Atoll: 3998
MHI: 2984
NWHI: 2673
Japan: 3395
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)
Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
Johnston_Atoll-MHI: mean Fst: 0.038243; mean Phi_st: 0.012337; mean Fst': 0.0011598; mean Dxy: 0.0014711
Johnston_Atoll-NWHI: mean Fst: 0.022267; mean Phi_st: 0.0056917; mean Fst': -0.0033667; mean Dxy: 0.0013478
Johnston_Atoll-Japan: mean Fst: 0.041778; mean Phi_st: 0.018924; mean Fst': 0.0048455; mean Dxy: 0.0014546
MHI-NWHI: mean Fst: 0.013568; mean Phi_st: 0.0027592; mean Fst': 0.0011102; mean Dxy: 0.0012735
MHI-Japan: mean Fst: 0.021407; mean Phi_st: 0.0058972; mean Fst': 0.0028619; mean Dxy: 0.0013316
NWHI-Japan: mean Fst: 0.013484; mean Phi_st: 0.002933; mean Fst': 0.0018245; mean Dxy: 0.0012352
r100
Settings for keeping loci that occur in 100 percent of indivs per population ::: {.cell}
populations-P ./ -O ./output3_r100 -M ../popmapECO.tsv -t 12 -p 4 -r 100 -H--hwe--fstats--p-value-cutoff 0.05 --fasta-loci--fasta-samples--vcf--genepop--structure--treemix--fasta-samples-raw# and again with single SNPs from each locus so we can test for linkage and selectionpopulations-P ./ -O ./output4_r100.SNP -M ../popmapECA.tsv -t 12 -p 3 -r 100 --hwe--fstats--p-value-cutoff 0.05 --vcf--genepop--write-random-snp#Rerun coruscans after dropping A150populations-P ./ -O ./output5_r100.dropA150 -M ../popmapECO2.tsv -t 12 -p 4 -r 100 -H--hwe--fstats--p-value-cutoff 0.05 --fasta-loci--fasta-samples--vcf--genepop--structure--treemix--fasta-samples-rawpopulations-P ./ -O ./output6_r100.dropA150.SNP -M ../popmapECO2.tsv -t 12 -p 4 -r 100 --hwe--fstats--p-value-cutoff 0.05 --vcf--genepop--write-random-snp
:::
E. carbunculus
Removed 111882 loci that did not pass sample/population constraints from 115429 loci.
Kept 3547 loci, composed of 1134453 sites; 6916 of those sites were filtered, 6537 variant sites remained.
Number of loci with PE contig: 3547.00 (100.0%);
Mean length of loci: 309.83bp (stderr 0.69);
Number of loci with SE/PE overlap: 288.00 (8.1%);
Mean length of overlapping loci: 285.06bp (stderr 1.00); mean overlap: 24.45bp (stderr 0.24);
Mean genotyped sites per locus: 310.68bp (stderr 0.68).
Population summary statistics (more detail in populations.sumstats_summary.tsv):
Japan: 14 samples per locus; pi: 0.10528; all/variant/polymorphic sites: 1101972/6537/3955; private alleles: 1170
MHI: 14 samples per locus; pi: 0.09766; all/variant/polymorphic sites: 1101972/6537/3434; private alleles: 511
NWHI: 27 samples per locus; pi: 0.097747; all/variant/polymorphic sites: 1101972/6537/4642; private alleles: 1340
Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Japan: 101
MHI: 90
NWHI: 110
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Japan: 1197
MHI: 1358
NWHI: 1331
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)
Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
Japan-MHI: mean Fst: 0.025802; mean Phi_st: 0.015567; mean Fst': 0.0094451; mean Dxy: 0.00087231
Japan-NWHI: mean Fst: 0.020607; mean Phi_st: 0.018272; mean Fst': 0.010517; mean Dxy: 0.00082861
MHI-NWHI: mean Fst: 0.012892; mean Phi_st: 0.00097809; mean Fst': 0.00022075; mean Dxy: 0.00081207
E. coruscans
Removed 140925 loci that did not pass sample/population constraints from 141967 loci.
Kept 1042 loci, composed of 334846 sites; 3569 of those sites were filtered, 3654 variant sites remained.
Number of loci with PE contig: 1042.00 (100.0%);
Mean length of loci: 311.35bp (stderr 1.36);
Number of loci with SE/PE overlap: 117.00 (11.2%);
Mean length of overlapping loci: 285.68bp (stderr 1.71); mean overlap: 24.41bp (stderr 0.38);
Mean genotyped sites per locus: 312.62bp (stderr 1.33).
Population summary statistics (more detail in populations.sumstats_summary.tsv):
Johnston_Atoll: 5 samples per locus; pi: 0.060494; all/variant/polymorphic sites: 325749/3654/975; private alleles: 520
MHI: 14 samples per locus; pi: 0.036669; all/variant/polymorphic sites: 325749/3654/1226; private alleles: 470
NWHI: 27 samples per locus; pi: 0.037317; all/variant/polymorphic sites: 325749/3654/2064; private alleles: 1019
Japan: 14 samples per locus; pi: 0.033766; all/variant/polymorphic sites: 325749/3654/1143; private alleles: 463
Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Johnston_Atoll: 0
MHI: 32
NWHI: 45
Japan: 19
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Johnston_Atoll: 524
MHI: 422
NWHI: 405
Japan: 396
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)
Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
Johnston_Atoll-MHI: mean Fst: 0.039663; mean Phi_st: 0.02039; mean Fst': 0.0012821; mean Dxy: 0.00073676
Johnston_Atoll-NWHI: mean Fst: 0.026268; mean Phi_st: 0.015567; mean Fst': 0.00037957; mean Dxy: 0.0006556
Johnston_Atoll-Japan: mean Fst: 0.041015; mean Phi_st: 0.020409; mean Fst': 0.0021605; mean Dxy: 0.0007176
MHI-NWHI: mean Fst: 0.012433; mean Phi_st: 0.00025697; mean Fst': 4.0397e-05; mean Dxy: 0.00049144
MHI-Japan: mean Fst: 0.019806; mean Phi_st: 0.0039369; mean Fst': 0.00060169; mean Dxy: 0.00050736
NWHI-Japan: mean Fst: 0.012751; mean Phi_st: 0.0020762; mean Fst': 0.00052502; mean Dxy: 0.00047009
After dropping A150
A150 is a bad apple. I dropped it from the popmap and re-ran
Removed 140852 loci that did not pass sample/population constraints from 141967 loci.
Kept 1115 loci, composed of 355817 sites; 3712 of those sites were filtered, 3592 variant sites remained.
Number of loci with PE contig: 1115.00 (100.0%);
Mean length of loci: 309.12bp (stderr 1.35);
Number of loci with SE/PE overlap: 162.00 (14.5%);
Mean length of overlapping loci: 279.02bp (stderr 1.56); mean overlap: 25.85bp (stderr 0.33);
Mean genotyped sites per locus: 310.72bp (stderr 1.32).
Population summary statistics (more detail in populations.sumstats_summary.tsv):
Johnston_Atoll: 4 samples per locus; pi: 0.043022; all/variant/polymorphic sites: 346452/3592/554; private alleles: 160
MHI: 14 samples per locus; pi: 0.041745; all/variant/polymorphic sites: 346452/3592/1347; private alleles: 506
NWHI: 27 samples per locus; pi: 0.042774; all/variant/polymorphic sites: 346452/3592/2295; private alleles: 1180
Japan: 14 samples per locus; pi: 0.037868; all/variant/polymorphic sites: 346452/3592/1241; private alleles: 502
Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Johnston_Atoll: 0
MHI: 37
NWHI: 49
Japan: 21
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
Johnston_Atoll: 709
MHI: 446
NWHI: 414
Japan: 422
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)
Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
Johnston_Atoll-MHI: mean Fst: 0.030722; mean Phi_st: -0.0013119; mean Fst': -0.0016144; mean Dxy: 0.00063351
Johnston_Atoll-NWHI: mean Fst: 0.015585; mean Phi_st: -0.014286; mean Fst': -0.0023453; mean Dxy: 0.00053846
Johnston_Atoll-Japan: mean Fst: 0.032038; mean Phi_st: -0.002018; mean Fst': -0.00092502; mean Dxy: 0.00060796
MHI-NWHI: mean Fst: 0.012297; mean Phi_st: -0.00015381; mean Fst': -7.5242e-05; mean Dxy: 0.00051624
MHI-Japan: mean Fst: 0.019801; mean Phi_st: 0.0038246; mean Fst': 0.00053071; mean Dxy: 0.00052919
NWHI-Japan: mean Fst: 0.012569; mean Phi_st: 0.0014773; mean Fst': 0.00052976; mean Dxy: 0.00049003
Download the populations output
scp-r argo:./Etelis/coruscans/pipeline/output3_r100 ./populations_r100scp-r argo:./Etelis/carbunculus/pipeline/output3_r100 ./populations_r100 #copy down files from the coruscans run with bad apples removed.scp-r argo:./Etelis/coruscans/pipeline/output5_r100.dropA150 ./populations_r100.dropA150scp-r argo:./Etelis/coruscans/pipeline/output6_r100.dropA150.SNP/populations.snps.genepop ./populations_r100.dropA150/populations.randsnp.genepop
Evaluate the genomic data
E. coruscans
Adegenet
I am testing differences among 4 a priori defined populations. Thus, following recommendations in (Thia 2022), I will only use k-1 or 3 PCA axes.
Warning in .local(x, ...): Some scaling values are null.
Corresponding alleles are removed.
scatter(ecor_dapc_haps)
ecor_hwe <-hw.test(ecor_haps, B =1000)# playing with cross-validation#xvalDapc(ecor_haps, grp = ecor_haps$pop, result = "overall", n.pca.max = 10, training.set = 0.75)
Rows: 60 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): sampID, pop
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Scanning file to determine attributes.
File attributes:
meta lines: 14
header_line: 15
variant count: 929
column count: 69
Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
Character matrix gt rows: 929
Character matrix gt cols: 69
skip: 0
nrows: 929
row_num: 0
Processed variant: 929
All variants processed
Filtering
I’ve already implemented stringent filters in Stacks to only include individuals with all loci and loci found in all individuals. Now going to implement a strict HWE filter - if a locus is out of HWE in any population (hwe_facets = "pop") after a Bonferroni correction, then it will be removed. Also going to remove non-variant loci, and loci with only a single minor allele (“singletons”).
Initializing...
Filtering loci. Starting loci: 929
Filtering non-biallelic loci...
0 bad loci
Filtering non_polymorphic loci...
0 bad loci
Filtering loci sequenced in few individuals...
0 bad loci
Filtering singletons.
521 bad loci
Filtering loci out of hwe...
7 bad loci
Ending loci: 401
Filtering individuals. Starting individuals: 60
Filtering out individuals sequenced in few kept loci...
No individuals removed.
Individual Stats
snpR uses facets to subset data on either sample metadata (such as sample ID, or population) or SNP metadata. Here we will look at per-individual stats. Also, note that the calculations are just attached to the data object, so we don’t lose track of them. Nice.
ecor_snps <-calc_hs(ecor_snps, facets ="sampID")ecor_snps <-calc_he(ecor_snps, facets ="sampID")ecor_snps <-calc_ho(ecor_snps, facets ="sampID")# and dang, just like that we can get a PCAp <-plot_clusters(ecor_snps, facets ="pop")
Formatting data...
p$plots$pca
so there’s that one weirdo at Johnston, who is A150… we can identify him like this. He has double the mean \(H_e\) and \(H_o\).
I went back to re-run populations from the stacks pipeline with the “Bad Apple” (sample A150 from Johnston) removed from the popmap (see above). This removal is as suggested by (Cerca et al. 2021) Now to re-do everything.
Adegenet
I am testing differences among 4 a priori defined populations. Thus, following recommendations in (thiaGuidelinesStandardizingApplication?), I will only use k-1 or 3 PCA axes.
Warning in .local(x, ...): Some scaling values are null.
Corresponding alleles are removed.
scatter(ecor_dapc_haps)
ecor_hwe <-hw.test(ecor_haps, B =1000)# playing with cross-validation#xvalDapc(ecor_haps, grp = ecor_haps$pop, result = "overall", n.pca.max = 10, training.set = 0.75)
Rows: 59 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): sampID, pop
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Scanning file to determine attributes.
File attributes:
meta lines: 14
header_line: 15
variant count: 984
column count: 68
Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
Character matrix gt rows: 984
Character matrix gt cols: 68
skip: 0
nrows: 984
row_num: 0
Processed variant: 984
All variants processed
Filtering
I’ve already implemented stringent filters in Stacks to only include individuals with all loci and loci found in all individuals. Now going to implement a strict HWE filter - if a locus is out of HWE in any population (hwe_facets = "pop") after a Bonferroni correction, then it will be removed. Also going to remove non-variant loci, and loci with only a single minor allele (“singletons”).
Initializing...
Filtering loci. Starting loci: 984
Filtering non-biallelic loci...
0 bad loci
Filtering non_polymorphic loci...
0 bad loci
Filtering loci sequenced in few individuals...
0 bad loci
Filtering singletons.
538 bad loci
Filtering loci out of hwe...
6 bad loci
Ending loci: 440
Filtering individuals. Starting individuals: 59
Filtering out individuals sequenced in few kept loci...
No individuals removed.
Individual Stats
snpR uses facets to subset data on either sample metadata (such as sample ID, or population) or SNP metadata. Here we will look at per-individual stats. Also, note that the calculations are just attached to the data object, so we don’t lose track of them. Nice.
ecor_snps <-calc_hs(ecor_snps, facets ="sampID")ecor_snps <-calc_he(ecor_snps, facets ="sampID")ecor_snps <-calc_ho(ecor_snps, facets ="sampID")# and dang, just like that we can get a PCAp <-plot_clusters(ecor_snps, facets ="pop")
Formatting data...
p$plots$pca
Population Stats
And here are some summary stats about each population
ecor_plink <-read.pcadapt("coruscans/plink/coruscans.plink.bed",type="bed")pca20 <-pcadapt(ecor_plink, K =20)plot(pca20, option ="screeplot")
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the pcadapt package.
Please report the issue at <]8;;https://github.com/bcm-uga/pcadapt/issueshttps://github.com/bcm-uga/pcadapt/issues]8;;>.
Based on the scree plot I think we’ll use 6 PCs.
PCA
pca6 <-pcadapt(ecor_plink, K =6)plot(pca6, option ="scores", pop = popmap_cor$pop)
Warning: Use of `df$Pop` is discouraged.
ℹ Use `Pop` instead.
plot(pca6, option ="manhattan")
#qqplotplot(pca6, option ="qqplot")
#histogramhist(pca6$pvalues, xlab ="p-values", main =NULL, breaks =50, col ="orange")
Outliers
So the p-values (even uncorrected) are all fairly large here. There aren’t really any outliers.
#write to vcfformat_snps(ecor_snps, output ="vcf", facets ="pop", outfile ="coruscans/hwe.pcadapt.filtered.snps.vcf",chr ="CHROM", position ="position" )format_snps(ecor_snps, output ="genepop", facets ="pop", outfile ="coruscans/hwe.pcadapt.filtered.snps.gen",chr ="CHROM", position ="position" )#grab the locus names and write them to a whitelist to be included in the migrate data file.whitelist <- ecor_snps@snp.meta$CHROMwrite.table(sort(as.numeric(ecor_snps@snp.meta$CHROM)), "coruscans/ecor_whitelist.txt", quote=F, row.names = F,col.names =F )ecor_haps2 <- ecor_haps[,loc =locNames(ecor_haps) %in% whitelist]# write them outgenind_to_genepop(ecor_haps2,output ="coruscans/hwe.pcadapt.filtered.haps.txt")
Re-check the DAPC
We didn’t remove any outlier loci, so no need to re-calculate Fst.
Converting data from a Genepop .gen file to a genind object...
File description: Conversion of an object of class 'genind' into a GENEPOP file with the package 'graph4lg'
...done.
Rows: 55 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): sampID, pop
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Scanning file to determine attributes.
File attributes:
meta lines: 14
header_line: 15
variant count: 2750
column count: 64
Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
Character matrix gt rows: 2750
Character matrix gt cols: 64
skip: 0
nrows: 2750
row_num: 0
Processed variant 1000
Processed variant 2000
Processed variant: 2750
All variants processed
Filtering
I’ve already implemented stringent filters in Stacks to only include individuals with all loci and loci found in all individuals. Now going to implement a strict HWE filter - if a locus is out of HWE in any population (hwe_facets = "pop") after a Bonferroni correction, then it will be removed. Also going to remove non-variant loci, and loci with only a single minor allele (“singletons”).
Initializing...
Filtering loci. Starting loci: 2750
Filtering non-biallelic loci...
0 bad loci
Filtering non_polymorphic loci...
0 bad loci
Filtering loci sequenced in few individuals...
0 bad loci
Filtering singletons.
1064 bad loci
Filtering loci out of hwe...
20 bad loci
Ending loci: 1666
Filtering individuals. Starting individuals: 55
Filtering out individuals sequenced in few kept loci...
No individuals removed.
Individual Stats
snpR uses facets to subset data on either sample metadata (such as sample ID, or population) or SNP metadata. Here we will look at per-individual stats. Also, note that the calculations are just attached to the data object, so we don’t lose track of them. Nice.
ecar_snps <-calc_hs(ecar_snps, facets ="sampID")ecar_snps <-calc_he(ecar_snps, facets ="sampID")ecar_snps <-calc_ho(ecar_snps, facets ="sampID")# and dang, just like that we can get a PCAp <-plot_clusters(ecar_snps, facets ="pop")
Formatting data...
p$plots$pca
The separation between both Hawaiian populations and Japan is pretty evident even in the PCA.
I’m going to write a whitelist with 401 randomly selected SNPs - to be in parity with coruscans. Adding more would take longer.
#write to vcfformat_snps(ecar_snps2, output ="vcf", facets ="pop", outfile ="carbunculus/hwe.pcadapt.filtered.snps.vcf",chr ="CHROM", position ="position" )format_snps(ecar_snps2, output ="genepop", facets ="pop", outfile ="carbunculus/hwe.pcadapt.filtered.snps.genepop",chr ="CHROM", position ="position" )write.csv(pca2_outliers, "carbunculus/pcadapt.outliers.csv")#grab the locus names, pick a random subset of 1000, and write them to a whitelist to be included in the migrate data file.whitelist <- ecar_snps2@snp.meta$CHROM[sample(1:nsnps(ecar_snps2),1000)]write.table(sort(as.numeric(whitelist)), "carbunculus/ecar_whitelist.txt", quote=F, row.names = F,col.names =F )ecar_haps2 <- ecar_haps[,loc =locNames(ecar_haps) %in% whitelist]# write them outgenind_to_genepop(ecar_haps2,output ="carbunculus/hwe.pcadapt.filtered.haps.txt")ecar_snps3 <- ecar_snps2[ecar_snps2@snp.meta$CHROM %in% whitelist,]format_snps(ecar_snps3, output ="genepop", facets ="pop", outfile ="carbunculus/hwe.pcadapt.filtered.snps1000.genepop",chr ="CHROM", position ="position" )
Converting data from a Genepop .gen file to a genind object...
File description: Conversion of an object of class 'genind' into a GENEPOP file with the package 'graph4lg'
...done.
Warning in .sub_and_t_1_2_to_A_C(ac, ncol(gt)): Since allelic identities are not clear, alleles at each locus will be saved as A and C. If this data is later reformatted for use elsewhere, please be aware that this may cause issues for some downstream analyses (since, for example, all SNPs will be nonsensically noted as transversions rather than transitions!).
To prevent this, use a different format that specifies allelic identities.
This is not an issue for any snpR functions save those that rely on ancestral/derived allelic identities.
Warning in validityMethod(object): Some snp metadata columns contain unacceptable special characters. Unaccepted characters: '.', '~', or any whitespace.
These can cause unexpected behaviour if the subect columns are used as facets.
# and here is Fstecar_fst3 <-get.snpR.stats(ecar_snps3, facets ="pop", stats =c("fst"))ecar_fst3$fst.matrix
$pop
$pop$fst
p1 MHI NWHI
1: Japan 0.01413796 0.0165854943
2: MHI NA 0.0003387499
$pop$p
p1 MHI NWHI
1: Japan 0.01198801 0.000999001
2: MHI NA 0.411588412
And compare that to pre-filtering. Some differences, but not as much as when I was overly stringent with selected loci.
ecar_fst$fst.matrix
$pop
$pop$fst
p1 MHI NWHI
1: Japan 0.01746465 0.021466084
2: MHI NA 0.001113613
$pop$p
p1 MHI NWHI
1: Japan 0.002997003 0.000999001
2: MHI NA 0.328671329
Fasta2Genotype
Paul Maier created this script to convert Stacks haplotypes to migrate format.
Install Python2
I had a doozy of a time trying to get this to work this time around: the script was written in python2. which is nearly deprecated. I first tried to use 2to3 to upgrade the script to python3, but there were a few stray tabs I couldn’t figure out. I eventually landed here, and used the following commands to install pyenv, which can handle multiple python installs.
brew install pyenvpyenv install 2.7.18pyenv versionsecho'export PYENV_ROOT="$HOME/.pyenv"'>> ~/.zshrc\necho 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"'>> ~/.zshrc\necho 'eval "$(pyenv init -)"'>> ~/.zshrcexec"$SHELL"pyenv shell 2.7.18 # use python 2 for this shell sessionpip install numpypip install --upgrade pip # need to update pip to be able to install scipypip install scipy
E. coruscans
Run it
I had to remove all the [individualName] tokens from the populations.samples.fa to get it to work. Also, I added a SampleID column and header row to the popmap for use by fasta2genotype.py. I wrote out a whitelist above of all the 440 loci that passed through the filters.
python ../../fasta2genotype/fasta2genotype.py populations_r100.dropA150/populations.samples2.fa ecor_whitelist.txt \popmapECO_f2g.tsv NA Ecoruscans440.mig###################################################################### ###### Fasta2Genotype | Data Conversion | Version 1.10 ###### ###### Cite as follows: ###### ###### Maier P.A., Vandergast A.G., Ostoja S.M., Aguilar A., ###### Bohonak A.J. (2019). Pleistocene glacial cycles drove ###### lineage diversification and fusion in the Yosemite toad ###### (Anaxyrus canorus). Evolution, in press. ###### https://www.doi.org/10.1111/evo.13868 ###### ######################################################################Output type? [1] Migrate [2] Arlequin [3] DIYABC [4] LFMM [5] Phylip [6] G-Phocs [7] Treemix [8] Haplotype: 1Loci to use? [1] Whitelist [2] All: 1Remove restriction enzyme or adapter sequences? These may bias data. [1] Yes [2] No: 2Remove monomorphic loci? [1] Yes [2] No: 1Remove loci with excess heterozygosity? This can remove paralogs. [1] Yes [2] No: 1Maximum heterozygosity cutoff for removing loci out of Hardy-Weinberg? 0.5Filter for allele frequency? False alleles might bias data. [1] Yes [2] No: 2Filter for missing genotypes? These might bias data. [1] Yes [2] No: 2***************************************************************************************************************** ... BEGINNING CONVERSION ... *****************************************************************************************************************Cataloging loci...Counting locus lengths...Cataloging populations...Counting gene copies...Counting alleles for each locus...Identifying loci with excess heterozygosity...Calculating observed heterozygosity and homozygosity...Calculating expected heterozygosity and homozygosity...Flagging loci with excess heterozygosity for removal...Removing loci...Removed 1 overly heterozygous loci.Outputting migrate-n file...# it removed locus 60082 for excess heterozygosity, so I dropped it from the whitelist.
I then re-ran the script, selecting phylip output, with locus name headers, and all other options the same. I need the locus names in the order output by the script in order to run modeltest.
E. carbunculus
Run it
I had to remove all the [individualName] tokens from the populations.samples.fa to get it to work. Also, I added a SampleID column and header row to the popmap for use by fasta2genotype.py. I wrote out a whitelist above of 1000/1666 randomly selected loci that passed through the filters. Used the same options as E. coruscans.
python ../../fasta2genotype/fasta2genotype.py populations_r100/populations.samples2.fa ecar_whitelist_1000.txt \popmapECA_f2g.tsv NA Ecarbunculus999.migOutput type? [1] Migrate [2] Arlequin [3] DIYABC [4] LFMM [5] Phylip [6] G-Phocs [7] Treemix [8] Haplotype: 1Loci to use? [1] Whitelist [2] All: 1Remove restriction enzyme or adapter sequences? These may bias data. [1] Yes [2] No: 2Remove monomorphic loci? [1] Yes [2] No: 1Remove loci with excess heterozygosity? This can remove paralogs. [1] Yes [2] No: 1Maximum heterozygosity cutoff for removing loci out of Hardy-Weinberg? 0.5Filter for allele frequency? False alleles might bias data. [1] Yes [2] No: 2Filter for missing genotypes? These might bias data. [1] Yes [2] No: 2***************************************************************************************************************** ... BEGINNING CONVERSION ... *****************************************************************************************************************Cataloging loci...Counting locus lengths...Cataloging populations...Counting gene copies...Counting alleles for each locus...Identifying loci with excess heterozygosity...Calculating observed heterozygosity and homozygosity...Calculating expected heterozygosity and homozygosity...Flagging loci with excess heterozygosity for removal...Removing loci...Removed 1 overly heterozygous loci.Outputting migrate-n file...*** DONE! ***It removed locus 31718 for being overly heterozygous, so I zapped it from the whitelist.
I then re-ran the script, selecting phylip output, with full sequences, haploid, all loci, not separated by ! locus name headers, and all other options the same. I need the locus names in the order output by the script in order to run modeltest. This is only needed to get the locus names in the next step.
I need to figure out an overall mutation model to use with RAD loci. I received this advice one from Peter about how to construct these. I’ll use modelTest in the phangorn package to see where that gets me.
I renamed the FASTA headers in populations.samples.fa with BBEdit to create population.samples3.fa
Find:>CLocus_\d+_Sample_\d+_Locus_(\d+)_Allele_([01])\[Etelis_(.+)\]Replace:>\3_L\1_A\2#first token is sample name, second is locus number(name) third is allele#Also remove bad indiv A150 in coruscansFind:>A150_.+\n.+\nReplace:
Lets load in the data and calculate some statistics for each locus. Previously Migrate-n only implemented the F84 (=HKY) model, with two rates (Transitions and Transversions) and gamma distributed rate variability.
Now it will take these possible models: 1 Jukes-Cantor model 2 Kimura 2-parameter model 3 Felsenstein 1981 (F81) model 4 Felsenstein 1984 (F84) model 5 Hasegawa-Kishino-Yano model 6 Tamura-Nei model
fastadata <-read.FASTA("coruscans/populations_r100.dropA150//populations.samples3.fa")# read in the locus lengths, as optional additional way to make sure the stats# and the migrate infile are ordered the same#migrate_lengths <- read_lines("coruscans/Ecoruscans.mig", skip = 1, n_max=1) %>% # str_split("\\t", simplify = T) %>% t(.) %>% tibble(length=.) %>% na.omit()migrate_loci <-read_lines("coruscans/Ecoruscans439.phy", skip =1, n_max=1) %>%str_split("\\t", simplify = T) %>%t(.) %>%tibble(locus_name=.) %>%filter(locus_name!="")stats <-tibble(locus =character(), length =numeric(),segSites =numeric(),nHaps =numeric(),nucDiv =numeric(),ttRatio =numeric(),model =character(),gammaShape =numeric(),rate1 =numeric(), rate2 =numeric(), rate3 =numeric(),rate4 =numeric(),Q1=numeric(),Q2 =numeric(),Q3 =numeric(), )for(l in migrate_loci$locus_name){ L <-paste0("L",l,"_")print(paste("Now Doing Locus", l, match(l, migrate_loci$locus_name), "out of", length(migrate_loci$locus_name))) locus_dnabin <- fastadata[str_which(names(fastadata),pattern = L)]# convert to package formats locus_dnabin <-as.matrix(locus_dnabin) locus_gtypes <-sequence2gtypes(locus_dnabin) locus_phy <-phyDat(locus_dnabin)#create a haplotype network .. to be continued haps <-haplotype(locus_dnabin) nhaps <-length(dimnames(haps)[[1]])#tcs <- haploNet(haps)#find parameters of HKY (F84) model modeltest <-modelTest(locus_phy, model =c("JC","K80", "F81", "HKY","TrN"), G = T, I = F, k =4, mc.cores =5)# pick out the best model. If multiple models are tied for best, pick the simplest one bestmodel <- modeltest$Model[which(modeltest$BIC ==min(modeltest$BIC))][1]#open the object environment env <-attr(modeltest,"env") bestparams <-get(bestmodel, env) bestparams <-eval(bestparams, env=env)# use this code for v3, which only has F84 (HKY)#HKY <- modelTest(locus_phy, model = c("HKY"), # G = T, I = F, k = 4)#env <- attr(HKY, "env")#HKYG <- get("HKY+G", env)#model <- eval(HKYG, env=env)# calculate TiTv Ratio ttratio <-TiTvRatio(locus_gtypes) stats <-bind_rows(stats, tibble(locus=L, length =length(locus_dnabin[1,]),segSites =length(seg.sites(locus_dnabin)),nHaps =length(dimnames(haps)[[1]]),nucDiv =nuc.div(locus_dnabin),ttRatio = ttratio[3],model = bestmodel,gammaShape = bestparams$shape,rate1 = bestparams$g[1],rate2 = bestparams$g[2],rate3 = bestparams$g[3],rate4 = bestparams$g[4],Q1 =sort(unique(bestparams$Q))[1],Q2 =sort(unique(bestparams$Q))[2],Q3 =sort(unique(bestparams$Q))[3] ))}#stats <- stats[which(stats$length %in% migrate_lengths$length),]#setdiff(stats$length, as.numeric(migrate_lengths$length))# write_csv(stats, "coruscans/migrate_locus_statistics439.csv")
Write a model block
Write a space delimited textfile that can be added to the migrate data file in the format that Peter suggested.
#stats <- read_csv("./carbunculus/migrate_locus_statistics.csv")#kable(stats)# write a space delimited textfile that can be added to the migrate data file# following Peter's suggestion here:#https://groups.google.com/g/migrate-support/c/XjV_4jZW4RI/m/HbRWoGY6AwAJmigrate_mutation_models <-tibble(prefix ="#$",locus=1:length(stats$locus),type ="s",model = stats$model,q1 = stats$Q2,q2 = stats$Q3)#write_delim(migrate_mutation_models,"./coruscans/migrate_modelblock439.txt", na="", delim = " ")
Read them back in so we don’t have to recalculate them every time I knit.
So we have a 0 invariant loci, and the mean overall transition:transversion ratio is NaN. Mean gamma shape parameter is NA, which argues for only one rate.
Metapopulation Models
Columns = departure pop, Rows = arrival pop
* = equilibrium gene flow/drift OR theta (\(N_e\mu\)) parameter (on diagonal)
d = divergence without gene flow
D = divergence with gene flow
0 = parameter not included in model
E. coruscans
divergence_geneflow_1
JA * 0 0 0
JO D * 0 0
MHI 0 D * 0
NWHI 0 0 D *
divergence_geneflow_2
* 0 0 0
D * 0 0
0 0 * D
0 D 0 *
divergence_geneflow_3
* 0 0 0
0 * 0 D
0 0 * D
D 0 0 *
divergence_only_1
* 0 0 0
d * 0 0
0 d * 0
0 0 d *
divergence_only_2
* 0 0 0
d * 0 0
0 0 * d
0 d 0 *
divergence_only_3
* 0 0 0
0 * 0 d
0 0 * d
d 0 0 *
geneflow_only_full
* * * *
* * * *
* * * *
* * * *
geneflow_only_1
* 0 0 0
* * 0 0
0 * * 0
0 0 * *
geneflow_only_2
* 0 0 0
* * 0 0
0 0 * *
0 * 0 *
geneflow_only_3
* 0 0 0
0 * 0 *
0 0 * *
* 0 0 *
n-island
m m m m
m m m m
m m m m
m m m m
panmixia
*
population-relabel={1 1 1 1}
E. carbunculus
divergence_geneflow_1
JA * 0 0
MHI D * 0
NWHI 0 D *
divergence_geneflow_2
* 0 0
0 * D
D 0 *
divergence_only_1
* 0 0
d * 0
0 d *
divergence_only_2
* 0 0
0 * d
d 0 *
geneflow_only_full
* 0 0
* * 0
0 * *
geneflow_only_1
* 0 0
0 * *
* 0 *
geneflow_only_2
* * *
* * *
* * *
n-island
m m m
m m m
m m m
panmixia
*
population-relabel={1 1 1 1}
Parmfile
Here’s the parmfile I am using, based on my parmfile for Palythoa tuberculosa. This one depicts a model of divergence without gene flow. Migration prior is increased to have a maximum of 1e8 (100,000,000), and mean of 1e4 (10,000). This reflects a prior expectation that \(m/\mu\) will not be larger than this value, if m is not larger than 0.1 (10% of population migrates), assuming a nuclear mutation rate of 1e-9. Also on average one out of every 100K individuals will send a migrant (m = 1e-5).
Copy it up, and then make a total of 10 replicate copies of the folder.
scp-r coruscans/migrate/rep_template eric@threadripper.tobolab.org:./Etelis/coruscans/rep1scp-r coruscans/Ecoruscans439.mig eric@threadripper.tobolab.org:./Etelis/coruscans/Ecoruscans2.migscp-r carbunculus/migrate/rep_template eric@threadripper.tobolab.org:./Etelis/carbunculus/rep1scp-r carbunculus/Ecarbunculus999.mig eric@threadripper.tobolab.org:./Etelis/carbunculus/Ecarbunculus.mig#once on server, copy it 9 timesfor a in$(seq 2 10);docp-r rep1 rep$a;done
Bash Script
So we will do 10 replicates of 3 replicates. This will start at r1, and run all models for that before moving on. Pretty sure this will finish one whole model before moving on to the next one (since all threads are being used for different loci). This uses two threads per core as well with the –use-hwthread-cpus flag.
### Bash Script#!for r in*/docd$recho$rdatedate> date.txtfor m in*/docd$mdate> date.txtecho$mdatempirun--use-hwthread-cpus-np 120 ~/migrate-5.0.4/src/migrate-n-mpi parmfilesleep 1cd ..donecd ..done
The way that model marginal likelihood is calculated depends on the scaling constant k, which is an indicator of agreement on parameter values between loci. To get this, migrate uses the same histograms that it uses in its output. Thus, setting number of bins becomes important, as does using a version of migrate that doesn’t have the visualization issues that Peter reported. I revised my bash submission script to:
make a new copy of parmfile called parmfile_rerun
change recover=NO to recover=YES - this will make migrate summarize the current bayesallfile, rather than create a brand new run
Change the number of bins from 3,000 to 2^14 = 16,384
Write the outfile to outfile_rerun.txt
Use version 5.0.4 instead of 4.4.4 (which has viz problems)
Re-running with more bins
#| eval: false
<!-- ### Bash Script -->
<!-- #! -->
<!-- for r in */ -->
<!-- do -->
<!-- echo starting $r at $(date ) >> progress_rerun.txt -->
<!-- cd $r -->
<!-- echo $r -->
<!-- date -->
<!-- echo starting $r at $(date ) >> progress_rerun.txt -->
<!-- for m in */ -->
<!-- do -->
<!-- echo starting $m at $(date )>> progress_rerun.txt -->
<!-- cd $m -->
<!-- date > date_rerun.txt -->
<!-- echo $m -->
<!-- date -->
<!-- # copy the parmfile to parmfile_rerun, and replace the following 3 lines -->
<!-- cp parmfile parmfile_rerun -->
<!-- sed -i 's/recover=NO/recover=YES/g' parmfile_rerun -->
<!-- sed -i 's/outfile=outfile.txt/outfile=outfile_rerun.txt/g' parmfile_rerun -->
<!-- sed -i 's/bayes-posteriorbins= 3000 3000/bayes-posteriorbins= 16384 16384/g' parmfile_rerun -->
<!-- mpirun --use-hwthread-cpus -np 60 ~/migrate-5.0.4/src/migrate-n-mpi parmfile_rerun -->
<!-- sleep 1 -->
<!-- cd .. -->
<!-- done -->
<!-- cd .. -->
<!-- done -->
And back down
And ten replicates finished in about 10 days! Found this guide to download everything except the massive bayesallfiles and pdfs
So, I was using a Ti/Tv ratio in my K80/HKY mutation models that was way out of bounds. I’ve exported the locus-by-locus marginal likelihoods out of the panmixia, full and divergence-with-gene-flow3 models to see what happens when I remove the bad loci.
mlikes <-read_csv("coruscans/model_marglike_comp.csv") %>%filter(!Locus_Index %in% badloci$Locus) %>%filter(!Locus_Index==440) %>%#locus 440 is the migrate sumsummarize(across(.fns = sum))mlikes
With this new back-of-the-envelope analysis, the full model comes 3rd, after divergence-with-gene-flow3. The scaling factor might have a big effect here.
Results
E. coruscans
Model Marginal Likelihoods
runDir_cor <-"coruscans/migrate/output5"likelist_cor <-list()for(r in1:5){ rep =paste0("rep",r)print(rep) likelist_cor[[rep]] <-harvest.model.likelihoods(workingDir=file.path(runDir_cor,rep),outfileName ="outfile_rerun3.txt")}# Model selection for each replicate...likelist_cor %>%map(bfcalcs)
The final model marginal likelihood estimates based on the mean of 10 replicates:
Exact Permutation Test (complete enumeration)
data: GROUP 1 and GROUP 2
p-value = 0.003968
alternative hypothesis: true mean GROUP 1 - mean GROUP 2 is greater than 0
sample estimates:
mean GROUP 1 - mean GROUP 2
5427.85
Model Selection Figures
And a figure summarizing all this
models_cor <-c("Divergence With Gene Flow 1","Divergence With Gene Flow 2","Divergence With Gene Flow 3","Divergence Only 1", "Divergence Only 2","Divergence Only 3","Gene Flow Only Full", "Gene Flow Only 1", "Gene Flow Only 2", "Gene Flow Only 3","n-island", "panmixia")likesPlot_cor <- likelist_cor %>%bind_rows() %>%group_by(model) %>%ggplot(mapping =aes(x=model, y = bezier.corrected)) +geom_violin(draw_quantiles =0.5) +theme(axis.text.x =element_text(angle =90, vjust =0.5, hjust=1)) +scale_x_discrete(labels = models_cor) +labs(x ="Metapopulation Model", y ="Bezier Corrected Marginal Likelihood")likesPlot_cor
Final Parameter Estimates
These are the final estimates, summed over loci.
Here are the priors I’m using for the equilibrium models, as a reminder.
repsDir <-"~/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Etelis/migrate_output/coruscans"# for some reason the bayesallfile has 39 columns, but only 31 are labelled.# read in a population key to translate numbers into names in figurespopkey <-read_csv("coruscans/popkey.csv")names(popkey$Pop) <- popkey$IndexwinningModel <-"full"posteriors <-list()#read in the posterior for one replicate this may take 5 minutesfor(r in1:3){print(paste("Loading Rep",r)) posteriors[[r]] <-read.table(file.path(repsDir,paste0("rep",r),winningModel,"bayesallfile"), header=T) }#combine into one, and then keep only the first 2 reps (based on diagnostics below)posterior <- posteriors %>%bind_rows(.id ="rep") %>%filter(rep <=1) %>%migrants.per.gen()# posterior_long <- posterior %>% # pivot_longer(starts_with(c("Theta_", "M_", "Nm_","D_", "d_")),# names_to = "parameter", # values_to = "value")#make a new labeling keyposterior_parameters <-names(posterior)[str_which(names(posterior),paste(c("Theta_", "M_", "Nm_","D_", "d_"), collapse ="|"))]parameter_key <- posterior_parameters %>%str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%str_replace("M_", "m/mu " ) %>%str_replace("Theta_", "theta ") %>%str_replace_all(popkey$Pop) %>%str_replace("Nm_", "4Nem ") names(parameter_key) <- posterior_parameters
Check Convergence
#thin by taking every other stepposteriors2 <-lapply(posteriors,filter,Steps%%500==0)# select parameters onlyposteriors2 <-lapply(posteriors2, select, starts_with(c("Theta_", "M_", "Nm_","D_","d_")))posteriors.mcmc <-mcmc.list(lapply(posteriors2,mcmc,thin=500))posterior.ggs <-ggs(posteriors.mcmc, burnin = F)effectiveSize(posteriors2[[1]])#ggmcmc(posterior.ggs, param_page = 4)
Took 51 minutes to generate the ggmcmc output, but everything looks really really good, other than the Geweke diagnostics… need to look that up.
New names:
Rows: 4 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
thetastats
Theta Violins
m/mu estimates
parameter_type <-"M"# use this to figure out the granularity of the prior needed for summarize_prior#seq(0,100000, length.out = 2^17)parameters <-names(posterior) %>%str_subset(parameter_type)parameter_key2 <- parametersnames(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]#summarize the posterior for thetaMs <-summarize_posterior(posterior, parameter_type = parameter_type, prior_params =c(0,10000,100000), n =2^17)#convert to long formatMs_long <- Ms %>%select(all_of(c("x","prior",parameters))) %>%pivot_longer(starts_with(c(paste0(parameter_type,"_"))), names_to ="Parameter",values_to ="Density")migration.colors <-c(theta.colors[1],theta.colors.dark[1],theta.colors.light[1], theta.colors[2],theta.colors.dark[2],theta.colors.light[2], theta.colors[3],theta.colors.dark[3],theta.colors.light[3], theta.colors[4],theta.colors.dark[4],theta.colors.light[4]) #plotMs_density <-ggplot(Ms_long, aes(x = x, y = Density, color = Parameter)) +geom_line(size =1) +scale_x_log10(limits =c(1,3000)) +scale_color_discrete(type = migration.colors, labels = parameter_key) +geom_line(aes(x = x, y=prior), color ="grey",linetype="dotted")#plotM_ridges <-ggplot(Ms_long, aes(x = x, y = Parameter, height = Density, fill = Parameter)) +geom_density_ridges(stat ="identity") +#scale_x_log10(limits = c(1,2000)) +scale_fill_discrete(type = migration.colors, labels = parameter_key) +scale_y_discrete(limits = rev, labels =NULL, breaks =NULL) +labs(x =TeX("$\\frac{m}{\\mu}$")) +xlim(0,1200)M_violins <-ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density*20, fill = Parameter)) +geom_violin(stat ="identity") +scale_y_log10(limits =c(1,1200)) +scale_fill_discrete(type = migration.colors, labels = parameter_key) # ggsave("./coruscans/migrate/Mmu_density_full.pdf", plot = Ms_density)# ggsave("./coruscans/migrate/Mmu_ridges_full.pdf", plot = M_ridges)# ggsave("./coruscans/migrate/Mmu_violins_full.pdf", plot = M_violins)# get stats for each summed-over-loci posteriorMs_stats <- parameter_key2 %>%map_dfr(.f =~posterior_stats(df = Ms, parameter = .x),.id ="parameter")#write.csv(Ms_stats,"./coruscans/migrate/Mmu_stats_full.csv")
New names:
Rows: 12 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
ms_stats
M/mu Density
M/mu Ridges
M/mu Violins
4Nem estimates
parameter_type <-"Nm"parameters <-names(posterior) %>%str_subset(parameter_type)parameter_key2 <- parametersnames(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]#summarize the posterior for thetanms <-summarize_posterior(posterior, parameter_type = parameter_type, prior_params =c(0,10,100), n =2^14)#convert to long formatnms_long <- nms %>%select(all_of(c("x","prior",parameters))) %>%pivot_longer(starts_with(c(paste0(parameter_type,"_"))), names_to ="Parameter",values_to ="Density")#plotNms_density <-ggplot(nms_long, aes(x = x, y = Density, color = Parameter)) +geom_line(size =1) +#scale_x_log10(limits = c(5e-4,1e-2)) + scale_color_discrete(type = migration.colors, labels = parameter_key) +geom_line(aes(x = x, y=prior), color ="grey",linetype="dotted") +xlim(0,1.5)#plotnm_ridges <-ggplot(nms_long, aes(x = x, y = Parameter, height = Density, fill = Parameter)) +geom_density_ridges(stat ="identity") +scale_fill_discrete(type = migration.colors, labels = parameter_key) +scale_y_discrete(limits = rev, labels =NULL, breaks =NULL) +labs(x =TeX("$4N_em$")) +xlim(0,1.5)nm_violins <-ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20, fill = Parameter)) +geom_violin(stat ="identity") +scale_fill_discrete(type = migration.colors, labels = parameter_key) +ylim(0,1.5)# get stats for each summed-over-loci posteriornms_stats <- parameter_key2 %>%map_dfr(.f =~posterior_stats(df = nms, parameter = .x),.id ="parameter")ggsave("./coruscans/migrate/Nm_density_full.pdf", plot = Nms_density)ggsave("./coruscans/migrate/Nm_ridges_full.pdf", plot = nm_ridges)ggsave("./coruscans/migrate/Nm_violins_full.pdf", plot = nm_violins)write.csv(nms_stats,"./coruscans/migrate/Nm_stats_full.csv")
New names:
Rows: 12 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
nms_stats
4Nm Density
4Nm Ridges
4Nm Violins
E. carbunculus
Model Marginal Likelihoods
runDir_car <-"carbunculus/migrate/output5"likelist_car <-list()for(r in1:5){ rep =paste0("rep",r)print(rep) likelist_car[[rep]] <-harvest.model.likelihoods(workingDir=file.path(runDir_car,rep),outfileName ="outfile_rerun3.txt")}# Model selection for each replicate...likelist_car %>%map(bfcalcs)
The final model marginal likelihood estimates based on the mean of 5 replicates:
Exact Permutation Test (complete enumeration)
data: GROUP 1 and GROUP 2
p-value = 0.003968
alternative hypothesis: true mean GROUP 1 - mean GROUP 2 is greater than 0
sample estimates:
mean GROUP 1 - mean GROUP 2
5492.338
Model-Selection Figures
And a figure summarizing all this
models <-c("Divergence With Gene Flow 1","Divergence With Gene Flow 2","Divergence Only 1", "Divergence Only 2","Gene Flow Only 1", "Gene Flow Only 2", "Gene Flow Only Full","n-island", "panmixia")likesPlot_car <- likelist_car %>%bind_rows() %>%group_by(model) %>%ggplot(mapping =aes(x=model, y = bezier.corrected)) +geom_violin(draw_quantiles =0.5) +theme(axis.text.x =element_text(angle =90, vjust =0.5, hjust=1)) +scale_x_discrete(labels = models) +labs(x ="Metapopulation Model", y ="Bezier Corrected Marginal Likelihood")likesPlot_car
Final Parameter Estimates
These are the final estimates, summed over loci.
Here are the priors I’m using for the equilibrium models, as a reminder.
repsDir <-"~/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Etelis/migrate_output/carbunculus/output5"# for some reason the bayesallfile has 39 columns, but only 31 are labelled.# read in a population key to translate numbers into names in figurespopkey <-read_csv("carbunculus/popkey.csv")names(popkey$Pop) <- popkey$IndexwinningModel <-"geneflow_only_full"posteriors <-list()#read in the posterior for one replicate this may take 5 minutesfor(r in2:4){print(paste("Loading Rep",r)) posteriors[[r]] <-read_tsv(file.path(repsDir,paste0("rep",r),winningModel,"bayesallfile"), comment ="#") }# This creates an empty element number one, set that to nullposteriors[[1]] <-NULL#combine into one, and then keep only the first 1 rep (based on diagnostics below)#also need to do this to keep "rep" as a column for summarizing over loci function.posterior <- posteriors %>%bind_rows(.id ="rep") %>%filter(rep <=1) %>%migrants.per.gen()# posterior_long <- posterior %>% # pivot_longer(starts_with(c("Theta_", "M_", "Nm_","D_", "d_")),# names_to = "parameter", # values_to = "value")#make a new labeling keyposterior_parameters <-names(posterior)[str_which(names(posterior),paste(c("Theta_", "M_", "Nm_","D_", "d_"), collapse ="|"))]parameter_key <- posterior_parameters %>%str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%str_replace("M_", "m/mu " ) %>%str_replace("Theta_", "theta ") %>%str_replace_all(popkey$Pop) %>%str_replace("Nm_", "4Nem ") names(parameter_key) <- posterior_parameters
Check Convergence
#thin by taking every 5 stepsposteriors2 <-map(posteriors,filter,Steps%%500==0)#posteriors2 <- map(posteriors2, select, -mL_harmonic) #remove mL_harmonic, which took in extra unlabeled columns# select parameters onlyposteriors2 <-lapply(posteriors2, select, starts_with(c("Theta_", "M_", "Nm_","D_","d_")))posteriors.mcmc <-mcmc.list(lapply(posteriors2,mcmc,thin=500))posterior.ggs <-ggs(posteriors.mcmc, burnin = F)effectiveSize(posteriors.mcmc[[1]])#ggmcmc(posterior.ggs, param_page = 4)
Took 51 minutes to generate the ggmcmc output, but everything looks really really good, other than the Geweke diagnostics… need to look that up.
Effective Sizes
Effective sizes are good for one replicate - although this isn’t broken up by locus.
New names:
Rows: 3 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
thetastats
Theta Violins
m/mu estimates
parameter_type <-"M"# use this to figure out the granularity of the prior needed for summarize_prior#seq(0,100000000, length.out = 2^17)parameters <-names(posterior) %>%str_subset(parameter_type)parameter_key2 <- parametersnames(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]#summarize the posterior for thetaMs <-summarize_posterior(posterior, parameter_type = parameter_type, prior_params =c(0,10000,100000), n =2^17)#convert to long formatMs_long <- Ms %>%select(all_of(c("x","prior",parameters))) %>%pivot_longer(starts_with(c(paste0(parameter_type,"_"))), names_to ="Parameter",values_to ="Density")migration.colors <-c(theta.colors[1],theta.colors.dark[1],theta.colors.light[1], theta.colors[2],theta.colors.dark[2],theta.colors.light[2], theta.colors[3],theta.colors.dark[3],theta.colors.light[3])#plotMs_density <-ggplot(Ms_long, aes(x = x, y = Density, color = Parameter)) +geom_line(linewidth =1) +scale_x_log10(limits =c(100,1000)) +scale_color_discrete(type = theta.colors, labels = parameter_key) +geom_line(aes(x = x, y=prior), color ="grey",linetype="dotted")#plotM_ridges <-ggplot(Ms_long, aes(x = x, y = Parameter, height = Density, fill = Parameter)) +geom_density_ridges(stat ="identity") +#scale_x_log10(limits = c(1,2000)) +scale_fill_discrete(type = theta.colors, labels = parameter_key) +scale_y_discrete(limits = rev, labels =NULL, breaks =NULL) +labs(x =TeX("$\\frac{m}{\\mu}$")) +xlim(0,1000)M_violins <-ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density*20, fill = Parameter)) +geom_violin(stat ="identity") +scale_y_log10(limits =c(100,1000)) +scale_fill_discrete(type = theta.colors, labels = parameter_key) # ggsave("./carbunculus/migrate/Mmu_density_full.jpg", plot = Ms_density)# ggsave("./carbunculus/migrate/Mmu_ridges_full.jpg", plot = M_ridges)# ggsave("./carbunculus/migrate/Mmu_violins_full.jpg", plot = M_violins)# get stats for each summed-over-loci posteriorMs_stats <- parameter_key2 %>%map_dfr(.f =~posterior_stats(df = Ms, parameter = .x),.id ="parameter")#write.csv(Ms_stats,"./carbunculus/migrate/Mmu_stats_full.csv")
New names:
Rows: 6 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
ms_stats
M/mu Density
M/mu Ridges
M/mu Violins
4Nem estimates
parameter_type <-"Nm"parameters <-names(posterior) %>%str_subset(parameter_type)parameter_key2 <- parametersnames(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]#summarize the posterior for thetanms <-summarize_posterior(posterior, parameter_type = parameter_type, prior_params =c(0,10,100), n =2^14)#convert to long formatnms_long <- nms %>%select(all_of(c("x","prior",parameters))) %>%pivot_longer(starts_with(c(paste0(parameter_type,"_"))), names_to ="Parameter",values_to ="Density")#plotNms_density <-ggplot(nms_long, aes(x = x, y = Density, color = Parameter)) +geom_line(size =1) +#scale_x_log10(limits = c(5e-4,1e-2)) + scale_color_discrete(type = theta.colors, labels = parameter_key) +geom_line(aes(x = x, y=prior), color ="grey",linetype="dotted") +xlim(0,2.5)#plotnm_ridges <-ggplot(nms_long, aes(x = x, y = Parameter, height = Density, fill = Parameter)) +geom_density_ridges(stat ="identity") +scale_fill_discrete(type = theta.colors, labels = parameter_key) +scale_y_discrete(limits = rev, labels =NULL, breaks =NULL) +labs(x =TeX("$4N_em$")) +xlim(0,2.5)nm_violins <-ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20, fill = Parameter)) +geom_violin(stat ="identity") +scale_fill_discrete(type = theta.colors, labels = parameter_key) +ylim(0,2.5)# get stats for each summed-over-loci posteriornms_stats <- parameter_key2 %>%map_dfr(.f =~posterior_stats(df = nms, parameter = .x),.id ="parameter")ggsave("./carbunculus/migrate/Nm_density_full.jpg", plot = Nms_density)ggsave("./carbunculus/migrate/Nm_ridges_full.jpg", plot = nm_ridges)ggsave("./carbunculus/migrate/Nm_violins_full.jpg", plot = nm_violins)write.csv(nms_stats,"./carbunculus/migrate/Nm_stats_full.csv")
New names:
Rows: 6 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
nms_stats
4Nm Density
4Nm Ridges
4Nm Violins
Sum over loci - Bayesfile
This is code for summing over loci in a Bayesfile. Sticking it down here for now.
Read Bayesfile
These functions are for reconstructing distributions from the bayesfile which consists of distributions for each locus calculated as histograms
Cerca, José, Marius F. Maurstad, Nicolas C. Rochette, Angel G. Rivera-Colón, Niraj Rayamajhi, Julian M. Catchen, and Torsten H. Struck. 2021. “Removing the Bad Apples: A Simple Bioinformatic Method to Improve Loci-Recovery in de Novo RADseq Data for Non-Model Organisms.”Methods in Ecology and Evolution 12 (5): 805–17. https://doi.org/10.1111/2041-210x.13562.
Thia, Joshua A. 2022. “Guidelines for Standardizing the Application of Discriminant Analysis of Principal Components to Genotype Data.”Molecular Ecology Resources n/a (n/a). https://doi.org/10.1111/1755-0998.13706.