Gene Flow Models for Etelis spp. Deepwater Snappers

Author

Eric Crandall

Setup

R Libraries

Here I’m just loading a bunch of R packages that we’ll need.

Code
library(tidyverse)
library(ape)
library(phangorn)
library(pegas)
#library(strataG)
library(coda)
library(ggmcmc)
library(perm)
library(snpR)
library(adegenet)
library(pcadapt)
library(knitr)
library(graph4lg)
library(spatstat.core)
library(Rmpfr)
library(emoji)
library(RColorBrewer)
library(colorspace)
library(latex2exp)
library(ggridges)

Function library

And here are some homebrewed functions that will be helpful.

Code
# Function library
harvest.model.likelihoods <- function(workingDir = workingDir,
                                      outfileName = "outfile.txt",
                                      multilocus = T){
    # this function harvests model marginal likelihoods for models calculated by
    # the program migrate-n (Beerli & Felsenstein 2001).
    # It takes as input a directory full of directories, 
    # each of which contains output from a migrate model, and is named
    # after that model. 
  
    #initialize a data frame to take the values
    modelMarglikes <- data.frame(model=character(),
                             thermodynamic=numeric(),
                             bezier.corrected=numeric(), 
                             harmonic=numeric()) 
    # loop through directories in the working directory, each of which is name
    # after a different model
  for(i in list.dirs(workingDir, full.names = F)[-1]){ #i<-"stepping.stone"
      modelDir<-file.path(workingDir,i)
      print(modelDir)
    #scan in the outfile, separating at each newline
      outfile<-scan(file=file.path(modelDir,outfileName),what="character",sep="\n") 
    #find the line with the likelihoods on it and split on runs of spaces
      marglikeline <- grep("Scaling factor",outfile,value=F)-1
      marglikeline <- strsplit(outfile[marglikeline],
                               "\\s+", perl = T)[[1]][3:5]
    #  if(length(marglikeline)==0){next}
      marglikes <- c(i,marglikeline)
     
      modelMarglikes <- rbind(modelMarglikes,marglikes, deparse.level = 2)
  }
  names(modelMarglikes) <- c("model","thermodynamic","bezier.corrected","harmonic")
  modelMarglikes[2:4] <- sapply(modelMarglikes[2:4], as.numeric)
  return(modelMarglikes)
}

bfcalcs<-function(df,ml="bezier.corrected"){
  # This calculates log bayes factors on data frames output by
  # harvest.model.likelihoods(), following Johnson and Omland (2004)
  # You may choose the likelihood flavor with
  # ml = "bezier.corrected", "thermodynamic" or "harmonic"
  #df$thermodynamic <- as.numeric(df$thermodynamic)
  #df$bezier.corrected <- as.numeric(df$bezier.corrected)
  #df$harmonic <- as.numeric(df$harmonic)
  mlcol <- df[[ml]] 
    bmvalue <- mlcol[which.max(mlcol)]
    lbf <- 2*(mlcol-bmvalue)
    choice <- rank(-mlcol)
    modelprob <- exp(lbf/2)/sum(exp(lbf/2))
    dfall <- cbind(df,lbf,choice,modelprob)
    return(dfall)
}   

migrants.per.gen<-function(x){
  #a function for creating Nm vectors out of m and Theta vectors.
  #x<-x[[1]]
  m<-names(x)[which(grepl("M_",names(x)))] #names of m columns
  #theta<-names(x)[which(grepl("Theta_",names(x)))] #names of theta columns
  for(n in m){
    t<-paste("Theta",strsplit(n,split="_")[[1]][3],sep="_")
    x[,paste("Nm",strsplit(n,split="_")[[1]][2],strsplit(n,split="_")[[1]][3],sep="_")]<- x[,which(names(x)==n)]*x[,which(names(x)==t)] 
    #this hairy little statement makes a new column named "Nm_X_Y" and then fills it by multiplying the M_X_Y column by the Theta_Y column  
  }
  return(x)
}

######Functions for summing over loci in a bayesallfile###############################

remove_prior <- function(densityd,prior,threshold = 1e-10){
  # this function changes values less than a threshold 
  # to zero removes a prior from the 
  # y values of a density distributions (density).
  # first it zeros values less than a threshold, then 
  # removes the prior, then renormalizes so the density
  # sums to 1
  prior[which(prior < 1e-30)] <- 1e-30
  densityd[which(densityd < threshold)] <- 0
  new <- (densityd/prior)/sum(densityd/prior)

  return(new)
}

sum_over_loci <- function(df,parameter){
      #this function takes a data frame of probability densities for many loci
      # that have had the prior removed,
      # together with with a logged prior named "logPrior",
      #  as well as the name of a parameter (e.g. "Theta_1")
      # and sums the densities over loci.
       # Rmpfr package allows quadruple precision for calcs on very small numbers.
    require(Rmpfr)
  
    #add a teeny-tiny amount to all values to avoid zeros
    df2 <- df %>% mutate(across(starts_with(parameter), 
                        .fns= function(x) x + 1e-11))  %>% 
      #log all values
            mutate(across(starts_with(c(parameter)),
                  .fns=log)) %>% 
      # convert the df to rowwise so that rows can be summed
      # and then sum across the row, including the prior
          rowwise() %>% 
          mutate(sum_param_prior = 
          sum(c_across(starts_with(c(parameter,"logPrior"))))) %>% 
    #convert back to a regular df
          ungroup()
    
    #need to convert to quadruple precision because 
    #these will exponentiate to very small numbers.
    sum_param_prior_exp <- exp(mpfr(df2$sum_param_prior, precBits = 128))
    # standardize by dividing by the sum of the column
    sum_param_prior_standardized <-
             sum_param_prior_exp/sum(sum_param_prior_exp)
    

    #drop the intermediate columns (no longer needed), change the standardized
    # output back to double precision so that it can be incorporated into the df
    # rename the summed column after the parameter
      df3 <- df2 %>% select(-c(sum_param_prior)) %>%
          mutate(summed_over_loci =
          as.numeric(sum_param_prior_standardized)) %>% 
          rename_with(.fn = ~ paste(parameter), 
                      .cols = summed_over_loci)
    return(df3)
}



summarize_posterior <- function(posterior, 
                  parameter_type = c("Theta","M","Nm","D"),
                  prior_params = c(min,mean,max),
                  n=16384){
  # this function takes a Migrate-n posterior "bayesallfile" as a dataframe
  # as well as one of the parameter types, and the prior on that parameter
  # as a vector c(min,mean,max). Currently only exponential priors supported
  # it will create densities for each parameter of the given type,
  # remove the prior from each, sum across loci, and re-add the prior (once)
  parameters <- names(posterior) %>%
                        str_subset(parameter_type)
  # create a tibble with the x values for all density plots 
  #  for a particular parameter type
  getx <- posterior %>% filter(Locus == 1) %>% 
        select(parameters[1])

  # create a tibble with x values for a density plot
  #  of the chosen number of points
  dens <- tibble(.rows = n, x = density(getx[[1]],  n=n, 
                           from = prior_params[1], 
                           to = prior_params[3],
                           bw = "nrd0")$x)
  print("calculating densities")
  # calculate densities for each parameter of a given type at each locus
  dens <- posterior %>% 
          select(starts_with(c("Steps","Locus","rep",
                  paste0(parameter_type,"_")))) %>% 
          pivot_wider(names_from = "Locus", values_from = 
                  starts_with(paste0(parameter_type,"_")),
                          names_sep = "-") %>% 
          select(starts_with(paste0(parameter_type,"_"))) %>% 
          map_dfc(function(x) density(x, n = n, from = 
                                    prior_params[1], 
                                    to = prior_params[3], 
                                    bw = "nrd0")$y) %>%
          bind_cols(dens)
  
  # create, standardize, log and remove prior
  dens$prior <- dexp(dens$x, rate = 1/prior_params[2], 
                     log = F)
  
  dens$prior <- dens$prior/sum(dens$prior)
  
  dens$logPrior <- log(dens$prior)
  
  print("removing prior")
  dens2 <- dens %>% 
        #remove the prior, standardize
        mutate(across(starts_with(parameter_type), 
                  ~ remove_prior(densityd = .x,
                                  prior = dens$prior,
                                  threshold = 1e-10) ))

  dens3 <- dens2
    
  for(p in parameters){
    print(p)  
    dens3 <- sum_over_loci(df = dens3, parameter = p)
  }
  # trying to do the above loop with purrr:map_dfc
  #dens4 <- parameters %>% 
  #        map_dfc(.f = ~ sum_over_loci(df = dens2, parameter = .x))
  return(dens3)
}

posterior_stats <- function(df,parameter){
  require(spatstat.core)
  p <- df %>% select(x,parameter) %>% as.list(bw = 6.33e-05)
  names(p) <- c("x", "y")
  p$bw <- 6.33e-5
  attr(p, "class") <- "density"
  qu <- quantile.density(p, c(0.025, 0.25, 0.5, 0.75, 0.975))
  wmo <- p$x[which(p$y==max(p$y))]
  wme <- weighted.mean(p$x, p$y)
  wsd <- sqrt(weighted.var(p$x, p$y))
  stats <- c(qu,mode = wmo, mean = wme, sd = wsd)
  return(stats)
}

Introduction

Brian Bowen’s student Anne Lee asked me to do a migrate analysis on two species of deepwater snapper, Etelis carbunculus and Etelis coruscans. She wrote:

My thesis focuses on assessing the population structure and origin of deepwater snappers, Etelis carbunculus and Etelis coruscans, that were sampled from Japan, Johnston Atoll, Main Hawaiian Islands, and Northwestern Hawaiian Islands. My data consists of SNPs and I will be using migrate to help me with the analysis of the origin. This will help me examine the patterns of gene flow within both species to compare the Hawaiian populations to Johnston Atoll and Japan populations.

Because these guys are so deep down, and harder to sample, we just have single population samples from the four populations she mentions above.

Etelis carbunculis; photo by Thomas Schreiber

Etelis coruscans; photo by Wiwied Soeparto

Bee-utiful deepwater snappers wondering where they came from, where they are going, and whether they can get back in the water now.

I’m also going to see how much I can learn about quarto notebooks, which Rstudio is apparently using to replace Rmarkdown notebooks. They seem like an improvement so far! Following along here, using the simple option to render to the docs folder.

Anne placed all the data on Google Drive, zipped by population. I will download them with the gdown python package. This involves looking up the google id for each file in the URL of the share link and pasting that into gdown thusly. They were quite large, and unzip and gzip wouldn’t work. I used jar (which I didn’t even know could uncompress things) as suggested here

gdown https://drive.google.com/uc?id=142BTkLDRNa6TXk1l4vhxRMxcejOg3rMt --output ECA_Japan.zip
gdown https://drive.google.com/uc?id=1NFI7bH6kmsDcKwB7Y6IkE1MkOAhypCu0 --output ECA_MHI.zip
gdown https://drive.google.com/uc?id=1hufzYMZNUX_q9-QX3JwIbKYOvjsmSwks --output ECA_NWHI.zip
gdown https://drive.google.com/uc?id=1_Z3zJVPVg7cazG_BWrcoJExM36YcFVqU --output ECO_Japan.zip
gdown https://drive.google.com/uc?id=1N50_iwNGLK6bZmjfrm58lgSZgC4RHw74 --output ECO_Johnston.zip
gdown https://drive.google.com/uc?id=1YvWbC3kkY7IJKjmjOQIJIKPAJD-KnExd --output ECO_MHI.zip
gdown https://drive.google.com/uc?id=1VIBPxE08eZQuTWnW3kWmQBTBbp8FUNEZ --output ECO_NWHI.zip

#gdown https://drive.google.com/uc?id=1rIWvFeQqaGOSIiNe385eIrHsGqMls2qD --output Etelis_ECO249.R2.fq.gz
jar xvf ECA_Japan.zip
 # etc.

Need to rename the files into Illumina format in order for Stacks to recognize the paired reads.

cd raw

for f in *.R1.fq.gz
do
mv $f ${f/.R1.fq.gz/_R1_001.fastq.gz}
done

for f in *.R2.fq.gz
do
mv $f ${f/.R2.fq.gz/_R2_001.fastq.gz}
done

cd ..

Haplotypes from Stacks

Clean the data with process_radtags

Same commands for carbunculus and coruscans. Paired end. Look for ecoR1 cutsites on read 1 and mspI cutsites on read 2. Remove low quality reads (based on a sliding window average). Remove reads smaller than 145 bp, and trim others to that length.

process_radtags -p ./raw/ -o ./cleaned/ --rescue --clean --quality --paired --threads 12 --renz-1 ecoRI --renz-2 mspI  \
                  --truncate 145 --len-limit 145 \
                  --adapter-1 GATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG \
                  --adapter-2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \
                  --adapter-mm 2

E. carbunculus output

BEGIN total_raw_read_counts  
Total Sequences 255732606  
Reads containing adapter sequence       1957206 0.8%  
Barcode Not Found       0       0.0%  
Low Quality     6558845 2.6%  
RAD Cutsite Not Found   1678118 0.7%  
Retained Reads  245538437       96.0%  
END total_raw_read_counts  

E. coruscans output

BEGIN total_raw_read_counts  
Total Sequences 272955592  
Reads containing adapter sequence       1919907 0.7%  
Barcode Not Found       0       0.0%  
Low Quality     8734891 3.2%  
RAD Cutsite Not Found   1884115 0.7%  
Retained Reads  260416679       95.4%  
END total_raw_read_counts  

Running the denovo_map.pl pipeline

Hmm… gotta rename the cleaned files too. All code is being run on both species.

More renaming

cd cleaned
for f in *001.1.fq.gz
do
mv $f ${f/_R[12]_001.1.fq.gz/.1.fq.gz}
done

for f in *001.2.fq.gz
do
mv $f ${f/_R[12]_001.2.fq.gz/.2.fq.gz}
done

#rename the "remainder reads" that have had their pair removed for low quality
for f in *001.rem.1.fq.gz; do mv $f ${f/_R[12]_001.rem.1.fq.gz/.rem.1.fq.gz}; done
for f in *001.rem.2.fq.gz; do mv $f ${f/_R[12]_001.rem.2.fq.gz/.rem.2.fq.gz}; done

The pipeline

Removing extra flags

Oy - so denovo_map.pl crashed with this message:

Error: Failed to find any matching paired-end reads in ‘./cleaned/Etelis_S36.2.fq.gz’.
Aborted.

Julian Catchen gives the solution here. Because these data were already run through process_radtags once to demultiplex them, it attached a \1 to the labels of all primary reads and a \2 to the labels of all paired reads. When I re-ran them in process_radtags, it attached an extra flag to each, which then confused tsv2bam. Julian provides the below code to remove the extra flags.

ls  -1 *.1.fq.gz | sed -E 's/\.1\.fq\.gz//' | while read line; do zcat ${line}.1.fq.gz | sed -E 's/\/1$//' > ../cleaned/${line}.1.fq; done

ls  -1 *.2.fq.gz | sed -E 's/\.2\.fq\.gz//' | while read line; do zcat ${line}.2.fq.gz | sed -E 's/\/2$//' > ../cleaned/${line}.2.fq; done

cd ../cleaned

parallel gzip ::: *

Run denovo_map.pl

I’m going to use the denovo_map.pl pipeline. This command will take data from ./cleaned and the popmap provided by Anne and -m 5 reads per stack, -M 4 distance between stacks, -n 4 distance between catalog loci. Running on 8 -T threads and only keeping loci that appear in 80% of individuals in all 4 populations

denovo_map.pl --samples ./cleaned/ --popmap popmapECA.tsv --out-path ./pipeline --paired \
-m 5 -M 4 -n 4  -T 12 -r 80 -p 4 -X "populations: --fasta-samples" -X "populations: --filter-haplotype-wise"

Populations

Now I need to re-run populations to get more statistics. Original populations output is in output1, this will be in output2.

r80

Settings for keeping loci that occur in 80 percent of indivs per population (so 4/5 of the Johnston invivs for coruscans). I’m being intentionally stringent by setting the p-value cutoff for hwe at 0.05.

populations -P ./ -O ./output2_r80 -M ../popmapECO.tsv -t 12 -p 4 -r 80 -H --hwe --fstats --p-value-cutoff 0.05 --fasta-loci --fasta-samples --vcf --genepop --structure --treemix --fasta-samples-raw 

E. carbunculus

Removed 98351 loci that did not pass sample/population constraints from 115429 loci.
Kept 17078 loci, composed of 5539395 sites; 17697 of those sites were filtered, 49261 variant sites remained.
Number of loci with PE contig: 17078.00 (100.0%);
  Mean length of loci: 314.36bp (stderr 0.38);
Number of loci with SE/PE overlap: 2546.00 (14.9%);
  Mean length of overlapping loci: 285.08bp (stderr 0.47); mean overlap: 27.61bp (stderr 0.13);
Mean genotyped sites per locus: 315.85bp (stderr 0.37).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Japan: 13.333 samples per locus; pi: 0.16058; all/variant/polymorphic sites: 5394107/49261/33997; private alleles: 6596
  MHI: 13.55 samples per locus; pi: 0.15487; all/variant/polymorphic sites: 5394107/49261/31541; private alleles: 3166
  NWHI: 25.142 samples per locus; pi: 0.15329; all/variant/polymorphic sites: 5394107/49261/38082; private alleles: 7321

Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Japan: 1145
  MHI: 1103
  NWHI: 1527
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Japan: 3917
  MHI: 4185
  NWHI: 3407
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  Japan-MHI: mean Fst: 0.030208; mean Phi_st: 0.023147; mean Fst': 0.025616; mean Dxy: 0.0019853
  Japan-NWHI: mean Fst: 0.02638; mean Phi_st: 0.028095; mean Fst': 0.029652; mean Dxy: 0.0019372
  MHI-NWHI: mean Fst: 0.014407; mean Phi_st: 0.0018347; mean Fst': 0.0016037; mean Dxy: 0.0018308

E. coruscans

Removed 130579 loci that did not pass sample/population constraints from 141967 loci.
Kept 11388 loci, composed of 3690405 sites; 32281 of those sites were filtered, 59470 variant sites remained.
Number of loci with PE contig: 11388.00 (100.0%);
  Mean length of loci: 314.06bp (stderr 0.42);
Number of loci with SE/PE overlap: 1079.00 (9.5%);
  Mean length of overlapping loci: 291.05bp (stderr 0.59); mean overlap: 24.47bp (stderr 0.12);
Mean genotyped sites per locus: 315.02bp (stderr 0.42).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Johnston_Atoll: 4.1993 samples per locus; pi: 0.074411; all/variant/polymorphic sites: 3587423/59470/15449; private alleles: 5208
  MHI: 13.68 samples per locus; pi: 0.060304; all/variant/polymorphic sites: 3587423/59470/25569; private alleles: 7241
  NWHI: 26.37 samples per locus; pi: 0.060932; all/variant/polymorphic sites: 3587423/59470/38902; private alleles: 15366
  Japan: 13.217 samples per locus; pi: 0.054013; all/variant/polymorphic sites: 3587423/59470/22717; private alleles: 6028

Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Johnston_Atoll: 7
  MHI: 1134
  NWHI: 1648
  Japan: 700
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Johnston_Atoll: 3998
  MHI: 2984
  NWHI: 2673
  Japan: 3395
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  Johnston_Atoll-MHI: mean Fst: 0.038243; mean Phi_st: 0.012337; mean Fst': 0.0011598; mean Dxy: 0.0014711
  Johnston_Atoll-NWHI: mean Fst: 0.022267; mean Phi_st: 0.0056917; mean Fst': -0.0033667; mean Dxy: 0.0013478
  Johnston_Atoll-Japan: mean Fst: 0.041778; mean Phi_st: 0.018924; mean Fst': 0.0048455; mean Dxy: 0.0014546
  MHI-NWHI: mean Fst: 0.013568; mean Phi_st: 0.0027592; mean Fst': 0.0011102; mean Dxy: 0.0012735
  MHI-Japan: mean Fst: 0.021407; mean Phi_st: 0.0058972; mean Fst': 0.0028619; mean Dxy: 0.0013316
  NWHI-Japan: mean Fst: 0.013484; mean Phi_st: 0.002933; mean Fst': 0.0018245; mean Dxy: 0.0012352

r100

Settings for keeping loci that occur in 100 percent of indivs per population ::: {.cell}

populations -P ./ -O ./output3_r100 -M ../popmapECO.tsv -t 12 -p 4 -r 100 -H --hwe --fstats --p-value-cutoff 0.05 --fasta-loci --fasta-samples --vcf --genepop --structure --treemix --fasta-samples-raw 

# and again with single SNPs from each locus so we can test for linkage and selection
populations -P ./ -O ./output4_r100.SNP -M ../popmapECA.tsv -t 12 -p 3 -r 100 --hwe --fstats --p-value-cutoff 0.05  --vcf --genepop  --write-random-snp

#Rerun coruscans after dropping A150
populations -P ./ -O ./output5_r100.dropA150 -M ../popmapECO2.tsv -t 12 -p 4 -r 100 -H --hwe --fstats --p-value-cutoff 0.05 --fasta-loci --fasta-samples --vcf --genepop --structure --treemix --fasta-samples-raw 

populations -P ./ -O ./output6_r100.dropA150.SNP -M ../popmapECO2.tsv -t 12 -p 4 -r 100 --hwe --fstats --p-value-cutoff 0.05  --vcf --genepop  --write-random-snp

:::

E. carbunculus

Removed 111882 loci that did not pass sample/population constraints from 115429 loci.
Kept 3547 loci, composed of 1134453 sites; 6916 of those sites were filtered, 6537 variant sites remained.
Number of loci with PE contig: 3547.00 (100.0%);
  Mean length of loci: 309.83bp (stderr 0.69);
Number of loci with SE/PE overlap: 288.00 (8.1%);
  Mean length of overlapping loci: 285.06bp (stderr 1.00); mean overlap: 24.45bp (stderr 0.24);
Mean genotyped sites per locus: 310.68bp (stderr 0.68).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Japan: 14 samples per locus; pi: 0.10528; all/variant/polymorphic sites: 1101972/6537/3955; private alleles: 1170
  MHI: 14 samples per locus; pi: 0.09766; all/variant/polymorphic sites: 1101972/6537/3434; private alleles: 511
  NWHI: 27 samples per locus; pi: 0.097747; all/variant/polymorphic sites: 1101972/6537/4642; private alleles: 1340

Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Japan: 101
  MHI: 90
  NWHI: 110
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Japan: 1197
  MHI: 1358
  NWHI: 1331
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  Japan-MHI: mean Fst: 0.025802; mean Phi_st: 0.015567; mean Fst': 0.0094451; mean Dxy: 0.00087231
  Japan-NWHI: mean Fst: 0.020607; mean Phi_st: 0.018272; mean Fst': 0.010517; mean Dxy: 0.00082861
  MHI-NWHI: mean Fst: 0.012892; mean Phi_st: 0.00097809; mean Fst': 0.00022075; mean Dxy: 0.00081207

E. coruscans

Removed 140925 loci that did not pass sample/population constraints from 141967 loci.
Kept 1042 loci, composed of 334846 sites; 3569 of those sites were filtered, 3654 variant sites remained.
Number of loci with PE contig: 1042.00 (100.0%);
  Mean length of loci: 311.35bp (stderr 1.36);
Number of loci with SE/PE overlap: 117.00 (11.2%);
  Mean length of overlapping loci: 285.68bp (stderr 1.71); mean overlap: 24.41bp (stderr 0.38);
Mean genotyped sites per locus: 312.62bp (stderr 1.33).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Johnston_Atoll: 5 samples per locus; pi: 0.060494; all/variant/polymorphic sites: 325749/3654/975; private alleles: 520
  MHI: 14 samples per locus; pi: 0.036669; all/variant/polymorphic sites: 325749/3654/1226; private alleles: 470
  NWHI: 27 samples per locus; pi: 0.037317; all/variant/polymorphic sites: 325749/3654/2064; private alleles: 1019
  Japan: 14 samples per locus; pi: 0.033766; all/variant/polymorphic sites: 325749/3654/1143; private alleles: 463

Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Johnston_Atoll: 0
  MHI: 32
  NWHI: 45
  Japan: 19
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Johnston_Atoll: 524
  MHI: 422
  NWHI: 405
  Japan: 396
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  Johnston_Atoll-MHI: mean Fst: 0.039663; mean Phi_st: 0.02039; mean Fst': 0.0012821; mean Dxy: 0.00073676
  Johnston_Atoll-NWHI: mean Fst: 0.026268; mean Phi_st: 0.015567; mean Fst': 0.00037957; mean Dxy: 0.0006556
  Johnston_Atoll-Japan: mean Fst: 0.041015; mean Phi_st: 0.020409; mean Fst': 0.0021605; mean Dxy: 0.0007176
  MHI-NWHI: mean Fst: 0.012433; mean Phi_st: 0.00025697; mean Fst': 4.0397e-05; mean Dxy: 0.00049144
  MHI-Japan: mean Fst: 0.019806; mean Phi_st: 0.0039369; mean Fst': 0.00060169; mean Dxy: 0.00050736
  NWHI-Japan: mean Fst: 0.012751; mean Phi_st: 0.0020762; mean Fst': 0.00052502; mean Dxy: 0.00047009
After dropping A150

A150 is a bad apple. I dropped it from the popmap and re-ran

Removed 140852 loci that did not pass sample/population constraints from 141967 loci.
Kept 1115 loci, composed of 355817 sites; 3712 of those sites were filtered, 3592 variant sites remained.
Number of loci with PE contig: 1115.00 (100.0%);
  Mean length of loci: 309.12bp (stderr 1.35);
Number of loci with SE/PE overlap: 162.00 (14.5%);
  Mean length of overlapping loci: 279.02bp (stderr 1.56); mean overlap: 25.85bp (stderr 0.33);
Mean genotyped sites per locus: 310.72bp (stderr 1.32).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Johnston_Atoll: 4 samples per locus; pi: 0.043022; all/variant/polymorphic sites: 346452/3592/554; private alleles: 160
  MHI: 14 samples per locus; pi: 0.041745; all/variant/polymorphic sites: 346452/3592/1347; private alleles: 506
  NWHI: 27 samples per locus; pi: 0.042774; all/variant/polymorphic sites: 346452/3592/2295; private alleles: 1180
  Japan: 14 samples per locus; pi: 0.037868; all/variant/polymorphic sites: 346452/3592/1241; private alleles: 502

Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Johnston_Atoll: 0
  MHI: 37
  NWHI: 49
  Japan: 21
Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<0.05):
  Johnston_Atoll: 709
  MHI: 446
  NWHI: 414
  Japan: 422
(more detail in populations.sumstats.tsv and populations.hapstats.tsv)

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  Johnston_Atoll-MHI: mean Fst: 0.030722; mean Phi_st: -0.0013119; mean Fst': -0.0016144; mean Dxy: 0.00063351
  Johnston_Atoll-NWHI: mean Fst: 0.015585; mean Phi_st: -0.014286; mean Fst': -0.0023453; mean Dxy: 0.00053846
  Johnston_Atoll-Japan: mean Fst: 0.032038; mean Phi_st: -0.002018; mean Fst': -0.00092502; mean Dxy: 0.00060796
  MHI-NWHI: mean Fst: 0.012297; mean Phi_st: -0.00015381; mean Fst': -7.5242e-05; mean Dxy: 0.00051624
  MHI-Japan: mean Fst: 0.019801; mean Phi_st: 0.0038246; mean Fst': 0.00053071; mean Dxy: 0.00052919
  NWHI-Japan: mean Fst: 0.012569; mean Phi_st: 0.0014773; mean Fst': 0.00052976; mean Dxy: 0.00049003

Download the populations output

scp -r argo:./Etelis/coruscans/pipeline/output3_r100 ./populations_r100
scp -r argo:./Etelis/carbunculus/pipeline/output3_r100 ./populations_r100 
#copy down files from the coruscans run with bad apples removed.
scp -r argo:./Etelis/coruscans/pipeline/output5_r100.dropA150 ./populations_r100.dropA150
scp -r argo:./Etelis/coruscans/pipeline/output6_r100.dropA150.SNP/populations.snps.genepop  ./populations_r100.dropA150/populations.randsnp.genepop

Evaluate the genomic data

E. coruscans

Adegenet

I am testing differences among 4 a priori defined populations. Thus, following recommendations in (Thia 2022), I will only use k-1 or 3 PCA axes.

ecor_haps <- read.genepop("coruscans/populations_r100/populations.haps.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  Stacks v2.61; GenePop v4.1.3; September 30, 2022 

...done.
levels(ecor_haps$pop) <- c("JA","MHI","NWHI","JP")

ecor_dapc_haps <- dapc(ecor_haps,pop =  ecor_haps$pop, scale = T, pca.select="nbEig", n.pca = 3, n.da=3)
Warning in .local(x, ...): Some scaling values are null.
 Corresponding alleles are removed.
scatter(ecor_dapc_haps)

ecor_hwe <- hw.test(ecor_haps, B = 1000)

# playing with cross-validation
#xvalDapc(ecor_haps, grp = ecor_haps$pop, result = "overall", n.pca.max = 10, training.set = 0.75)

Let’s take a look with the randomly selected SNPs

ecor_snps.genind <- read.genepop("coruscans/populations_r100/populations.randsnp.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  # Stacks v2.61; GenePop v4.1.3; September 30, 2022 

...done.
levels(ecor_snps.genind$pop) <- c("JA","MHI","NWHI","JP")

ecor_dapc_snps <- dapc(ecor_snps.genind,pop =  ecor_snps.genind$pop, scale = T,
                       pca.select="nbEig", n.pca = 3, n.da=3)
Warning in .local(x, ...): Some scaling values are null.
 Corresponding alleles are removed.
scatter(ecor_dapc_snps)

SNPr

I will use Will Hemstrom’s new package SNPr to further evaluate and filter the SNPs that I randomly selected from each locus.

popmap_cor <- read_tsv("coruscans/popmapECO.tsv",col_names = c("sampID","pop"))
Rows: 60 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): sampID, pop

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ecor_snps <- read_vcf(file = "coruscans/populations_r100/populations.randsnp.vcf",
                      sample.meta = popmap_cor)
Scanning file to determine attributes.
File attributes:
  meta lines: 14
  header_line: 15
  variant count: 929
  column count: 69

Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
  Character matrix gt rows: 929
  Character matrix gt cols: 69
  skip: 0
  nrows: 929
  row_num: 0

Processed variant: 929
All variants processed

Filtering

I’ve already implemented stringent filters in Stacks to only include individuals with all loci and loci found in all individuals. Now going to implement a strict HWE filter - if a locus is out of HWE in any population (hwe_facets = "pop") after a Bonferroni correction, then it will be removed. Also going to remove non-variant loci, and loci with only a single minor allele (“singletons”).

ecor_snps <- filter_snps(ecor_snps, min_ind = 0.9999, min_loci = 0.9999, re_run = "full", 
                         hwe = 0.05, fwe_method = "BY", hwe_facets = "pop", singletons = T,
                         verbose = T)
Initializing...
Filtering loci. Starting loci: 929 
Filtering non-biallelic loci...
    0 bad loci
Filtering non_polymorphic loci...
    0 bad loci
Filtering loci sequenced in few individuals...
    0 bad loci
Filtering singletons.
    521 bad loci
Filtering loci out of hwe...
     7  bad loci
    Ending loci: 401 
Filtering individuals. Starting individuals: 60 
Filtering out individuals sequenced in few kept loci...
No individuals removed.

Individual Stats

snpR uses facets to subset data on either sample metadata (such as sample ID, or population) or SNP metadata. Here we will look at per-individual stats. Also, note that the calculations are just attached to the data object, so we don’t lose track of them. Nice.

ecor_snps <- calc_hs(ecor_snps, facets = "sampID")
ecor_snps <- calc_he(ecor_snps, facets = "sampID")
ecor_snps <- calc_ho(ecor_snps, facets = "sampID")

# and dang, just like that we can get a PCA
p <- plot_clusters(ecor_snps, facets = "pop")
Formatting data...
p$plots$pca

so there’s that one weirdo at Johnston, who is A150… we can identify him like this. He has double the mean \(H_e\) and \(H_o\).

ecor_snps <- calc_genetic_distances(ecor_snps, facets = "sampID",method = "Nei")
stats.ind <- get.snpR.stats(ecor_snps, facets="sampID", stats = c("hs","ho","he"))
ind.dists <- get.snpR.stats(ecor_snps,facets="sampID", stats = "genetic_distance")                  
heatmap(as.matrix(ind.dists$sampID$.base$Nei))

# drop'im
ecor_snps <- ecor_snps[,-which("Etelis_A150" %in% names(ecor_snps))]

p2 <- plot_clusters(ecor_snps, facets = "pop")
Formatting data...
p2$plots$pca

Re-running after dropping the Bad Apple

I went back to re-run populations from the stacks pipeline with the “Bad Apple” (sample A150 from Johnston) removed from the popmap (see above). This removal is as suggested by (Cerca et al. 2021) Now to re-do everything.

Adegenet

I am testing differences among 4 a priori defined populations. Thus, following recommendations in (thiaGuidelinesStandardizingApplication?), I will only use k-1 or 3 PCA axes.

ecor_haps <- read.genepop("coruscans/populations_r100.dropA150/populations.haps.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  Stacks v2.61; GenePop v4.1.3; November 09, 2022 

...done.
levels(ecor_haps$pop) <- c("JA","MHI","NWHI","JP")

ecor_dapc_haps <- dapc(ecor_haps,pop =  ecor_haps$pop, scale = T, pca.select="nbEig", n.pca = 3, n.da=3)
Warning in .local(x, ...): Some scaling values are null.
 Corresponding alleles are removed.
scatter(ecor_dapc_haps)

ecor_hwe <- hw.test(ecor_haps, B = 1000)

# playing with cross-validation
#xvalDapc(ecor_haps, grp = ecor_haps$pop, result = "overall", n.pca.max = 10, training.set = 0.75)

Let’s take a look with the randomly selected SNPs

ecor_snps.genind <- read.genepop("coruscans/populations_r100/populations.randsnp.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  # Stacks v2.61; GenePop v4.1.3; September 30, 2022 

...done.
levels(ecor_snps.genind$pop) <- c("JA","MHI","NWHI","JP")

ecor_dapc_snps <- dapc(ecor_snps.genind,pop =  ecor_snps.genind$pop, scale = T,
                       pca.select="nbEig", n.pca = 3, n.da=3)
Warning in .local(x, ...): Some scaling values are null.
 Corresponding alleles are removed.
scatter(ecor_dapc_snps)

SNPr

popmap_cor <- read_tsv("coruscans/popmapECO.dropA150.tsv",col_names = c("sampID","pop"))
Rows: 59 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): sampID, pop

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ecor_snps <- read_vcf(file = "coruscans/populations_r100.dropA150/populations.randsnp.vcf",
                      sample.meta = popmap_cor)
Scanning file to determine attributes.
File attributes:
  meta lines: 14
  header_line: 15
  variant count: 984
  column count: 68

Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
  Character matrix gt rows: 984
  Character matrix gt cols: 68
  skip: 0
  nrows: 984
  row_num: 0

Processed variant: 984
All variants processed

Filtering

I’ve already implemented stringent filters in Stacks to only include individuals with all loci and loci found in all individuals. Now going to implement a strict HWE filter - if a locus is out of HWE in any population (hwe_facets = "pop") after a Bonferroni correction, then it will be removed. Also going to remove non-variant loci, and loci with only a single minor allele (“singletons”).

ecor_snps <- filter_snps(ecor_snps, min_ind = 0.9999, min_loci = 0.9999, re_run = "full", 
                         hwe = 0.05, fwe_method = "BY", hwe_facets = "pop", singletons = T,
                         verbose = T)
Initializing...
Filtering loci. Starting loci: 984 
Filtering non-biallelic loci...
    0 bad loci
Filtering non_polymorphic loci...
    0 bad loci
Filtering loci sequenced in few individuals...
    0 bad loci
Filtering singletons.
    538 bad loci
Filtering loci out of hwe...
     6  bad loci
    Ending loci: 440 
Filtering individuals. Starting individuals: 59 
Filtering out individuals sequenced in few kept loci...
No individuals removed.

Individual Stats

snpR uses facets to subset data on either sample metadata (such as sample ID, or population) or SNP metadata. Here we will look at per-individual stats. Also, note that the calculations are just attached to the data object, so we don’t lose track of them. Nice.

ecor_snps <- calc_hs(ecor_snps, facets = "sampID")
ecor_snps <- calc_he(ecor_snps, facets = "sampID")
ecor_snps <- calc_ho(ecor_snps, facets = "sampID")

# and dang, just like that we can get a PCA
p <- plot_clusters(ecor_snps, facets = "pop")
Formatting data...
p$plots$pca

Population Stats

And here are some summary stats about each population

ecor_snps <- calc_ho(ecor_snps, facets = "pop")
ecor_snps <- calc_he(ecor_snps, facets = "pop")
ecor_snps <- calc_maf(ecor_snps, facets = "pop")
ecor_snps <- calc_pi(ecor_snps, facets = "pop")
ecor_snps <- calc_private(ecor_snps, facets = "pop")
ecor_snps <- calc_hwe(ecor_snps, facets = "pop")
ecor_snps <- calc_fis(ecor_snps, facets = "pop")
ecor_snps <- calc_pairwise_fst(ecor_snps, facets = "pop",boot = 1000, boot_par = 5)

stats <- get.snpR.stats(ecor_snps, facets = "pop", stats = c("maf", "ho","he","hwe", "fis","pi", "private"))
stats$weighted.means
# and here is Fst

ecor_fst <- get.snpR.stats(ecor_snps, facets = "pop", stats = c("fst"))
ecor_fst$fst.matrix
$pop
$pop$fst
               p1 Johnston_Atoll          MHI          NWHI
1:          Japan   -0.003445356  0.004937154  0.0032788133
2: Johnston_Atoll             NA -0.011144848 -0.0212128337
3:            MHI             NA           NA -0.0009330938

$pop$p
               p1 Johnston_Atoll       MHI      NWHI
1:          Japan      0.3606394 0.1358641 0.1948052
2: Johnston_Atoll             NA 0.6013986 0.7982018
3:            MHI             NA        NA 0.4945055

Linkage Disequilibrium

A little stuck on LD. But there are so few markers (intentionally) that it really shouldn’t be an issue.

ecor_snps <- calc_pairwise_ld(ecor_snps, facets ="CHROM", verbose = T)
ldstats <- get.snpR.stats(ecor_snps, stats = "ld", facets = "CHROM")
ld <- plot_pairwise_ld_heatmap(ecor_snps, facets=c("CHROM"))

Remove loci under selection using pcadapt

Following along [here](https://bcm-uga.github.io/pcadapt/articles/pcadapt.html.

First, write out the data into plink format

format_snps(ecor_snps, output = "plink", facets = "pop", 
            outfile = "coruscans/plink/coruscans.plink",chr = "CHROM", 
            position = "position" )

Scree Plot

ecor_plink <- read.pcadapt("coruscans/plink/coruscans.plink.bed",type="bed")

pca20 <- pcadapt(ecor_plink, K = 20)

plot(pca20, option = "screeplot")
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the pcadapt package.
  Please report the issue at <]8;;https://github.com/bcm-uga/pcadapt/issueshttps://github.com/bcm-uga/pcadapt/issues]8;;>.

Based on the scree plot I think we’ll use 6 PCs.

PCA

pca6 <- pcadapt(ecor_plink, K = 6)
plot(pca6, option = "scores", pop = popmap_cor$pop)
Warning: Use of `df$Pop` is discouraged.
ℹ Use `Pop` instead.

plot(pca6, option = "manhattan")

#qqplot
plot(pca6, option = "qqplot")

#histogram
hist(pca6$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "orange")

Outliers

So the p-values (even uncorrected) are all fairly large here. There aren’t really any outliers.

padj <- p.adjust(pca6$pvalues, method = "BY")
alpha <- 0.001
pca6_outliers <- which(padj < alpha)

#remove outliers
#ecor_snps2 <- ecor_snps[-pca5_outliers,]

Write out filtered SNPs and haplotypes

#write to vcf
format_snps(ecor_snps, output = "vcf", facets = "pop", 
            outfile = "coruscans/hwe.pcadapt.filtered.snps.vcf",chr = "CHROM", 
            position = "position" )
format_snps(ecor_snps, output = "genepop", facets = "pop", 
            outfile = "coruscans/hwe.pcadapt.filtered.snps.gen",chr = "CHROM", 
            position = "position" )

#grab the locus names and write them to a whitelist to be included in the migrate data file.
whitelist <- ecor_snps@snp.meta$CHROM
write.table(sort(as.numeric(ecor_snps@snp.meta$CHROM)), "coruscans/ecor_whitelist.txt", quote=F, row.names = F,col.names =F )

ecor_haps2 <- ecor_haps[,loc = locNames(ecor_haps) %in% whitelist]
# write them out
genind_to_genepop(ecor_haps2,output = "coruscans/hwe.pcadapt.filtered.haps.txt")

Re-check the DAPC

We didn’t remove any outlier loci, so no need to re-calculate Fst.

Haplotypes
ecor_haps2 <- read.genepop("coruscans/hwe.pcadapt.filtered.haps.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  Conversion of an object of class 'genind' into a GENEPOP file with the package 'graph4lg'                                                                                                                                                                                                                                                                                                                                                                                                                                                         

...done.
levels(ecor_haps2$pop) <- c("JA","JP", "MHI","NWHI")

ecor_dapc2 <- dapc(ecor_haps2,pop =  ecor_haps2$pop, scale = T, pca.select="nbEig", n.pca = 3, n.da=3)

scatter(ecor_dapc2)

SNPs
ecor_snps2.genind <- read.genepop("coruscans/hwe.pcadapt.filtered.snps.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  coruscans/hwe_genepop 

...done.
levels(ecor_snps2.genind$pop) <- c("JP","JA","MHI","NWHI")

ecor_dapc2_snps <- dapc(ecor_snps2.genind,pop =  ecor_snps2.genind$pop, scale = T,
                       pca.select="nbEig", n.pca = 3, n.da=3)

scatter(ecor_dapc2_snps)

E. carbunculus

Adegenet

ecar_haps <- read.genepop("carbunculus/populations_r100/populations.haps.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  Stacks v2.61; GenePop v4.1.3; September 30, 2022 

...done.
levels(ecar_haps$pop) <- c("JP","MHI","NWHI")

ecar_dapc <- dapc(ecar_haps,pop =  ecar_haps$pop, scale = T, pca.select="nbEig", n.pca = 2, n.da=2)
Warning in .local(x, ...): Some scaling values are null.
 Corresponding alleles are removed.
scatter(ecar_dapc)

Let’s take a look with the randomly selected SNPs

ecar_snps.genind <- read.genepop("carbunculus/populations_r100/populations.randsnp.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  # Stacks v2.61; GenePop v4.1.3; September 30, 2022 

...done.
levels(ecar_snps.genind$pop) <- c("JP","MHI","NWHI")

ecar_dapc_snps <- dapc(ecar_snps.genind,pop =  ecar_snps.genind$pop, scale = T,
                       pca.select="nbEig", n.pca = 2, n.da=2)
Warning in .local(x, ...): Some scaling values are null.
 Corresponding alleles are removed.
scatter(ecar_dapc_snps)

SNPr

I will use Will Hemstrom’s new package SNPr to further evaluate and filter the SNPs that I randomly selected from each locus.

ecar_popmap <- read_tsv("carbunculus/popmapECA.tsv",col_names = c("sampID","pop"))
Rows: 55 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): sampID, pop

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ecar_snps <- read_vcf(file = "carbunculus/populations_r100/populations.randsnp.vcf",
                      sample.meta = ecar_popmap)
Scanning file to determine attributes.
File attributes:
  meta lines: 14
  header_line: 15
  variant count: 2750
  column count: 64

Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
  Character matrix gt rows: 2750
  Character matrix gt cols: 64
  skip: 0
  nrows: 2750
  row_num: 0

Processed variant 1000
Processed variant 2000
Processed variant: 2750
All variants processed

Filtering

I’ve already implemented stringent filters in Stacks to only include individuals with all loci and loci found in all individuals. Now going to implement a strict HWE filter - if a locus is out of HWE in any population (hwe_facets = "pop") after a Bonferroni correction, then it will be removed. Also going to remove non-variant loci, and loci with only a single minor allele (“singletons”).

ecar_snps <- filter_snps(ecar_snps, min_ind = 0.9999, min_loci = 0.9999, re_run = "full", 
                         hwe = 0.05, fwe_method = "BY", hwe_facets = "pop", singletons = T,
                         verbose = T)
Initializing...
Filtering loci. Starting loci: 2750 
Filtering non-biallelic loci...
    0 bad loci
Filtering non_polymorphic loci...
    0 bad loci
Filtering loci sequenced in few individuals...
    0 bad loci
Filtering singletons.
    1064 bad loci
Filtering loci out of hwe...
     20  bad loci
    Ending loci: 1666 
Filtering individuals. Starting individuals: 55 
Filtering out individuals sequenced in few kept loci...
No individuals removed.

Individual Stats

snpR uses facets to subset data on either sample metadata (such as sample ID, or population) or SNP metadata. Here we will look at per-individual stats. Also, note that the calculations are just attached to the data object, so we don’t lose track of them. Nice.

ecar_snps <- calc_hs(ecar_snps, facets = "sampID")
ecar_snps <- calc_he(ecar_snps, facets = "sampID")
ecar_snps <- calc_ho(ecar_snps, facets = "sampID")

# and dang, just like that we can get a PCA
p <- plot_clusters(ecar_snps, facets = "pop")
Formatting data...
p$plots$pca

The separation between both Hawaiian populations and Japan is pretty evident even in the PCA.

ecar_snps <- calc_genetic_distances(ecar_snps, facets = "sampID",method = "Nei")
stats.ind <- get.snpR.stats(ecar_snps, facets="sampID", stats = c("hs","ho","he"))
ind.dists <- get.snpR.stats(ecar_snps,facets="sampID", stats = "genetic_distance")                  
heatmap(as.matrix(ind.dists$sampID$.base$Nei))

And there doesn’t seem to be any reason to drop any individuals.

Population Stats

And here are some summary stats about each population

ecar_snps <- calc_ho(ecar_snps, facets = "pop")
ecar_snps <- calc_he(ecar_snps, facets = "pop")
ecar_snps <- calc_maf(ecar_snps, facets = "pop")
ecar_snps <- calc_pi(ecar_snps, facets = "pop")
ecar_snps <- calc_private(ecar_snps, facets = "pop")
ecar_snps <- calc_hwe(ecar_snps, facets = "pop")
ecar_snps <- calc_fis(ecar_snps, facets = "pop")
ecar_snps <- calc_pairwise_fst(ecar_snps, facets = "pop", boot = 1000, boot_par = 5)

stats <- get.snpR.stats(ecar_snps, facets = "pop", stats = c("maf", "ho","he","hwe", "fis","pi", "private"))
stats$weighted.means
# and here is Fst

ecar_fst <- get.snpR.stats(ecar_snps, facets = "pop", stats = c("fst"))
ecar_fst$fst.matrix
$pop
$pop$fst
      p1        MHI        NWHI
1: Japan 0.01746465 0.021466084
2:   MHI         NA 0.001113613

$pop$p
      p1         MHI        NWHI
1: Japan 0.002997003 0.000999001
2:   MHI          NA 0.328671329

Linkage Disequilibrium

A little stuck on LD. But there are so few markers (intentionally) that it really shouldn’t be an issue.

ecar_snps <- calc_pairwise_ld(ecar_snps, facets ="CHROM", verbose = T)
ldstats <- get.snpR.stats(ecar_snps, stats = "ld", facets = "CHROM")
ld <- plot_pairwise_ld_heatmap(ecar_snps, facets=c("CHROM"))

Remove loci under selection using pcadapt

Following along [here](https://bcm-uga.github.io/pcadapt/articles/pcadapt.html.

First, write out the data into plink format

format_snps(ecar_snps, output = "plink", facets = "pop", 
            outfile = "carbunculus/plink/carbunculus.plink",chr = "CHROM", 
            position = "position" )

Scree Plot

ecar_plink <- read.pcadapt("carbunculus/plink/carbunculus.plink.bed",type="bed")

pca20 <- pcadapt(ecar_plink, K = 20)

plot(pca20, option = "screeplot")

Based on the scree plot I think we’ll use 2 PCs.

PCA

pca2 <- pcadapt(ecar_plink, K = 2)
plot(pca2, option = "scores", pop = ecar_popmap$pop)
Warning: Use of `df$Pop` is discouraged.
ℹ Use `Pop` instead.

plot(pca2, option = "manhattan")

#qqplot
plot(pca2, option = "qqplot")

#histogram
hist(pca2$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "orange")

Outliers

In this species, there are some clear outliers. I am going to correct to Benjamini-Yekuteli False Discovery Rate, with a 1 in 1000 FDR.

padj <- p.adjust(pca2$pvalues, method = "BY")
alpha <- 0.001
pca2_outliers <- which(padj < alpha)

#remove outliers
ecar_snps2 <- ecar_snps[-pca2_outliers,]

Write out filtered SNPs and haplotypes

I’m going to write a whitelist with 401 randomly selected SNPs - to be in parity with coruscans. Adding more would take longer.

#write to vcf
format_snps(ecar_snps2, output = "vcf", facets = "pop", 
            outfile = "carbunculus/hwe.pcadapt.filtered.snps.vcf",chr = "CHROM", 
            position = "position" )

format_snps(ecar_snps2, output = "genepop", facets = "pop", 
            outfile = "carbunculus/hwe.pcadapt.filtered.snps.genepop",chr = "CHROM", 
            position = "position" )

write.csv(pca2_outliers, "carbunculus/pcadapt.outliers.csv")
#grab the locus names, pick a random subset of 1000, and write them to a whitelist to be included in the migrate data file.
whitelist <- ecar_snps2@snp.meta$CHROM[sample(1:nsnps(ecar_snps2),1000)]
write.table(sort(as.numeric(whitelist)), 
            "carbunculus/ecar_whitelist.txt", quote=F, row.names = F,col.names =F )

ecar_haps2 <- ecar_haps[,loc = locNames(ecar_haps) %in% whitelist]
# write them out
genind_to_genepop(ecar_haps2,output = "carbunculus/hwe.pcadapt.filtered.haps.txt")

ecar_snps3 <- ecar_snps2[ecar_snps2@snp.meta$CHROM %in% whitelist,]
format_snps(ecar_snps3, output = "genepop", facets = "pop", 
            outfile = "carbunculus/hwe.pcadapt.filtered.snps1000.genepop",chr = "CHROM", 
            position = "position" )

Re-check the DAPC

Haplotypes
ecar_haps2 <- read.genepop("carbunculus/hwe.pcadapt.filtered.haps.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  Conversion of an object of class 'genind' into a GENEPOP file with the package 'graph4lg'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

...done.
levels(ecar_haps2$pop) <- c("JP","MHI","NWHI")

ecar_dapc2 <- dapc(ecar_haps2,pop =  ecar_haps2$pop, scale = T, pca.select="nbEig", n.pca = 2, n.da=2)

scatter(ecar_dapc2)

SNPs
ecar_snps2.genind <- read.genepop("carbunculus/hwe.pcadapt.filtered.snps1000.gen")

 Converting data from a Genepop .gen file to a genind object... 


File description:  carbunculus/hwe_genepop 

...done.
levels(ecar_snps2.genind$pop) <- c("JP","MHI","NWHI")

ecar_dapc2_snps <- dapc(ecar_snps2.genind,pop =  ecar_snps2.genind$pop, scale = T,
                       pca.select="nbEig", n.pca = 2, n.da=2)

scatter(ecar_dapc2_snps)

So the structure as depicted by DAPC still seems intact in this species.

popmap2 <- tibble(pop=c(rep("Japan",14),rep("MHI",14),rep("NWHI",27)))
ecar_snps3 <- read_genepop(file = "carbunculus/hwe.pcadapt.filtered.snps.genepop",sample.meta = popmap2)
Warning in .sub_and_t_1_2_to_A_C(ac, ncol(gt)): Since allelic identities are not clear, alleles at each locus will be saved as A and C. If this data is later reformatted for use elsewhere, please be aware that this may cause issues for some downstream analyses (since, for example, all SNPs will be nonsensically noted as transversions rather than transitions!).
 To prevent this, use a different format that specifies allelic identities.
This is not an issue for any snpR functions save those that rely on ancestral/derived allelic identities.
Warning in validityMethod(object): Some snp metadata columns contain unacceptable special characters. Unaccepted characters: '.', '~', or any whitespace.
These can cause unexpected behaviour if the subect columns are used as facets.
ecar_snps3 <- calc_ho(ecar_snps3, facets = "pop")
ecar_snps3 <- calc_he(ecar_snps3, facets = "pop")
ecar_snps3 <- calc_maf(ecar_snps3, facets = "pop")
ecar_snps3 <- calc_pi(ecar_snps3, facets = "pop")
ecar_snps3 <- calc_private(ecar_snps3, facets = "pop")
ecar_snps3 <- calc_hwe(ecar_snps3, facets = "pop")
ecar_snps3 <- calc_fis(ecar_snps3, facets = "pop")
ecar_snps3 <- calc_pairwise_fst(ecar_snps3, facets = "pop", boot= 1000, boot_par = 5)

stats <- get.snpR.stats(ecar_snps3, facets = "pop", stats = c("maf", "ho","he","hwe", "fis","pi", "private"))
stats$weighted.means
# and here is Fst

ecar_fst3 <- get.snpR.stats(ecar_snps3, facets = "pop", stats = c("fst"))
ecar_fst3$fst.matrix
$pop
$pop$fst
      p1        MHI         NWHI
1: Japan 0.01413796 0.0165854943
2:   MHI         NA 0.0003387499

$pop$p
      p1        MHI        NWHI
1: Japan 0.01198801 0.000999001
2:   MHI         NA 0.411588412

And compare that to pre-filtering. Some differences, but not as much as when I was overly stringent with selected loci.

ecar_fst$fst.matrix
$pop
$pop$fst
      p1        MHI        NWHI
1: Japan 0.01746465 0.021466084
2:   MHI         NA 0.001113613

$pop$p
      p1         MHI        NWHI
1: Japan 0.002997003 0.000999001
2:   MHI          NA 0.328671329

Fasta2Genotype

Paul Maier created this script to convert Stacks haplotypes to migrate format.

Install Python2

I had a doozy of a time trying to get this to work this time around: the script was written in python2. which is nearly deprecated. I first tried to use 2to3 to upgrade the script to python3, but there were a few stray tabs I couldn’t figure out. I eventually landed here, and used the following commands to install pyenv, which can handle multiple python installs.

brew install pyenv
pyenv install 2.7.18
pyenv versions
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.zshrc\necho 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.zshrc\necho 'eval "$(pyenv init -)"' >> ~/.zshrc
exec "$SHELL"
pyenv shell 2.7.18 # use python 2 for this shell session

 pip install numpy
 pip install --upgrade pip # need to update pip to be able to install scipy
 pip install scipy

E. coruscans

Run it

I had to remove all the [individualName] tokens from the populations.samples.fa to get it to work. Also, I added a SampleID column and header row to the popmap for use by fasta2genotype.py. I wrote out a whitelist above of all the 440 loci that passed through the filters.

python ../../fasta2genotype/fasta2genotype.py populations_r100.dropA150/populations.samples2.fa ecor_whitelist.txt \
popmapECO_f2g.tsv NA Ecoruscans440.mig


###################################################################
###                                                             ###
###       Fasta2Genotype | Data Conversion | Version 1.10       ###
###                                                             ###
###                        Cite as follows:                     ###
###                                                             ###
###   Maier P.A., Vandergast A.G., Ostoja S.M., Aguilar A.,     ###
###   Bohonak A.J. (2019). Pleistocene glacial cycles drove     ###
###   lineage diversification and fusion in the Yosemite toad   ###
###   (Anaxyrus canorus). Evolution, in press.                  ###
###   https://www.doi.org/10.1111/evo.13868                     ###
###                                                             ###
###################################################################

Output type? [1] Migrate [2] Arlequin [3] DIYABC [4] LFMM [5] Phylip [6] G-Phocs [7] Treemix [8] Haplotype: 1
Loci to use? [1] Whitelist [2] All: 1
Remove restriction enzyme or adapter sequences? These may bias data. [1] Yes [2] No: 2
Remove monomorphic loci? [1] Yes [2] No: 1
Remove loci with excess heterozygosity? This can remove paralogs. [1] Yes [2] No: 1
Maximum heterozygosity cutoff for removing loci out of Hardy-Weinberg? 0.5
Filter for allele frequency? False alleles might bias data. [1] Yes [2] No: 2
Filter for missing genotypes? These might bias data. [1] Yes [2] No: 2
 
**************************************************************************************************************
***                                       ... BEGINNING CONVERSION ...                                     ***
**************************************************************************************************************
 
Cataloging loci...
Counting locus lengths...
Cataloging populations...
Counting gene copies...
Counting alleles for each locus...
Identifying loci with excess heterozygosity...
     Calculating observed heterozygosity and homozygosity...
     Calculating expected heterozygosity and homozygosity...
     Flagging loci with excess heterozygosity for removal...
     Removing loci...
     Removed 1 overly heterozygous loci.
Outputting migrate-n file...


# it removed locus 60082 for excess heterozygosity, so I dropped it from the whitelist.

I then re-ran the script, selecting phylip output, with locus name headers, and all other options the same. I need the locus names in the order output by the script in order to run modeltest.

E. carbunculus

Run it

I had to remove all the [individualName] tokens from the populations.samples.fa to get it to work. Also, I added a SampleID column and header row to the popmap for use by fasta2genotype.py. I wrote out a whitelist above of 1000/1666 randomly selected loci that passed through the filters. Used the same options as E. coruscans.

python ../../fasta2genotype/fasta2genotype.py populations_r100/populations.samples2.fa ecar_whitelist_1000.txt \
popmapECA_f2g.tsv NA Ecarbunculus999.mig

Output type? [1] Migrate [2] Arlequin [3] DIYABC [4] LFMM [5] Phylip [6] G-Phocs [7] Treemix [8] Haplotype: 1
Loci to use? [1] Whitelist [2] All: 1
Remove restriction enzyme or adapter sequences? These may bias data. [1] Yes [2] No: 2
Remove monomorphic loci? [1] Yes [2] No: 1
Remove loci with excess heterozygosity? This can remove paralogs. [1] Yes [2] No: 1
Maximum heterozygosity cutoff for removing loci out of Hardy-Weinberg? 0.5
Filter for allele frequency? False alleles might bias data. [1] Yes [2] No: 2
Filter for missing genotypes? These might bias data. [1] Yes [2] No: 2
 
**************************************************************************************************************
***                                       ... BEGINNING CONVERSION ...                                     ***
**************************************************************************************************************
 
Cataloging loci...
Counting locus lengths...
Cataloging populations...
Counting gene copies...
Counting alleles for each locus...
Identifying loci with excess heterozygosity...
     Calculating observed heterozygosity and homozygosity...
     Calculating expected heterozygosity and homozygosity...
     Flagging loci with excess heterozygosity for removal...
     Removing loci...
     Removed 1 overly heterozygous loci.
Outputting migrate-n file...
*** DONE! ***


It removed locus 31718 for being overly heterozygous, so I zapped it from the whitelist.

I then re-ran the script, selecting phylip output, with full sequences, haploid, all loci, not separated by ! locus name headers, and all other options the same. I need the locus names in the order output by the script in order to run modeltest. This is only needed to get the locus names in the next step.

Migrate

I installed Migrate 5.0.4 as described here.

Mutation Models (code used for both species)

I need to figure out an overall mutation model to use with RAD loci. I received this advice one from Peter about how to construct these. I’ll use modelTest in the phangorn package to see where that gets me.

I renamed the FASTA headers in populations.samples.fa with BBEdit to create population.samples3.fa

Find: >CLocus_\d+_Sample_\d+_Locus_(\d+)_Allele_([01]) \[Etelis_(.+)\]
Replace: >\3_L\1_A\2

#first token is sample name, second is locus number(name) third is allele

#Also remove bad indiv A150 in coruscans
Find: >A150_.+\n.+\n
Replace:

Lets load in the data and calculate some statistics for each locus. Previously Migrate-n only implemented the F84 (=HKY) model, with two rates (Transitions and Transversions) and gamma distributed rate variability.

Now it will take these possible models: 1 Jukes-Cantor model 2 Kimura 2-parameter model 3 Felsenstein 1981 (F81) model 4 Felsenstein 1984 (F84) model 5 Hasegawa-Kishino-Yano model 6 Tamura-Nei model

fastadata <- read.FASTA("coruscans/populations_r100.dropA150//populations.samples3.fa")

#  read in the locus lengths, as optional additional way to make sure the stats
# and the migrate infile are ordered the same
#migrate_lengths <- read_lines("coruscans/Ecoruscans.mig", skip = 1, n_max=1) %>% 
#  str_split("\\t", simplify = T) %>%  t(.) %>% tibble(length=.) %>% na.omit()

migrate_loci <- read_lines("coruscans/Ecoruscans439.phy", skip = 1, n_max=1) %>% 
  str_split("\\t", simplify = T) %>%  t(.) %>% tibble(locus_name=.) %>% filter(locus_name!="")

stats <- tibble(locus = character(), 
                length = numeric(),
                segSites = numeric(),
                nHaps = numeric(),
                nucDiv = numeric(),
                ttRatio = numeric(),
                model = character(),
                gammaShape = numeric(),
                rate1 = numeric(), 
                rate2 = numeric(), 
                rate3 = numeric(),
                rate4 = numeric(),
                Q1= numeric(),
                Q2 = numeric(),
                Q3 = numeric(),
                )
                
for(l in migrate_loci$locus_name){
  L <- paste0("L",l,"_")
  print(paste("Now Doing Locus", l, match(l,  migrate_loci$locus_name), "out of", length(migrate_loci$locus_name)))
  locus_dnabin <- fastadata[str_which(names(fastadata),pattern = L)]
  # convert to package formats
  locus_dnabin <- as.matrix(locus_dnabin)
  locus_gtypes <- sequence2gtypes(locus_dnabin)
  locus_phy <- phyDat(locus_dnabin)
  #create a haplotype network .. to be continued
  haps <- haplotype(locus_dnabin)
  nhaps <- length(dimnames(haps)[[1]])
  #tcs <- haploNet(haps)
  #find parameters of HKY (F84) model
  modeltest <- modelTest(locus_phy, model = c("JC","K80", "F81", "HKY","TrN"), 
                         G = T, I = F, k = 4, mc.cores = 5)
  # pick out the best model. If multiple models are tied for best, pick the simplest one
  bestmodel <- modeltest$Model[which(modeltest$BIC == min(modeltest$BIC))][1]
  #open the object environment
  env <- attr(modeltest,"env")
  bestparams <- get(bestmodel, env)
  bestparams <- eval(bestparams, env=env)
  # use this code for v3, which only has F84 (HKY)
  #HKY <- modelTest(locus_phy, model = c("HKY"), 
  #                       G = T, I = F, k = 4)
  #env <- attr(HKY, "env")
  #HKYG <- get("HKY+G", env)
  #model <- eval(HKYG, env=env)
  # calculate TiTv Ratio
  ttratio <- TiTvRatio(locus_gtypes)
  
  stats <- bind_rows(stats, tibble(locus=L, 
                          length = length(locus_dnabin[1,]),
                          segSites = length(seg.sites(locus_dnabin)),
                          nHaps = length(dimnames(haps)[[1]]),
                          nucDiv = nuc.div(locus_dnabin),
                          ttRatio =  ttratio[3],
                          model = bestmodel,
                          gammaShape = bestparams$shape,
                          rate1 = bestparams$g[1],
                          rate2 = bestparams$g[2],
                          rate3 = bestparams$g[3],
                          rate4 = bestparams$g[4],
                          Q1 = sort(unique(bestparams$Q))[1],
                          Q2 = sort(unique(bestparams$Q))[2],
                          Q3 = sort(unique(bestparams$Q))[3]
                          ))
                         
                        
}

#stats <- stats[which(stats$length %in% migrate_lengths$length),]
#setdiff(stats$length, as.numeric(migrate_lengths$length))
# write_csv(stats, "coruscans/migrate_locus_statistics439.csv")

Write a model block

Write a space delimited textfile that can be added to the migrate data file in the format that Peter suggested.

#stats <- read_csv("./carbunculus/migrate_locus_statistics.csv")
#kable(stats)


# write a space delimited textfile that can be added to the migrate data file
# following Peter's suggestion here:
#https://groups.google.com/g/migrate-support/c/XjV_4jZW4RI/m/HbRWoGY6AwAJ
migrate_mutation_models <- tibble(prefix = "#$",
                                 locus=1:length(stats$locus),
                                 type = "s",
                                 model = stats$model,
                                 q1 = stats$Q2,
                                 q2 = stats$Q3)

#write_delim(migrate_mutation_models,"./coruscans/migrate_modelblock439.txt", na="", delim = " ")

Read them back in so we don’t have to recalculate them every time I knit.

So we have a 0 invariant loci, and the mean overall transition:transversion ratio is NaN. Mean gamma shape parameter is NA, which argues for only one rate.

Metapopulation Models

Columns = departure pop, Rows = arrival pop
* = equilibrium gene flow/drift OR theta (\(N_e\mu\)) parameter (on diagonal)
d = divergence without gene flow
D = divergence with gene flow
0 = parameter not included in model

E. coruscans

divergence_geneflow_1

JA    * 0   0   0
JO    D *   0   0
MHI   0 D   *   0
NWHI  0 0   D   *

divergence_geneflow_2

*   0   0   0
D   *   0   0
0   0   *   D
0   D   0   *

divergence_geneflow_3

*   0   0   0
0   *   0   D
0   0   *   D
D   0   0   *

divergence_only_1

*   0   0   0
d   *   0   0
0   d   *   0
0   0   d   *

divergence_only_2

*   0   0   0
d   *   0   0
0   0   *   d
0   d   0   *

divergence_only_3

*   0   0   0
0   *   0   d
0   0   *   d
d   0   0   *

geneflow_only_full

*   *   *   *
*   *   *   *
*   *   *   *
*   *   *   *

geneflow_only_1

*   0   0   0
*   *   0   0
0   *   *   0
0   0   *   *

geneflow_only_2

*   0   0   0
*   *   0   0
0   0   *   *
0   *   0   *

geneflow_only_3

*   0   0   0
0   *   0   *
0   0   *   *
*   0   0   *

n-island

m   m   m   m
m   m   m   m
m   m   m   m
m   m   m   m

panmixia

*


population-relabel={1 1 1 1}

E. carbunculus

divergence_geneflow_1

JA    * 0   0
MHI   D *   0
NWHI  0 D   *

divergence_geneflow_2

*   0   0
0   *   D
D   0   *

divergence_only_1

*   0   0
d   *   0
0   d   *

divergence_only_2

*   0   0
0   *   d
d   0   *

geneflow_only_full

*   0   0
*   *   0
0   *   *

geneflow_only_1

*   0   0
0   *   *
*   0   *

geneflow_only_2

*   *   *
*   *   *
*   *   *

n-island

m   m   m
m   m   m
m   m   m

panmixia

*


population-relabel={1 1 1 1}

Parmfile

Here’s the parmfile I am using, based on my parmfile for Palythoa tuberculosa. This one depicts a model of divergence without gene flow. Migration prior is increased to have a maximum of 1e8 (100,000,000), and mean of 1e4 (10,000). This reflects a prior expectation that \(m/\mu\) will not be larger than this value, if m is not larger than 0.1 (10% of population migrates), assuming a nuclear mutation rate of 1e-9. Also on average one out of every 100K individuals will send a migrant (m = 1e-5).

################################################################################
# Parmfile for Migrate 4.4.4(git:v4-series-26-ge85c6ff)-June-1-2019 [do not remove these first TWO lines]
# generated automatically on
# Fri Feb 4 2022
menu=NO
nmlength=10
datatype=SequenceData
datamodel=HKY
ttratio=1.000000

freqs-from-data=YES 

seqerror-rate={0.0001,0.0001,0.0001,0.0001}
categories=1 #no categories file specified
rates=1: 1.000000 
prob-rates=1: 1.000000 
autocorrelation=NO
weights=NO
recover=NO
fast-likelihood=NO
inheritance-scalars={1.00000000000000000000}
haplotyping=YES:no-report
population-relabel={1 2 3 4}
infile=../../Ecarbunculus.mig
random-seed=AUTO #OWN:410568459
title= Etelis coruscans
progress=YES
logfile=YES:logfile.txt
print-data=NO
outfile=outfile.txt
pdf-outfile=outfile.pdf
pdf-terse=YES
use-M=YES
print-tree=NONE
mig-histogram=MIGRATIONEVENTSONLY
skyline=NO #needs mig-histogram=ALL:...
theta=PRIOR:50
migration=PRIOR:10
rate=PRIOR:50
split=PRIOR:10
splitstd=PRIOR:10
mutation=CONSTANT
analyze-loci=A
divergence-distrib=E
custom-migration={
*   0   0   0
d   *   0   0
0   d   *   0
0 0 d *
}
geo=NO

updatefreq=0.200000 0.200000 0.200000 0.200000 #tree, parameter haplotype, timeparam updates
bayes-posteriorbins= 2000 2000
bayes-posteriormaxtype=TOTAL
bayes-file=YES:bayesfile
bayes-allfile=YES:bayesallfile
bayes-all-posteriors=YES:bayesallposterior
bayes-proposals= THETA METROPOLIS-HASTINGS Sampler
bayes-proposals= MIG SLICE Sampler
bayes-proposals= DIVERGENCE METROPOLIS-HASTINGS Sampler
bayes-proposals= DIVERGENCESTD METROPOLIS-HASTINGS Sampler
bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000 
bayes-priors= MIG WEXPPRIOR: 0.0100 10000.000000 100000000 10000
bayes-priors= SPLIT * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-hyperpriors=NO
long-chains=1
long-inc=100
long-sample=10000
burn-in=2000  
auto-tune=YES:0.440000
assign=NO
heating=YES:1:{1.000000,1.500000,3.000000,1000000.000000}
heated-swap=YES
moving-steps=NO
gelman-convergence=No
replicate=YES:3
end

Copy It Up

Copy it up, and then make a total of 10 replicate copies of the folder.

scp -r coruscans/migrate/rep_template eric@threadripper.tobolab.org:./Etelis/coruscans/rep1
scp -r coruscans/Ecoruscans439.mig eric@threadripper.tobolab.org:./Etelis/coruscans/Ecoruscans2.mig

scp -r carbunculus/migrate/rep_template eric@threadripper.tobolab.org:./Etelis/carbunculus/rep1
scp -r carbunculus/Ecarbunculus999.mig eric@threadripper.tobolab.org:./Etelis/carbunculus/Ecarbunculus.mig

#once on server, copy it 9 times
for a in $(seq 2 10); do cp -r rep1 rep$a; done

Bash Script

So we will do 10 replicates of 3 replicates. This will start at r1, and run all models for that before moving on. Pretty sure this will finish one whole model before moving on to the next one (since all threads are being used for different loci). This uses two threads per core as well with the –use-hwthread-cpus flag.

### Bash Script
#!
for r in */
        do
                cd $r
                echo $r
                date
                date > date.txt
                        for m in */
                          do
                                cd $m
                                  date > date.txt
                                  echo $m
                                  date
                                  mpirun --use-hwthread-cpus -np 120 ~/migrate-5.0.4/src/migrate-n-mpi parmfile
                                  sleep 1
                                cd ..
                          done
                cd ..
        done

The way that model marginal likelihood is calculated depends on the scaling constant k, which is an indicator of agreement on parameter values between loci. To get this, migrate uses the same histograms that it uses in its output. Thus, setting number of bins becomes important, as does using a version of migrate that doesn’t have the visualization issues that Peter reported. I revised my bash submission script to:

  1. make a new copy of parmfile called parmfile_rerun
  2. change recover=NO to recover=YES - this will make migrate summarize the current bayesallfile, rather than create a brand new run
  3. Change the number of bins from 3,000 to 2^14 = 16,384
  4. Write the outfile to outfile_rerun.txt
  5. Use version 5.0.4 instead of 4.4.4 (which has viz problems)

Re-running with more bins

#| eval: false

<!-- ### Bash Script -->
<!-- #! -->
<!-- for r in */ -->
<!--    do -->
<!--     echo starting $r at $(date ) >> progress_rerun.txt -->
<!--          cd $r -->
<!--        echo $r -->
<!--        date -->
<!--        echo starting $r at $(date ) >> progress_rerun.txt -->
<!--            for m in */ -->
<!--              do -->
<!--              echo starting $m at $(date )>> progress_rerun.txt -->
<!--                cd $m -->
<!--                  date > date_rerun.txt -->
<!--                  echo $m -->
<!--                  date -->
<!--                  # copy the parmfile to parmfile_rerun, and replace the following 3 lines -->
<!--                  cp parmfile parmfile_rerun -->
<!--                  sed -i 's/recover=NO/recover=YES/g' parmfile_rerun -->
<!--                  sed -i 's/outfile=outfile.txt/outfile=outfile_rerun.txt/g' parmfile_rerun -->
<!--                  sed -i 's/bayes-posteriorbins= 3000 3000/bayes-posteriorbins= 16384 16384/g' parmfile_rerun -->

<!--                  mpirun --use-hwthread-cpus -np 60 ~/migrate-5.0.4/src/migrate-n-mpi parmfile_rerun -->
<!--                  sleep 1 -->
<!--                cd .. -->
<!--              done -->
<!--        cd .. -->
<!--    done -->

And back down

And ten replicates finished in about 10 days! Found this guide to download everything except the massive bayesallfiles and pdfs

rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf" --exclude="*.mig" eric@threadripper.tobolab.org:~/Etelis/coruscans ./coruscans/migrate/output2

rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf" --exclude="*.mig" eric@moneta.tobolab.org:~/Etelis/coruscans ./coruscans/migrate/output

rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf" --exclude="*.mig" eric@threadripper.tobolab.org:~/Etelis/carbunculus ./carbunculus/migrate/output

Checking the effects of a bad Ti/Tv ratio

So, I was using a Ti/Tv ratio in my K80/HKY mutation models that was way out of bounds. I’ve exported the locus-by-locus marginal likelihoods out of the panmixia, full and divergence-with-gene-flow3 models to see what happens when I remove the bad loci.

mlikes <- read_csv("coruscans/model_marglike_comp.csv") %>% 
            filter(!Locus_Index %in% badloci$Locus) %>%
              filter(!Locus_Index==440) %>%  #locus 440 is the migrate sum
              summarize(across(.fns = sum))

mlikes

With this new back-of-the-envelope analysis, the full model comes 3rd, after divergence-with-gene-flow3. The scaling factor might have a big effect here.

Results

E. coruscans

Model Marginal Likelihoods
runDir_cor <- "coruscans/migrate/output5"

likelist_cor <- list()
for(r in 1:5){
  rep = paste0("rep",r)
  print(rep)
  likelist_cor[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir_cor,rep),
                                                   outfileName = "outfile_rerun3.txt")
}

# Model selection for each replicate...
likelist_cor %>% map(bfcalcs)

The final model marginal likelihood estimates based on the mean of 10 replicates:

like.df_cor <-  likelist_cor %>% bind_rows() %>% group_by(model)

means_cor <- like.df_cor %>% summarize(bezier.corrected = mean(bezier.corrected))
  

final_model_cor <- bfcalcs(means_cor)

final_model_cor
T-Test

Difference between panmixia and the second-place full model is very significant.

top.choice <- final_model_cor$model[which(final_model_cor$choice ==1)]
second.choice <- final_model_cor$model[which(final_model_cor$choice ==2)]

permTS(like.df_cor$bezier.corrected[which(like.df_cor$model == top.choice)],
       like.df_cor$bezier.corrected[which(like.df_cor$model == second.choice)],
       alternative = "greater", method = "exact.ce")

    Exact Permutation Test (complete enumeration)

data:  GROUP 1 and GROUP 2
p-value = 0.003968
alternative hypothesis: true mean GROUP 1 - mean GROUP 2 is greater than 0
sample estimates:
mean GROUP 1 - mean GROUP 2 
                    5427.85 
Model Selection Figures

And a figure summarizing all this

models_cor <- c("Divergence With Gene Flow 1","Divergence With Gene Flow 2","Divergence With Gene Flow 3",
            "Divergence Only 1", "Divergence Only 2","Divergence Only 3",
            "Gene Flow Only Full", "Gene Flow Only 1", "Gene Flow Only 2", "Gene Flow Only 3",
            "n-island", "panmixia")

likesPlot_cor <- likelist_cor %>% bind_rows() %>% group_by(model) %>% 
              ggplot(mapping = aes(x=model, y = bezier.corrected)) +
              geom_violin(draw_quantiles = 0.5) +
              theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
              scale_x_discrete(labels = models_cor) +
              labs(x = "Metapopulation Model", y = "Bezier Corrected Marginal Likelihood")
likesPlot_cor

Final Parameter Estimates

These are the final estimates, summed over loci.

Here are the priors I’m using for the equilibrium models, as a reminder.

bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000 
bayes-priors= MIG WEXPPRIOR: 0.0100 10000.000000 100000000 10000
Read in the posterior and make a labeling key
repsDir <- "~/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Etelis/migrate_output/coruscans"

# for some reason the bayesallfile has 39 columns, but only 31 are labelled.

# read in a population key to translate numbers into names in figures
popkey <- read_csv("coruscans/popkey.csv")
names(popkey$Pop) <- popkey$Index


winningModel <- "full"

posteriors <- list()

#read in the posterior for one replicate this may take 5 minutes
for(r in 1:3){
  print(paste("Loading Rep",r))
  posteriors[[r]] <- read.table(file.path(repsDir,paste0("rep",r),winningModel,
                                      "bayesallfile"), header=T) 
}

#combine into one, and then keep only the first 2 reps (based on diagnostics below)
posterior <- posteriors %>% bind_rows(.id = "rep") %>% 
                                filter(rep <= 1) %>% migrants.per.gen()


# posterior_long <- posterior %>% 
#                   pivot_longer(starts_with(c("Theta_", "M_", "Nm_","D_", "d_")),
#                                names_to = "parameter", 
#                                values_to = "value")
#make a new labeling key
posterior_parameters <- names(posterior)[str_which(names(posterior),
                                                   paste(c("Theta_", "M_", 
                                                           "Nm_","D_", "d_"), 
                                                         collapse = "|"))]

parameter_key <- posterior_parameters %>% 
                                  str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%
                                  str_replace("M_", "m/mu " ) %>% 
                                  str_replace("Theta_", "theta ") %>% 
                                  str_replace_all(popkey$Pop) %>% 
                                  str_replace("Nm_", "4Nem ") 
                                  

names(parameter_key) <- posterior_parameters
Check Convergence
#thin by taking every other step
posteriors2 <- lapply(posteriors,filter,Steps%%500==0)
# select parameters only
posteriors2 <- lapply(posteriors2, select, starts_with(c("Theta_", "M_", 
                                                          "Nm_","D_","d_")))

                        
posteriors.mcmc <- mcmc.list(lapply(posteriors2,mcmc,thin=500))
posterior.ggs <- ggs(posteriors.mcmc, burnin = F)
effectiveSize(posteriors2[[1]])
#ggmcmc(posterior.ggs, param_page = 4)

Took 51 minutes to generate the ggmcmc output, but everything looks really really good, other than the Geweke diagnostics… need to look that up.

Effective Sizes

Effective sizes are good for one replicate

effectiveSize(posteriors2[[1]]) (first replicate)

Theta_1    Theta_2    Theta_3    Theta_4      M_2_1      M_3_1      M_4_1      M_1_2      M_3_2 
  56487.86 1122681.80   75221.67   22195.94  395848.98 2059638.34  996418.98  356999.16  493093.13 
     M_4_2      M_1_3      M_2_3      M_4_3      M_1_4      M_2_4      M_3_4 
 428010.34  865528.01  330380.72 1714705.09 1181245.08  537577.93 1897040.34
Gelman-Rubin Diagnostic

Gelman-Rubin diagnostic great, both transformed and untransformed!


gelman.diag(posteriors.mcmc)

# Potential scale reduction factors:
# 
#         Point est. Upper C.I.
# Theta_1       1.00       1.00
# Theta_2       1.00       1.00
# Theta_3       1.00       1.00
# Theta_4       1.00       1.00
# M_2_1         1.00       1.01
# M_3_1         1.00       1.01
# M_4_1         1.00       1.00
# M_1_2         1.00       1.00
# M_3_2         1.00       1.00
# M_4_2         1.00       1.00
# M_1_3         1.00       1.01
# M_2_3         1.00       1.01
# M_4_3         1.00       1.00
# M_1_4         1.01       1.01
# M_2_4         1.00       1.00
# M_3_4         1.01       1.01
# 
# Multivariate psrf
# 
# 1

gelman.diag(posteriors.mcmc, transform = T)
Theta Estimates

And here I run the code for the \(\theta\) estimates

parameter_type <- "Theta"

parameters <- names(posterior) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
thetas <- summarize_posterior(posterior, parameter_type = parameter_type, 
                              prior_params = c(0,0.001,0.1))

#convert to long format
thetas_long <- thetas %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")

theta.colors <- brewer.pal(4,"Spectral")
theta.colors.light <- lighten(theta.colors)
theta.colors.dark <- darken(theta.colors)

#plot
thetas_density <-  ggplot(thetas_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(linewidth = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = theta.colors, 
                             labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.125) + xlim(0,0.0025)

thetas_ridges <- ggplot(thetas_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\Theta$")) + xlim(0,0.0025)

thetas_violins <- ggplot(thetas_long, aes(x = Parameter, y = x, 
                                          violinwidth = Density*10, 
                                          fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_x_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(y = TeX("$\\Theta$")) + ylim(0,0.0025)

ggsave("./coruscans/migrate/theta_density_full.jpg", plot = thetas_density)
ggsave("./coruscans/migrate/theta_ridges_full.jpg", plot = thetas_ridges)
ggsave("./coruscans/migrate/theta_violins_full.jpg", plot = thetas_violins)

# get stats for each summed-over-loci posterior

theta_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = thetas, 
                                                             parameter = .x),
                                      .id = "parameter")
#write.csv(theta_stats,"./coruscans/migrate/theta_stats_full.csv")

Theta estimates:

thetastats <- read_csv("./coruscans/migrate/theta_stats_full.csv")
New names:
Rows: 4 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
thetastats

Theta Density Theta Ridges

Theta Violins

m/mu estimates
parameter_type <- "M"

# use this to figure out the granularity of the prior needed for summarize_prior
#seq(0,100000, length.out = 2^17)

parameters <- names(posterior) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
Ms <- summarize_posterior(posterior, parameter_type = parameter_type, 
                              prior_params = c(0,10000,100000), n = 2^17)

#convert to long format
Ms_long <- Ms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")


migration.colors <- c(theta.colors[1],theta.colors.dark[1],theta.colors.light[1],
                      theta.colors[2],theta.colors.dark[2],theta.colors.light[2],
                      theta.colors[3],theta.colors.dark[3],theta.colors.light[3],
                      theta.colors[4],theta.colors.dark[4],theta.colors.light[4])           
#plot
Ms_density <-  ggplot(Ms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        scale_x_log10(limits = c(1,3000)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted")
  
           
#plot
M_ridges <- ggplot(Ms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        #scale_x_log10(limits = c(1,2000)) +
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\frac{m}{\\mu}$")) + xlim(0,1200)

  
M_violins <- ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density*20, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") +
        scale_y_log10(limits = c(1,1200)) +
        scale_fill_discrete(type = migration.colors, labels = parameter_key) 

# ggsave("./coruscans/migrate/Mmu_density_full.pdf", plot = Ms_density)
# ggsave("./coruscans/migrate/Mmu_ridges_full.pdf", plot = M_ridges)
# ggsave("./coruscans/migrate/Mmu_violins_full.pdf", plot = M_violins)

# get stats for each summed-over-loci posterior

Ms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = Ms, 
                                                             parameter = .x),
                                      .id = "parameter")

#write.csv(Ms_stats,"./coruscans/migrate/Mmu_stats_full.csv")

M/mu estimates:

ms_stats <- read_csv("./coruscans/migrate/Mmu_stats_full.csv")
New names:
Rows: 12 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
ms_stats

M/mu Density

M/mu Ridges

M/mu Violins

4Nem estimates
parameter_type <- "Nm"

parameters <- names(posterior) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
nms <- summarize_posterior(posterior, parameter_type = parameter_type, 
                              prior_params = c(0,10,100), n = 2^14)

#convert to long format
nms_long <- nms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")

          
#plot
Nms_density <-  ggplot(nms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
         xlim(0,1.5)
  
           
#plot
nm_ridges <- ggplot(nms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$4N_em$")) +
        xlim(0,1.5)
  
nm_violins <- ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        ylim(0,1.5)
# get stats for each summed-over-loci posterior

nms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = nms, 
                                                             parameter = .x),
                                      .id = "parameter")

ggsave("./coruscans/migrate/Nm_density_full.pdf", plot = Nms_density)
ggsave("./coruscans/migrate/Nm_ridges_full.pdf", plot = nm_ridges)
ggsave("./coruscans/migrate/Nm_violins_full.pdf", plot = nm_violins)
write.csv(nms_stats,"./coruscans/migrate/Nm_stats_full.csv")

\(4N_em\) estimates:

nms_stats <- read_csv("./coruscans/migrate/Nm_stats_full.csv")
New names:
Rows: 12 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
nms_stats

4Nm Density

4Nm Ridges

4Nm Violins

E. carbunculus

Model Marginal Likelihoods

runDir_car <- "carbunculus/migrate/output5"

likelist_car <- list()
for(r in 1:5){
  rep = paste0("rep",r)
  print(rep)
  likelist_car[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir_car,rep),
                                                   outfileName = "outfile_rerun3.txt")
}

# Model selection for each replicate...
likelist_car %>% map(bfcalcs)

The final model marginal likelihood estimates based on the mean of 5 replicates:

like.df_car <-  likelist_car %>% bind_rows() %>% group_by(model)

means_car <- like.df_car %>% summarize(bezier.corrected = mean(bezier.corrected))
  

final_model_car <- bfcalcs(means_car)

final_model_car

T-Test

Difference between panmixia and the second-place full model is very significant.

top.choice <- final_model_car$model[which(final_model_car$choice ==1)]
second.choice <- final_model_car$model[which(final_model_car$choice ==2)]

permTS(like.df_car$bezier.corrected[which(like.df_car$model == top.choice)],
       like.df_car$bezier.corrected[which(like.df_car$model == second.choice)],
       alternative = "greater", method = "exact.ce")

    Exact Permutation Test (complete enumeration)

data:  GROUP 1 and GROUP 2
p-value = 0.003968
alternative hypothesis: true mean GROUP 1 - mean GROUP 2 is greater than 0
sample estimates:
mean GROUP 1 - mean GROUP 2 
                   5492.338 

Model-Selection Figures

And a figure summarizing all this

models <- c("Divergence With Gene Flow 1","Divergence With Gene Flow 2",
            "Divergence Only 1", "Divergence Only 2",
            "Gene Flow Only 1", "Gene Flow Only 2", "Gene Flow Only Full",
            "n-island", "panmixia")

likesPlot_car <- likelist_car %>% bind_rows() %>% group_by(model) %>% 
              ggplot(mapping = aes(x=model, y = bezier.corrected)) +
              geom_violin(draw_quantiles = 0.5) +
              theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
              scale_x_discrete(labels = models) +
              labs(x = "Metapopulation Model", y = "Bezier Corrected Marginal Likelihood")
likesPlot_car

Final Parameter Estimates

These are the final estimates, summed over loci.

Here are the priors I’m using for the equilibrium models, as a reminder.

bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000 
bayes-priors= MIG WEXPPRIOR: 0.0100 10000.000000 100000000 10000
Read in the posterior and make a labeling key
repsDir <- "~/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Etelis/migrate_output/carbunculus/output5"

# for some reason the bayesallfile has 39 columns, but only 31 are labelled.

# read in a population key to translate numbers into names in figures
popkey <- read_csv("carbunculus/popkey.csv")
names(popkey$Pop) <- popkey$Index


winningModel <- "geneflow_only_full"

posteriors <- list()

#read in the posterior for one replicate this may take 5 minutes
for(r in 2:4){
  print(paste("Loading Rep",r))
  posteriors[[r]] <- read_tsv(file.path(repsDir,paste0("rep",r),winningModel,
                                      "bayesallfile"), comment = "#") 
}

# This creates an empty element number one, set that to null
posteriors[[1]] <- NULL


#combine into one, and then keep only the first 1 rep (based on diagnostics below)
#also need to do this to keep "rep" as a column for summarizing over loci function.
posterior <- posteriors %>% bind_rows(.id = "rep") %>%  
                                filter(rep <= 1) %>% migrants.per.gen()

# posterior_long <- posterior %>% 
#                   pivot_longer(starts_with(c("Theta_", "M_", "Nm_","D_", "d_")),
#                                names_to = "parameter", 
#                                values_to = "value")
#make a new labeling key
posterior_parameters <- names(posterior)[str_which(names(posterior),
                                                   paste(c("Theta_", "M_", 
                                                           "Nm_","D_", "d_"), 
                                                         collapse = "|"))]

parameter_key <- posterior_parameters %>% 
                                  str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%
                                  str_replace("M_", "m/mu " ) %>% 
                                  str_replace("Theta_", "theta ") %>% 
                                  str_replace_all(popkey$Pop) %>% 
                                  str_replace("Nm_", "4Nem ") 
                                  

names(parameter_key) <- posterior_parameters
Check Convergence
#thin by taking every 5 steps
posteriors2 <- map(posteriors,filter,Steps%%500==0)
#posteriors2 <- map(posteriors2, select, -mL_harmonic) #remove mL_harmonic, which took in extra unlabeled columns
# select parameters only
posteriors2 <- lapply(posteriors2, select, starts_with(c("Theta_", "M_", 
                                                          "Nm_","D_","d_")))

                        
posteriors.mcmc <- mcmc.list(lapply(posteriors2,mcmc,thin=500))
posterior.ggs <- ggs(posteriors.mcmc, burnin = F)
effectiveSize(posteriors.mcmc[[1]])
#ggmcmc(posterior.ggs, param_page = 4)

Took 51 minutes to generate the ggmcmc output, but everything looks really really good, other than the Geweke diagnostics… need to look that up.

Effective Sizes

Effective sizes are good for one replicate - although this isn’t broken up by locus.

effectiveSize(posteriors2[[2]]) #same results achieved for effectiveSize(posteriors.mcmc[[2]])
Theta_1 Theta_2 Theta_3   M_2_1   M_3_1   M_1_2   M_3_2   M_1_3   M_2_3 
4725164 4925675 4731644 4285552 4023338 4464066 4439156 4072776 5068390 
Gelman-Rubin Diagnostic

Gelman-Rubin diagnostic perfect, both transformed and untransformed!


gelman.diag(posteriors.mcmc)

Potential scale reduction factors:

        Point est. Upper C.I.
Theta_1          1          1
Theta_2          1          1
Theta_3          1          1
M_3_2            1          1
M_1_3            1          1
D_3_2            1          1
D_1_3            1          1

Multivariate psrf

1

gelman.diag(posteriors.mcmc, transform = T)
Theta Estimates

And here I run the code for the \(\theta\) estimates

parameter_type <- "Theta"

parameters <- names(posterior) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
thetas <- summarize_posterior(posterior, parameter_type = parameter_type, 
                              prior_params = c(0,0.001,0.1))

#convert to long format
thetas_long <- thetas %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")

theta.colors <- brewer.pal(3,"Spectral")
theta.colors.light <- lighten(theta.colors)
theta.colors.dark <- darken(theta.colors)

#plot
thetas_density <-  ggplot(thetas_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(linewidth = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = theta.colors, 
                             labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.06) + xlim(0,0.0125)

thetas_ridges <- ggplot(thetas_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\Theta$")) + xlim(0,0.0125)

thetas_violins <- ggplot(thetas_long, aes(x = Parameter, y = x, 
                                          violinwidth = Density*10, 
                                          fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_x_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(y = TeX("$\\Theta$")) + ylim(0,0.0125)

#ggsave("./carbunculus/migrate/theta_density_full.jpg", plot = thetas_density)
#ggsave("./carbunculus/migrate/theta_ridges_full.jpg", plot = thetas_ridges)
#ggsave("./carbunculus/migrate/theta_violins_full.jpg", plot = thetas_violins)

# get stats for each summed-over-loci posterior

theta_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = thetas, 
                                                             parameter = .x),
                                      .id = "parameter")
#write.csv(theta_stats,"./carbunculus/migrate/theta_stats_full.csv")

Theta estimates:

thetastats <- read_csv("./carbunculus/migrate/theta_stats_full.csv")
New names:
Rows: 3 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
thetastats

Theta Density Theta Ridges

Theta Violins

m/mu estimates
parameter_type <- "M"

# use this to figure out the granularity of the prior needed for summarize_prior
#seq(0,100000000, length.out = 2^17)

parameters <- names(posterior) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
Ms <- summarize_posterior(posterior, parameter_type = parameter_type, 
                              prior_params = c(0,10000,100000), n = 2^17)

#convert to long format
Ms_long <- Ms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")


migration.colors <- c(theta.colors[1],theta.colors.dark[1],theta.colors.light[1],
                      theta.colors[2],theta.colors.dark[2],theta.colors.light[2],
                      theta.colors[3],theta.colors.dark[3],theta.colors.light[3])
                                 
#plot
Ms_density <-  ggplot(Ms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(linewidth = 1) + 
        scale_x_log10(limits = c(100,1000)) + 
        scale_color_discrete(type = theta.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted")
  
           
#plot
M_ridges <- ggplot(Ms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        #scale_x_log10(limits = c(1,2000)) +
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\frac{m}{\\mu}$")) + xlim(0,1000)

  
M_violins <- ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density*20, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") +
        scale_y_log10(limits = c(100,1000)) +
        scale_fill_discrete(type = theta.colors, labels = parameter_key) 

# ggsave("./carbunculus/migrate/Mmu_density_full.jpg", plot = Ms_density)
# ggsave("./carbunculus/migrate/Mmu_ridges_full.jpg", plot = M_ridges)
# ggsave("./carbunculus/migrate/Mmu_violins_full.jpg", plot = M_violins)

# get stats for each summed-over-loci posterior

Ms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = Ms, 
                                                             parameter = .x),
                                      .id = "parameter")

#write.csv(Ms_stats,"./carbunculus/migrate/Mmu_stats_full.csv")

M/mu estimates:

ms_stats <- read_csv("./carbunculus/migrate/Mmu_stats_full.csv")
New names:
Rows: 6 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
ms_stats

M/mu Density

M/mu Ridges

M/mu Violins

4Nem estimates
parameter_type <- "Nm"

parameters <- names(posterior) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
nms <- summarize_posterior(posterior, parameter_type = parameter_type, 
                              prior_params = c(0,10,100), n = 2^14)

#convert to long format
nms_long <- nms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")

          
#plot
Nms_density <-  ggplot(nms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = theta.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
         xlim(0,2.5)
  
           
#plot
nm_ridges <- ggplot(nms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$4N_em$")) +
        xlim(0,2.5)
  
nm_violins <- ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        ylim(0,2.5)
# get stats for each summed-over-loci posterior

nms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = nms, 
                                                             parameter = .x),
                                      .id = "parameter")

ggsave("./carbunculus/migrate/Nm_density_full.jpg", plot = Nms_density)
ggsave("./carbunculus/migrate/Nm_ridges_full.jpg", plot = nm_ridges)
ggsave("./carbunculus/migrate/Nm_violins_full.jpg", plot = nm_violins)
write.csv(nms_stats,"./carbunculus/migrate/Nm_stats_full.csv")

\(4N_em\) estimates:

nms_stats <- read_csv("./carbunculus/migrate/Nm_stats_full.csv")
New names:
Rows: 6 Columns: 10
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): parameter dbl (9): ...1, 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
nms_stats

4Nm Density

4Nm Ridges

4Nm Violins

Sum over loci - Bayesfile

This is code for summing over loci in a Bayesfile. Sticking it down here for now.

Read Bayesfile

These functions are for reconstructing distributions from the bayesfile which consists of distributions for each locus calculated as histograms

bayesfilenames <- c("Locus","pnumber","HPC50","HPC95","value","count",
                    "freq","cumFreq","priorFreq")

bf <- read_table(file = file.path(winningModelDir,"bayesfile"), 
                             col_types = "ifiiddddd", col_names = bayesfilenames,
                             comment = "#", skip_empty_rows = T)

parameter_number_table <- read_table(file = file.path(winningModelDir,"bayesfile"), 
                             col_types = "cic", 
                             col_names = c("drop","pnumber","parameter"),
                             skip=9, n_max = 16) %>% 
                      select(-drop) %>%
                      mutate(parameter = str_replace_all(parameter, ",","_")) %>% 
                      mutate(parameter = str_remove_all(parameter,"\\(")) %>% 
                      mutate(parameter = str_remove_all(parameter,"\\)")) %>% 
                      left_join(tibble(parameter_name =
                                        parameter_key,
                                       parameter=names(parameter_key)),
                                by = "parameter")

parameter_number_key <- parameter_number_table$parameter_name 
names(parameter_number_key) <- parameter_number_table$pnumber

#bf %>% ggplot(aes(x=value, y = freq, color = Locus)) + 
#  geom_line() + scale_color_continuous() + 
#  facet_wrap(vars(Parameter), scales = "free")

# bf %>% filter(pnumber %in% 1:9) %>% filter(Locus %in% 1:109) %>% 
#       ggplot(aes(x=value, y = freq, color = Locus)) + geom_line() +
#       facet_wrap(vars(pnumber), scales = "free")

Theta

bf %>% filter(pnumber %in% 1:4) %>% filter(Locus == 440) %>% 
      ggplot(aes(x = value, y = freq, color = pnumber)) + geom_line() +
      scale_color_brewer(palette = "YlGnBu", labels = parameter_number_key) +
      ylim(0,0.02) + xlim(0,0.015) + geom_line(aes(x = value, y = priorFreq),
                                             color = "grey", linetype="dotted")

M/mu

bf %>% filter(pnumber %in% 6:16) %>% filter(Locus == 440) %>% 
      ggplot(aes(x = value, y = freq, color = pnumber)) + geom_line() +
      #scale_color_brewer(palette = "YlGnBu", labels = parameter_number_key) +
      geom_line(aes(x = value, y = priorFreq),
                                             color = "grey", linetype="dotted") +
      scale_x_log10(limits = c(1e4,1e6)) + ylim(0,0.05)

References

Cerca, José, Marius F. Maurstad, Nicolas C. Rochette, Angel G. Rivera-Colón, Niraj Rayamajhi, Julian M. Catchen, and Torsten H. Struck. 2021. “Removing the Bad Apples: A Simple Bioinformatic Method to Improve Loci-Recovery in de Novo RADseq Data for Non-Model Organisms.” Methods in Ecology and Evolution 12 (5): 805–17. https://doi.org/10.1111/2041-210x.13562.
Thia, Joshua A. 2022. “Guidelines for Standardizing the Application of Discriminant Analysis of Principal Components to Genotype Data.” Molecular Ecology Resources n/a (n/a). https://doi.org/10.1111/1755-0998.13706.