##################Some R scripts for (1) linkage disequilibrium, (2) Fst and (3) Qst-Fst analysis on the basis of genomic scale of SNP data ############Author: Zitong Li ###########Contact information: zitong.li@helsinki.fi ###########First, please download our SNP data 'data.vcf.gz' from DRYAD. ###########original data were saved in VCF file #For the biallelic SNP datasets, it will be good to use VCFtools to convert the data into a .012 matrix, with each row representing individuals and each column representing a SNP. The genotype AA, AB, and BB are coded as 0, 1 and 2, respectively. #In linux terminal (assuming the VCFtools have been installed), type vcftools --gzvcf data.vcf.gz --012 --out data #to convert .VCF data to .012 data. #Three files are generated: "data.012" is the genotype matrix, "data.pos" contains the information of SNPs on the genome (e.g. chromosome, physical position), and "data.indv" are the individual ID. #For multiallelic microsatellite data: micro_data.vcf.gz, in terminal, type vcffile=data.vcf.gz outfile=data.csv echo -n "locus," > $outfile bcftools query -l $vcffile |tr '\n' ',' |sed 's/,$//' >> $outfile echo >> $outfile bcftools query -f '%CHROM\t%POS[\t%GT]\n' $vcffile | \ perl -ne 's/\./-1/g;@r=split /\s+/,$_;$r[0]=~s/\|.+//;print $r[0]."_".$r[1];for($i=2;$i<42;$i++){($a,$b)=split /\//,$r[$i];$a++;$b++;printf(",%.2i%.2i",$a,$b);}print "\n"' >> $outfile #to convert the VCF data to a numerical genotype matrix saved in data.csv. The data format of a genotype is like "0101", where '01' and 01' represents the numerical value of the first and second alleles. #load the SNP data into R geno <- read.table('data.012',head=FALSE) pos <- read.table('data.012.pos',head=FALSE) ID <- read.table('data.012.012.indv',head=FALSE) #load microsatellite data #geno=read.table("data.csv",sep=",",colClasses='character',row.names=1,header=T) #look at how the genotype data look like geno[1:10,1:10] #First we conduct LD analysis, and evaluate its impact on the Fst baseline estimation. #Load R packages 'SNPRelate' and 'gdsfmt', which can be installed from Bioconductor library(gdsfmt) library(SNPRelate) #extract the chromosome information of each SNP chr <- pos[,1] chr <- as.numeric(substr(chr, start=3, stop=4)) #extract the physical distance information of each SNP pdis <- pos[,2] #prepare the data into the SNPRelate format U <- t(geno[,-1]) snpgdsCreateGeno("data.gds", genmat = U, sample.id = 1:40, snp.id = 1:dim(U)[1], snp.chromosome = chr, snp.position = pdis, snpfirstdim=TRUE) #Now the genotype data is saved in "file" file <- snpgdsOpen("data.gds") #Evaluate the Fst baseline #First specify the labels of the populations for each individual. (This should be adjusted according to the users' own data.) pop <- as.factor(c(rep('HEL',10),rep('PYO',10),'BYN',rep('LEV',10),rep('BYN',9))) #Calculate the Fst using the Weir&Cockerham (1984) approach result <- snpgdsFst(file,snp.id=snpset.id,population=pop, method=c("W&C84")) result #Do LD pruning to choose a subset of unlinked SNPs from the data snpset <- snpgdsLDpruning(file, method = c("composite"), ld.threshold=0.2) snpset.id <- unlist(snpset) #Evaluate the impact of the LD pruning on the Fst estimation, try different threshold for LD pruning result <- snpgdsFst(file,snp.id=snpset.id,population=pop, method=c("W&C84")) ######################QST-FST comparison studies #######################Driftsel analysis on the SNP data (microsatellite data can be prepared in a very similar fasion) ###First load R package Driftsel and RAFM. RAFM is first used to calculate the population level coancestry matrix on the basis of the genotype data. ###Driftsel is next applied to calculate the QST-FST statistic on the basis of quantitative traits and the coancestry matrix library(driftsel) library(RAFM) ######First convert the genotype data into the format that Driftsel can read n <- dim(geno)[1] #get the number of individuals p <- dim(geno)[2] #get the number of SNPs #create a n*2p matrix to save the two alleles of a locus seperately geno2 <- matrix(NA,nrow=n,ncol= p*2) for (i in 1:n){ for (j in 1:p) { if (geno[i,j]==0){ geno2[i,2*j-1]=100 geno2[i,2*j]=100 } if (geno[i,j]==1){ geno2[i,2*j-1]=100 geno2[i,2*j]=200 } if (geno[i,j]==2){ geno2[i,2*j-1]=200 geno2[i,2*j]=200 } } } #Specify names for each allele colnames(geno2) <- paste(paste('loc',sep='',rep(1:p, each=2)),sep='.',rep(1:2,p)) #specify the population strucure for the genotype data. The population labels have to be numerical values pop2 <- c(rep(1,10),rep(2,10),4,rep(3,10),rep(4,9)) #Combine the population structure and the genotype data geno2 <- cbind(pop2,geno2) colnames(geno2)[1] <- 'subpop' #Run the RAFM analysis ptm <- proc.time() afm <- do.all(geno, 15000, 5000, 10) # 15000 iterations, first 5000 as burn-in, the rest saved in every 10th iteractions proc.time() - ptm #check the mean Fst estimated by RAFM mean(afm$fst) #Read phenotype data into R, which can be downloaded from DRYAD: 10.5061/dryad.sg546 Y <- read.table('KarhunenEtAl_Evolution_phenotypes.txt',head=TRUE) #Collect pedigree data ped <- cbind(Y$id,Y$sire,Y$dam,Y$sire.pop,Y$dam.pop) colnames(ped) <- c('id','sire','dam','sire.pop','dam.pop') #Match the labels of the populations in the phenotype data to the genotypes. Note that this does not need to be done in your own data, if the population labels in genotypes and phenotypes are matched. ped[ped[,5]==4,4]=2 ped[ped[,5]==2,4]=4 ped[ped[,5]==3,4]=3 ped[,5]=ped[,4] #Extract covariates info, which is sex in this case. covars <- cbind(Y$id,Y$sex) colnames(covars) <- c('id','sex') #Extract the trait 'standard length' traits1 <- cbind(Y$id,Y$SL) traits1 <- traits1[ID,] colnames(traits1) <- c('id','trait.1') #Extract five morphorlogy traits for multivariate analysis. Data for other traits can be prepared similarly. traits2 <- cbind(Y$id,Y$SL,Y$body.depth,Y$head.length,Y$ pg.length,Y$cp.length) traits2 <- traits2[ID,] colnames(traits2) <- c('id','trait.1','trait.2','trait.3','trait.4','trait.5') #Run the driftsel analysis on standard length samp1 <- MH(afm$theta, ped, covars, traits1, 15000, 5000, 10, alt=T) #Run the driftsel analysis simultaneously on the five morphological traits samp2 <- MH(afm$theta, ped, covars, traits2, 15000, 5000, 10, alt=T) #Specify the habitat information for 4 populations, 1: Marine and 0:Pond env = cbind(1:4,c(1,0,0,1)) #Calculate the S test statistic. S>0.95 implies divergent selection; S<0.05 implies stabilising selection; S=0.5 implies perfect genetic drift S1 <- neut.test(samp1$pop.ef, samp1$G, samp1$theta, silent=T) S2 <- neut.test(samp2$pop.ef, samp2$G, samp2$theta, silent=T) #Calculate H test statistic. Interpretation is the same as the S statistic H1 <- H.test(samp1$pop.ef, samp1$G, samp1$theta, env, silent = T) H2 <- H.test(samp2$pop.ef, samp2$G, samp2$theta, env, silent = T)