##################Some R scripts for (1) linkage disequilibrium, (2) Fst and  (3) Qst-Fst analysis on the basis of genomic scale of SNP data
############Author: Zitong Li
###########Contact information: zitong.li@helsinki.fi

###########First, please download our SNP data 'data.vcf.gz' from DRYAD.
###########original data were saved in VCF file
#For the biallelic SNP datasets, it will be good to use VCFtools to convert the data into a .012 matrix, with each row representing individuals and each column representing a SNP. The genotype AA, AB, and BB are coded as 0, 1 and 2, respectively.
#In linux terminal (assuming the VCFtools have been installed), type 
vcftools --gzvcf  data.vcf.gz  --012 --out data
#to convert .VCF data to .012 data. 
#Three files are generated: "data.012" is the genotype matrix, "data.pos" contains the information of SNPs on the genome (e.g. chromosome, physical position), and "data.indv" are the individual ID.

#For multiallelic microsatellite data: micro_data.vcf.gz, in terminal, type

vcffile=data.vcf.gz 
outfile=data.csv
echo -n "locus," > $outfile
bcftools query -l $vcffile |tr '\n' ',' |sed 's/,$//' >> $outfile
echo >> $outfile

bcftools query -f '%CHROM\t%POS[\t%GT]\n' $vcffile | \
perl -ne 's/\./-1/g;@r=split /\s+/,$_;$r[0]=~s/\|.+//;print 
$r[0]."_".$r[1];for($i=2;$i<42;$i++){($a,$b)=split 
/\//,$r[$i];$a++;$b++;printf(",%.2i%.2i",$a,$b);}print "\n"' >> $outfile

#to convert the VCF data to a numerical genotype matrix saved in data.csv. The data format of a genotype is like "0101", where '01' and 01' represents the numerical value of the first and second alleles. 

#load the SNP data into R
geno <- read.table('data.012',head=FALSE)
pos <- read.table('data.012.pos',head=FALSE)
ID <- read.table('data.012.012.indv',head=FALSE)

#load microsatellite data
#geno=read.table("data.csv",sep=",",colClasses='character',row.names=1,header=T)

#look at how the genotype data look like
geno[1:10,1:10]

#First we conduct LD analysis, and evaluate its impact on the Fst baseline estimation.
#Load R packages 'SNPRelate' and 'gdsfmt', which can be installed from Bioconductor  
library(gdsfmt)
library(SNPRelate)


#extract the chromosome information of each SNP
chr <- pos[,1]
chr <- as.numeric(substr(chr, start=3, stop=4))

#extract the physical distance information of each SNP
pdis <- pos[,2]


#prepare the data into the SNPRelate format
U <- t(geno[,-1])
snpgdsCreateGeno("data.gds", genmat = U,
    sample.id = 1:40, snp.id = 1:dim(U)[1],
    snp.chromosome = chr,
    snp.position = pdis, snpfirstdim=TRUE)
#Now the genotype data is saved in "file"
file <- snpgdsOpen("data.gds")

#Evaluate the Fst baseline 
#First specify the labels of the populations for each individual. (This should be adjusted according to the users' own data.)  
pop <- as.factor(c(rep('HEL',10),rep('PYO',10),'BYN',rep('LEV',10),rep('BYN',9)))
#Calculate the Fst using the Weir&Cockerham (1984) approach

result <- snpgdsFst(file,snp.id=snpset.id,population=pop, method=c("W&C84"))
result


#Do LD pruning to choose a subset of unlinked SNPs from the data
snpset <- snpgdsLDpruning(file, method = c("composite"), ld.threshold=0.2)
snpset.id <- unlist(snpset)
#Evaluate the impact of the LD pruning on the Fst estimation, try different threshold for LD pruning 

result <- snpgdsFst(file,snp.id=snpset.id,population=pop, method=c("W&C84"))


######################QST-FST comparison studies
#######################Driftsel analysis on the SNP data (microsatellite data can be prepared in a very similar fasion)
###First load R package Driftsel and RAFM. RAFM is first used to calculate the population level coancestry matrix on the basis of the genotype data.
###Driftsel is next applied to calculate the QST-FST statistic on the basis of quantitative traits and the coancestry matrix

library(driftsel)
library(RAFM)
 
######First convert the genotype data into the format that Driftsel can read
n <- dim(geno)[1] #get the number of individuals
p <- dim(geno)[2] #get the number of SNPs

#create a n*2p matrix to save the two alleles of a locus seperately

geno2 <- matrix(NA,nrow=n,ncol= p*2)

for (i in 1:n){
for (j in 1:p) {
if (geno[i,j]==0){
geno2[i,2*j-1]=100
geno2[i,2*j]=100
}
if (geno[i,j]==1){
geno2[i,2*j-1]=100
geno2[i,2*j]=200
}
if (geno[i,j]==2){
geno2[i,2*j-1]=200
geno2[i,2*j]=200
}
}
}

#Specify names for each allele
colnames(geno2) <- paste(paste('loc',sep='',rep(1:p, each=2)),sep='.',rep(1:2,p))

#specify the population strucure for the genotype data. The population labels have to be numerical values

pop2 <-  c(rep(1,10),rep(2,10),4,rep(3,10),rep(4,9))

#Combine the population structure and the genotype data

geno2 <- cbind(pop2,geno2)

colnames(geno2)[1] <- 'subpop'

#Run the RAFM analysis

ptm <- proc.time()
afm <- do.all(geno, 15000, 5000, 10)   # 15000 iterations, first 5000 as burn-in, the rest saved in every 10th iteractions
proc.time() - ptm

#check the mean Fst estimated by RAFM
mean(afm$fst)

#Read phenotype data into R, which can be downloaded from DRYAD: 10.5061/dryad.sg546

Y <- read.table('KarhunenEtAl_Evolution_phenotypes.txt',head=TRUE)

#Collect pedigree data

ped <- cbind(Y$id,Y$sire,Y$dam,Y$sire.pop,Y$dam.pop)
colnames(ped) <- c('id','sire','dam','sire.pop','dam.pop')

#Match the labels of the populations in the phenotype data to the genotypes. Note that this does not need to be done in your own data, if the population labels in genotypes and phenotypes are matched.



 ped[ped[,5]==4,4]=2
 ped[ped[,5]==2,4]=4
 ped[ped[,5]==3,4]=3
 ped[,5]=ped[,4]

#Extract covariates info, which is sex in this case. 

covars <- cbind(Y$id,Y$sex)

colnames(covars) <- c('id','sex')

#Extract the trait 'standard length' 

traits1 <- cbind(Y$id,Y$SL)
traits1 <- traits1[ID,]
colnames(traits1) <- c('id','trait.1')

#Extract five morphorlogy traits for multivariate analysis. Data for other traits can be prepared similarly.
traits2 <- cbind(Y$id,Y$SL,Y$body.depth,Y$head.length,Y$ pg.length,Y$cp.length)
traits2 <- traits2[ID,]
colnames(traits2) <- c('id','trait.1','trait.2','trait.3','trait.4','trait.5')

#Run the driftsel analysis on standard length
samp1 <- MH(afm$theta, ped, covars, traits1, 15000, 5000, 10, alt=T) 

#Run the driftsel analysis simultaneously on the five morphological traits
samp2 <- MH(afm$theta, ped, covars, traits2, 15000, 5000, 10, alt=T) 

#Specify the habitat information for 4 populations, 1: Marine and 0:Pond
env = cbind(1:4,c(1,0,0,1))

#Calculate the S test statistic. S>0.95 implies divergent selection; S<0.05 implies stabilising selection; S=0.5 implies perfect genetic drift

S1 <- neut.test(samp1$pop.ef, samp1$G, samp1$theta, silent=T)
S2 <- neut.test(samp2$pop.ef, samp2$G, samp2$theta, silent=T)  

#Calculate H test statistic. Interpretation is the same as the S statistic

H1 <- H.test(samp1$pop.ef, samp1$G, samp1$theta, env, silent = T)
H2 <- H.test(samp2$pop.ef, samp2$G, samp2$theta, env, silent = T)