--- title: "Intraspecific sequence variation and gene expression contribute little to venom diversity in Sidewinder Rattlesnakes (*Crotalus cerastes*)" author: "Rhett M. Rautsaw, Erich P. Hofmann, Mark J. Margres, Matthew L. Holding, Jason L. Strickland, Andrew J. Mason, Darin R. Rokyta, and Christopher L. Parkinson" date: "`r format(Sys.time(), '%Y %B %d')`" output: html_notebook: theme: journal toc: true toc_float: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # SNP Processing ## Preprocess ```{bash,eval=F} tabix -fp vcf *_phased.vcf # Merge SNPs into a single file vcf-merge -R 0/0 *_phased.vcf.gz > Combined.vcf bgzip -c Combined.vcf > Combined.vcf.gz tabix -fp vcf Combined.vcf.gz mkdir Analysis mv Combined.* Analysis_filtered/ cd Analysis_filtered ``` ## Tajima's D and Nucleotide Diversity per Gene ```{bash,eval=F} mkdir Genes NDiversity TajimasD_perGene RESULTS for i in `cat Genes.txt` do echo $i tabix -h Combined.vcf.gz $i > Genes/"${i}".vcf vcftools --vcf Genes/"${i}".vcf --TajimaD 8000 --out TajimasD_perGene/"${i}" vcftools --vcf Genes/"${i}".vcf --site-pi --out NDiversity/"${i}" tail +2 TajimasD_perGene/"${i}".Tajima.D >> RESULTS/TajimaD_perGene.txt tail +2 NDiversity/"${i}".sites.pi >> RESULTS/Pi_perSite.txt done # TajimaD per site mkdir TajimaD_perSite vcftools --vcf Combined.vcf --TajimaD 1 --out TajimaD_perSite/TajimaD_perSite cp TajimaD_perSite/TajimaD_perSite.Tajima.D RESULTS/TajimaD_perSite.txt ``` ## Fst Between Populations ```{bash,eval=F} # Fst per site between populations (mean per gene in R) tabix -h Combined.vcf.gz `cat Toxins.txt` > CombinedToxins.vcf tabix -h Combined.vcf.gz `cat Nontoxins.txt` > CombinedNontoxins.vcf mkdir Fst vcftools --vcf CombinedToxins.vcf --weir-fst-pop S_Mojave.txt --weir-fst-pop N_Mojave.txt --out Fst/Toxin_SMvNM vcftools --vcf CombinedToxins.vcf --weir-fst-pop S_Mojave.txt --weir-fst-pop Colorado.txt --out Fst/Toxin_SMvC vcftools --vcf CombinedToxins.vcf --weir-fst-pop S_Mojave.txt --weir-fst-pop Sonora.txt --out Fst/Toxin_SMvS vcftools --vcf CombinedToxins.vcf --weir-fst-pop N_Mojave.txt --weir-fst-pop Colorado.txt --out Fst/Toxin_NMvC vcftools --vcf CombinedToxins.vcf --weir-fst-pop N_Mojave.txt --weir-fst-pop Sonora.txt --out Fst/Toxin_NMvS vcftools --vcf CombinedToxins.vcf --weir-fst-pop Colorado.txt --weir-fst-pop Sonora.txt --out Fst/Toxin_CvS vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop S_Mojave.txt --weir-fst-pop N_Mojave.txt --out Fst/Nontoxin_SMvNM vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop S_Mojave.txt --weir-fst-pop Colorado.txt --out Fst/Nontoxin_SMvC vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop S_Mojave.txt --weir-fst-pop Sonora.txt --out Fst/Nontoxin_SMvS vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop N_Mojave.txt --weir-fst-pop Colorado.txt --out Fst/Nontoxin_NMvC vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop N_Mojave.txt --weir-fst-pop Sonora.txt --out Fst/Nontoxin_NMvS vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop Colorado.txt --weir-fst-pop Sonora.txt --out Fst/Nontoxin_CvS # Fst between all populations vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop Colorado.txt --weir-fst-pop Sonora.txt --weir-fst-pop N_Mojave.txt --weir-fst-pop S_Mojave.txt --out Fst/Nontoxin_All vcftools --vcf CombinedToxins.vcf --weir-fst-pop Colorado.txt --weir-fst-pop Sonora.txt --weir-fst-pop N_Mojave.txt --weir-fst-pop S_Mojave.txt --out Fst/Toxins_All # Fst between Phylogenetic East-West clades vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop PhyloEast.txt --weir-fst-pop PhyloWest.txt --out Fst/Nontoxin_PEvPW vcftools --vcf CombinedToxins.vcf --weir-fst-pop PhyloEast.txt --weir-fst-pop PhyloWest.txt --out Fst/Toxin_PEvPW # Fst between Geographic East-West clades vcftools --vcf CombinedNontoxins.vcf --weir-fst-pop East.txt --weir-fst-pop West.txt --out Fst/Nontoxin_EvW vcftools --vcf CombinedToxins.vcf --weir-fst-pop East.txt --weir-fst-pop West.txt --out Fst/Toxin_EvW for i in `cat Comparisons.txt` do echo $i perl -p -i -e "s/\n/\t ${i}\n/g" Fst/*_"${i}".weir.fst tail +2 Fst/*_"${i}".weir.fst >> Fst.txt done for i in `cat Comparisons.txt` do A="$(grep "mean Fst" Fst/Nontoxin_"${i}".log)" B="$(grep "mean Fst" Fst/Toxin_"${i}".log)" C="$(grep "weighted Fst" Fst/Nontoxin_"${i}".log)" D="$(grep "weighted Fst" Fst/Toxin_"${i}".log)" printf "$i \t $A \t $B \t $C \t $D \n" >> RESULTS/Fst_PopMeans.txt done grep -v "==" Fst.txt | grep -v '^$' > Fst_perSite.txt rm Fst.txt ``` ## Determine Nonsynonymous & Synonymous Mutations ```{bash,eval=F} snpEff Ccera Combined.vcf.gz > Annotated.vcf ``` ```{bash,eval=F,echo=F} ## TajimaD & NDiversity per population mkdir PopulationAnalyses cd PopulationAnalyses mkdir N_Mojave S_Mojave Colorado Sonora cd .. for i in `cat N_Mojave.txt` do cp ../"${i}"* PopulationAnalyses/N_Mojave/ done for i in `cat S_Mojave.txt` do cp ../"${i}"* PopulationAnalyses/S_Mojave/ done for i in `cat Colorado.txt` do cp ../"${i}"* PopulationAnalyses/Colorado/ done for i in `cat Sonora.txt` do cp ../"${i}"* PopulationAnalyses/Sonora/ done cd PopulationAnalyses for i in N_Mojave S_Mojave Colorado Sonora do cd $i vcf-merge -R 0/0 *_phased.vcf.gz > Combined.vcf bgzip -c Combined.vcf > Combined.vcf.gz tabix -fp vcf Combined.vcf.gz mkdir Analysis mv Combined.* Analysis/ cp ../../Genes.txt Analysis/ cd Analysis/ mkdir Genes NDiversity TajimasD for j in `cat Genes.txt` do echo $j tabix -h Combined.vcf.gz $j > Genes/"${j}".vcf vcftools --vcf Genes/"${j}".vcf --TajimaD 8000 --out TajimasD/"${j}" vcftools --vcf Genes/"${j}".vcf --site-pi --out NDiversity/"${j}" tail +2 TajimasD/"${j}".Tajima.D >> CombinedTajimaD.txt tail +2 NDiversity/"${j}".sites.pi >> CombinedPi.txt done cd ../.. done ``` ## Tajima's D Synonymous vs. Nonsynonymous Mutations ```{bash,eval=F} grep "synonymous_variant" Annotated.vcf > Synonymous.vcf grep -v "synonymous_variant" Annotated.vcf > Nonsynonymous.vcf # Manually edit the Synonymous.vcf to add header back into document bgzip -c Synonymous.vcf > Synonymous.vcf.gz tabix -fp vcf Synonymous.vcf.gz bgzip -c Nonsynonymous.vcf > Nonsynonymous.vcf.gz tabix -fp vcf Nonsynonymous.vcf.gz for i in `cat Genes.txt` do echo $i tabix -h Synonymous.vcf.gz $i > Genes_Synonymous/"${i}".vcf vcftools --vcf Genes_Synonymous/"${i}".vcf --TajimaD 8000 --out TajimasD_Synonymous/"${i}" tail +2 TajimasD_Synonymous/"${i}".Tajima.D >> RESULTS/TajimasD_Synonymous.txt tabix -h Nonsynonymous.vcf.gz $i > Genes_Nonsynonymous/"${i}".vcf vcftools --vcf Genes_Nonsynonymous/"${i}".vcf --TajimaD 8000 --out TajimasD_Nonsynonymous/"${i}" tail +2 TajimasD_Nonsynonymous/"${i}".Tajima.D >> RESULTS/TajimasD_Nonsynonymous.txt done ``` # R Analysis ```{r,echo=F} setwd("~/Dropbox/Projects/2018_Ccera/RESULTS/PhasedSNPs/Analysis_filtered/RESULTS/") ``` ## Loading Libraries and Data ``` {r,warning=F,error=F,message=F} library(readxl) library(ggplot2) library(dplyr) library(cowplot) library(kableExtra) library(tidyr) Data<-read_excel("./FinalData.xlsx") Data$ID<-as.factor(Data$ID) Data$class<-as.factor(Data$class) Data$toxin_class<-as.factor(Data$toxin_class) ```
## snpEff Analysis ```{r} snpEff<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","length", "Total_Nonsynonymous","Synonymous","Total_variants")] snpEff<-snpEff[complete.cases(snpEff), ] snpEff$total_snpKB<-(snpEff$Total_variants/snpEff$length)*1000 snpEff$syn_snpKB<-(snpEff$Synonymous/snpEff$length)*1000 snpEff$nonsyn_snpKB<-(snpEff$Total_Nonsynonymous/snpEff$length)*1000 snpEff_tox<-droplevels.data.frame(subset(snpEff,snpEff$class=="Toxin")) snpEff_nontox<-droplevels.data.frame(subset(snpEff,snpEff$class=="Nontoxin")) x<-snpEff %>% group_by(class) %>% summarize(n=n(), perKB_all=mean(total_snpKB), perKB_all_sd=sd(total_snpKB), Total_Nonsynonymous=sum(Total_Nonsynonymous), Total_Synonymous=sum(Synonymous), Total_Variants=sum(Total_variants), Prop_nonsyn=(Total_Nonsynonymous/Total_Variants), Prop_syn=(Total_Synonymous/Total_Variants)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant association between the type of mutation (nonsynonymous vs. synonymous) and the transcript class (toxin vs. nontoxin)? ```{r} chisq.test(x[,5:6]) ```
#### Is there a significant difference in the number of SNPs per kilobase between toxin and nontoxins? ```{r} (m<-summary(lm(snpEff$total_snpKB~snpEff$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(snpEff_nontox$total_snpKB,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","total_snpKB") data<-snpEff_tox[,c("class","total_snpKB")] data<-rbind(data,samp) z<-summary(lm(data$total_snpKB~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="SNP's per kb") b<-round(pct[2]*0.01,2) ```
## Nucleotide Diversity ```{r} NDiv<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","pi")] NDiv<-NDiv[complete.cases(NDiv), ] NDiv_tox<-droplevels.data.frame(subset(NDiv,NDiv$class=="Toxin")) NDiv_nontox<-droplevels.data.frame(subset(NDiv,NDiv$class=="Nontoxin")) x<-NDiv %>% group_by(class) %>% summarize(mean=mean(pi),sd=sd(pi),n=length(pi)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in nucleotide diversity between toxins and nontoxins? ```{r} (m<-summary(lm(NDiv$pi~NDiv$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(NDiv_nontox$pi,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","pi") data<-NDiv_tox[,c("class","pi")] data<-rbind(data,samp) z<-summary(lm(data$pi~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="Nucleotide Diversity") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(NDiv_nontox$pi,c(0.025,0.975)) NDiv_tox$sig<-NA for (i in 1:nrow(NDiv_tox)){ if (NDiv_tox$pi[i] < CI[1] | NDiv_tox$pi[i] > CI[2]){NDiv_tox$sig[i]<-TRUE} else {NDiv_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(NDiv_tox,NDiv_tox$sig=="TRUE")) ggplot(NDiv, aes(x=class, y=pi)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.25*max(NDiv$pi))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=pi),data=sig,fill="white",colour="black",size=3,pch=23) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=pi),label.size = 0,size=5,inherit.aes = F) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.2*max(NDiv$pi)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.2*max(NDiv$pi)),size=10) + draw_label(paste("R2=",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1*max(NDiv$pi)),size=10) + guides(fill=F) ```
#### Does nucleotide diversity correlate with average expression levels? ```{r,fig.width=5,fig.height=5,echo=F,eval=F} x<-NDiv %>% group_by(diff_exprs) %>% summarize(mean=mean(pi),sd=sd(pi),n=length(pi)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NDiv$pi~NDiv$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NDiv, aes(x=diff_exprs, y=pi)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.25*max(NDiv$pi))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.2*max(NDiv$pi)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.2*max(NDiv$pi)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p),x=1.5,y=(1*max(NDiv$pi)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NDiv$pi~log(NDiv$average_tpm)) (m<-summary(all_lm)) r2<-round(m$r.squared,3) p<-round(m$coefficients[2,4],3) ggplot(NDiv,aes(x=log(average_tpm),y=pi)) + xlab("Average TPM") + ylab("Pi") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NDiv),"; R2=",r2,"; p =",p),x=7.5,y=0,size=10) ```
#### Does nucleotide diversity correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,echo=F,eval=F} x<-NDiv_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(pi),sd=sd(pi),n=length(pi)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NDiv_tox$pi~NDiv_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NDiv_tox, aes(x=diff_exprs, y=pi)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.25*max(NDiv$pi))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.2*max(NDiv$pi)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.2*max(NDiv$pi)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p),x=1.5,y=(1*max(NDiv$pi)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NDiv_tox$pi~log(NDiv_tox$average_tpm)) (m<-summary(all_lm)) r2<-round(m$r.squared,3) p<-round(m$coefficients[2,4],3) ggplot(NDiv_tox,aes(x=log(average_tpm),y=pi)) + xlab("Average TPM") + ylab("Pi") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NDiv_tox),"; R2=",r2,"; p =",p),x=7.5,y=0.4,size=10) ```
## Tajima's D ```{r} Tajima<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","tajimasD")] Tajima<-Tajima[complete.cases(Tajima), ] Tajima_tox<-droplevels.data.frame(subset(Tajima,Tajima$class=="Toxin")) Tajima_nontox<-droplevels.data.frame(subset(Tajima,Tajima$class=="Nontoxin")) x<-Tajima %>% group_by(class) %>% summarize(mean=mean(tajimasD),sd=sd(tajimasD),n=length(tajimasD)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Are estimates of Tajima's D significantly different than 0 (suggesting selection) for toxins or nontoxins? ```{r} t.test(Tajima_nontox$tajimasD) t.test(Tajima_tox$tajimasD) ```
#### Is there a significant difference in Tajima's D between toxins and nontoxins? ```{r} (m<-summary(lm(Tajima$tajimasD~Tajima$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(Tajima_nontox$tajimasD,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","tajimasD") data<-Tajima_tox[,c("class","tajimasD")] data<-rbind(data,samp) z<-summary(lm(data$tajimasD~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="Tajima's D") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(Tajima_nontox$tajimasD,c(0.025,0.975)) Tajima_tox$sig<-NA for (i in 1:nrow(Tajima_tox)){ if (Tajima_tox$tajimasD[i] < CI[1] | Tajima_tox$tajimasD[i] > CI[2]){Tajima_tox$sig[i]<-TRUE} else {Tajima_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(Tajima_tox,Tajima_tox$sig=="TRUE")) ``` ### Figure 2A ```{r,fig.width=5,fig.height=5} (Fig2A <- ggplot(Tajima, aes(x=class, y=tajimasD)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.5*max(Tajima$tajimasD))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=tajimasD),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=tajimasD),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(Tajima$tajimasD)),size=10) + guides(fill=F)) ```
#### Does Tajima's D (All, Positive, and Negative values) correlate with average expression levels? ```{r,fig.width=5,fig.height=5,echo=F,eval=F} x<-Tajima %>% group_by(diff_exprs) %>% summarize(mean=mean(tajimasD),sd=sd(tajimasD),n=length(tajimasD)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(Tajima$tajimasD~Tajima$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(Tajima, aes(x=diff_exprs, y=tajimasD)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(Tajima$tajimasD))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p),x=1.5,y=(1.25*max(Tajima$tajimasD)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(Tajima$tajimasD~log(Tajima$average_tpm)) (m<-summary(all_lm)) lt0<-droplevels.data.frame(subset(Tajima,Tajima$tajimasD<0)) lt0_lm<-lm(lt0$tajimasD~log(lt0$average_tpm)) (m_lt0<-summary(lt0_lm)) gt0<-droplevels.data.frame(subset(Tajima,Tajima$tajimasD>0)) gt0_lm<-lm(gt0$tajimasD~log(gt0$average_tpm)) (m_gt0<-summary(gt0_lm)) ggplot(Tajima,aes(x=log(average_tpm),y=tajimasD)) + xlab("Average TPM") + ylab("Tajima's D") + geom_point() + geom_hline(yintercept=0) + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + geom_abline(intercept = coef(lt0_lm)[1],slope=coef(lt0_lm)[2],lty=3) + geom_abline(intercept = coef(gt0_lm)[1],slope=coef(gt0_lm)[2],lty=2) + draw_label(paste("n =",nrow(Tajima),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=0,size=10) + draw_label(paste("n =",nrow(lt0),"; R2=",round(m_lt0$r.squared,2),"; p =",round(m_lt0$coefficients[2,4],2)),x=7.5,y=-1,size=10) + draw_label(paste("n =",nrow(gt0),"; R2=",round(m_gt0$r.squared,2),"; p =",round(m_gt0$coefficients[2,4],2)),x=7.5,y=1,size=10) ```
#### Does Tajima's D (All, Positive, and Negative values) correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,echo=F,eval=F} x<-Tajima_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(tajimasD),sd=sd(tajimasD),n=length(tajimasD)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(Tajima_tox$tajimasD~Tajima_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(Tajima_tox, aes(x=diff_exprs, y=tajimasD)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(Tajima_tox$tajimasD))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(Tajima_tox$tajimasD)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(Tajima_tox$tajimasD)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(Tajima_tox$tajimasD)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(Tajima_tox$tajimasD~log(Tajima_tox$average_tpm)) (m<-summary(all_lm)) lt0<-droplevels.data.frame(subset(Tajima_tox,Tajima_tox$tajimasD<0)) lt0_lm<-lm(lt0$tajimasD~log(lt0$average_tpm)) (m_lt0<-summary(lt0_lm)) gt0<-droplevels.data.frame(subset(Tajima_tox,Tajima_tox$tajimasD>0)) gt0_lm<-lm(gt0$tajimasD~log(gt0$average_tpm)) (m_gt0<-summary(gt0_lm)) ``` ### Figure 3A ```{r,fig.width=5,fig.height=5} (Fig3A<-ggplot(Tajima_tox,aes(x=log(average_tpm),y=tajimasD)) + xlab("Average TPM") + ylab("Tajima's D") + geom_point() + #geom_hline(yintercept=0) + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + geom_abline(intercept = coef(lt0_lm)[1],slope=coef(lt0_lm)[2],lty=3) + geom_abline(intercept = coef(gt0_lm)[1],slope=coef(gt0_lm)[2],lty=2) + draw_label(paste("n =",nrow(Tajima_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=0,size=10) + draw_label(paste("n =",nrow(lt0),"; R2=",round(m_lt0$r.squared,2),"; p =",round(m_lt0$coefficients[2,4],2)),x=7.5,y=-1,size=10) + draw_label(paste("n =",nrow(gt0),"; R2=",round(m_gt0$r.squared,2),"; p =",round(m_gt0$coefficients[2,4],2)),x=7.5,y=1,size=10)) ```
```{r,eval=F,echo=F} ## PAML PAML<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","M12_LRT")] PAML<-PAML[complete.cases(PAML), ] PAML_tox<-droplevels.data.frame(subset(PAML,PAML$class=="Toxin")) PAML_nontox<-droplevels.data.frame(subset(PAML,PAML$class=="Nontoxin")) x<-PAML %>% group_by(class) %>% summarize(mean=mean(M12_LRT),sd=sd(M12_LRT),n=length(M12_LRT)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ``` ```{r,eval=F,echo=F} #### Is there a significant difference in PAML between toxins and nontoxins? (m<-summary(lm(PAML$M12_LRT~PAML$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(PAML_nontox$M12_LRT,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","M12_LRT") data<-PAML_tox[,c("class","M12_LRT")] data<-rbind(data,samp) z<-summary(lm(data$M12_LRT~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="M1-M2 Likelihood Ratio") b<-round(pct[2]*0.01,2) ``` ```{r,fig.width=5,fig.height=5,eval=F,echo=F} #Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(PAML_nontox$M12_LRT,c(0.975)) PAML_tox$sig<-NA for (i in 1:nrow(PAML_tox)){ if (PAML_tox$M12_LRT[i] > CI){PAML_tox$sig[i]<-TRUE} else {PAML_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(PAML_tox,PAML_tox$sig=="TRUE")) ``` ```{r,fig.width=5,fig.height=5,eval=F,echo=F} ### Figure 2B (Fig2B <- ggplot(PAML, aes(x=class, y=M12_LRT)) + geom_violin(aes(fill=class),trim=F,scale = "width") + ylim(NA,(1.25*max(PAML$M12_LRT))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=M12_LRT),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=M12_LRT),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.25*max(PAML$M12_LRT)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.25*max(PAML$M12_LRT)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(max(PAML$M12_LRT)),size=10) + guides(fill=F)) ``` ```{r,fig.width=5,fig.height=5,eval=F,echo=F} #### Do PAML Likelihood Ratios correlate with average expression levels? x<-PAML %>% group_by(diff_exprs) %>% summarize(mean=mean(M12_LRT),sd=sd(M12_LRT),n=length(M12_LRT)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(PAML$M12_LRT~PAML$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(PAML, aes(x=diff_exprs, y=M12_LRT)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(PAML$M12_LRT))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(PAML$M12_LRT)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(PAML$M12_LRT)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(PAML$M12_LRT)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(PAML$M12_LRT~log(PAML$average_tpm)) (m<-summary(all_lm)) ggplot(PAML,aes(x=log(average_tpm),y=M12_LRT)) + xlab("Average TPM") + ylab("M1-M2 Likelihood Ratio") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(PAML),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-10,size=10) ``` ```{r,fig.width=5,fig.height=5,eval=F,echo=F} #### Do PAML Likelihood Ratios correlate with average expression levels **in toxins**? x<-PAML_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(M12_LRT),sd=sd(M12_LRT),n=length(M12_LRT)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(PAML_tox$M12_LRT~PAML_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(PAML_tox, aes(x=diff_exprs, y=M12_LRT)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(PAML_tox$M12_LRT))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(PAML_tox$M12_LRT)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(PAML_tox$M12_LRT)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(PAML_tox$M12_LRT)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(PAML_tox$M12_LRT~log(PAML_tox$average_tpm)) (m<-summary(all_lm)) ``` ```{r,fig.width=5,fig.height=5,eval=F,echo=F} ### Figure 3B (Fig3B<-ggplot(PAML_tox,aes(x=log(average_tpm),y=M12_LRT)) + xlab("Average TPM") + ylab("M1-M2 Likelihood Ratio") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(PAML_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=50,size=10)) ```
## Global Fst ```{r} Global<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","Global")] Global<-Global[complete.cases(Global), ] Global_tox<-droplevels.data.frame(subset(Global,class=="Toxin")) Global_nontox<-droplevels.data.frame(subset(Global,class=="Nontoxin")) x<-Global %>% group_by(class) %>% summarize(mean=mean(Global),sd=sd(Global),n=length(Global)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in Global Fst between toxins and nontoxins? ```{r} (m<-summary(lm(Global$Global~Global$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(Global_nontox$Global,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","Global") data<-Global_tox[,c("class","Global")] data<-rbind(data,samp) z<-summary(lm(data$Global~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="Global Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(Global_nontox$Global,c(0.975)) Global_tox$sig<-NA for (i in 1:nrow(Global_tox)){ if (Global_tox$Global[i] > CI){Global_tox$sig[i]<-TRUE} else {Global_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(Global_tox,Global_tox$sig=="TRUE")) ``` ### Figure 2B ```{r,fig.width=5,fig.height=5} (Fig2B <- ggplot(Global, aes(x=class, y=Global)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.25*max(Global$Global))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=Global),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=Global),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.25*max(Global$Global)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.25*max(Global$Global)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(max(Global$Global)),size=10) + guides(fill=F)) ```
#### Does Global Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-Global %>% group_by(diff_exprs) %>% summarize(mean=mean(Global),sd=sd(Global),n=length(Global)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(Global$Global~Global$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(Global, aes(x=diff_exprs, y=Global)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(Global$Global))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(Global$Global)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(Global$Global)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(Global$Global)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(Global$Global~log(Global$average_tpm)) (m<-summary(all_lm)) ggplot(Global,aes(x=log(average_tpm),y=Global)) + xlab("Average TPM") + ylab("Global Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(Global),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does Global Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-Global_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(Global),sd=sd(Global),n=length(Global)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(Global_tox$Global~Global_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(Global_tox, aes(x=diff_exprs, y=Global)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(Global_tox$Global))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(Global_tox$Global)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(Global_tox$Global)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(Global_tox$Global)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(Global_tox$Global~log(Global_tox$average_tpm)) (m<-summary(all_lm)) ``` ### Figure 3B ```{r,fig.width=5,fig.height=5} (Fig3B<-ggplot(Global_tox,aes(x=log(average_tpm),y=Global)) + xlab("Average TPM") + ylab("Global Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(Global_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=-0.15,size=10)) ```
## North Mojave vs. South Mojave Fst ```{r} NMvSM<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","NMvSM")] NMvSM<-NMvSM[complete.cases(NMvSM), ] NMvSM_tox<-droplevels.data.frame(subset(NMvSM,class=="Toxin")) NMvSM_nontox<-droplevels.data.frame(subset(NMvSM,class=="Nontoxin")) x<-NMvSM %>% group_by(class) %>% summarize(mean=mean(NMvSM),sd=sd(NMvSM),n=length(NMvSM)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in NMvSM Fst between toxins and nontoxins? ```{r} (m<-summary(lm(NMvSM$NMvSM~NMvSM$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(NMvSM_nontox$NMvSM,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","NMvSM") data<-NMvSM_tox[,c("class","NMvSM")] data<-rbind(data,samp) z<-summary(lm(data$NMvSM~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="NMvSM Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(NMvSM_nontox$NMvSM,c(0.975)) NMvSM_tox$sig<-NA for (i in 1:nrow(NMvSM_tox)){ if (NMvSM_tox$NMvSM[i] > CI){NMvSM_tox$sig[i]<-TRUE} else {NMvSM_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(NMvSM_tox,NMvSM_tox$sig=="TRUE")) ``` ### Figure 4A ```{r,fig.width=5,fig.height=5} (Fig4A <- ggplot(NMvSM, aes(x=class, y=NMvSM)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.25*max(NMvSM$NMvSM))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=NMvSM),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=NMvSM),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.25*max(NMvSM$NMvSM)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.25*max(NMvSM$NMvSM)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(max(NMvSM$NMvSM)),size=10) + guides(fill=F)) ```
#### Does NMvSM Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-NMvSM %>% group_by(diff_exprs) %>% summarize(mean=mean(NMvSM),sd=sd(NMvSM),n=length(NMvSM)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NMvSM$NMvSM~NMvSM$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NMvSM, aes(x=diff_exprs, y=NMvSM)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(NMvSM$NMvSM))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(NMvSM$NMvSM)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(NMvSM$NMvSM)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(NMvSM$NMvSM)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NMvSM$NMvSM~log(NMvSM$average_tpm)) (m<-summary(all_lm)) ggplot(NMvSM,aes(x=log(average_tpm),y=NMvSM)) + xlab("Average TPM") + ylab("NMvSM Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NMvSM),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-1,size=10) ```
#### Does NMvSM Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-NMvSM_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(NMvSM),sd=sd(NMvSM),n=length(NMvSM)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NMvSM_tox$NMvSM~NMvSM_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NMvSM_tox, aes(x=diff_exprs, y=NMvSM)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(NMvSM_tox$NMvSM))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(NMvSM_tox$NMvSM)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(NMvSM_tox$NMvSM)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(NMvSM_tox$NMvSM)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NMvSM_tox$NMvSM~log(NMvSM_tox$average_tpm)) (m<-summary(all_lm)) ggplot(NMvSM_tox,aes(x=log(average_tpm),y=NMvSM)) + xlab("Average TPM") + ylab("NMvSM Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NMvSM_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=-0.15,size=10) ```
## North Mojave vs. Colorado Fst ```{r} NMvC<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","NMvC")] NMvC<-NMvC[complete.cases(NMvC), ] NMvC_tox<-droplevels.data.frame(subset(NMvC,class=="Toxin")) NMvC_nontox<-droplevels.data.frame(subset(NMvC,class=="Nontoxin")) x<-NMvC %>% group_by(class) %>% summarize(mean=mean(NMvC),sd=sd(NMvC),n=length(NMvC)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in NMvC Fst between toxins and nontoxins? ```{r} (m<-summary(lm(NMvC$NMvC~NMvC$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(NMvC_nontox$NMvC,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","NMvC") data<-NMvC_tox[,c("class","NMvC")] data<-rbind(data,samp) z<-summary(lm(data$NMvC~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="NMvC Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(NMvC_nontox$NMvC,c(0.975)) NMvC_tox$sig<-NA for (i in 1:nrow(NMvC_tox)){ if (NMvC_tox$NMvC[i] > CI){NMvC_tox$sig[i]<-TRUE} else {NMvC_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(NMvC_tox,NMvC_tox$sig=="TRUE")) ``` ### Figure 4B ```{r,fig.width=5,fig.height=5} (Fig4B <- ggplot(NMvC, aes(x=class, y=NMvC)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(2.25*max(NMvC$NMvC))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=NMvC),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + #geom_label(data=sig,aes(x=2.25,label=ID,y=NMvC),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(2*max(NMvC$NMvC)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(2*max(NMvC$NMvC)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.75*max(NMvC$NMvC)),size=10) + guides(fill=F)) ```
#### Does NMvC Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-NMvC %>% group_by(diff_exprs) %>% summarize(mean=mean(NMvC),sd=sd(NMvC),n=length(NMvC)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NMvC$NMvC~NMvC$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NMvC, aes(x=diff_exprs, y=NMvC)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(NMvC$NMvC))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(NMvC$NMvC)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(NMvC$NMvC)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(NMvC$NMvC)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NMvC$NMvC~log(NMvC$average_tpm)) (m<-summary(all_lm)) ggplot(NMvC,aes(x=log(average_tpm),y=NMvC)) + xlab("Average TPM") + ylab("NMvC Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NMvC),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does NMvC Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-NMvC_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(NMvC),sd=sd(NMvC),n=length(NMvC)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NMvC_tox$NMvC~NMvC_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NMvC_tox, aes(x=diff_exprs, y=NMvC)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(NMvC_tox$NMvC))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(NMvC_tox$NMvC)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(NMvC_tox$NMvC)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(NMvC_tox$NMvC)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NMvC_tox$NMvC~log(NMvC_tox$average_tpm)) (m<-summary(all_lm)) ggplot(NMvC_tox,aes(x=log(average_tpm),y=NMvC)) + xlab("Average TPM") + ylab("NMvC Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NMvC_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=-0.15,size=10) ```
## North Mojave vs. Sonora Fst ```{r} NMvS<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","NMvS")] NMvS<-NMvS[complete.cases(NMvS), ] NMvS_tox<-droplevels.data.frame(subset(NMvS,class=="Toxin")) NMvS_nontox<-droplevels.data.frame(subset(NMvS,class=="Nontoxin")) x<-NMvS %>% group_by(class) %>% summarize(mean=mean(NMvS),sd=sd(NMvS),n=length(NMvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in NMvS Fst between toxins and nontoxins? ```{r} (m<-summary(lm(NMvS$NMvS~NMvS$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(NMvS_nontox$NMvS,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","NMvS") data<-NMvS_tox[,c("class","NMvS")] data<-rbind(data,samp) z<-summary(lm(data$NMvS~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="NMvS Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(NMvS_nontox$NMvS,c(0.975)) NMvS_tox$sig<-NA for (i in 1:nrow(NMvS_tox)){ if (NMvS_tox$NMvS[i] > CI){NMvS_tox$sig[i]<-TRUE} else {NMvS_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(NMvS_tox,NMvS_tox$sig=="TRUE")) ``` ### Figure 4C ```{r,fig.width=5,fig.height=5} (Fig4C <- ggplot(NMvS, aes(x=class, y=NMvS)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.75*max(NMvS$NMvS))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=NMvS),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=NMvS),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.5*max(NMvS$NMvS)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.5*max(NMvS$NMvS)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(NMvS$NMvS)),size=10) + guides(fill=F)) ```
#### Does NMvS Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,echo=F,eval=F} x<-NMvS %>% group_by(diff_exprs) %>% summarize(mean=mean(NMvS),sd=sd(NMvS),n=length(NMvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NMvS$NMvS~NMvS$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NMvS, aes(x=diff_exprs, y=NMvS)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(NMvS$NMvS))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(NMvS$NMvS)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(NMvS$NMvS)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(NMvS$NMvS)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NMvS$NMvS~log(NMvS$average_tpm)) (m<-summary(all_lm)) ggplot(NMvS,aes(x=log(average_tpm),y=NMvS)) + xlab("Average TPM") + ylab("NMvS Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NMvS),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does NMvS Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-NMvS_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(NMvS),sd=sd(NMvS),n=length(NMvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(NMvS_tox$NMvS~NMvS_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(NMvS_tox, aes(x=diff_exprs, y=NMvS)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(NMvS_tox$NMvS))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(NMvS_tox$NMvS)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(NMvS_tox$NMvS)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(NMvS_tox$NMvS)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(NMvS_tox$NMvS~log(NMvS_tox$average_tpm)) (m<-summary(all_lm)) ggplot(NMvS_tox,aes(x=log(average_tpm),y=NMvS)) + xlab("Average TPM") + ylab("NMvS Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(NMvS_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=-0.15,size=10) ```
## South Mojave vs. Colorado Fst ```{r} SMvC<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","SMvC")] SMvC<-SMvC[complete.cases(SMvC), ] SMvC_tox<-droplevels.data.frame(subset(SMvC,class=="Toxin")) SMvC_nontox<-droplevels.data.frame(subset(SMvC,class=="Nontoxin")) x<-SMvC %>% group_by(class) %>% summarize(mean=mean(SMvC),sd=sd(SMvC),n=length(SMvC)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in SMvC Fst between toxins and nontoxins? ```{r} (m<-summary(lm(SMvC$SMvC~SMvC$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(SMvC_nontox$SMvC,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","SMvC") data<-SMvC_tox[,c("class","SMvC")] data<-rbind(data,samp) z<-summary(lm(data$SMvC~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="SMvC Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(SMvC_nontox$SMvC,c(0.975)) SMvC_tox$sig<-NA for (i in 1:nrow(SMvC_tox)){ if (SMvC_tox$SMvC[i] > CI){SMvC_tox$sig[i]<-TRUE} else {SMvC_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(SMvC_tox,SMvC_tox$sig=="TRUE")) ``` ### Figure 4D ```{r,fig.width=5,fig.height=5} (Fig4D <- ggplot(SMvC, aes(x=class, y=SMvC)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(2.25*max(SMvC$SMvC))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=SMvC),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + #geom_label(data=sig,aes(x=2.25,label=ID,y=SMvC),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(2*max(SMvC$SMvC)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(2*max(SMvC$SMvC)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.75*max(SMvC$SMvC)),size=10) + guides(fill=F)) ```
#### Does SMvC Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-SMvC %>% group_by(diff_exprs) %>% summarize(mean=mean(SMvC),sd=sd(SMvC),n=length(SMvC)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(SMvC$SMvC~SMvC$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(SMvC, aes(x=diff_exprs, y=SMvC)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(SMvC$SMvC))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(SMvC$SMvC)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(SMvC$SMvC)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(SMvC$SMvC)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(SMvC$SMvC~log(SMvC$average_tpm)) (m<-summary(all_lm)) ggplot(SMvC,aes(x=log(average_tpm),y=SMvC)) + xlab("Average TPM") + ylab("SMvC Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(SMvC),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does SMvC Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-SMvC_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(SMvC),sd=sd(SMvC),n=length(SMvC)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(SMvC_tox$SMvC~SMvC_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(SMvC_tox, aes(x=diff_exprs, y=SMvC)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(SMvC_tox$SMvC))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(SMvC_tox$SMvC)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(SMvC_tox$SMvC)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(SMvC_tox$SMvC)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(SMvC_tox$SMvC~log(SMvC_tox$average_tpm)) (m<-summary(all_lm)) ggplot(SMvC_tox,aes(x=log(average_tpm),y=SMvC)) + xlab("Average TPM") + ylab("SMvC Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(SMvC_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=-0.15,size=10) ```
## South Mojave vs. Sonora Fst ```{r} SMvS<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","SMvS")] SMvS<-SMvS[complete.cases(SMvS), ] SMvS_tox<-droplevels.data.frame(subset(SMvS,class=="Toxin")) SMvS_nontox<-droplevels.data.frame(subset(SMvS,class=="Nontoxin")) x<-SMvS %>% group_by(class) %>% summarize(mean=mean(SMvS),sd=sd(SMvS),n=length(SMvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in SMvS Fst between toxins and nontoxins? ```{r} (m<-summary(lm(SMvS$SMvS~SMvS$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(SMvS_nontox$SMvS,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","SMvS") data<-SMvS_tox[,c("class","SMvS")] data<-rbind(data,samp) z<-summary(lm(data$SMvS~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="SMvS Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(SMvS_nontox$SMvS,c(0.975)) SMvS_tox$sig<-NA for (i in 1:nrow(SMvS_tox)){ if (SMvS_tox$SMvS[i] > CI){SMvS_tox$sig[i]<-TRUE} else {SMvS_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(SMvS_tox,SMvS_tox$sig=="TRUE")) ``` ### Figure 4E ```{r,fig.width=5,fig.height=5} (Fig4E <- ggplot(SMvS, aes(x=class, y=SMvS)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.75*max(SMvS$SMvS))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=SMvS),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=SMvS),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.5*max(SMvS$SMvS)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.5*max(SMvS$SMvS)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(SMvS$SMvS)),size=10) + guides(fill=F)) ```
#### Does SMvS Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-SMvS %>% group_by(diff_exprs) %>% summarize(mean=mean(SMvS),sd=sd(SMvS),n=length(SMvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(SMvS$SMvS~SMvS$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(SMvS, aes(x=diff_exprs, y=SMvS)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(SMvS$SMvS))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(SMvS$SMvS)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(SMvS$SMvS)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(SMvS$SMvS)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(SMvS$SMvS~log(SMvS$average_tpm)) (m<-summary(all_lm)) ggplot(SMvS,aes(x=log(average_tpm),y=SMvS)) + xlab("Average TPM") + ylab("SMvS Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(SMvS),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does SMvS Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-SMvS_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(SMvS),sd=sd(SMvS),n=length(SMvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(SMvS_tox$SMvS~SMvS_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(SMvS_tox, aes(x=diff_exprs, y=SMvS)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(SMvS_tox$SMvS))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(SMvS_tox$SMvS)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(SMvS_tox$SMvS)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(SMvS_tox$SMvS)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(SMvS_tox$SMvS~log(SMvS_tox$average_tpm)) (m<-summary(all_lm)) ggplot(SMvS_tox,aes(x=log(average_tpm),y=SMvS)) + xlab("Average TPM") + ylab("SMvS Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(SMvS_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=-0.15,size=10) ```
## Colorado vs. Sonora Fst ```{r} CvS<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","CvS")] CvS<-CvS[complete.cases(CvS), ] CvS_tox<-droplevels.data.frame(subset(CvS,class=="Toxin")) CvS_nontox<-droplevels.data.frame(subset(CvS,class=="Nontoxin")) x<-CvS %>% group_by(class) %>% summarize(mean=mean(CvS),sd=sd(CvS),n=length(CvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in CvS Fst between toxins and nontoxins? ```{r} (m<-summary(lm(CvS$CvS~CvS$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(CvS_nontox$CvS,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","CvS") data<-CvS_tox[,c("class","CvS")] data<-rbind(data,samp) z<-summary(lm(data$CvS~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="CvS Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(CvS_nontox$CvS,c(0.975)) CvS_tox$sig<-NA for (i in 1:nrow(CvS_tox)){ if (CvS_tox$CvS[i] > CI){CvS_tox$sig[i]<-TRUE} else {CvS_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(CvS_tox,CvS_tox$sig=="TRUE")) ``` ### Figure 4F ```{r,fig.width=5,fig.height=5} (Fig4F <- ggplot(CvS, aes(x=class, y=CvS)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.75*max(CvS$CvS))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=CvS),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + #geom_label(data=sig,aes(x=2.25,label=ID,y=CvS),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.5*max(CvS$CvS)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.5*max(CvS$CvS)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(CvS$CvS)),size=10) + guides(fill=F)) ```
#### Does CvS Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-CvS %>% group_by(diff_exprs) %>% summarize(mean=mean(CvS),sd=sd(CvS),n=length(CvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(CvS$CvS~CvS$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(CvS, aes(x=diff_exprs, y=CvS)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(CvS$CvS))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(CvS$CvS)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(CvS$CvS)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(CvS$CvS)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(CvS$CvS~log(CvS$average_tpm)) (m<-summary(all_lm)) ggplot(CvS,aes(x=log(average_tpm),y=CvS)) + xlab("Average TPM") + ylab("CvS Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(CvS),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does CvS Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-CvS_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(CvS),sd=sd(CvS),n=length(CvS)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(CvS_tox$CvS~CvS_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(CvS_tox, aes(x=diff_exprs, y=CvS)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(CvS_tox$CvS))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(CvS_tox$CvS)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(CvS_tox$CvS)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(CvS_tox$CvS)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(CvS_tox$CvS~log(CvS_tox$average_tpm)) (m<-summary(all_lm)) ggplot(CvS_tox,aes(x=log(average_tpm),y=CvS)) + xlab("Average TPM") + ylab("CvS Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(CvS_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=-0.5,size=10) ```
## East vs. West Fst ```{r} EvW<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","EvW")] EvW<-EvW[complete.cases(EvW), ] EvW_tox<-droplevels.data.frame(subset(EvW,class=="Toxin")) EvW_nontox<-droplevels.data.frame(subset(EvW,class=="Nontoxin")) x<-EvW %>% group_by(class) %>% summarize(mean=mean(EvW),sd=sd(EvW),n=length(EvW)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in EvW Fst between toxins and nontoxins? ```{r} (m<-summary(lm(EvW$EvW~EvW$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(EvW_nontox$EvW,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","EvW") data<-EvW_tox[,c("class","EvW")] data<-rbind(data,samp) z<-summary(lm(data$EvW~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="EvW Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(EvW_nontox$EvW,c(0.975)) EvW_tox$sig<-NA for (i in 1:nrow(EvW_tox)){ if (EvW_tox$EvW[i] > CI){EvW_tox$sig[i]<-TRUE} else {EvW_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(EvW_tox,EvW_tox$sig=="TRUE")) ``` ### Figure 5A ```{r,fig.width=5,fig.height=5} (Fig5A <- ggplot(EvW, aes(x=class, y=EvW)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.75*max(EvW$EvW))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=EvW),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=EvW),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.5*max(EvW$EvW)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.5*max(EvW$EvW)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(EvW$EvW)),size=10) + guides(fill=F)) ```
#### Does EvW Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-EvW %>% group_by(diff_exprs) %>% summarize(mean=mean(EvW),sd=sd(EvW),n=length(EvW)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(EvW$EvW~EvW$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(EvW, aes(x=diff_exprs, y=EvW)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(EvW$EvW))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(EvW$EvW)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(EvW$EvW)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(EvW$EvW)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(EvW$EvW~log(EvW$average_tpm)) (m<-summary(all_lm)) ggplot(EvW,aes(x=log(average_tpm),y=EvW)) + xlab("Average TPM") + ylab("EvW Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(EvW),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does EvW Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-EvW_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(EvW),sd=sd(EvW),n=length(EvW)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(EvW_tox$EvW~EvW_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(EvW_tox, aes(x=diff_exprs, y=EvW)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(EvW_tox$EvW))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(EvW_tox$EvW)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(EvW_tox$EvW)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(EvW_tox$EvW)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(EvW_tox$EvW~log(EvW_tox$average_tpm)) (m<-summary(all_lm)) ggplot(EvW_tox,aes(x=log(average_tpm),y=EvW)) + xlab("Average TPM") + ylab("EvW Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(EvW_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=0.5,size=10) ``` ## Geographic East vs. West Fst ```{r} GeoEvGeoW<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","GeoEvGeoW")] GeoEvGeoW<-GeoEvGeoW[complete.cases(GeoEvGeoW), ] GeoEvGeoW_tox<-droplevels.data.frame(subset(GeoEvGeoW,class=="Toxin")) GeoEvGeoW_nontox<-droplevels.data.frame(subset(GeoEvGeoW,class=="Nontoxin")) x<-GeoEvGeoW %>% group_by(class) %>% summarize(mean=mean(GeoEvGeoW),sd=sd(GeoEvGeoW),n=length(GeoEvGeoW)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Is there a significant difference in GeoEvGeoW Fst between toxins and nontoxins? ```{r} (m<-summary(lm(GeoEvGeoW$GeoEvGeoW~GeoEvGeoW$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(GeoEvGeoW_nontox$GeoEvGeoW,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","GeoEvGeoW") data<-GeoEvGeoW_tox[,c("class","GeoEvGeoW")] data<-rbind(data,samp) z<-summary(lm(data$GeoEvGeoW~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="GeoEvGeoW Fst") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(GeoEvGeoW_nontox$GeoEvGeoW,c(0.975)) GeoEvGeoW_tox$sig<-NA for (i in 1:nrow(GeoEvGeoW_tox)){ if (GeoEvGeoW_tox$GeoEvGeoW[i] > CI){GeoEvGeoW_tox$sig[i]<-TRUE} else {GeoEvGeoW_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(GeoEvGeoW_tox,GeoEvGeoW_tox$sig=="TRUE")) ``` ### Figure 5B ```{r,fig.width=5,fig.height=5} (Fig5B <- ggplot(GeoEvGeoW, aes(x=class, y=GeoEvGeoW)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.75*max(GeoEvGeoW$GeoEvGeoW))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=GeoEvGeoW),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=GeoEvGeoW),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.5*max(GeoEvGeoW$GeoEvGeoW)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.5*max(GeoEvGeoW$GeoEvGeoW)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(GeoEvGeoW$GeoEvGeoW)),size=10) + guides(fill=F)) ```
#### Does GeoEvGeoW Fst correlate with average expression levels? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-GeoEvGeoW %>% group_by(diff_exprs) %>% summarize(mean=mean(GeoEvGeoW),sd=sd(GeoEvGeoW),n=length(GeoEvGeoW)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(GeoEvGeoW$GeoEvGeoW~GeoEvGeoW$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(GeoEvGeoW, aes(x=diff_exprs, y=GeoEvGeoW)) + geom_violin(aes(fill=diff_exprs),trim=F,scale="width") + xlab("Differentially Expressed") + ylim(NA,(1.5*max(GeoEvGeoW$GeoEvGeoW))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(GeoEvGeoW$GeoEvGeoW)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(GeoEvGeoW$GeoEvGeoW)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(GeoEvGeoW$GeoEvGeoW)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(GeoEvGeoW$GeoEvGeoW~log(GeoEvGeoW$average_tpm)) (m<-summary(all_lm)) ggplot(GeoEvGeoW,aes(x=log(average_tpm),y=GeoEvGeoW)) + xlab("Average TPM") + ylab("GeoEvGeoW Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(GeoEvGeoW),"; R2=",round(m$r.squared,3),"; p =",round(m$coefficients[2,4],3)),x=7.5,y=-0.5,size=10) ```
#### Does GeoEvGeoW Fst correlate with average expression levels **in toxins**? ```{r,fig.width=5,fig.height=5,eval=F,echo=F} x<-GeoEvGeoW_tox %>% group_by(diff_exprs) %>% summarize(mean=mean(GeoEvGeoW),sd=sd(GeoEvGeoW),n=length(GeoEvGeoW)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) (m<-summary(lm(GeoEvGeoW_tox$GeoEvGeoW~GeoEvGeoW_tox$diff_exprs))) mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) ggplot(GeoEvGeoW_tox, aes(x=diff_exprs, y=GeoEvGeoW)) + geom_violin(aes(fill=diff_exprs),trim=F) + xlab("Differentially Expressed") + ylim(NA,(1.5*max(GeoEvGeoW_tox$GeoEvGeoW))) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + draw_label(paste("FALSE",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(GeoEvGeoW_tox$GeoEvGeoW)),size=10) + draw_label(paste("TRUE",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(GeoEvGeoW_tox$GeoEvGeoW)),size=10) + draw_label(paste("R2=",r2,"; p =",p),x=1.5,y=(1.25*max(GeoEvGeoW_tox$GeoEvGeoW)),size=10) + guides(fill=F) ``` ```{r,fig.width=5,fig.height=5} all_lm<-lm(GeoEvGeoW_tox$GeoEvGeoW~log(GeoEvGeoW_tox$average_tpm)) (m<-summary(all_lm)) ggplot(GeoEvGeoW_tox,aes(x=log(average_tpm),y=GeoEvGeoW)) + xlab("Average TPM") + ylab("GeoEvGeoW Fst") + geom_point() + geom_abline(intercept = coef(all_lm)[1],slope=coef(all_lm)[2],lty=2) + draw_label(paste("n =",nrow(GeoEvGeoW_tox),"; R2=",round(m$r.squared,2),"; p =",round(m$coefficients[2,4],2)),x=7.5,y=0.5,size=10) ``` ## Fang Morphology ```{r} library(ggpubr) fangs<-read.csv("Ccerastes_Fang2.csv") x<-fangs %>% group_by(Lineage) %>% summarize(mean=mean(FL_HL),sd=sd(FL_HL),n=length(FL_HL)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) compare_means(FL_HL ~ Lineage, data = fangs, method = "anova") my_comparisons<-list( c("North Mojave","South Mojave"), c("North Mojave","Colorado"), c("North Mojave","Sonora"), c("South Mojave","Colorado"), c("South Mojave", "Sonora"),c("Colorado","Sonora")) ggviolin(data=fangs,x="Lineage",y="FL_HL", order=c("North Mojave","South Mojave", "Colorado", "Sonora"), fill="Lineage", palette = "grey", legend="none", ylab="Fang:Head Length\n",xlab="\nLineage", add="boxplot",add.params = list(fill = "white")) + stat_compare_means(comparisons = my_comparisons,label="p.signif") + stat_compare_means(label.y = 0.35, method="anova") ``` ```{r} DIFF_df<-read.csv("DIFF_df.csv") ggscatter(DIFF_df,x="Fang_Diff",y="Fst", add = "reg.line", xlab="\nDifference in Fang:Head Length", ylab="FST\n", conf.int = F, color = "class", palette = c("blue","red")) + stat_cor(aes(color=class,label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), label.x = 0.01, label.y=c(0.35,0.30)) m1<-lm(Fst~Fang_Diff,data=DIFF_df[DIFF_df$class=="Toxin",]) m2<-lm(Fst~Fang_Diff,data=DIFF_df[DIFF_df$class=="Nontoxin",]) anova(m1,m2) ``` ## Tajima's D Synonymous Mutations ```{r} Tajima<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","tajimasD_synon")] colnames(Tajima)[7]<-"tajimasD" Tajima<-Tajima[complete.cases(Tajima), ] Tajima_tox<-droplevels.data.frame(subset(Tajima,Tajima$class=="Toxin")) Tajima_nontox<-droplevels.data.frame(subset(Tajima,Tajima$class=="Nontoxin")) x<-Tajima %>% group_by(class) %>% summarize(mean=mean(tajimasD),sd=sd(tajimasD), n=length(tajimasD)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Are estimates of Tajima's D for synonymous mutations significantly different than 0 (suggesting selection) for toxins or nontoxins? ```{r} t.test(Tajima_nontox$tajimasD) t.test(Tajima_tox$tajimasD) ```
#### Is there a significant difference in Tajima's D for synonymous mutations between toxins and nontoxins? ```{r} (m<-summary(lm(Tajima$tajimasD~Tajima$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(Tajima_nontox$tajimasD,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","tajimasD") data<-Tajima_tox[,c("class","tajimasD")] data<-rbind(data,samp) z<-summary(lm(data$tajimasD~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="Tajima's D") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(Tajima_nontox$tajimasD,c(0.025,0.975)) Tajima_tox$sig<-NA for (i in 1:nrow(Tajima_tox)){ if (Tajima_tox$tajimasD[i] < CI[1] | Tajima_tox$tajimasD[i] > CI[2]){Tajima_tox$sig[i]<-TRUE} else {Tajima_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(Tajima_tox,Tajima_tox$sig=="TRUE")) ``` ```{r,fig.width=5,fig.height=5} ggplot(Tajima, aes(x=class, y=tajimasD)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.5*max(Tajima$tajimasD))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=tajimasD),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=tajimasD),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(Tajima$tajimasD)),size=10) + guides(fill=F) ``` ## Tajima's D Nonsynonymous Mutations ```{r} Tajima<-Data[,c("ID","class","average_tpm","stdev_tpm","bcv_tpm","diff_exprs","tajimasD_nonsyn")] colnames(Tajima)[7]<-"tajimasD" Tajima<-Tajima[complete.cases(Tajima), ] Tajima_tox<-droplevels.data.frame(subset(Tajima,Tajima$class=="Toxin")) Tajima_nontox<-droplevels.data.frame(subset(Tajima,Tajima$class=="Nontoxin")) x<-Tajima %>% group_by(class) %>% summarize(mean=mean(tajimasD),sd=sd(tajimasD), n=length(tajimasD)) kable(x) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", font_size = 10), full_width = F) ```
#### Are estimates of Tajima's D for nonsynonymous mutations significantly different than 0 (suggesting selection) for toxins or nontoxins? ```{r} t.test(Tajima_nontox$tajimasD) t.test(Tajima_tox$tajimasD) ```
#### Is there a significant difference in Tajima's D for nonsynonymous mutations between toxins and nontoxins? ```{r} (m<-summary(lm(Tajima$tajimasD~Tajima$class))) results<-matrix(nrow=1000,ncol=2) set.seed(2019) for(i in 1:1000) { samp<-sample(Tajima_nontox$tajimasD,x$n[2], replace=T) samp<-cbind(rep("Nontoxin",x$n[2]),samp) colnames(samp)<-c("class","tajimasD") data<-Tajima_tox[,c("class","tajimasD")] data<-rbind(data,samp) z<-summary(lm(data$tajimasD~data$class)) results[i,1]<-round(z$r.squared,2) results[i,2]<-round(z$coefficients[2,4],2) } pie_results<-unlist(as.data.frame(table(results[,2] < 0.05),row.names = c("FALSE","TRUE"))[2]) lbls<-c("False","True") pct<-as.vector(round(pie_results/sum(pie_results)*100,2)) lbls<-paste(lbls,pct) lbls<-paste(lbls,"%",sep="") pie(pie_results,labels=lbls,col=c("red","green"),main="Tajima's D") b<-round(pct[2]*0.01,2) ``` Save the results for plotting and subset toxins which fall outside the 95th percentile of nontoxin estimates. ```{r,fig.width=5,fig.height=5} mean_1<-round(x$mean[1],2) mean_2<-round(x$mean[2],2) sd_1<-round(x$sd[1],2) sd_2<-round(x$sd[2],2) n_1<-x$n[1] n_2<-x$n[2] p<-round(m$coefficients[2,4],2) r2<-round(m$r.squared,2) CI<-quantile(Tajima_nontox$tajimasD,c(0.025,0.975)) Tajima_tox$sig<-NA for (i in 1:nrow(Tajima_tox)){ if (Tajima_tox$tajimasD[i] < CI[1] | Tajima_tox$tajimasD[i] > CI[2]){Tajima_tox$sig[i]<-TRUE} else {Tajima_tox$sig[i]<-FALSE} } sig<-droplevels.data.frame(subset(Tajima_tox,Tajima_tox$sig=="TRUE")) ``` ```{r,fig.width=5,fig.height=5} ggplot(Tajima, aes(x=class, y=tajimasD)) + geom_violin(aes(fill=class),trim=F) + ylim(NA,(1.5*max(Tajima$tajimasD))) + #scale_fill_manual(values=c("#00BFC4","#F8766D")) + scale_fill_manual(values=c("dodgerblue2","firebrick2")) + stat_summary(fun.data=mean_sdl, geom="pointrange", color="black") + geom_point(aes(x=class,y=tajimasD),data=sig,fill="white",colour="black",size=3,pch=23) + #,position=position_jitter(width=0.1,seed=1)) + geom_hline(yintercept=CI,linetype="longdash",color="darkgray") + geom_label(data=sig,aes(x=2.25,label=ID,y=tajimasD),label.size = 0,size=4,inherit.aes = F) + #, position=position_jitter(width=0,height=0.4,seed=1)) + draw_label(paste("Nontoxins",paste(mean_1,"±",sd_1),paste("n =",n_1),sep='\n'),x=1,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("Toxins",paste(mean_2,"±",sd_2),paste("n =",n_2),sep='\n'),x=2,y=(1.45*max(Tajima$tajimasD)),size=10) + draw_label(paste("R2 =",r2,"\n p =",p,"\n b =",b),x=1.5,y=(1.25*max(Tajima$tajimasD)),size=10) + guides(fill=F) ``` ## COWPLOTS ```{r,fig.width=5,fig.height=5} plot_grid(Fig2A,Fig2B,align="hv",nrow=1,ncol=3,labels="AUTO") #3240 x 1080 plot_grid(Fig3A,Fig3B,align="hv",nrow=1,ncol=3,labels="AUTO") #3240 x 1080 plot_grid(Fig4A,NULL,NULL, Fig4B,Fig4D,NULL, Fig4C,Fig4E,Fig4F, align="hv",nrow=3,ncol=3) plot_grid(Fig5A,Fig5B,align="hv",nrow=1,ncol=2,labels="AUTO") #3240 x 1080 ```