############################################################################################################ # # This file contains two bash scripts used for phasing whole-genome data into Z and W chromosome gene sequences # Submitted with Manuscript "Repeated sex chromosome evolution in vertebrates supported by expanded avian sex chromosomes" in Proceedings B as Supplementary Code S1 # Contact: hanna.sigeman@biol.lu.se # ############################################################################################################ # Genotype phasing script 1 (of 2): #!/bin/bash #################################### # Author: Hanna Sigeman, 2018 # Contact: hanna.sigeman@biol.lu.se # # This script is the first of two used to phase whole-genome sequence data into a Z and W chromosome gene sequences using one female and one male sample. The script uses a VCF file where genotypes from one female and one male have been called for each genomic position within exon regions of specified genes. # # Usage: ./genotype_phasing_script_I.sh # # Comment: ($1) should specify a VCF file where the first sample (in column 10) is a male sample (having a ZZ genotype) and the second sample (in column 11) is a female (having a ZW genotype). ($2) is the chosen output name for the modified VCF file (which will be used as input in the second script). # # Output: A modified VCF file where each position is marked with the allele corresponding to the Z and W chromosome. # ##################################### cat $1 | grep -v "^#" | sed -E "s/([ATCG]+),([ATCG]+)/\1\t\2/1" | awk --re-interval '{ if($6 ~ /^[A-Z]+,[A-Z]+/) print $0,"ZW","multiN",length($4) else if($6 ~ /^[A-Z]+/ && $5!=$6) print $0 else print $1,$2,$3,$4,$5,$5,$6,$7,$8,$9,$10,$11}' | tr ' ' '\t' | awk -v OFS="\t" --re-interval '{ if (/multi/) print $0 else if ($5==".") print $0,"ZW","same",length($4) else if ($11=="." || $12==".") print $0,"ZW","missing",length($4) else if ($7<=20) print $0,"ZW","qual20",length($4) else if(/DP=[0-9];/ || /DP=[1][1-9];/) print $0,"ZW","depth20",length($4) else if ($11 ~ /^0\/0:/ && $12 ~ /^0\/0:/) print $0,"Z","ref",length($4),"break",$0,"W","ref",length($4) else if ($11 ~ /^0\/1:/ && $12 ~ /^0\/0:/) print $0,"Z","N",length($4),"break",$0,"W","N",length($4) else if ($11 ~ /^0\/0:/ && $12 ~ /^0\/1:/) print $0,"Z","ref",length($4),"break",$0,"W","alt1",length($4) else if ($11 ~ /^0\/1:/ && $12 ~ /^0\/1:/) print $0,"Z","N",length($4),"break",$0,"W","N",length($4) else if ($11 ~ /^1\/1:/ && $12 ~ /^0\/1:/) print $0,"Z","alt1",length($4),"break",$0,"W","ref",length($4) else if ($11 ~ /^0\/1:/ && $12 ~ /^1\/1:/) print $0,"Z","N",length($4),"break",$0,"W","N",length($4) else if ($11 ~ /^1\/1:/ && $12 ~ /^1\/1:/) print $0,"Z","alt1",length($4),"break",$0,"W","alt1",length($4) else if ($11 ~ /^1\/2:/ && $12 ~ /^1\/1:/) print $0,"Z","N",length($4),"break",$0,"W","alt1",length($4) else if ($11 ~ /^1\/1:/ && $12 ~ /^1\/2:/) print $0,"Z","alt1",length($4),"break",$0,"W","alt2",length($4) else if ($11 ~ /^1\/2:/ && $12 ~ /^1\/2:/) print $0,"Z","N",length($4),"break",$0,"W","N",length($4) else if ($11 ~ /^2\/2:/ && $12 ~ /^1\/2:/) print $0,"Z","alt2",length($4),"break",$0,"W","alt1",length($4) else if ($11 ~ /^1\/2:/ && $12 ~ /^2\/2:/) print $0,"Z","N",length($4),"break",$0,"W","N",length($4) else if ($11 ~ /^2\/2:/ && $12 ~ /^2\/2:/) print $0,"Z","alt2",length($4),"break",$0,"W","alt2",length($4) else print $0,"ZW","Un",length($4)}' | tr ' ' '\t' | sed 's/break/\n/g' | sed -e 's/^[ \t]*//' | sed -e 's/[ \t]*$//' | grep -v ">" | while read a b c d e f g h i j k l m n o ; do printf $a"\t"$b"\t"$c"\t"$d"\t"$e"\t"$f"\t"$g"\t"$h"\t"$i"\t"$j"\t"$k"\t"$l"\t"$m"\t"$n"\t"$o"\t" ; eval printf -- 'N%.s' {1..$o} ; echo ; done > $2 ############################################################################################################ # Genotype phasing script 2 (of 2): #!/bin/bash #################################### # Author: Hanna Sigeman, 2018 # Contact: hanna.sigeman@biol.lu.se # # This script is the second of two used to phase whole-genome sequence data into a Z and W chromosome gene sequences using one female and one male sample. The script uses a modified VCF file which is the output of script number 1. It also requires a 4 column bed file (0-based positions) where the first three columns specifies the genome ranges of the exons and the fourth column contains the name of the gene. # # Usage: ./genotype_phasing_script_II.sh # # Comment: ($1) is the output from script 1. ($2) is a bed file containing genomic ranges for exons. ($3) is the chosen output name for the fasta file containing the phased gene sequences. # # Output: A fasta file containing phased Z and W gene sequences where the sequence headers correspond to the gene names specified in the bed file ($2). # ##################################### cat $1 | cut -f 1,4 | sort | uniq | while read cont gene do cat $1 | awk '{if($1=="'"$cont"'") print $0}' | grep $gene | sort -k2,2n | head -n 1 | tr "\n" "\t" cat $1 | awk '{if($1=="'"$cont"'") print $0}' | grep $gene | sort -k2,2nr | head -n 1 done | awk -v OFS="\t" '{print $1,$2,$7,$4}' | while read cont start end gene do echo ">Zlinked_${gene}_${cont}" ; \ cat $2 | grep -v "^#" | awk '{if($1=="'"$cont"'") print $0}' | awk '{if($2>="'"$start"'" && $2<="'"$end"'") print $0}' | awk '{ # Select Z or common alleles if ($13=="Z" || $13=="ZW") print $0}' | awk --re-interval '{ if ($14=="ref" || $14=="same") print $4 else if ($14=="alt") print $5 else if ($14=="alt1") print $5 else if ($14=="alt2") print $6 else if ($14=="N" || $14=="sizediffN" || $14=="missing" || $14=="qual20" || $14=="depth20" || $14=="multiN" || $14=="Un" || $14=="covDiff" ) print $16}' | tr ' ' '\t' | tr -d "\n" ; done | sed 's/>/\n>/' > $3 # Extract W gene sequences cat $1 | cut -f 1,4 | sort | uniq | while read cont gene do cat $1 | awk '{if($1=="'"$cont"'") print $0}' | grep $gene | sort -k2,2n | head -n 1 | tr "\n" "\t" cat $1 | awk '{if($1=="'"$cont"'") print $0}' | grep $gene | sort -k2,2nr | head -n 1 done | awk -v OFS="\t" '{print $1,$2,$7,$4}' | while read cont start end gene do echo ">Wlinked_${gene}_${cont}" ; \ cat $2 | grep -v "^#" | awk '{if($1=="'"$cont"'") print $0}' | awk '{if($2>="'"$start"'" && $2<="'"$end"'") print $0}' | awk '{ # Select W or common alleles if ($13=="W" || $13=="ZW") print $0}' | awk --re-interval '{ if ($14=="ref" || $14=="same") print $4 else if ($14=="alt") print $5 else if ($14=="alt1") print $5 else if ($14=="alt2") print $6 else if ($14=="N" || $14=="sizediffN" || $14=="missing" || $14=="qual20" || $14=="depth20" || $14=="multiN" || $14=="Un" || $14=="covDiff" ) print $16}' | tr ' ' '\t' | tr -d "\n" ; done | sed 's/>/\n>/' >> $3