#Electronic supplementary material from: #Complete loss of the MHC II pathway in an anglerfish, Lophius piscatorius #Arseny Dubin, Tor Erik Jørgensen, Truls Moum, Steinar Daae Johansen and Lars Martin Jakt Genomics group, Faculty of Biosciences and Aquaculture, Nord University, 8049 Bodø, Norway #Author for correspondence: Lars Martin Jakt lars.m.jakt@nord.no ##This file contains scripts used to produce the data ####################### ####contig_unfolder.py ####################### #!/usr/bin/env python with open('./MONOCENTRIS_ASSEMBLY.utg.fasta', 'r') as inp, open('./MONOCENTRIS_ASSEMBLY.impr.fasta', 'w') as outp: seq = "" for line in inp: if line.startswith(">"): if seq != "": outp.write(seq) seq="" seq += '\n' + line else: seq += line.strip().replace('\n', '') else: if seq != "": outp.write(seq) seq="" ##################### ####manyfish_blast.py ##################### #!/usr/bin/env python ### ###TIMER from datetime import datetime startTime = datetime.now() import os import re import shutil import subprocess ### ### os.system("tblastn \ -query /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/breiflabb_project/Complete_genelist.fas \ -db /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/BLAST_databases/GADUS_ASSEMBLY \ -out ./GADUS_ASSEMBLY_ctg_blastout \ -evalue 1 \ -outfmt 6 \ -max_target_seqs 50 \ -num_threads 16") ### ### ### ### os.system("tblastn \ -query /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/breiflabb_project/Complete_genelist.fas \ -db /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/BLAST_databases/MONOCENTRIS_ASSEMBLY \ -out ./MONOCENTRIS_ASSEMBLY_ctg_blastout \ -evalue 1 \ -outfmt 6 \ -max_target_seqs 50 \ -num_threads 16") ### ### ### ### os.system("tblastn \ -query /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/breiflabb_project/Complete_genelist.fas \ -db /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/BLAST_databases/PERCA_ASSEMBLY \ -out ./PERCA_ASSEMBLY_ctg_blastout \ -evalue 1 \ -outfmt 6 \ -max_target_seqs 50 \ -num_threads 16") ### ### ### ### os.system("tblastn \ -query /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/breiflabb_project/Complete_genelist.fas \ -db /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/BLAST_databases/piscatorius \ -out ./hippocampus_ctg_blastout \ -evalue 1 \ -outfmt 6 \ -max_target_seqs 50 \ -num_threads 16") ### ### ### ### ##################### ####forward_blast.sh ###################### #!/usr/bin/bash for f in ./*.peptide; do \ blastp -query $f -db /home/adu/data/anglerfish_project/l_piscatorius_data/DOVETAIL_DATA/BLASTING/databases/Complete_genelist -out ${f%.peptide}"_blastout" -evalue 1 -outfmt '6 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore' -max_target_seqs 50 -num_threads 16; done ##################### ####reverse_blast.sh ###################### #!/usr/bin/bash for f in ./*.peptide; do \ blastp \ -query ./$f \ -db /home/adu/data/anglerfish_project/l_piscatorius_data/tblastn/BLAST_databases/uniprot_stuff/UNIPROT \ -out ./${f%.peptide}'_blastout_reverse_score' \ -evalue 1 \ -outfmt '6 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore' \ -max_target_seqs 50 \ -num_threads 16; done ##################### ####filter_forward_blast.py ###################### #!/usr/bin/env python with open('./gene_names', 'r') as inp: for line in inp: L = line.strip() print "for f in ./" + L + "_*_blastout; do grep '" + L + "'" + "$f > ${f%_blastout}'_forward_score'; done" #This script prints everything to stdout. I redirected the output to a separate file (filter_forward_blast.sh) #and then called it with bash filter_forward_blast.sh #That way "forward score" files would contain matches only to a gene of interest ##################### ####filter_forward_blast.sh ###################### for f in ./AID*_blastout; do grep 'AID' $f > ${f%_blastout}'_forward_score'; done for f in ./AIRE*_blastout; do grep 'AIRE' $f > ${f%_blastout}'_forward_score'; done for f in ./AP1M2*_blastout; do grep 'AP1M2' $f > ${f%_blastout}'_forward_score'; done for f in ./AP2M1*_blastout; do grep 'AP2M1' $f > ${f%_blastout}'_forward_score'; done for f in ./AP3M2*_blastout; do grep 'AP3M2' $f > ${f%_blastout}'_forward_score'; done for f in ./B2m*_blastout; do grep 'B2m' $f > ${f%_blastout}'_forward_score'; done for f in ./BATF*_blastout; do grep 'BATF' $f > ${f%_blastout}'_forward_score'; done for f in ./CD4*_blastout; do grep 'CD4' $f > ${f%_blastout}'_forward_score'; done for f in ./CD74a*_blastout; do grep 'CD74a' $f > ${f%_blastout}'_forward_score'; done for f in ./CD74b*_blastout; do grep 'CD74b' $f > ${f%_blastout}'_forward_score'; done for f in ./CD8a*_blastout; do grep 'CD8a' $f > ${f%_blastout}'_forward_score'; done for f in ./CD8b*_blastout; do grep 'CD8b' $f > ${f%_blastout}'_forward_score'; done for f in ./CIITA*_blastout; do grep 'CIITA' $f > ${f%_blastout}'_forward_score'; done for f in ./CTSS1*_blastout; do grep 'CTSS1' $f > ${f%_blastout}'_forward_score'; done for f in ./CTSS2*_blastout; do grep 'CTSS2' $f > ${f%_blastout}'_forward_score'; done for f in ./ERAP1*_blastout; do grep 'ERAP1' $f > ${f%_blastout}'_forward_score'; done for f in ./ERAP2*_blastout; do grep 'ERAP2' $f > ${f%_blastout}'_forward_score'; done for f in ./HSP90*_blastout; do grep 'HSP90' $f > ${f%_blastout}'_forward_score'; done for f in ./LNPEP*_blastout; do grep 'LNPEP' $f > ${f%_blastout}'_forward_score'; done for f in ./MHCI*_blastout; do grep 'MHCI' $f > ${f%_blastout}'_forward_score'; done for f in ./MHCIIa*_blastout; do grep 'MHCIIa' $f > ${f%_blastout}'_forward_score'; done for f in ./MHCIIb*_blastout; do grep 'MHCIIb' $f > ${f%_blastout}'_forward_score'; done for f in ./RAG1*_blastout; do grep 'RAG1' $f > ${f%_blastout}'_forward_score'; done for f in ./RAG2*_blastout; do grep 'RAG2' $f > ${f%_blastout}'_forward_score'; done for f in ./SEC61A1-2*_blastout; do grep 'SEC61A1-2' $f > ${f%_blastout}'_forward_score'; done for f in ./SEC61A1*_blastout; do grep 'SEC61A1' $f > ${f%_blastout}'_forward_score'; done for f in ./SEC61G*_blastout; do grep 'SEC61G' $f > ${f%_blastout}'_forward_score'; done for f in ./SSR3*_blastout; do grep 'SSR3' $f > ${f%_blastout}'_forward_score'; done for f in ./TAP1*_blastout; do grep 'TAP1' $f > ${f%_blastout}'_forward_score'; done for f in ./TAP2*_blastout; do grep 'TAP2' $f > ${f%_blastout}'_forward_score'; done for f in ./TAPBP*_blastout; do grep 'TAPBP' $f > ${f%_blastout}'_forward_score'; done ##################### ####process_forward_blastout.sh ###################### #!/usr/bin/bash for f in ./cd4_*_blastout; do grep 'CD4' $f > ${f%_blastout}'_forward_score'; done for f in ./cd74_*_blastout; do grep 'CD74' $f > ${f%_blastout}'_forward_score'; done for f in ./mhci_*_blastout; do grep 'MHCI_' $f > ${f%_blastout}'_forward_score'; done for f in ./cd8_*_blastout; do grep 'CD8' $f > ${f%_blastout}'_forward_score'; done for f in ./mhcii_*_blastout; do grep 'MHCII' $f > ${f%_blastout}'_forward_score'; done ##################### ####process_blast.py ###################### #!/usr/bin/env python ### from datetime import datetime startTime = datetime.now() import os import re import shutil import subprocess ### #names_blast contains the list of blastout files ### with open('./names_blast', 'r') as names: for name in names: NAME = name.strip() with open('./%s' %NAME, 'r') as inp, open('./%s_blast_hits' %NAME, 'w') as outp: for line in inp: List = line.split('\t') hits = [List[0], List[1]] hits = str(hits) + '\n' outp.write(hits) ### with open('./names_blast', 'r') as names: for name in names: NAME = name.strip() output = subprocess.check_output('cat ./%s_blast_hits | sort -u' %NAME, shell=True) with open('./%s_blast_hits_unique' %NAME, 'w') as outp: outp.write(output) ### with open('./names_blast', 'r') as names: for name in names: NAME = name.strip() with open('./%s_blast_hits_unique' %NAME, 'r') as inp, open('./%s_gene_hits_only' %NAME, 'w') as outp: for line in inp: List = line.split('\t') genes = List[0] sstring = r"^\['(.*?)_" search = re.search(sstring, genes) match = search.group(1) match = str(match) + '\n' outp.write(match) ### with open('./names_blast', 'r') as names: for name in names: NAME = name.strip() output = subprocess.check_output('cat ./%s_gene_hits_only | sort -u' %NAME, shell=True) with open('./%s_gene_hits_only' %NAME, 'w') as outp: outp.write(output) ### with open('./names_blast', 'r') as names: for name in names: NAME = name.strip() outputlist = '' with open('./%s_gene_hits_only' %NAME, 'r') as Genes: names = [line.strip() for line in Genes] for name in names: with open('./%s.gene_%s' % (name, NAME), 'w') as output: with open('./%s_blast_hits_unique' %NAME, 'r') as Hits: for line in Hits: List = line.split('\t') gene = List[0] sstring = r"^\['(.*?)_" search = re.search(sstring, gene) match = search.group(1) if match in name: outputlist = outputlist + line output.write(outputlist) outputlist = '' else: outputlist = '' ### #with open('./names_blast', 'r') as names: # for name in names: # NAME = name # os.system("mkdir ./%s_GENE_HITS", %NAME) # os.system("mv ./*.gene ./GENE_HITS", %NAME) ### with open('./names_blast', 'r') as names: for name in names: NAME = name.strip() output = '' with open('./%s_gene_hits_only' %NAME, 'r') as gene_hits: names = [line.strip() for line in gene_hits] for name in names: with open('./%s.gene_%s' % (name, NAME), 'r') as genes: with open('./%s.contig_hits_%s' % (name, NAME), 'w') as contig_hits: for line in genes: List = line.split() gene_contig = List[1] sstring = r"^'(.*?)'\]" search = re.search(sstring, gene_contig) match = search.group(1) output = match + '\n' contig_hits.write(output) output = '' ## #os.system('for f in ./*.contig_hits*; do cat $f | sort -u > $f"_unique" ; done')