Bioinformatiha 11 - Firenze 14-15 Ottobre 2024 #### SOFTWARE NEEDED #### - SPADES - https://github.com/ablab/spades - DOI: 10.1002/cpbi.102 - FragGeneScan - https://sourceforge.net/projects/fraggenescan/ DOI:10.1093/nar/gkq747 - HMMER - http://hmmer.org/ DOI: 10.1371/journal.pcbi.1000069 - ALAN - https://github.com/mpdunne/alan #### FOLDER STRUCTURE #### - reads: original reads in fastq.gz - assembled: commbined output of metaSpades and FragGeneScan - build: material for biolding hmms - hmms: precompiled hmms - userXXX: studenst must stay there #### CONNECTION to the REMOTE MACHINE #### ssh -X bioinfo11@150.217.159.17 PASSWORD: #14ott24# ######################### LAB START ######################### #swith on the conda environment conda activate functional ###################################### #### ASSEMBLY AND PROTEIN FINDING #### ###################################### #assembly the short reads (1 million reads per sample, 6 samples) metaspades.py -1 ../reads/sample01_1.fastq.gz -2 ../reads/sample01_2.fastq.gz -o sample01_ctrl run_FragGeneScan.pl -genome=sample01_ctrl/scaffolds.fasta -o sample01_ctrl/scaffolds.fasta.prot -complete=0 -train=illumina_10 -thread=10 #metaspades.py -1 ../reads/sample02_1.fastq.gz -2 ../reads/sample02_2.fastq.gz -o sample02_ctrl #run_FragGeneScan.pl -genome=sample02_ctrl/scaffolds.fasta -o sample02_ctrl/scaffolds.fasta.prot -complete=0 -train=illumina_10 -thread=10 #metaspades.py -1 ../reads/sample03_1.fastq.gz -2 ../reads/sample03_2.fastq.gz -o sample03_ctrl #run_FragGeneScan.pl -genome=sample03_ctrl/scaffolds.fasta -o sample03_ctrl/scaffolds.fasta.prot -complete=0 -train=illumina_10 -thread=10 #metaspades.py -1 ../reads/sample04_1.fastq.gz -2 ../reads/sample04_2.fastq.gz -o sample04_compost #run_FragGeneScan.pl -genome=sample04_compost/scaffolds.fasta -o sample04_compost/scaffolds.fasta.prot -complete=0 -train=illumina_10 -thread=10 #metaspades.py -1 ../reads/sample05_1.fastq.gz -2 ../reads/sample05_2.fastq.gz -o sample05_compost #run_FragGeneScan.pl -genome=sample05_compost/scaffolds.fasta -o sample05_compost/scaffolds.fasta.prot -complete=0 -train=illumina_10 -thread=10 #metaspades.py -1 ../reads/sample06_1.fastq.gz -2 ../reads/sample06_2.fastq.gz -o sample06_compost #run_FragGeneScan.pl -genome=sample06_compost/scaffolds.fasta -o sample06_compost/scaffolds.fasta.prot -complete=0 -train=illumina_10 -thread=10 ######################################### #### ANTIBIOTIC RESISTANCE MODELLING #### ######################################### ### VIOMYCIN PHOSPHOTRASFERASE ### # go to www.uniprot.org # viomycin resistance has been studied in Streptomyces vinaceus # go to https://www.uniprot.org/uniprotkb/P18623/entry # last section is "similar_proteins" # click on 90%identity # we see the preformed uniref_cluster_90:UniRef90_P18623 with 15 proteins # click on View all # thick the selector to select all proteins # click on tools -> align # we notice a warning, indicating to remove some proteins because they are identical # go back and remove protein 8(A0A1V0UK02),11(A0A1G6NDG2),12(A0A6G3PQN2),13(D3K8V0) # click on tools -> align. No more warning, we can go on with 11 proteins # scroll and evaluate the alignment to study variations in similar organisms. Seems good. # click on download, choose to save in "raw submitted sequence". # Open the downloaded text file (or copy if text is shown to screen). # type cat > viomycin.fasta [enter] # paste the copied text in the terminal # type ctrl-c # now you should have a file names viomycin.fasta in your space # let's realign sequences using a local cluslalo clustalo -i viomycin.fasta -o viomycin.aln # inspect the alignment ../bin/alan viomycin.aln (press q to exit!) # build the hidden markov model hmmbuild -n viomycin viomycin.hmm viomycin.aln # stamp the model in binary format (makes calculatio faster) hmmpress viomycin.hmm # to sarch in sequences use hmmsearch viomycin.hmm target for i in $(ls ../assembled/sample*/scaffolds.fasta.prot.faa) do hmmsearch viomycin.hmm $i | grep -c ">>" done ### RIFAMPICIN MONOOXIGENASE ### # go to www.uniprot.org # rifampicin resistance has been studied in Nocardia farcinica # go to https://www.uniprot.org/uniprotkb/Q5YTV5/entry # using the strategy seen before (90% similarity) just returns 4 proteins form different strain of the same genus # we can then switch to the query "rifampicin monooxygenase": we obtain 16 valid proteins, with no redundancy # select all entries and click Download: this time click on "Generate URL for API" # this returns a link that can be used to directly download proteins to our terminal wget -O rifampicin.fasta "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%22rifampicin+monooxygenase%22%29" # we now have all 16 proteins in our space, ready to be aligned and checked clustalo -i rifampicin.fasta -o rifampicin.aln ../bin/alan viomycin.aln # it seems consistent, let's then build the hidden Markov model hmmbuild -n rifampicin rifampicin.hmm rifampicin.aln hmmpress rifampicin.hmm for i in $(ls ../assembled/sample*/scaffolds.fasta.prot.faa) do hmmsearch rifampicin.hmm $i | grep -c ">>" done ### QINOLONE ### #go to www.uniprot.org https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28uniref_cluster_90%3AUniRef90_P0A0J4%29 clustalo -i quinolone.fasta -o quinolone.aln ../bin/alan quinolone.aln # it seems consistent, let's then build the hidden Markov model hmmbuild -n rifampicin quinolone.hmm quinolone.aln hmmpress quinolone.hmm for i in $(ls ../assembled/sample*/scaffolds.fasta.prot.faa) do hmmsearch quinolone.hmm $i | grep -c ">>" done ### AUTOSCANNER FOR NCBI HMM COLLECTION ######### #from user's home for j in $(ls ../hmm/db/hmm_NCBI/HMM/) do cp ../hmm/db/hmm_NCBI/HMM/$j tmp hmmpress -f tmp chmod 777 tmp cat tmp | grep -B1 DESC for i in $(ls ../assembled/sample*/scaffolds.fasta.prot.faa) do echo $i | perl -ne '/.+?(sample\d+_\w+)\//;print $1,": "' hmmsearch tmp $i | grep -c ">>" done rm -r tmp* read done