Exam_cheatsheet
March 22, 2025
1 Exam cheatsheet
1.1 Prepare data
[ ]: from Bio.Blast import NCBIWWW, NCBIXML
from Bio import Entrez, SeqIO, Phylo, AlignIO
import pandas as pd
import time, os, alv, re
from IPython.display import Image #optional
[ ]: %load_ext rpy2.ipython
[ ]: %%R
library("ape")
library("adephylo")
library("phytools")
[ ]: baseName = ""
#can add additional baseName, then number them and do not forget to add them in␣
,→downstream (see SARS excersise)
folderName = "exercise_"
#change baseName to the file with everything before .fasta
#change folderName to excercise_baseName (the variable you name your baseName)
#e.g. baseName= HIV_virus folderName= exercise_HIV_virus
[ ]: %Rpush baseName folderName
[ ]: %%bash -s "$folderName"
mkdir ${1} # if folder already exists and you wish to overwrite, you can try␣
,→$mkdir -p ${1}
Move the uploaded file into the folder for this exercise. You can also directly upload the sequence
file to today’s exercise folder. In that case, skip the next cell.
[ ]: %%bash -s "$baseName" "$folderName"
cp ${1}.fasta ${2}/${1}.fasta
1
, #if multiple files need to be moved
cp $1.#fileFormat $2/$1.#fileFormat
[ ]: os.chdir(folderName)
1.2 BlastN search (session01)
[ ]: inputFile = baseName+'.fasta'
query = SeqIO.read(inputFile, format="fasta")
search_type = "blastn"
database = "nt"
max_nr_hits = 100
before = round(time.time(), ndigits=0)
blast_stream = NCBIWWW.qblast(program = search_type,
database = database,
sequence = query.seq,
alignments = 1,
hitlist_size = max_nr_hits)
after = round(time.time(), ndigits=0)
text = f"The BLAST search took {after - before} seconds."
print(text)
Save result BlastN
[ ]: blastn_result_file = baseName+".blastn.result.xml"
with open(blastn_result_file, 'w') as file:
file.write(blast_stream.read())
[ ]: blastn_result_file = baseName+".blastn.result.xml"
Extract the sequences in fasta format from the BLAST search results
[ ]: all_data_fasta = baseName+".blastn.result.fasta"
with open(blastn_result_file) as result_handle:
blast_records = NCBIXML.parse(result_handle)
with open(all_data_fasta, 'w') as fasta_handle:
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
db_seq = hsp.sbjct
db_id = alignment.accession
fasta_handle.write(f">{db_id}\n{db_seq}\n")
2
March 22, 2025
1 Exam cheatsheet
1.1 Prepare data
[ ]: from Bio.Blast import NCBIWWW, NCBIXML
from Bio import Entrez, SeqIO, Phylo, AlignIO
import pandas as pd
import time, os, alv, re
from IPython.display import Image #optional
[ ]: %load_ext rpy2.ipython
[ ]: %%R
library("ape")
library("adephylo")
library("phytools")
[ ]: baseName = ""
#can add additional baseName, then number them and do not forget to add them in␣
,→downstream (see SARS excersise)
folderName = "exercise_"
#change baseName to the file with everything before .fasta
#change folderName to excercise_baseName (the variable you name your baseName)
#e.g. baseName= HIV_virus folderName= exercise_HIV_virus
[ ]: %Rpush baseName folderName
[ ]: %%bash -s "$folderName"
mkdir ${1} # if folder already exists and you wish to overwrite, you can try␣
,→$mkdir -p ${1}
Move the uploaded file into the folder for this exercise. You can also directly upload the sequence
file to today’s exercise folder. In that case, skip the next cell.
[ ]: %%bash -s "$baseName" "$folderName"
cp ${1}.fasta ${2}/${1}.fasta
1
, #if multiple files need to be moved
cp $1.#fileFormat $2/$1.#fileFormat
[ ]: os.chdir(folderName)
1.2 BlastN search (session01)
[ ]: inputFile = baseName+'.fasta'
query = SeqIO.read(inputFile, format="fasta")
search_type = "blastn"
database = "nt"
max_nr_hits = 100
before = round(time.time(), ndigits=0)
blast_stream = NCBIWWW.qblast(program = search_type,
database = database,
sequence = query.seq,
alignments = 1,
hitlist_size = max_nr_hits)
after = round(time.time(), ndigits=0)
text = f"The BLAST search took {after - before} seconds."
print(text)
Save result BlastN
[ ]: blastn_result_file = baseName+".blastn.result.xml"
with open(blastn_result_file, 'w') as file:
file.write(blast_stream.read())
[ ]: blastn_result_file = baseName+".blastn.result.xml"
Extract the sequences in fasta format from the BLAST search results
[ ]: all_data_fasta = baseName+".blastn.result.fasta"
with open(blastn_result_file) as result_handle:
blast_records = NCBIXML.parse(result_handle)
with open(all_data_fasta, 'w') as fasta_handle:
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
db_seq = hsp.sbjct
db_id = alignment.accession
fasta_handle.write(f">{db_id}\n{db_seq}\n")
2