Skip to content

Commit

Permalink
Factor out blastn invocations into externals.py
Browse files Browse the repository at this point in the history
Also improve typing in externals.py and fasta_to_csv.py
  • Loading branch information
Donaim committed Dec 7, 2024
1 parent 4ff8eec commit 6de7d17
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 81 deletions.
2 changes: 1 addition & 1 deletion micall/drivers/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def run_denovo(self, excluded_seeds):
with open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \
open(self.merged_contigs_csv, 'r') as merged_contigs_csv, \
open(self.blast_csv, 'w') as blast_csv:
fasta_to_csv(self.unstitched_contigs_fasta,
fasta_to_csv(Path(self.unstitched_contigs_fasta),
unstitched_contigs_csv,
merged_contigs_csv,
blast_csv=blast_csv,
Expand Down
22 changes: 11 additions & 11 deletions micall/tests/test_fasta_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def DEFAULT_DATABASE():

@pytest.fixture(scope='session', name='hcv_db')
def check_hcv_db(DEFAULT_DATABASE):
db_path = Path(DEFAULT_DATABASE)
db_path = DEFAULT_DATABASE
index_path = db_path.parent / "refs.fasta.nin"
build_needed = not index_path.exists()
if not build_needed:
Expand All @@ -32,7 +32,7 @@ def check_hcv_db(DEFAULT_DATABASE):


def test_make_blast_db_excludes_hivgha(hcv_db, DEFAULT_DATABASE):
fasta_path = Path(DEFAULT_DATABASE)
fasta_path = DEFAULT_DATABASE
with fasta_path.open() as f:
for reference in SeqIO.parse(f, 'fasta'):
# Exclude the Ghana project, because they're recombinant.
Expand All @@ -55,7 +55,7 @@ def test_genotype(tmpdir, hcv_db):
1,HCV-1a,41,1.0,100,1,41,8187,8227
"""

genotype(str(contigs_fasta), blast_csv=blast_csv)
genotype(contigs_fasta, blast_csv=blast_csv)

assert expected_blast_csv == blast_csv.getvalue()

Expand All @@ -75,7 +75,7 @@ def test_fasta_to_csv_two_sequences(tmpdir, hcv_db):
HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -98,7 +98,7 @@ def test_fasta_to_csv_two_groups(tmpdir, hcv_db):
HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -115,7 +115,7 @@ def test_fasta_to_csv_not_found(tmpdir, hcv_db):
unknown,0,,CATCACATAGGAGA
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -135,7 +135,7 @@ def test_fasta_to_csv_partial_match(tmpdir, hcv_db):
HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -156,7 +156,7 @@ def test_fasta_to_csv_reversed_match(tmpdir, hcv_db):
HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -183,7 +183,7 @@ def test_fasta_to_csv(tmpdir, hcv_db):
1,HCV-1a,41,1.0,100,1,41,8187,8227
"""

fasta_to_csv(str(contigs_fasta), contigs_csv, blast_csv=blast_csv)
fasta_to_csv(contigs_fasta, contigs_csv, blast_csv=blast_csv)

assert expected_contigs_csv == contigs_csv.getvalue()
assert expected_blast_csv == blast_csv.getvalue()
Expand All @@ -198,7 +198,7 @@ def test_fasta_to_csv_none(tmpdir, hcv_db):
ref,match,group_ref,contig
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -220,7 +220,7 @@ def test_merged_contig(tmpdir, hcv_db):
"""

with merged_contigs_path.open() as merged_contigs_csv:
fasta_to_csv(str(contigs_fasta),
fasta_to_csv(contigs_fasta,
contigs_csv,
merged_contigs_csv=merged_contigs_csv)

Expand Down
3 changes: 2 additions & 1 deletion micall/utils/contig_blaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from itertools import groupby
from operator import itemgetter
from tempfile import NamedTemporaryFile
from pathlib import Path

from micall.utils.fasta_to_csv import fasta_to_csv

Expand Down Expand Up @@ -44,7 +45,7 @@ def main():
fasta_file.flush()
new_contigs_csv = StringIO()
blast_csv = StringIO()
fasta_to_csv(fasta_file.name, new_contigs_csv, blast_csv=blast_csv)
fasta_to_csv(Path(fasta_file.name), new_contigs_csv, blast_csv=blast_csv)
blast_csv.seek(0)
for source_contig_num, contig_rows in groupby(DictReader(blast_csv),
itemgetter('contig_num')):
Expand Down
31 changes: 6 additions & 25 deletions micall/utils/contig_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,16 @@
from csv import DictReader
from io import StringIO
from pathlib import Path
import subprocess

from micall.utils.fasta_to_csv import default_database
from micall.utils.externals import Blastn

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt # noqa

DEFAULT_SCRATCH_PATH = (Path(__file__).parent.parent / "tests" / "working" /
"basespace_linked_data" / "scratch")
BLAST_COLUMNS = ['qaccver',
'saccver',
'qlen',
'pident',
'score',
'qcovhsp',
'qstart',
'qend',
'sstart',
'send']


def parse_args():
Expand All @@ -39,7 +29,6 @@ def main():
sample_dirs = [d for d in args.scratch.iterdir() if d.is_dir()]
contig_plots_path = sample_dirs[0].parent / 'contig_plots'
contig_plots_path.mkdir(exist_ok=True)
blast_format = f'"10 {" ".join(BLAST_COLUMNS)}"'
sample_dirs.sort()
incomplete_count = empty_count = 0
for sample_dir in sample_dirs:
Expand All @@ -61,18 +50,10 @@ def main():
continue
contigs_fasta_path, = contigs_fasta_paths
with default_database() as DEFAULT_DATABASE:
program = ["blastn",
"-outfmt", blast_format,
"-query", str(contigs_fasta_path),
"-db", str(DEFAULT_DATABASE),
"-evalue", "0.0001",
"-max_target_seqs", "5000",
"-gapopen", "5",
"-gapextend", "2",
"-penalty", "-3",
"-reward", "1",
]
stdout = subprocess.check_output(program).decode()
stdout = Blastn().genotype(
contigs_fasta=contigs_fasta_path,
database=DEFAULT_DATABASE,
)
plot_contigs(sample_dir, stdout)
plot_path = contig_plots_path / (sample_dir.name + '.png')
plt.savefig(str(plot_path))
Expand All @@ -85,7 +66,7 @@ def main():


def plot_contigs(sample_dir, contigs_csv):
reader = DictReader(StringIO(contigs_csv), BLAST_COLUMNS)
reader = DictReader(StringIO(contigs_csv), Blastn.BLAST_COLUMNS)
rows = list(reader)
rows.sort(reverse=True, key=lambda row: int(row['score']))
contig_names = sorted({row['qaccver'] for row in rows})
Expand Down
35 changes: 35 additions & 0 deletions micall/utils/externals.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,38 @@ def buffered_count(self, filename):
buf = read_f(buf_size)

return lines


class Blastn(CommandWrapper):
BLAST_COLUMNS = ['qaccver',
'saccver',
'qlen',
'pident',
'score',
'qcovhsp',
'qstart',
'qend',
'sstart',
'send',
]

def __init__(self, version=None, execname='blastn', logger=None, **kwargs):
super(Blastn, self).__init__(version, execname, logger, **kwargs)
stdout = self.check_output(['-version'], stderr=subprocess.STDOUT)
version_found = stdout.split('\n')[0].split()[-1]
self.validate_version(version_found)

def genotype(self, database: Path, contigs_fasta: Path) -> str:
blast_format = '10 ' + " ".join(Blastn.BLAST_COLUMNS)
program = ["-outfmt", blast_format,
"-query", str(contigs_fasta),
"-db", str(database),
"-evalue", "0.0001",
"-max_target_seqs", "5000",
"-gapopen", "5",
"-gapextend", "2",
"-penalty", "-3",
"-reward", "1",
]
stdout = self.check_output(program)
return stdout
Loading

0 comments on commit 6de7d17

Please sign in to comment.