Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove usage of biopython application #1209

Merged
merged 8 commits into from
Dec 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion micall/drivers/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def run_denovo(self, excluded_seeds):
with open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \
open(self.merged_contigs_csv, 'r') as merged_contigs_csv, \
open(self.blast_csv, 'w') as blast_csv:
fasta_to_csv(self.unstitched_contigs_fasta,
fasta_to_csv(Path(self.unstitched_contigs_fasta),
unstitched_contigs_csv,
merged_contigs_csv,
blast_csv=blast_csv,
Expand Down
22 changes: 11 additions & 11 deletions micall/tests/test_fasta_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def DEFAULT_DATABASE():

@pytest.fixture(scope='session', name='hcv_db')
def check_hcv_db(DEFAULT_DATABASE):
db_path = Path(DEFAULT_DATABASE)
db_path = DEFAULT_DATABASE
index_path = db_path.parent / "refs.fasta.nin"
build_needed = not index_path.exists()
if not build_needed:
Expand All @@ -32,7 +32,7 @@ def check_hcv_db(DEFAULT_DATABASE):


def test_make_blast_db_excludes_hivgha(hcv_db, DEFAULT_DATABASE):
fasta_path = Path(DEFAULT_DATABASE)
fasta_path = DEFAULT_DATABASE
with fasta_path.open() as f:
for reference in SeqIO.parse(f, 'fasta'):
# Exclude the Ghana project, because they're recombinant.
Expand All @@ -55,7 +55,7 @@ def test_genotype(tmpdir, hcv_db):
1,HCV-1a,41,1.0,100,1,41,8187,8227
"""

genotype(str(contigs_fasta), blast_csv=blast_csv)
genotype(contigs_fasta, blast_csv=blast_csv)

assert expected_blast_csv == blast_csv.getvalue()

Expand All @@ -75,7 +75,7 @@ def test_fasta_to_csv_two_sequences(tmpdir, hcv_db):
HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -98,7 +98,7 @@ def test_fasta_to_csv_two_groups(tmpdir, hcv_db):
HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -115,7 +115,7 @@ def test_fasta_to_csv_not_found(tmpdir, hcv_db):
unknown,0,,CATCACATAGGAGA
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -135,7 +135,7 @@ def test_fasta_to_csv_partial_match(tmpdir, hcv_db):
HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -156,7 +156,7 @@ def test_fasta_to_csv_reversed_match(tmpdir, hcv_db):
HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -183,7 +183,7 @@ def test_fasta_to_csv(tmpdir, hcv_db):
1,HCV-1a,41,1.0,100,1,41,8187,8227
"""

fasta_to_csv(str(contigs_fasta), contigs_csv, blast_csv=blast_csv)
fasta_to_csv(contigs_fasta, contigs_csv, blast_csv=blast_csv)

assert expected_contigs_csv == contigs_csv.getvalue()
assert expected_blast_csv == blast_csv.getvalue()
Expand All @@ -198,7 +198,7 @@ def test_fasta_to_csv_none(tmpdir, hcv_db):
ref,match,group_ref,contig
"""

fasta_to_csv(str(contigs_fasta), contigs_csv)
fasta_to_csv(contigs_fasta, contigs_csv)

assert expected_contigs_csv == contigs_csv.getvalue()

Expand All @@ -220,7 +220,7 @@ def test_merged_contig(tmpdir, hcv_db):
"""

with merged_contigs_path.open() as merged_contigs_csv:
fasta_to_csv(str(contigs_fasta),
fasta_to_csv(contigs_fasta,
contigs_csv,
merged_contigs_csv=merged_contigs_csv)

Expand Down
3 changes: 2 additions & 1 deletion micall/utils/contig_blaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from itertools import groupby
from operator import itemgetter
from tempfile import NamedTemporaryFile
from pathlib import Path

from micall.utils.fasta_to_csv import fasta_to_csv

Expand Down Expand Up @@ -44,7 +45,7 @@ def main():
fasta_file.flush()
new_contigs_csv = StringIO()
blast_csv = StringIO()
fasta_to_csv(fasta_file.name, new_contigs_csv, blast_csv=blast_csv)
fasta_to_csv(Path(fasta_file.name), new_contigs_csv, blast_csv=blast_csv)
blast_csv.seek(0)
for source_contig_num, contig_rows in groupby(DictReader(blast_csv),
itemgetter('contig_num')):
Expand Down
30 changes: 6 additions & 24 deletions micall/utils/contig_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,15 @@
from io import StringIO
from pathlib import Path

from Bio.Blast.Applications import NcbiblastnCommandline

from micall.utils.fasta_to_csv import default_database
from micall.utils.externals import Blastn

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt # noqa

DEFAULT_SCRATCH_PATH = (Path(__file__).parent.parent / "tests" / "working" /
"basespace_linked_data" / "scratch")
BLAST_COLUMNS = ['qaccver',
'saccver',
'qlen',
'pident',
'score',
'qcovhsp',
'qstart',
'qend',
'sstart',
'send']


def parse_args():
Expand All @@ -40,7 +29,6 @@ def main():
sample_dirs = [d for d in args.scratch.iterdir() if d.is_dir()]
contig_plots_path = sample_dirs[0].parent / 'contig_plots'
contig_plots_path.mkdir(exist_ok=True)
blast_format = f'"10 {" ".join(BLAST_COLUMNS)}"'
sample_dirs.sort()
incomplete_count = empty_count = 0
for sample_dir in sample_dirs:
Expand All @@ -62,16 +50,10 @@ def main():
continue
contigs_fasta_path, = contigs_fasta_paths
with default_database() as DEFAULT_DATABASE:
cline = NcbiblastnCommandline(query=str(contigs_fasta_path),
db=DEFAULT_DATABASE,
outfmt=blast_format,
evalue=0.0001,
gapopen=5,
gapextend=2,
penalty=-3,
reward=1,
max_target_seqs=5000)
stdout, _ = cline(stderr=False)
stdout = Blastn().genotype(
contigs_fasta=contigs_fasta_path,
database=DEFAULT_DATABASE,
)
plot_contigs(sample_dir, stdout)
plot_path = contig_plots_path / (sample_dir.name + '.png')
plt.savefig(str(plot_path))
Expand All @@ -84,7 +66,7 @@ def main():


def plot_contigs(sample_dir, contigs_csv):
reader = DictReader(StringIO(contigs_csv), BLAST_COLUMNS)
reader = DictReader(StringIO(contigs_csv), Blastn.BLAST_COLUMNS)
rows = list(reader)
rows.sort(reverse=True, key=lambda row: int(row['score']))
contig_names = sorted({row['qaccver'] for row in rows})
Expand Down
Loading