cfe-lab · Donaim · Dec 7, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 7, 2024
diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py
@@ -429,7 +429,7 @@ def run_denovo(self, excluded_seeds):
         with open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \
              open(self.merged_contigs_csv, 'r') as merged_contigs_csv, \
              open(self.blast_csv, 'w') as blast_csv:
-            fasta_to_csv(self.unstitched_contigs_fasta,
+            fasta_to_csv(Path(self.unstitched_contigs_fasta),
                          unstitched_contigs_csv,
                          merged_contigs_csv,
                          blast_csv=blast_csv,

diff --git a/micall/tests/test_fasta_to_csv.py b/micall/tests/test_fasta_to_csv.py
@@ -16,7 +16,7 @@ def DEFAULT_DATABASE():
 
 @pytest.fixture(scope='session', name='hcv_db')
 def check_hcv_db(DEFAULT_DATABASE):
-    db_path = Path(DEFAULT_DATABASE)
+    db_path = DEFAULT_DATABASE
     index_path = db_path.parent / "refs.fasta.nin"
     build_needed = not index_path.exists()
     if not build_needed:
@@ -32,7 +32,7 @@ def check_hcv_db(DEFAULT_DATABASE):
 
 
 def test_make_blast_db_excludes_hivgha(hcv_db, DEFAULT_DATABASE):
-    fasta_path = Path(DEFAULT_DATABASE)
+    fasta_path = DEFAULT_DATABASE
     with fasta_path.open() as f:
         for reference in SeqIO.parse(f, 'fasta'):
             # Exclude the Ghana project, because they're recombinant.
@@ -55,7 +55,7 @@ def test_genotype(tmpdir, hcv_db):
 1,HCV-1a,41,1.0,100,1,41,8187,8227
 """
 
-    genotype(str(contigs_fasta), blast_csv=blast_csv)
+    genotype(contigs_fasta, blast_csv=blast_csv)
 
     assert expected_blast_csv == blast_csv.getvalue()
 
@@ -75,7 +75,7 @@ def test_fasta_to_csv_two_sequences(tmpdir, hcv_db):
 HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -98,7 +98,7 @@ def test_fasta_to_csv_two_groups(tmpdir, hcv_db):
 HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -115,7 +115,7 @@ def test_fasta_to_csv_not_found(tmpdir, hcv_db):
 unknown,0,,CATCACATAGGAGA
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -135,7 +135,7 @@ def test_fasta_to_csv_partial_match(tmpdir, hcv_db):
 HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -156,7 +156,7 @@ def test_fasta_to_csv_reversed_match(tmpdir, hcv_db):
 HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -183,7 +183,7 @@ def test_fasta_to_csv(tmpdir, hcv_db):
 1,HCV-1a,41,1.0,100,1,41,8187,8227
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv, blast_csv=blast_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv, blast_csv=blast_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
     assert expected_blast_csv == blast_csv.getvalue()
@@ -198,7 +198,7 @@ def test_fasta_to_csv_none(tmpdir, hcv_db):
 ref,match,group_ref,contig
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -220,7 +220,7 @@ def test_merged_contig(tmpdir, hcv_db):
 """
 
     with merged_contigs_path.open() as merged_contigs_csv:
-        fasta_to_csv(str(contigs_fasta),
+        fasta_to_csv(contigs_fasta,
                      contigs_csv,
                      merged_contigs_csv=merged_contigs_csv)
 

diff --git a/micall/utils/contig_blaster.py b/micall/utils/contig_blaster.py
@@ -4,6 +4,7 @@
 from itertools import groupby
 from operator import itemgetter
 from tempfile import NamedTemporaryFile
+from pathlib import Path
 
 from micall.utils.fasta_to_csv import fasta_to_csv
 
@@ -44,7 +45,7 @@ def main():
     fasta_file.flush()
     new_contigs_csv = StringIO()
     blast_csv = StringIO()
-    fasta_to_csv(fasta_file.name, new_contigs_csv, blast_csv=blast_csv)
+    fasta_to_csv(Path(fasta_file.name), new_contigs_csv, blast_csv=blast_csv)
     blast_csv.seek(0)
     for source_contig_num, contig_rows in groupby(DictReader(blast_csv),
                                                   itemgetter('contig_num')):

diff --git a/micall/utils/contig_summary.py b/micall/utils/contig_summary.py
@@ -3,26 +3,15 @@
 from io import StringIO
 from pathlib import Path
 
-from Bio.Blast.Applications import NcbiblastnCommandline
-
 from micall.utils.fasta_to_csv import default_database
+from micall.utils.externals import Blastn
 
 import matplotlib
 matplotlib.use('Agg')
 from matplotlib import pyplot as plt  # noqa
 
 DEFAULT_SCRATCH_PATH = (Path(__file__).parent.parent / "tests" / "working" /
                         "basespace_linked_data" / "scratch")
-BLAST_COLUMNS = ['qaccver',
-                 'saccver',
-                 'qlen',
-                 'pident',
-                 'score',
-                 'qcovhsp',
-                 'qstart',
-                 'qend',
-                 'sstart',
-                 'send']
 
 
 def parse_args():
@@ -40,7 +29,6 @@ def main():
     sample_dirs = [d for d in args.scratch.iterdir() if d.is_dir()]
     contig_plots_path = sample_dirs[0].parent / 'contig_plots'
     contig_plots_path.mkdir(exist_ok=True)
-    blast_format = f'"10 {" ".join(BLAST_COLUMNS)}"'
     sample_dirs.sort()
     incomplete_count = empty_count = 0
     for sample_dir in sample_dirs:
@@ -62,16 +50,10 @@ def main():
                 continue
             contigs_fasta_path, = contigs_fasta_paths
             with default_database() as DEFAULT_DATABASE:
-                cline = NcbiblastnCommandline(query=str(contigs_fasta_path),
-                                              db=DEFAULT_DATABASE,
-                                              outfmt=blast_format,
-                                              evalue=0.0001,
-                                              gapopen=5,
-                                              gapextend=2,
-                                              penalty=-3,
-                                              reward=1,
-                                              max_target_seqs=5000)
-                stdout, _ = cline(stderr=False)
+                stdout = Blastn().genotype(
+                    contigs_fasta=contigs_fasta_path,
+                    database=DEFAULT_DATABASE,
+                )
             plot_contigs(sample_dir, stdout)
             plot_path = contig_plots_path / (sample_dir.name + '.png')
             plt.savefig(str(plot_path))
@@ -84,7 +66,7 @@ def main():
 
 
 def plot_contigs(sample_dir, contigs_csv):
-    reader = DictReader(StringIO(contigs_csv), BLAST_COLUMNS)
+    reader = DictReader(StringIO(contigs_csv), Blastn.BLAST_COLUMNS)
     rows = list(reader)
     rows.sort(reverse=True, key=lambda row: int(row['score']))
     contig_names = sorted({row['qaccver'] for row in rows})