From 6de7d1768245644bb28fdbe571f667285bc5881d Mon Sep 17 00:00:00 2001
From: Vitaliy Mysak <vmysak@bccfe.ca>
Date: Fri, 6 Dec 2024 16:12:52 -0800
Subject: [PATCH] Factor out blastn invocations into externals.py

Also improve typing in externals.py and fasta_to_csv.py
---
 micall/drivers/sample.py          |  2 +-
 micall/tests/test_fasta_to_csv.py | 22 +++++-----
 micall/utils/contig_blaster.py    |  3 +-
 micall/utils/contig_summary.py    | 31 +++-----------
 micall/utils/externals.py         | 35 ++++++++++++++++
 micall/utils/fasta_to_csv.py      | 68 ++++++++++++-------------------
 6 files changed, 80 insertions(+), 81 deletions(-)

diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py
index c63c3353c..20e64fd09 100644
--- a/micall/drivers/sample.py
+++ b/micall/drivers/sample.py
@@ -429,7 +429,7 @@ def run_denovo(self, excluded_seeds):
         with open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \
              open(self.merged_contigs_csv, 'r') as merged_contigs_csv, \
              open(self.blast_csv, 'w') as blast_csv:
-            fasta_to_csv(self.unstitched_contigs_fasta,
+            fasta_to_csv(Path(self.unstitched_contigs_fasta),
                          unstitched_contigs_csv,
                          merged_contigs_csv,
                          blast_csv=blast_csv,
diff --git a/micall/tests/test_fasta_to_csv.py b/micall/tests/test_fasta_to_csv.py
index 4ab951b6d..70edbf9d6 100644
--- a/micall/tests/test_fasta_to_csv.py
+++ b/micall/tests/test_fasta_to_csv.py
@@ -16,7 +16,7 @@ def DEFAULT_DATABASE():
 
 @pytest.fixture(scope='session', name='hcv_db')
 def check_hcv_db(DEFAULT_DATABASE):
-    db_path = Path(DEFAULT_DATABASE)
+    db_path = DEFAULT_DATABASE
     index_path = db_path.parent / "refs.fasta.nin"
     build_needed = not index_path.exists()
     if not build_needed:
@@ -32,7 +32,7 @@ def check_hcv_db(DEFAULT_DATABASE):
 
 
 def test_make_blast_db_excludes_hivgha(hcv_db, DEFAULT_DATABASE):
-    fasta_path = Path(DEFAULT_DATABASE)
+    fasta_path = DEFAULT_DATABASE
     with fasta_path.open() as f:
         for reference in SeqIO.parse(f, 'fasta'):
             # Exclude the Ghana project, because they're recombinant.
@@ -55,7 +55,7 @@ def test_genotype(tmpdir, hcv_db):
 1,HCV-1a,41,1.0,100,1,41,8187,8227
 """
 
-    genotype(str(contigs_fasta), blast_csv=blast_csv)
+    genotype(contigs_fasta, blast_csv=blast_csv)
 
     assert expected_blast_csv == blast_csv.getvalue()
 
@@ -75,7 +75,7 @@ def test_fasta_to_csv_two_sequences(tmpdir, hcv_db):
 HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -98,7 +98,7 @@ def test_fasta_to_csv_two_groups(tmpdir, hcv_db):
 HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -115,7 +115,7 @@ def test_fasta_to_csv_not_found(tmpdir, hcv_db):
 unknown,0,,CATCACATAGGAGA
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -135,7 +135,7 @@ def test_fasta_to_csv_partial_match(tmpdir, hcv_db):
 HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -156,7 +156,7 @@ def test_fasta_to_csv_reversed_match(tmpdir, hcv_db):
 HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -183,7 +183,7 @@ def test_fasta_to_csv(tmpdir, hcv_db):
 1,HCV-1a,41,1.0,100,1,41,8187,8227
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv, blast_csv=blast_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv, blast_csv=blast_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
     assert expected_blast_csv == blast_csv.getvalue()
@@ -198,7 +198,7 @@ def test_fasta_to_csv_none(tmpdir, hcv_db):
 ref,match,group_ref,contig
 """
 
-    fasta_to_csv(str(contigs_fasta), contigs_csv)
+    fasta_to_csv(contigs_fasta, contigs_csv)
 
     assert expected_contigs_csv == contigs_csv.getvalue()
 
@@ -220,7 +220,7 @@ def test_merged_contig(tmpdir, hcv_db):
 """
 
     with merged_contigs_path.open() as merged_contigs_csv:
-        fasta_to_csv(str(contigs_fasta),
+        fasta_to_csv(contigs_fasta,
                      contigs_csv,
                      merged_contigs_csv=merged_contigs_csv)
 
diff --git a/micall/utils/contig_blaster.py b/micall/utils/contig_blaster.py
index ccdbacd0a..20403bb9c 100644
--- a/micall/utils/contig_blaster.py
+++ b/micall/utils/contig_blaster.py
@@ -4,6 +4,7 @@
 from itertools import groupby
 from operator import itemgetter
 from tempfile import NamedTemporaryFile
+from pathlib import Path
 
 from micall.utils.fasta_to_csv import fasta_to_csv
 
@@ -44,7 +45,7 @@ def main():
     fasta_file.flush()
     new_contigs_csv = StringIO()
     blast_csv = StringIO()
-    fasta_to_csv(fasta_file.name, new_contigs_csv, blast_csv=blast_csv)
+    fasta_to_csv(Path(fasta_file.name), new_contigs_csv, blast_csv=blast_csv)
     blast_csv.seek(0)
     for source_contig_num, contig_rows in groupby(DictReader(blast_csv),
                                                   itemgetter('contig_num')):
diff --git a/micall/utils/contig_summary.py b/micall/utils/contig_summary.py
index 95f3b6f96..7b8529b99 100644
--- a/micall/utils/contig_summary.py
+++ b/micall/utils/contig_summary.py
@@ -2,9 +2,9 @@
 from csv import DictReader
 from io import StringIO
 from pathlib import Path
-import subprocess
 
 from micall.utils.fasta_to_csv import default_database
+from micall.utils.externals import Blastn
 
 import matplotlib
 matplotlib.use('Agg')
@@ -12,16 +12,6 @@
 
 DEFAULT_SCRATCH_PATH = (Path(__file__).parent.parent / "tests" / "working" /
                         "basespace_linked_data" / "scratch")
-BLAST_COLUMNS = ['qaccver',
-                 'saccver',
-                 'qlen',
-                 'pident',
-                 'score',
-                 'qcovhsp',
-                 'qstart',
-                 'qend',
-                 'sstart',
-                 'send']
 
 
 def parse_args():
@@ -39,7 +29,6 @@ def main():
     sample_dirs = [d for d in args.scratch.iterdir() if d.is_dir()]
     contig_plots_path = sample_dirs[0].parent / 'contig_plots'
     contig_plots_path.mkdir(exist_ok=True)
-    blast_format = f'"10 {" ".join(BLAST_COLUMNS)}"'
     sample_dirs.sort()
     incomplete_count = empty_count = 0
     for sample_dir in sample_dirs:
@@ -61,18 +50,10 @@ def main():
                 continue
             contigs_fasta_path, = contigs_fasta_paths
             with default_database() as DEFAULT_DATABASE:
-                program = ["blastn",
-                           "-outfmt", blast_format,
-                           "-query", str(contigs_fasta_path),
-                           "-db", str(DEFAULT_DATABASE),
-                           "-evalue", "0.0001",
-                           "-max_target_seqs", "5000",
-                           "-gapopen", "5",
-                           "-gapextend", "2",
-                           "-penalty", "-3",
-                           "-reward", "1",
-                           ]
-                stdout = subprocess.check_output(program).decode()
+                stdout = Blastn().genotype(
+                    contigs_fasta=contigs_fasta_path,
+                    database=DEFAULT_DATABASE,
+                )
             plot_contigs(sample_dir, stdout)
             plot_path = contig_plots_path / (sample_dir.name + '.png')
             plt.savefig(str(plot_path))
@@ -85,7 +66,7 @@ def main():
 
 
 def plot_contigs(sample_dir, contigs_csv):
-    reader = DictReader(StringIO(contigs_csv), BLAST_COLUMNS)
+    reader = DictReader(StringIO(contigs_csv), Blastn.BLAST_COLUMNS)
     rows = list(reader)
     rows.sort(reverse=True, key=lambda row: int(row['score']))
     contig_names = sorted({row['qaccver'] for row in rows})
diff --git a/micall/utils/externals.py b/micall/utils/externals.py
index fed306d68..b9739b994 100644
--- a/micall/utils/externals.py
+++ b/micall/utils/externals.py
@@ -253,3 +253,38 @@ def buffered_count(self, filename):
                 buf = read_f(buf_size)
 
         return lines
+
+
+class Blastn(CommandWrapper):
+    BLAST_COLUMNS = ['qaccver',
+                     'saccver',
+                     'qlen',
+                     'pident',
+                     'score',
+                     'qcovhsp',
+                     'qstart',
+                     'qend',
+                     'sstart',
+                     'send',
+                     ]
+
+    def __init__(self, version=None, execname='blastn', logger=None, **kwargs):
+        super(Blastn, self).__init__(version, execname, logger, **kwargs)
+        stdout = self.check_output(['-version'], stderr=subprocess.STDOUT)
+        version_found = stdout.split('\n')[0].split()[-1]
+        self.validate_version(version_found)
+
+    def genotype(self, database: Path, contigs_fasta: Path) -> str:
+        blast_format = '10 ' + " ".join(Blastn.BLAST_COLUMNS)
+        program = ["-outfmt", blast_format,
+                   "-query", str(contigs_fasta),
+                   "-db", str(database),
+                   "-evalue", "0.0001",
+                   "-max_target_seqs", "5000",
+                   "-gapopen", "5",
+                   "-gapextend", "2",
+                   "-penalty", "-3",
+                   "-reward", "1",
+                   ]
+        stdout = self.check_output(program)
+        return stdout
diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py
index 5595ced23..6c52465fe 100644
--- a/micall/utils/fasta_to_csv.py
+++ b/micall/utils/fasta_to_csv.py
@@ -9,7 +9,6 @@
 from operator import itemgetter
 from pathlib import Path
 import contextlib
-import subprocess
 
 from io import StringIO
 import importlib.resources as resources
@@ -18,6 +17,7 @@
 
 from micall.core.project_config import ProjectConfig
 from micall.utils.contig_stitcher_contigs import GenotypedContig
+from micall.utils.externals import Blastn
 
 
 @contextlib.contextmanager
@@ -55,27 +55,27 @@ def reference_dir() -> Iterator[Path]:
 
 
 @contextlib.contextmanager
-def default_database() -> Iterator[str]:
+def default_database() -> Iterator[Path]:
     with reference_dir() as blast_db:
-        yield str(blast_db / "refs.fasta")
+        yield blast_db / "refs.fasta"
 
 
 def read_assembled_contigs(group_refs: Dict[str, str],
                            genotypes: Dict[str, typing.Tuple[str, float]],
-                           contigs_fasta_path: str) -> Iterable[GenotypedContig]:
+                           contigs_fasta: Path) -> Iterable[GenotypedContig]:
     """Read assembled contigs and generate GenotypedContig objects.
 
     Args:
         group_refs (Dict[str, str]): Mapping of reference names to group references.
         genotypes (Dict[str, Tuple[str, float]]): Mapping of contig names to (reference name, match fraction).
-        contigs_fasta_path (str): Path to the FASTA file containing contig sequences.
+        contigs_fasta (Path): Path to the FASTA file containing contig sequences.
 
     Returns:
         Iterable[GenotypedContig]: An iterable of GenotypedContig objects.
     """
     projects = ProjectConfig.loadDefault()
 
-    for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")):
+    for i, record in enumerate(SeqIO.parse(contigs_fasta, "fasta")):
         (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0))
         seq = record.seq
         if match_fraction < 0:
@@ -118,29 +118,29 @@ def init_contigs_refs(contigs_csv: TextIO) -> DictWriter:
 def write_contigs(writer: DictWriter,
                   group_refs: Dict[str, str],
                   genotypes: Dict[str, typing.Tuple[str, float]],
-                  contigs_fasta_path: str):
+                  contigs_fasta: Path):
     """Write contigs to a CSV file.
 
     Args:
         writer (DictWriter): CSV writer to write contigs.
         group_refs (Dict[str, str]): Mapping of reference names to group references.
         genotypes (Dict[str, Tuple[str, float]]): Mapping of contig names to (reference name, match fraction).
-        contigs_fasta_path (str): Path to the FASTA file containing contig sequences.
+        contigs_fasta (Path): Path to the FASTA file containing contig sequences.
     """
-    for contig in read_assembled_contigs(group_refs, genotypes, contigs_fasta_path):
+    for contig in read_assembled_contigs(group_refs, genotypes, contigs_fasta):
         writer.writerow(dict(ref=contig.ref_name,
                              match=contig.match_fraction,
                              group_ref=contig.group_ref,
                              contig=contig.seq))
 
 
-def genotype(fasta: str, db: Optional[str] = None,
+def genotype(contigs_fasta: Path, db: Optional[Path] = None,
              blast_csv: Optional[TextIO] = None,
              group_refs: Optional[Dict[str, str]] = None) -> Dict[str, typing.Tuple[str, float]]:
     """Use Blastn to search for the genotype of a set of reference sequences.
 
     Args:
-        fasta (str): File path of the FASTA file containing the query sequences.
+        fasta (Path): File path of the FASTA file containing the query sequences.
         db (Optional[str]): File path of the database to search for matches.
         blast_csv (Optional[TextIO]): Open file to write the blast matches to, or None.
         group_refs (Optional[Dict[str, str]]): Dictionary to fill with the mapping from
@@ -151,35 +151,17 @@ def genotype(fasta: str, db: Optional[str] = None,
     """
 
     contig_nums: Dict[str, int] = {}  # {contig_name: contig_num}
-    with open(fasta) as f:
+    with open(contigs_fasta) as f:
         for line in f:
             if line.startswith('>'):
                 contig_name = line[1:-1]
                 contig_nums[contig_name] = len(contig_nums) + 1
-    blast_columns = ['qaccver',
-                     'saccver',
-                     'pident',
-                     'score',
-                     'qcovhsp',
-                     'qstart',
-                     'qend',
-                     'sstart',
-                     'send']
-
-    def invoke_blast(db: str) -> str:
-        program = ["blastn",
-                   "-outfmt", "10 " + " ".join(blast_columns),
-                   "-query", str(fasta),
-                   "-db", str(db),
-                   "-evalue", "0.0001",
-                   "-max_target_seqs", "5000",
-                   "-gapopen", "5",
-                   "-gapextend", "2",
-                   "-penalty", "-3",
-                   "-reward", "1",
-                   ]
-        stdout = subprocess.check_output(program)
-        return stdout.decode()
+
+    def invoke_blast(db: Path) -> str:
+        return Blastn().genotype(
+            contigs_fasta=contigs_fasta,
+            database=db,
+        )
 
     if db is None:
         with default_database() as db:
@@ -188,7 +170,7 @@ def invoke_blast(db: str) -> str:
         stdout = invoke_blast(db)
 
     samples = {}  # {query_name: (subject_name, matched_fraction)}
-    matches = sorted(DictReader(StringIO(stdout), blast_columns),
+    matches = sorted(DictReader(StringIO(stdout), Blastn.BLAST_COLUMNS),
                      key=lambda row: (row['qaccver'], float(row['score'])))
     if not blast_csv:
         blast_writer = None
@@ -247,32 +229,32 @@ def invoke_blast(db: str) -> str:
     return samples
 
 
-def fasta_to_csv(contigs_fasta_path: str,
+def fasta_to_csv(contigs_fasta: Path,
                  contigs_csv: TextIO,
                  merged_contigs_csv: Optional[TextIO] = None,
                  blast_csv: Optional[TextIO] = None) -> None:
     """Run BLAST search to identify contig sequences and write them to CSV.
 
     Args:
-        contigs_fasta_path (str): Path to the FASTA file containing contig sequences.
+        contigs_fasta (Path): Path to the FASTA file containing contig sequences.
         contigs_csv (TextIO): Open file to write assembled contigs to.
         merged_contigs_csv: open file to read contigs that were merged from amplicon reads.
         blast_csv (Optional[TextIO]): Open file to write BLAST search results for each contig.
     """
 
-    with open(contigs_fasta_path, 'a') as contigs_fasta:
+    with open(contigs_fasta, 'a') as contigs_fasta_writer:
         if merged_contigs_csv is not None:
             contig_reader = DictReader(merged_contigs_csv)
             for i, row in enumerate(contig_reader, 1):
                 contig_name = f'merged-contig-{i}'
-                contigs_fasta.write(f">{contig_name}\n{row['contig']}\n")
+                contigs_fasta_writer.write(f">{contig_name}\n{row['contig']}\n")
 
     writer = init_contigs_refs(cast(TextIO, contigs_csv))
     group_refs: Dict[str, str] = {}
 
-    genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs)
+    genotypes = genotype(contigs_fasta, blast_csv=blast_csv, group_refs=group_refs)
 
-    write_contigs(writer, group_refs, genotypes, contigs_fasta_path)
+    write_contigs(writer, group_refs, genotypes, contigs_fasta)
     contigs_csv.flush()