From 56b0b10b4a5ecb72a0b79ff611671864a48e3f0c Mon Sep 17 00:00:00 2001
From: Mikhail Karasikov <mike-karas@ya.ru>
Date: Fri, 12 Feb 2021 11:22:47 +0100
Subject: [PATCH] cleanup in tests

---
 metagraph/integration_tests/base.py       |  35 ++-
 metagraph/integration_tests/test_api.py   |   4 +-
 metagraph/integration_tests/test_clean.py | 259 ++++++----------------
 3 files changed, 103 insertions(+), 195 deletions(-)

diff --git a/metagraph/integration_tests/base.py b/metagraph/integration_tests/base.py
index c1fecc7a5d..c6264baffb 100644
--- a/metagraph/integration_tests/base.py
+++ b/metagraph/integration_tests/base.py
@@ -29,15 +29,19 @@ class TestingBase(unittest.TestCase):
     def setUpClass(cls):
         cls.tempdir = TemporaryDirectory()
 
-    def _get_stats(self, graph_filename):
+    @staticmethod
+    def _get_stats(graph_filename):
         stats_command = METAGRAPH + ' stats ' + graph_filename
         res = subprocess.run(stats_command.split(), stdout=PIPE, stderr=PIPE)
         return res
 
-    def _build_graph(self, input, output, k, repr, canonical=False, primary=False):
-        construct_command = '{exe} build {canonical} \
+    @staticmethod
+    def _build_graph(input, output, k, repr, canonical=False, primary=False, extra_params=""):
+        construct_command = '{exe} build -p {num_threads} {canonical} {extra_params} \
                 --graph {repr} -k {k} -o {outfile} {input}'.format(
             exe=METAGRAPH,
+            num_threads=NUM_THREADS,
+            extra_params=extra_params,
             k=k,
             repr=repr,
             canonical='--canonical' if canonical else '',
@@ -50,9 +54,10 @@ def _build_graph(self, input, output, k, repr, canonical=False, primary=False):
         assert res.returncode == 0
 
         if primary:
-            transform_command = '{exe} transform --to-fasta --primary-kmers \
+            transform_command = '{exe} transform -p {num_threads} --to-fasta --primary-kmers \
                     -o {outfile} {input}'.format(
                 exe=METAGRAPH,
+                num_threads=NUM_THREADS,
                 k=k,
                 repr=repr,
                 outfile='{}.fasta.gz'.format(output),
@@ -63,9 +68,11 @@ def _build_graph(self, input, output, k, repr, canonical=False, primary=False):
                                  stderr=PIPE)
             assert res.returncode == 0
 
-            construct_command = '{exe} build \
+            construct_command = '{exe} build -p {num_threads} {extra_params} \
                     --graph {repr} -k {k} -o {outfile} {input}'.format(
                 exe=METAGRAPH,
+                num_threads=NUM_THREADS,
+                extra_params=extra_params,
                 k=k,
                 repr=repr,
                 outfile=output,
@@ -76,11 +83,25 @@ def _build_graph(self, input, output, k, repr, canonical=False, primary=False):
                                  stderr=PIPE)
             assert res.returncode == 0
 
+    @staticmethod
+    def _clean(graph, output, extra_params=""):
+        clean_command = '{exe} clean -p {num_threads} \
+                --to-fasta -o {outfile} {extra_params} {input}'.format(
+            exe=METAGRAPH,
+            num_threads=NUM_THREADS,
+            outfile=output,
+            extra_params=extra_params,
+            input=graph
+        )
+        res = subprocess.run([clean_command], shell=True)
+        assert res.returncode == 0
 
-    def _annotate_graph(self, input, graph_path, output, anno_repr, primary=False):
+    @staticmethod
+    def _annotate_graph(input, graph_path, output, anno_repr, primary=False):
         annotate_command = '{exe} annotate {fwd_and_rev} --anno-header -i {graph} \
-                --anno-type {anno_repr} -o {outfile} {input}'.format(
+                --anno-type {anno_repr} -o {outfile} -p {num_threads} {input}'.format(
             exe=METAGRAPH,
+            num_threads=NUM_THREADS,
             fwd_and_rev='--canonical' if primary else '',
             graph=graph_path,
             anno_repr=anno_repr,
diff --git a/metagraph/integration_tests/test_api.py b/metagraph/integration_tests/test_api.py
index cc530687a6..c26ac86c4e 100644
--- a/metagraph/integration_tests/test_api.py
+++ b/metagraph/integration_tests/test_api.py
@@ -23,9 +23,9 @@ def setUpClass(cls, fasta_path, canonical=False, primary=False):
         graph_path = cls.tempdir.name + '/graph.dbg'
         annotation_path = cls.tempdir.name + '/annotation.column.annodbg'
 
-        cls._build_graph(cls, fasta_path, graph_path, 6, 'succinct',
+        cls._build_graph(fasta_path, graph_path, 6, 'succinct',
                          canonical=canonical, primary=primary)
-        cls._annotate_graph(cls, fasta_path, graph_path, annotation_path, 'column',
+        cls._annotate_graph(fasta_path, graph_path, annotation_path, 'column',
                             primary=primary)
 
         cls.host = socket.gethostbyname(socket.gethostname())
diff --git a/metagraph/integration_tests/test_clean.py b/metagraph/integration_tests/test_clean.py
index 2d2716b2be..b7107ecaf1 100644
--- a/metagraph/integration_tests/test_clean.py
+++ b/metagraph/integration_tests/test_clean.py
@@ -7,13 +7,12 @@
 import glob
 import os
 import gzip
+from base import TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS
 
 
 """Test graph construction"""
 
-METAGRAPH = './metagraph'
 PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")
-TEST_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../tests/data'
 
 graph_file_extension = {'succinct': '.dbg',
                         'bitmap': '.bitmapdbg',
@@ -24,30 +23,19 @@
 GRAPH_TYPES = [graph_type for graph_type, _ in graph_file_extension.items()]
 
 
-class TestCleanWeighted(unittest.TestCase):
+class TestCleanWeighted(TestingBase):
     def setUp(self):
-        self.tempdir = TemporaryDirectory()
+        super().setUpClass()
 
     @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)])
     def test_no_cleaning_contigs(self, representation):
 
-        construct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 20 --count-kmers -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph',
-            input=TEST_DATA_DIR + '/transcripts_1000.fa'
-        )
-
-        res = subprocess.run([construct_command], shell=True, stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa',
+                          output=self.tempdir.name + '/graph',
+                          k=20, repr=representation,
+                          extra_params="--mask-dummy --count-kmers")
+
+        res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 20', params_str[0])
         self.assertEqual('nodes (k): 591997', params_str[1])
@@ -55,34 +43,17 @@ def test_no_cleaning_contigs(self, representation):
         self.assertEqual('nnz weights: 591997', params_str[3])
         self.assertEqual('avg weight: 2.48587', params_str[4])
 
-        clean_command = '{exe} clean \
-                --to-fasta -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/contigs.fasta.gz',
-            input=self.tempdir.name + '/graph' + graph_file_extension[representation]
-        )
-
-        res = subprocess.run([clean_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        reconstruct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 20 --count-kmers -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph_clean',
-            input=self.tempdir.name + '/contigs.fasta.gz'
-        )
-
-        res = subprocess.run([reconstruct_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        clean_fasta = self.tempdir.name + '/contigs.fasta.gz'
+        self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation],
+                    output=clean_fasta,
+                    extra_params='')  # no cleaning
+
+        self._build_graph(input=clean_fasta,
+                          output=self.tempdir.name + '/graph_clean',
+                          k=20, repr=representation,
+                          extra_params="--mask-dummy --count-kmers")
+
+        res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 20', params_str[0])
         self.assertEqual('nodes (k): 591997', params_str[1])
@@ -93,23 +64,12 @@ def test_no_cleaning_contigs(self, representation):
     @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)])
     def test_no_cleaning_contigs_2bit_counts(self, representation):
 
-        construct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 20 --count-kmers --count-width 2 -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph',
-            input=TEST_DATA_DIR + '/transcripts_1000.fa'
-        )
-
-        res = subprocess.run([construct_command], shell=True, stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa',
+                          output=self.tempdir.name + '/graph',
+                          k=20, repr=representation,
+                          extra_params="--mask-dummy --count-kmers --count-width 2")
+
+        res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 20', params_str[0])
         self.assertEqual('nodes (k): 591997', params_str[1])
@@ -117,34 +77,17 @@ def test_no_cleaning_contigs_2bit_counts(self, representation):
         self.assertEqual('nnz weights: 591997', params_str[3])
         self.assertEqual('avg weight: 1.73589', params_str[4])
 
-        clean_command = '{exe} clean \
-                --to-fasta -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/contigs.fasta.gz',
-            input=self.tempdir.name + '/graph' + graph_file_extension[representation]
-        )
-
-        res = subprocess.run([clean_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        reconstruct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 20 --count-kmers -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph_clean',
-            input=self.tempdir.name + '/contigs.fasta.gz'
-        )
-
-        res = subprocess.run([reconstruct_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        clean_fasta = self.tempdir.name + '/contigs.fasta.gz'
+        self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation],
+                    output=clean_fasta,
+                    extra_params='')  # no cleaning
+
+        self._build_graph(input=clean_fasta,
+                          output=self.tempdir.name + '/graph_clean',
+                          k=20, repr=representation,
+                          extra_params="--mask-dummy --count-kmers")
+
+        res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 20', params_str[0])
         self.assertEqual('nodes (k): 591997', params_str[1])
@@ -154,31 +97,20 @@ def test_no_cleaning_contigs_2bit_counts(self, representation):
 
 
 @unittest.skipIf(PROTEIN_MODE, "No canonical mode for Protein alphabets")
-class TestCleanWeightedCanonical(unittest.TestCase):
+class TestCleanWeightedCanonical(TestingBase):
     def setUp(self):
-        self.tempdir = TemporaryDirectory()
+        super().setUpClass()
 
     # TODO: add 'hashstr' once the canonical mode is implemented for it
     @parameterized.expand(['succinct', 'bitmap', 'hash'])  # , 'hashstr']:
     def test_no_cleaning_contigs(self, representation):
 
-        construct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 31 --canonical --count-kmers -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph',
-            input=TEST_DATA_DIR + '/transcripts_1000.fa'
-        )
-
-        res = subprocess.run([construct_command], shell=True, stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa',
+                          output=self.tempdir.name + '/graph',
+                          k=31, repr=representation, canonical=True,
+                          extra_params="--mask-dummy --count-kmers")
+
+        res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 31', params_str[0])
         self.assertEqual('nodes (k): 1185814', params_str[1])
@@ -186,34 +118,17 @@ def test_no_cleaning_contigs(self, representation):
         self.assertEqual('nnz weights: 1185814', params_str[3])
         self.assertEqual('avg weight: 2.4635', params_str[4])
 
-        clean_command = '{exe} clean \
-                --to-fasta -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/contigs.fasta.gz',
-            input=self.tempdir.name + '/graph' + graph_file_extension[representation]
-        )
-
-        res = subprocess.run([clean_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        reconstruct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 31 --canonical --count-kmers -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph_clean',
-            input=self.tempdir.name + '/contigs.fasta.gz'
-        )
-
-        res = subprocess.run([reconstruct_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        clean_fasta = self.tempdir.name + '/contigs.fasta.gz'
+        self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation],
+                    output=clean_fasta,
+                    extra_params='')  # no cleaning
+
+        self._build_graph(input=clean_fasta,
+                          output=self.tempdir.name + '/graph_clean',
+                          k=31, repr=representation, canonical=True,
+                          extra_params="--mask-dummy --count-kmers")
+
+        res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 31', params_str[0])
         self.assertEqual('nodes (k): 1185814', params_str[1])
@@ -225,23 +140,12 @@ def test_no_cleaning_contigs(self, representation):
     @parameterized.expand(['succinct', 'bitmap', 'hash'])  # , 'hashstr']:
     def test_no_cleaning_contigs_2bit_counts(self, representation):
 
-        construct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 31 --canonical --count-kmers --count-width 2 -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph',
-            input=TEST_DATA_DIR + '/transcripts_1000.fa'
-        )
-
-        res = subprocess.run([construct_command], shell=True, stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa',
+                          output=self.tempdir.name + '/graph',
+                          k=31, repr=representation, canonical=True,
+                          extra_params="--mask-dummy --count-kmers --count-width 2")
+
+        res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 31', params_str[0])
         self.assertEqual('nodes (k): 1185814', params_str[1])
@@ -249,34 +153,17 @@ def test_no_cleaning_contigs_2bit_counts(self, representation):
         self.assertEqual('nnz weights: 1185814', params_str[3])
         self.assertEqual('avg weight: 1.72792', params_str[4])
 
-        clean_command = '{exe} clean \
-                --to-fasta -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/contigs.fasta.gz',
-            input=self.tempdir.name + '/graph' + graph_file_extension[representation]
-        )
-
-        res = subprocess.run([clean_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        reconstruct_command = '{exe} build --mask-dummy \
-                --graph {repr} -k 31 --canonical --count-kmers -o {outfile} {input}'.format(
-            exe=METAGRAPH,
-            repr=representation,
-            outfile=self.tempdir.name + '/graph_clean',
-            input=self.tempdir.name + '/contigs.fasta.gz'
-        )
-
-        res = subprocess.run([reconstruct_command], shell=True)
-        self.assertEqual(res.returncode, 0)
-
-        stats_command = '{exe} stats {graph}'.format(
-            exe=METAGRAPH,
-            graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation],
-        )
-        res = subprocess.run(stats_command.split(), stdout=PIPE)
-        self.assertEqual(res.returncode, 0)
+        clean_fasta = self.tempdir.name + '/contigs.fasta.gz'
+        self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation],
+                    output=clean_fasta,
+                    extra_params='')  # no cleaning
+
+        self._build_graph(input=clean_fasta,
+                          output=self.tempdir.name + '/graph_clean',
+                          k=31, repr=representation, canonical=True,
+                          extra_params="--mask-dummy --count-kmers")
+
+        res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation])
         params_str = res.stdout.decode().split('\n')[2:]
         self.assertEqual('k: 31', params_str[0])
         self.assertEqual('nodes (k): 1185814', params_str[1])