From 56b0b10b4a5ecb72a0b79ff611671864a48e3f0c Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Fri, 12 Feb 2021 11:22:47 +0100 Subject: [PATCH] cleanup in tests --- metagraph/integration_tests/base.py | 35 ++- metagraph/integration_tests/test_api.py | 4 +- metagraph/integration_tests/test_clean.py | 259 ++++++---------------- 3 files changed, 103 insertions(+), 195 deletions(-) diff --git a/metagraph/integration_tests/base.py b/metagraph/integration_tests/base.py index c1fecc7a5d..c6264baffb 100644 --- a/metagraph/integration_tests/base.py +++ b/metagraph/integration_tests/base.py @@ -29,15 +29,19 @@ class TestingBase(unittest.TestCase): def setUpClass(cls): cls.tempdir = TemporaryDirectory() - def _get_stats(self, graph_filename): + @staticmethod + def _get_stats(graph_filename): stats_command = METAGRAPH + ' stats ' + graph_filename res = subprocess.run(stats_command.split(), stdout=PIPE, stderr=PIPE) return res - def _build_graph(self, input, output, k, repr, canonical=False, primary=False): - construct_command = '{exe} build {canonical} \ + @staticmethod + def _build_graph(input, output, k, repr, canonical=False, primary=False, extra_params=""): + construct_command = '{exe} build -p {num_threads} {canonical} {extra_params} \ --graph {repr} -k {k} -o {outfile} {input}'.format( exe=METAGRAPH, + num_threads=NUM_THREADS, + extra_params=extra_params, k=k, repr=repr, canonical='--canonical' if canonical else '', @@ -50,9 +54,10 @@ def _build_graph(self, input, output, k, repr, canonical=False, primary=False): assert res.returncode == 0 if primary: - transform_command = '{exe} transform --to-fasta --primary-kmers \ + transform_command = '{exe} transform -p {num_threads} --to-fasta --primary-kmers \ -o {outfile} {input}'.format( exe=METAGRAPH, + num_threads=NUM_THREADS, k=k, repr=repr, outfile='{}.fasta.gz'.format(output), @@ -63,9 +68,11 @@ def _build_graph(self, input, output, k, repr, canonical=False, primary=False): stderr=PIPE) assert res.returncode == 0 - construct_command = '{exe} build \ + construct_command = '{exe} build -p {num_threads} {extra_params} \ --graph {repr} -k {k} -o {outfile} {input}'.format( exe=METAGRAPH, + num_threads=NUM_THREADS, + extra_params=extra_params, k=k, repr=repr, outfile=output, @@ -76,11 +83,25 @@ def _build_graph(self, input, output, k, repr, canonical=False, primary=False): stderr=PIPE) assert res.returncode == 0 + @staticmethod + def _clean(graph, output, extra_params=""): + clean_command = '{exe} clean -p {num_threads} \ + --to-fasta -o {outfile} {extra_params} {input}'.format( + exe=METAGRAPH, + num_threads=NUM_THREADS, + outfile=output, + extra_params=extra_params, + input=graph + ) + res = subprocess.run([clean_command], shell=True) + assert res.returncode == 0 - def _annotate_graph(self, input, graph_path, output, anno_repr, primary=False): + @staticmethod + def _annotate_graph(input, graph_path, output, anno_repr, primary=False): annotate_command = '{exe} annotate {fwd_and_rev} --anno-header -i {graph} \ - --anno-type {anno_repr} -o {outfile} {input}'.format( + --anno-type {anno_repr} -o {outfile} -p {num_threads} {input}'.format( exe=METAGRAPH, + num_threads=NUM_THREADS, fwd_and_rev='--canonical' if primary else '', graph=graph_path, anno_repr=anno_repr, diff --git a/metagraph/integration_tests/test_api.py b/metagraph/integration_tests/test_api.py index cc530687a6..c26ac86c4e 100644 --- a/metagraph/integration_tests/test_api.py +++ b/metagraph/integration_tests/test_api.py @@ -23,9 +23,9 @@ def setUpClass(cls, fasta_path, canonical=False, primary=False): graph_path = cls.tempdir.name + '/graph.dbg' annotation_path = cls.tempdir.name + '/annotation.column.annodbg' - cls._build_graph(cls, fasta_path, graph_path, 6, 'succinct', + cls._build_graph(fasta_path, graph_path, 6, 'succinct', canonical=canonical, primary=primary) - cls._annotate_graph(cls, fasta_path, graph_path, annotation_path, 'column', + cls._annotate_graph(fasta_path, graph_path, annotation_path, 'column', primary=primary) cls.host = socket.gethostbyname(socket.gethostname()) diff --git a/metagraph/integration_tests/test_clean.py b/metagraph/integration_tests/test_clean.py index 2d2716b2be..b7107ecaf1 100644 --- a/metagraph/integration_tests/test_clean.py +++ b/metagraph/integration_tests/test_clean.py @@ -7,13 +7,12 @@ import glob import os import gzip +from base import TestingBase, METAGRAPH, TEST_DATA_DIR, NUM_THREADS """Test graph construction""" -METAGRAPH = './metagraph' PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein") -TEST_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../tests/data' graph_file_extension = {'succinct': '.dbg', 'bitmap': '.bitmapdbg', @@ -24,30 +23,19 @@ GRAPH_TYPES = [graph_type for graph_type, _ in graph_file_extension.items()] -class TestCleanWeighted(unittest.TestCase): +class TestCleanWeighted(TestingBase): def setUp(self): - self.tempdir = TemporaryDirectory() + super().setUpClass() @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_no_cleaning_contigs(self, representation): - construct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 20 --count-kmers -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph', - input=TEST_DATA_DIR + '/transcripts_1000.fa' - ) - - res = subprocess.run([construct_command], shell=True, stdout=PIPE) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa', + output=self.tempdir.name + '/graph', + k=20, repr=representation, + extra_params="--mask-dummy --count-kmers") + + res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 20', params_str[0]) self.assertEqual('nodes (k): 591997', params_str[1]) @@ -55,34 +43,17 @@ def test_no_cleaning_contigs(self, representation): self.assertEqual('nnz weights: 591997', params_str[3]) self.assertEqual('avg weight: 2.48587', params_str[4]) - clean_command = '{exe} clean \ - --to-fasta -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/contigs.fasta.gz', - input=self.tempdir.name + '/graph' + graph_file_extension[representation] - ) - - res = subprocess.run([clean_command], shell=True) - self.assertEqual(res.returncode, 0) - - reconstruct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 20 --count-kmers -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph_clean', - input=self.tempdir.name + '/contigs.fasta.gz' - ) - - res = subprocess.run([reconstruct_command], shell=True) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + clean_fasta = self.tempdir.name + '/contigs.fasta.gz' + self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], + output=clean_fasta, + extra_params='') # no cleaning + + self._build_graph(input=clean_fasta, + output=self.tempdir.name + '/graph_clean', + k=20, repr=representation, + extra_params="--mask-dummy --count-kmers") + + res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 20', params_str[0]) self.assertEqual('nodes (k): 591997', params_str[1]) @@ -93,23 +64,12 @@ def test_no_cleaning_contigs(self, representation): @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_no_cleaning_contigs_2bit_counts(self, representation): - construct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 20 --count-kmers --count-width 2 -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph', - input=TEST_DATA_DIR + '/transcripts_1000.fa' - ) - - res = subprocess.run([construct_command], shell=True, stdout=PIPE) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa', + output=self.tempdir.name + '/graph', + k=20, repr=representation, + extra_params="--mask-dummy --count-kmers --count-width 2") + + res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 20', params_str[0]) self.assertEqual('nodes (k): 591997', params_str[1]) @@ -117,34 +77,17 @@ def test_no_cleaning_contigs_2bit_counts(self, representation): self.assertEqual('nnz weights: 591997', params_str[3]) self.assertEqual('avg weight: 1.73589', params_str[4]) - clean_command = '{exe} clean \ - --to-fasta -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/contigs.fasta.gz', - input=self.tempdir.name + '/graph' + graph_file_extension[representation] - ) - - res = subprocess.run([clean_command], shell=True) - self.assertEqual(res.returncode, 0) - - reconstruct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 20 --count-kmers -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph_clean', - input=self.tempdir.name + '/contigs.fasta.gz' - ) - - res = subprocess.run([reconstruct_command], shell=True) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + clean_fasta = self.tempdir.name + '/contigs.fasta.gz' + self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], + output=clean_fasta, + extra_params='') # no cleaning + + self._build_graph(input=clean_fasta, + output=self.tempdir.name + '/graph_clean', + k=20, repr=representation, + extra_params="--mask-dummy --count-kmers") + + res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 20', params_str[0]) self.assertEqual('nodes (k): 591997', params_str[1]) @@ -154,31 +97,20 @@ def test_no_cleaning_contigs_2bit_counts(self, representation): @unittest.skipIf(PROTEIN_MODE, "No canonical mode for Protein alphabets") -class TestCleanWeightedCanonical(unittest.TestCase): +class TestCleanWeightedCanonical(TestingBase): def setUp(self): - self.tempdir = TemporaryDirectory() + super().setUpClass() # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: def test_no_cleaning_contigs(self, representation): - construct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 31 --canonical --count-kmers -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph', - input=TEST_DATA_DIR + '/transcripts_1000.fa' - ) - - res = subprocess.run([construct_command], shell=True, stdout=PIPE) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa', + output=self.tempdir.name + '/graph', + k=31, repr=representation, canonical=True, + extra_params="--mask-dummy --count-kmers") + + res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 31', params_str[0]) self.assertEqual('nodes (k): 1185814', params_str[1]) @@ -186,34 +118,17 @@ def test_no_cleaning_contigs(self, representation): self.assertEqual('nnz weights: 1185814', params_str[3]) self.assertEqual('avg weight: 2.4635', params_str[4]) - clean_command = '{exe} clean \ - --to-fasta -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/contigs.fasta.gz', - input=self.tempdir.name + '/graph' + graph_file_extension[representation] - ) - - res = subprocess.run([clean_command], shell=True) - self.assertEqual(res.returncode, 0) - - reconstruct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 31 --canonical --count-kmers -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph_clean', - input=self.tempdir.name + '/contigs.fasta.gz' - ) - - res = subprocess.run([reconstruct_command], shell=True) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + clean_fasta = self.tempdir.name + '/contigs.fasta.gz' + self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], + output=clean_fasta, + extra_params='') # no cleaning + + self._build_graph(input=clean_fasta, + output=self.tempdir.name + '/graph_clean', + k=31, repr=representation, canonical=True, + extra_params="--mask-dummy --count-kmers") + + res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 31', params_str[0]) self.assertEqual('nodes (k): 1185814', params_str[1]) @@ -225,23 +140,12 @@ def test_no_cleaning_contigs(self, representation): @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: def test_no_cleaning_contigs_2bit_counts(self, representation): - construct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 31 --canonical --count-kmers --count-width 2 -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph', - input=TEST_DATA_DIR + '/transcripts_1000.fa' - ) - - res = subprocess.run([construct_command], shell=True, stdout=PIPE) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + self._build_graph(input=TEST_DATA_DIR + '/transcripts_1000.fa', + output=self.tempdir.name + '/graph', + k=31, repr=representation, canonical=True, + extra_params="--mask-dummy --count-kmers --count-width 2") + + res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 31', params_str[0]) self.assertEqual('nodes (k): 1185814', params_str[1]) @@ -249,34 +153,17 @@ def test_no_cleaning_contigs_2bit_counts(self, representation): self.assertEqual('nnz weights: 1185814', params_str[3]) self.assertEqual('avg weight: 1.72792', params_str[4]) - clean_command = '{exe} clean \ - --to-fasta -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/contigs.fasta.gz', - input=self.tempdir.name + '/graph' + graph_file_extension[representation] - ) - - res = subprocess.run([clean_command], shell=True) - self.assertEqual(res.returncode, 0) - - reconstruct_command = '{exe} build --mask-dummy \ - --graph {repr} -k 31 --canonical --count-kmers -o {outfile} {input}'.format( - exe=METAGRAPH, - repr=representation, - outfile=self.tempdir.name + '/graph_clean', - input=self.tempdir.name + '/contigs.fasta.gz' - ) - - res = subprocess.run([reconstruct_command], shell=True) - self.assertEqual(res.returncode, 0) - - stats_command = '{exe} stats {graph}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/graph_clean' + graph_file_extension[representation], - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) + clean_fasta = self.tempdir.name + '/contigs.fasta.gz' + self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], + output=clean_fasta, + extra_params='') # no cleaning + + self._build_graph(input=clean_fasta, + output=self.tempdir.name + '/graph_clean', + k=31, repr=representation, canonical=True, + extra_params="--mask-dummy --count-kmers") + + res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 31', params_str[0]) self.assertEqual('nodes (k): 1185814', params_str[1])