diff --git a/metagraph/integration_tests/test_align.py b/metagraph/integration_tests/test_align.py index dc0a092479..fa789f5e70 100644 --- a/metagraph/integration_tests/test_align.py +++ b/metagraph/integration_tests/test_align.py @@ -41,37 +41,7 @@ def test_simple_align_all_graphs(self, representation): self.assertEqual('nodes (k): 16438', params_str[1]) self.assertEqual('mode: basic', params_str[2]) - stats_command = '{exe} align -i {graph} --align-vertical-bandwidth 1000000 --align-min-exact-match 0.0 {reads}'.format( - exe=METAGRAPH, - graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation], - reads=TEST_DATA_DIR + '/genome_MT1.fq', - ) - res = subprocess.run(stats_command.split(), stdout=PIPE) - self.assertEqual(res.returncode, 0) - params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) - self.assertEqual(params_str[0], 'MT-10/1\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t+\tTAGAATCTTAG\t22\t11\t19S11=120S\t0') - self.assertEqual(params_str[1], 'MT-8/1\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t+\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t300\t150\t150=\t0') - self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') - self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t*\t*\t0\t*\t*\t*') - self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - self.assertEqual(params_str[5], 'MT-11/1\tAACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA\t+\tTTCACTGATTT\t22\t11\t62S11=77S\t0') - - @parameterized.expand(GRAPH_TYPES) - def test_simple_align_banded_all_graphs(self, representation): - - self._build_graph(input=TEST_DATA_DIR + '/genome.MT.fa', - output=self.tempdir.name + '/genome.MT', - k=11, repr=representation, - extra_params="--mask-dummy") - - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) - - stats_command = '{exe} align -i {graph} --align-vertical-bandwidth 10 --align-min-exact-match 0.0 {reads}'.format( + stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation], reads=TEST_DATA_DIR + '/genome_MT1.fq', @@ -85,7 +55,10 @@ def test_simple_align_banded_all_graphs(self, representation): self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t*\t*\t0\t*\t*\t*') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - self.assertEqual(params_str[5], 'MT-11/1\tAACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA\t+\tTTCACTGATTT\t22\t11\t62S11=77S\t0') + last_split = params_str[5].split("\t"); + self.assertEqual(last_split[0], "MT-11/1") + self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") + self.assertEqual(last_split[4], "22") @parameterized.expand(['succinct']) def test_simple_align_json_all_graphs(self, representation): @@ -139,7 +112,10 @@ def test_simple_align_fwd_rev_comp_all_graphs(self, representation): self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t-\tATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACT\t295\t149\t95=1X54=\t0') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - self.assertEqual(params_str[5], 'MT-11/1\tAACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA\t+\tTTCACTGATTT\t22\t11\t62S11=77S\t0') + last_split = params_str[5].split("\t"); + self.assertEqual(last_split[0], "MT-11/1") + self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") + self.assertEqual(last_split[4], "22") @parameterized.expand(GRAPH_TYPES) def test_simple_align_canonical_all_graphs(self, representation): @@ -170,7 +146,10 @@ def test_simple_align_canonical_all_graphs(self, representation): self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t+\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTGGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t295\t149\t54=1X95=\t0') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - self.assertEqual(params_str[5], 'MT-11/1\tAACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA\t+\tTTCACTGATTT\t22\t11\t62S11=77S\t0') + last_split = params_str[5].split("\t"); + self.assertEqual(last_split[0], "MT-11/1") + self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") + self.assertEqual(last_split[4], "22") @parameterized.expand(['succinct']) def test_simple_align_canonical_subk_succinct(self, representation): @@ -230,7 +209,10 @@ def test_simple_align_primary_all_graphs(self, representation): self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t+\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTGGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t295\t149\t54=1X95=\t0') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - self.assertEqual(params_str[5], 'MT-11/1\tAACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA\t+\tTTCACTGATTT\t22\t11\t62S11=77S\t0') + last_split = params_str[5].split("\t"); + self.assertEqual(last_split[0], "MT-11/1") + self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") + self.assertEqual(last_split[4], "22") @parameterized.expand(['succinct']) def test_simple_align_primary_subk_succinct(self, representation): diff --git a/metagraph/src/cli/align.cpp b/metagraph/src/cli/align.cpp index c35bc7ebe8..6f5fd8cc40 100644 --- a/metagraph/src/cli/align.cpp +++ b/metagraph/src/cli/align.cpp @@ -29,8 +29,6 @@ DBGAlignerConfig initialize_aligner_config(size_t k, const Config &config) { DBGAlignerConfig aligner_config; - aligner_config.queue_size = config.alignment_queue_size; - aligner_config.bandwidth = config.alignment_vertical_bandwidth; aligner_config.num_alternative_paths = config.alignment_num_alternative_paths; aligner_config.min_seed_length = config.alignment_min_seed_length; aligner_config.max_seed_length = config.alignment_max_seed_length; @@ -57,7 +55,6 @@ DBGAlignerConfig initialize_aligner_config(size_t k, const Config &config) { logger->trace("Alignment settings:"); logger->trace("\t Alignments to report: {}", aligner_config.num_alternative_paths); - logger->trace("\t Priority queue size: {}", aligner_config.queue_size); logger->trace("\t Min seed length: {}", aligner_config.min_seed_length); logger->trace("\t Max seed length: {}", aligner_config.max_seed_length); logger->trace("\t Max num seeds per locus: {}", aligner_config.max_num_seeds_per_locus); @@ -67,7 +64,6 @@ DBGAlignerConfig initialize_aligner_config(size_t k, const Config &config) { logger->trace("\t Gap extension penalty: {}", int64_t(aligner_config.gap_extension_penalty)); logger->trace("\t Min DP table cell score: {}", int64_t(aligner_config.min_cell_score)); logger->trace("\t Min alignment score: {}", aligner_config.min_path_score); - logger->trace("\t Bandwidth: {}", aligner_config.bandwidth); logger->trace("\t X drop-off: {}", aligner_config.xdrop); logger->trace("\t Exact nucleotide match threshold: {}", aligner_config.min_exact_match); diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp index ad2c5c22b4..ed2cad4a46 100644 --- a/metagraph/src/cli/config/config.cpp +++ b/metagraph/src/cli/config/config.cpp @@ -201,10 +201,6 @@ Config::Config(int argc, char *argv[]) { batch_align = true; } else if (!strcmp(argv[i], "--align-length")) { alignment_length = atoi(get_value(i++)); - } else if (!strcmp(argv[i], "--align-queue-size")) { - alignment_queue_size = atoi(get_value(i++)); - } else if (!strcmp(argv[i], "--align-vertical-bandwidth")) { - alignment_vertical_bandwidth = atoi(get_value(i++)); } else if (!strcmp(argv[i], "--align-match-score")) { alignment_match_score = atoi(get_value(i++)); } else if (!strcmp(argv[i], "--align-mm-transition-penalty")) { @@ -901,7 +897,7 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) { fprintf(stderr, "\n"); fprintf(stderr, "\t --query-presence \t\ttest sequences for presence, report as 0 or 1 [off]\n"); fprintf(stderr, "\t --filter-present \t\treport only present input sequences as FASTA [off]\n"); - fprintf(stderr, "\t --batch-size \tquery batch size (number of base pairs) [100000000]\n"); + fprintf(stderr, "\t --batch-size \t\tquery batch size (number of base pairs) [100000000]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Available options for alignment:\n"); fprintf(stderr, "\t-o --outfile-base [STR]\t\t\t\tbasename of output file []\n"); @@ -910,10 +906,8 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) { fprintf(stderr, "\t --align-alternative-alignments \t\tthe number of alternative paths to report per seed [1]\n"); fprintf(stderr, "\t --align-min-path-score [INT]\t\t\tthe minimum score that a reported path can have [0]\n"); fprintf(stderr, "\t --align-edit-distance \t\t\tuse unit costs for scoring matrix [off]\n"); - fprintf(stderr, "\t --align-queue-size [INT]\t\t\tmaximum size of the priority queue for alignment [20]\n"); - fprintf(stderr, "\t --align-vertical-bandwidth [INT]\t\tmaximum width of a window to consider in alignment step [inf]\n"); - fprintf(stderr, "\t --align-max-nodes-per-seq-char [FLOAT]\t\tmaximum number of nodes to consider per sequence character [10.0]\n"); - fprintf(stderr, "\t --align-max-ram [FLOAT]\t\tmaximum amount of RAM used per alignment in MB [200.0]\n"); + fprintf(stderr, "\t --align-max-nodes-per-seq-char [FLOAT]\tmaximum number of nodes to consider per sequence character [12.0]\n"); + fprintf(stderr, "\t --align-max-ram [FLOAT]\t\t\tmaximum amount of RAM used per alignment in MB [200.0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Advanced options for scoring:\n"); fprintf(stderr, "\t --align-match-score [INT]\t\t\tpositive match score [2]\n"); @@ -927,7 +921,7 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) { fprintf(stderr, "Advanced options for seeding:\n"); fprintf(stderr, "\t --align-min-seed-length [INT]\t\tthe minimum length of a seed [graph k]\n"); fprintf(stderr, "\t --align-max-seed-length [INT]\t\tthe maximum length of a seed [graph k]\n"); - fprintf(stderr, "\t --align-min-exact-match [FLOAT] fraction of matching nucleotides required to align sequence [0.7]\n"); + fprintf(stderr, "\t --align-min-exact-match [FLOAT] \t\tfraction of matching nucleotides required to align sequence [0.7]\n"); fprintf(stderr, "\t --align-max-num-seeds-per-locus [INT]\tthe maximum number of allowed inexact seeds per locus [inf]\n"); } break; case COMPARE: { @@ -1142,9 +1136,7 @@ void Config::print_usage(const std::string &prog_name, IdentityType identity) { // fprintf(stderr, "\t --align-alternative-alignments \tthe number of alternative paths to report per seed [1]\n"); fprintf(stderr, "\t --align-min-path-score [INT]\t\t\tthe minimum score that a reported path can have [0]\n"); fprintf(stderr, "\t --align-edit-distance \t\t\tuse unit costs for scoring matrix [off]\n"); - fprintf(stderr, "\t --align-queue-size [INT]\t\t\tmaximum size of the priority queue for alignment [20]\n"); - fprintf(stderr, "\t --align-vertical-bandwidth [INT]\t\tmaximum width of a window to consider in alignment step [inf]\n"); - fprintf(stderr, "\t --align-max-nodes-per-seq-char [FLOAT]\tmaximum number of nodes to consider per sequence character [10.0]\n"); + fprintf(stderr, "\t --align-max-nodes-per-seq-char [FLOAT]\tmaximum number of nodes to consider per sequence character [12.0]\n"); fprintf(stderr, "\t --align-max-ram [FLOAT]\t\tmaximum amount of RAM used per alignment in MB [200.0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "\t --batch-align \t\talign against query graph [off]\n"); diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp index 44c8194d93..af46e37806 100644 --- a/metagraph/src/cli/config/config.hpp +++ b/metagraph/src/cli/config/config.hpp @@ -109,8 +109,6 @@ class Config { int32_t alignment_min_path_score = 0; int32_t alignment_xdrop = 27; - size_t alignment_queue_size = 20; - size_t alignment_vertical_bandwidth = std::numeric_limits::max(); size_t alignment_num_alternative_paths = 1; size_t alignment_min_seed_length = 0; size_t alignment_max_seed_length = std::numeric_limits::max(); @@ -124,7 +122,7 @@ class Config { double max_count_quantile = 1.; double bloom_fpp = 1.0; double bloom_bpk = 4.0; - double alignment_max_nodes_per_seq_char = 10.0; + double alignment_max_nodes_per_seq_char = 12.0; double alignment_max_ram = 200; double alignment_min_exact_match = 0.7; std::vector count_slice_quantiles; diff --git a/metagraph/src/cli/server.cpp b/metagraph/src/cli/server.cpp index 6924cf4631..68b9c3642d 100644 --- a/metagraph/src/cli/server.cpp +++ b/metagraph/src/cli/server.cpp @@ -5,6 +5,7 @@ #include "common/unix_tools.hpp" #include "common/utils/string_utils.hpp" #include "common/utils/file_utils.hpp" +#include "common/utils/template_utils.hpp" #include "graph/alignment/dbg_aligner.hpp" #include "graph/annotated_dbg.hpp" #include "seq_io/sequence_io.hpp" diff --git a/metagraph/src/common/logger.hpp b/metagraph/src/common/logger.hpp index f767c9d853..387a0b2b6c 100644 --- a/metagraph/src/common/logger.hpp +++ b/metagraph/src/common/logger.hpp @@ -12,4 +12,11 @@ void set_verbose(bool verbose); extern std::shared_ptr logger; } // namespace common + +#ifndef NDEBUG +#define DEBUG_LOG(...) common::logger->trace(__VA_ARGS__) +#else +#define DEBUG_LOG(...) (void)0 +#endif + } // namespace mtg diff --git a/metagraph/src/common/utils/simd_utils.hpp b/metagraph/src/common/utils/simd_utils.hpp index 98b9c0cf33..b0c2f6632b 100644 --- a/metagraph/src/common/utils/simd_utils.hpp +++ b/metagraph/src/common/utils/simd_utils.hpp @@ -238,10 +238,7 @@ inline uint64_t haddall_epi64(__m256i v) { } -/** - * Helpers for aligner - */ - +// Helper for Bloom filter // Bit reduce packed 64-bit numbers to 32 bits inline __m128i cvtepi64_epi32(__m256i v) { return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32( @@ -250,12 +247,25 @@ inline __m128i cvtepi64_epi32(__m256i v) { )); } -inline __m256i rshiftpushback_epi32(__m256i v, uint32_t a) { - return _mm256_insert_epi32( - _mm256_permutevar8x32_epi32(v, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0)), - a, - 7 - ); + +/** + * Helpers for aligner + */ + +// Drop-in replacement for _mm_loadu_si64 +inline __m128i mm_loadu_si64(const void *mem_addr) { + return _mm_loadl_epi64((const __m128i*)mem_addr); +} + +// Drop-in replacement for _mm_storeu_si64 +inline void mm_storeu_si64(void *mem_addr, __m128i a) { + _mm_storel_epi64((__m128i*)mem_addr, a); +} + +inline void mm_maskstorel_epi8(int8_t *mem_addr, __m128i mask, __m128i a) { + __m128i orig = mm_loadu_si64((__m128i*)mem_addr); + a = _mm_blendv_epi8(orig, a, mask); + mm_storeu_si64(mem_addr, a); } #if defined(__AVX512VL__) && defined(__AVX512F__) @@ -307,6 +317,6 @@ inline __m256d fmafast_pd(__m256d a, __m256d b, __m256d c) { return _mm256_add_pd(_mm256_mul_pd(a, b), c); } -#endif +#endif // __AVX2__ #endif // __SIMD_UTILS_HPP__ diff --git a/metagraph/src/graph/alignment/aligner_aggregator.hpp b/metagraph/src/graph/alignment/aligner_aggregator.hpp index 608b02c3c7..654006254c 100644 --- a/metagraph/src/graph/alignment/aligner_aggregator.hpp +++ b/metagraph/src/graph/alignment/aligner_aggregator.hpp @@ -50,7 +50,7 @@ inline void AlignmentAggregator ::add_alignment(DBGAlignment&& alignment) { if (path_queue_.size() < config_.num_alternative_paths) { path_queue_.emplace(std::move(alignment)); - } else if (alignment.get_score() > path_queue_.minimum().get_score()) { + } else if (!AlignmentCompare()(alignment, path_queue_.minimum())) { path_queue_.update(path_queue_.begin(), std::move(alignment)); } } diff --git a/metagraph/src/graph/alignment/aligner_alignment.cpp b/metagraph/src/graph/alignment/aligner_alignment.cpp index 298a297827..758844e535 100644 --- a/metagraph/src/graph/alignment/aligner_alignment.cpp +++ b/metagraph/src/graph/alignment/aligner_alignment.cpp @@ -43,201 +43,6 @@ Alignment::Alignment(std::string_view query, cigar_.append(Cigar::DELETION, sequence_.size() - min_length); } -template -Alignment::Alignment(const DPTable &dp_table, - const DBGAlignerConfig &config, - std::string_view query_view, - typename DPTable::const_iterator column, - size_t start_pos, - size_t offset, - NodeType *start_node, - const Alignment &seed) - : orientation_(seed.get_orientation()), offset_(offset) { - assert(start_node); - - auto i = start_pos; - size_t shift = column->second.start_index; - assert(i >= shift); - assert(i - shift < column->second.scores.size()); - score_ = column->second.scores.at(i - shift); - Cigar::Operator op = column->second.ops.at(i - shift); - NodeType prev_node; - switch (column->second.prev_nodes.at(i - shift)) { - case 0: { prev_node = SequenceGraph::npos; } break; - case 0xFF: { prev_node = column->first; } break; - default: { - prev_node = column->second.select_prev_node(column->second.prev_nodes.at(i - shift)); - } - } - - NodeType prev_gap_node; - switch (column->second.gap_prev_nodes.at(i - shift)) { - case 0: { prev_gap_node = SequenceGraph::npos; } break; - case 0xFF: { prev_gap_node = column->first; } break; - default: { - prev_gap_node = column->second.select_prev_node(column->second.gap_prev_nodes.at(i - shift)); - } - } - - if (op == Cigar::DELETION) - prev_node = prev_gap_node; - - uint32_t gap_count = op == Cigar::DELETION ? column->second.gap_count.at(i - shift) - 1 : 0; - - if (!i && prev_node == SequenceGraph::npos) - return; - - // use config to recompute CIGAR score in DEBUG mode - score_t score_track = score_; - Cigar::Operator last_op = Cigar::CLIPPED; - - score_t gap_diff = config.gap_opening_penalty - config.gap_extension_penalty; - - // TODO: If there is a cyclic part of the graph in which the optimal - // alignment involves an insertion, then a score which was previously - // from a insertion may be replaced with a match. This will cause - // subsequent insertion scores to be wrong since they're no longer - // extensions. The only way to fix this is to store a separate vector - // to keep partial alignments ending in insertions. - // Until this is fixed, the score checking asserts have been commented out. - - std::vector::const_iterator> out_columns; - while (prev_node != SequenceGraph::npos) { - auto prev_column = dp_table.find(prev_node); - assert(prev_column != dp_table.end()); - assert(i || op == Cigar::DELETION); - - switch (op) { - case Cigar::MATCH: - case Cigar::MISMATCH: { - --i; - out_columns.emplace_back(column); - - if (last_op == Cigar::DELETION) - score_track -= gap_diff; - - // assert(column->second.scores.at(i + 1) >= score_track); - score_track -= config.get_row(column->second.last_char)[query_view[i]]; - // assert(prev_column->second.scores.at(i) >= score_track); - - } break; - case Cigar::DELETION: { - out_columns.emplace_back(column); - - score_track -= config.gap_extension_penalty; - - } break; - case Cigar::INSERTION: { - assert(column == prev_column); - --i; - - assert(i >= shift); - assert(i - shift < column->second.scores.size()); - - // assert(column->second.prev_nodes.at(i + 1) == 0xFF); - // assert(column->second.scores.at(i + 1) >= score_track); - - if (column->second.ops.at(i - shift) == Cigar::DELETION) { - logger->error("INSERTION after DELETION: {}", query_view); - exit(1); - } - - score_track -= column->second.ops.at(i - shift) == Cigar::INSERTION - ? config.gap_extension_penalty - : config.gap_opening_penalty; - // assert(column->second.scores.at(i) >= score_track); - - } break; - case Cigar::CLIPPED: { assert(false); } - } - - cigar_.append(op); - - last_op = op; - - column = prev_column; - shift = prev_column->second.start_index; - assert(i >= shift); - assert(i - shift < column->second.scores.size()); - if (gap_count) { - --gap_count; - } else { - op = column->second.ops.at(i - shift); - - if (op == Cigar::DELETION) - gap_count = column->second.gap_count.at(i - shift) - 1; - } - switch (column->second.prev_nodes.at(i - shift)) { - case 0: { prev_node = SequenceGraph::npos; } break; - case 0xFF: { prev_node = column->first; } break; - default: { - prev_node = column->second.select_prev_node(column->second.prev_nodes.at(i - shift)); - } - } - - switch (column->second.gap_prev_nodes.at(i - shift)) { - case 0: { prev_gap_node = SequenceGraph::npos; } break; - case 0xFF: { prev_gap_node = column->first; } break; - default: { - prev_gap_node = column->second.select_prev_node(column->second.gap_prev_nodes.at(i - shift)); - } - } - - if (op == Cigar::DELETION) - prev_node = prev_gap_node; - } - - const auto &score_col = column->second.scores; - - if (last_op == Cigar::DELETION) - score_track -= gap_diff; - - score_t correction = score_col.at(i - shift) - score_track; - - // assert(correction >= 0); - - if (correction > 0) - logger->trace("Fixing outdated score: {} -> {}", score_, score_ + correction); - - score_ -= score_col.at(i - shift) - correction; - - *start_node = column->first; - - if (i > std::numeric_limits::max()) - throw std::runtime_error("Error: clipping length can't be stored in CIGAR"); - - cigar_.append(Cigar::CLIPPED, i); - assert(cigar_.size()); - - query_ = { query_view.data() + i, start_pos - i }; - - std::reverse(cigar_.begin(), cigar_.end()); - - nodes_.resize(out_columns.size()); - std::transform(out_columns.rbegin(), - out_columns.rend(), - nodes_.begin(), - [](const auto &iter) { return iter->first; }); - - sequence_.assign(out_columns.size(), 'N'); - std::transform(out_columns.rbegin(), - out_columns.rend(), - sequence_.begin(), - [](const auto &iter) { return iter->second.last_char; }); - - if (correction < 0) { - logger->warn( - "Correcting score: {} -> {}\nQuery: {}\nTarget: {}\nSeed: {}\nExtension: {}", - score_, - score_ + correction, - seed.get_sequence() + std::string(query_view), - seed.get_sequence() + sequence_, - seed.get_cigar().to_string(), - cigar_.to_string() - ); - } -} - template void Alignment::append(Alignment&& other) { assert(query_.data() + query_.size() == other.query_.data()); @@ -779,11 +584,10 @@ ::load_from_json(const Json::Value &alignment, const DeBruijnGraph &graph) { return query_sequence; } -template bool spell_path(const DeBruijnGraph &graph, - const std::vector &path, + const std::vector &path, std::string &seq, - size_t offset = 0) { + size_t offset) { assert(offset < graph.get_k()); if (path.empty()) @@ -792,7 +596,7 @@ bool spell_path(const DeBruijnGraph &graph, if (std::find(path.begin(), path.end(), DeBruijnGraph::npos) != path.end()) { std::cerr << "ERROR: path has invalid nodes\n"; - for (NodeType node : path) { + for (DeBruijnGraph::node_index node : path) { std::cerr << node << " "; } @@ -889,7 +693,6 @@ QueryAlignment::QueryAlignment(std::string_view query, template class Alignment<>; -template struct LocalAlignmentLess<>; template class QueryAlignment<>; } // namespace align diff --git a/metagraph/src/graph/alignment/aligner_alignment.hpp b/metagraph/src/graph/alignment/aligner_alignment.hpp index bd270e39bf..28831a10e7 100644 --- a/metagraph/src/graph/alignment/aligner_alignment.hpp +++ b/metagraph/src/graph/alignment/aligner_alignment.hpp @@ -11,7 +11,6 @@ #include "aligner_cigar.hpp" #include "aligner_config.hpp" -#include "aligner_dp_table.hpp" namespace mtg { @@ -30,6 +29,22 @@ class Alignment { typedef NodeType node_index; typedef DBGAlignerConfig::score_t score_t; + Alignment(std::string_view query, + std::vector&& nodes = {}, + std::string&& sequence = "", + score_t score = 0, + Cigar&& cigar = Cigar(), + size_t clipping = 0, + bool orientation = false, + size_t offset = 0) + : query_(query), + nodes_(std::move(nodes)), + sequence_(std::move(sequence)), + score_(score), + cigar_(Cigar::CLIPPED, clipping), + orientation_(orientation), + offset_(offset) { cigar_.append(std::move(cigar)); } + // Used for constructing seeds Alignment(std::string_view query = {}, std::vector&& nodes = {}, @@ -48,6 +63,7 @@ class Alignment { assert(nodes.empty() || clipping || is_exact_match()); } + // Used for constructing exact match seeds Alignment(std::string_view query, std::vector&& nodes, std::string&& sequence, @@ -56,16 +72,6 @@ class Alignment { bool orientation = false, size_t offset = 0); - // TODO: construct multiple alignments from the same starting point - Alignment(const DPTable &dp_table, - const DBGAlignerConfig &config, - std::string_view query_view, - typename DPTable::const_iterator column, - size_t start_pos, - size_t offset, - NodeType *start_node, - const Alignment &seed); - void append(Alignment&& other); size_t size() const { return nodes_.size(); } @@ -160,22 +166,6 @@ class Alignment { bool is_valid(const DeBruijnGraph &graph, const DBGAlignerConfig *config = nullptr) const; private: - Alignment(std::string_view query, - std::vector&& nodes = {}, - std::string&& sequence = "", - score_t score = 0, - Cigar&& cigar = Cigar(), - size_t clipping = 0, - bool orientation = false, - size_t offset = 0) - : query_(query), - nodes_(std::move(nodes)), - sequence_(std::move(sequence)), - score_(score), - cigar_(Cigar::CLIPPED, clipping), - orientation_(orientation), - offset_(offset) { cigar_.append(std::move(cigar)); } - Json::Value path_json(size_t node_size, std::string_view label = {}) const; std::string_view query_; @@ -199,11 +189,36 @@ std::ostream& operator<<(std::ostream& out, const Alignment &alignment return out; } -template +bool spell_path(const DeBruijnGraph &graph, + const std::vector &path, + std::string &seq, + size_t offset = 0); + struct LocalAlignmentLess { + template + bool operator()(const Alignment &a, const Alignment &b) { + // 1) score is less, or + // 2) more of the query is covered, or + // 3) if it is in the reverse orientation, or + // 4) if the starting point is later in the query + return std::make_tuple(b.get_score(), a.get_query().size(), + a.get_orientation(), a.get_clipping()) + > std::make_tuple(a.get_score(), b.get_query().size(), + b.get_orientation(), b.get_clipping()); + } +}; + +struct LocalAlignmentGreater { + template bool operator()(const Alignment &a, const Alignment &b) { - return std::make_pair(-a.get_score(), a.get_query().size()) - > std::make_pair(-b.get_score(), b.get_query().size()); + // 1) score is higher, or + // 2) less of the query is covered, or + // 3) if it is in the forward orientation, or + // 4) if the starting point is earlier in the query + return std::make_tuple(a.get_score(), b.get_query().size(), + b.get_orientation(), b.get_clipping()) + > std::make_tuple(b.get_score(), a.get_query().size(), + a.get_orientation(), a.get_clipping()); } }; diff --git a/metagraph/src/graph/alignment/aligner_cigar.cpp b/metagraph/src/graph/alignment/aligner_cigar.cpp index 06b4573a0e..322029e72c 100644 --- a/metagraph/src/graph/alignment/aligner_cigar.cpp +++ b/metagraph/src/graph/alignment/aligner_cigar.cpp @@ -2,43 +2,12 @@ #include "kmer/alphabets.hpp" - namespace mtg { namespace graph { namespace align { -Cigar::Cigar(const std::string &cigar_str) { - std::string op_count; - for (auto c : cigar_str) { - switch (c) { - case '=': - cigar_.emplace_back(Cigar::MATCH, std::stol(op_count)); - op_count.clear(); - break; - case 'X': - cigar_.emplace_back(Cigar::MISMATCH, std::stol(op_count)); - op_count.clear(); - break; - case 'I': - cigar_.emplace_back(Cigar::INSERTION, std::stol(op_count)); - op_count.clear(); - break; - case 'D': - cigar_.emplace_back(Cigar::DELETION, std::stol(op_count)); - op_count.clear(); - break; - case 'S': - cigar_.emplace_back(Cigar::CLIPPED, std::stol(op_count)); - op_count.clear(); - break; - default: - op_count += c; - } - } -} - -Cigar::OperatorTable Cigar::initialize_opt_table() { +OperatorTable initialize_opt_table() { OperatorTable char_to_op; // TODO: fix this when alphabets are no longer set at compile time @@ -61,7 +30,7 @@ Cigar::OperatorTable Cigar::initialize_opt_table() { ); #endif - for (auto& row : char_to_op) { + for (auto &row : char_to_op) { row.fill(Cigar::MISMATCH); } @@ -81,7 +50,35 @@ Cigar::OperatorTable Cigar::initialize_opt_table() { return char_to_op; } -Cigar::OperatorTable Cigar::char_to_op = Cigar::initialize_opt_table(); +Cigar::Cigar(std::string_view cigar_str) { + std::string op_count; + for (char c : cigar_str) { + switch (c) { + case '=': + cigar_.emplace_back(Cigar::MATCH, std::stol(op_count)); + op_count.clear(); + break; + case 'X': + cigar_.emplace_back(Cigar::MISMATCH, std::stol(op_count)); + op_count.clear(); + break; + case 'I': + cigar_.emplace_back(Cigar::INSERTION, std::stol(op_count)); + op_count.clear(); + break; + case 'D': + cigar_.emplace_back(Cigar::DELETION, std::stol(op_count)); + op_count.clear(); + break; + case 'S': + cigar_.emplace_back(Cigar::CLIPPED, std::stol(op_count)); + op_count.clear(); + break; + default: + op_count += c; + } + } +} std::string Cigar::to_string() const { std::string cigar_string; diff --git a/metagraph/src/graph/alignment/aligner_cigar.hpp b/metagraph/src/graph/alignment/aligner_cigar.hpp index 8b5dbb5f21..bc82f859df 100644 --- a/metagraph/src/graph/alignment/aligner_cigar.hpp +++ b/metagraph/src/graph/alignment/aligner_cigar.hpp @@ -24,20 +24,15 @@ class Cigar { }; typedef uint32_t LengthType; - typedef std::array OperatorTableRow; - typedef std::array OperatorTable; typedef std::pair value_type; - static OperatorTable char_to_op; - static const OperatorTableRow& get_op_row(char a) { return char_to_op[a]; } - Cigar(Operator op = Operator::CLIPPED, LengthType num = 0) : cigar_(num ? 1 : 0, std::make_pair(op, num)) { } // See section 1.4 in https://samtools.github.io/hts-specs/SAMv1.pdf for // a specification of the CIGAR string format. // e.g., 3=1X2I3D for 3 matches, 1 mismatch, 2 insertions, 3 deletions - explicit Cigar(const std::string &cigar_str); + Cigar(std::string_view cigar_str); size_t size() const { return cigar_.size(); } bool empty() const { return cigar_.empty(); } @@ -104,15 +99,19 @@ class Cigar { // character of the reference sequence after clipping is trimmed bool is_valid(std::string_view reference, std::string_view query) const; - static char opt_to_char(Cigar::Operator op) { return op_str_[op]; } + static constexpr char opt_to_char(Cigar::Operator op) { return op_str_[op]; } private: static constexpr char op_str_[] = "SX=DIN"; std::vector cigar_; - - static OperatorTable initialize_opt_table(); }; + +typedef std::array, 128> OperatorTable; +OperatorTable initialize_opt_table(); +static const OperatorTable kCharToOp = initialize_opt_table(); + + } // namespace align } // namespace graph } // namespace mtg diff --git a/metagraph/src/graph/alignment/aligner_config.cpp b/metagraph/src/graph/alignment/aligner_config.cpp index 5e407967b0..52946e0be2 100644 --- a/metagraph/src/graph/alignment/aligner_config.cpp +++ b/metagraph/src/graph/alignment/aligner_config.cpp @@ -2,19 +2,36 @@ #include "aligner_cigar.hpp" #include "kmer/alphabets.hpp" +#include "common/logger.hpp" namespace mtg { namespace graph { namespace align { +using mtg::common::logger; + // check to make sure the current scoring system won't underflow bool DBGAlignerConfig::check_config_scores() const { - int8_t min_penalty_score = std::min(gap_opening_penalty, gap_extension_penalty); + int8_t min_penalty_score = std::numeric_limits::max(); for (const auto &row : score_matrix_) { - min_penalty_score = std::min(min_penalty_score, *std::min_element(row.begin(), row.end())); + min_penalty_score = std::min(min_penalty_score, + *std::min_element(row.begin(), row.end())); + } + + if (gap_opening_penalty * 2 >= min_penalty_score) { + common::logger->error( + "|gap_opening_penalty| * 2 should be greater than greatest mismatch penalty: " + "{} >= {}", + gap_opening_penalty * 2, (score_t)min_penalty_score + ); + return false; } + min_penalty_score = std::min({ min_penalty_score, + gap_opening_penalty, + gap_extension_penalty }); + assert(min_penalty_score < 0 && "min scores must be negative"); if (min_cell_score >= std::numeric_limits::min() - min_penalty_score) diff --git a/metagraph/src/graph/alignment/aligner_config.hpp b/metagraph/src/graph/alignment/aligner_config.hpp index d5a0faad21..afabe31a1c 100644 --- a/metagraph/src/graph/alignment/aligner_config.hpp +++ b/metagraph/src/graph/alignment/aligner_config.hpp @@ -51,8 +51,6 @@ class DBGAlignerConfig { return score_matrix_[char_in_query]; } - size_t queue_size = std::numeric_limits::max(); - size_t bandwidth = std::numeric_limits::max(); size_t num_alternative_paths = 1; size_t min_seed_length = 1; size_t max_seed_length = std::numeric_limits::max(); diff --git a/metagraph/src/graph/alignment/aligner_dp_table.cpp b/metagraph/src/graph/alignment/aligner_dp_table.cpp deleted file mode 100644 index fee7ac0a12..0000000000 --- a/metagraph/src/graph/alignment/aligner_dp_table.cpp +++ /dev/null @@ -1,219 +0,0 @@ -#include "aligner_dp_table.hpp" - -#include - -#include "aligner_alignment.hpp" -#include "graph/representation/base/sequence_graph.hpp" - -namespace mtg { -namespace graph { -namespace align { - - -template -bool DPTable::add_seed(const Alignment &seed, - const DBGAlignerConfig &config, - size_t size, - size_t start_pos, - size_t query_offset) { - query_offset_ = query_offset; - char start_char = seed.get_query().back(); - score_t last_char_score = config.get_row(start_char)[seed.get_sequence().back()]; - - iterator column_it = dp_table_.find(seed.back()); - if (column_it == dp_table_.end()) { - column_it = emplace(seed.back(), Column(size, config.min_cell_score, - start_char, start_pos)).first; - } else { - expand_to_cover(column_it, 0, size); - } - - auto &table_init = column_it.value(); - - bool update = false; - - if (table_init.best_score() < seed.get_score()) { - auto last_op = seed.get_cigar().back().first; - table_init.scores[start_pos] = seed.get_score(); - table_init.ops[start_pos] = last_op; - table_init.prev_nodes[start_pos] = 0; - table_init.gap_prev_nodes[start_pos] = 0; - - if (last_op != Cigar::DELETION && last_op != Cigar::MATCH - && last_op != Cigar::MISMATCH) { - throw std::runtime_error("Seeds must end in DELETION, MATCH, or MISMATCH"); - } - - // This vector stores the best scores for partial alignments ending in a - // delete operation. If the last operation in the seed is MATCH or MISMATCH, - // then we replace it with a score corresponding to INSERTION followed by DELETION - table_init.gap_scores[start_pos] = std::max( - last_op == Cigar::DELETION - ? table_init.scores[start_pos] - : table_init.scores[start_pos] - last_char_score - + config.gap_opening_penalty - + config.gap_opening_penalty, - config.min_cell_score - ); - - table_init.gap_count[start_pos] = 1; - update = true; - } - - return update; -} - -template -void DPTable -::extract_alignments(const DeBruijnGraph &graph, - const DBGAlignerConfig &config, - std::string_view query_view, - std::function&&, NodeType)> callback, - score_t min_path_score, - const Alignment &seed, - NodeType *node) { - NodeType start_node; - if (config.num_alternative_paths == 1 && node) { - // avoid sorting column iterators if we're only interested in the top path - start_node = DeBruijnGraph::npos; - auto column_it = dp_table_.find(*node); - assert(column_it != dp_table_.end()); - Alignment alignment(*this, - config, - query_view, - column_it, - column_it->second.best_pos, - graph.get_k() - 1, - &start_node, - seed); - - if (alignment.empty() && !alignment.get_query().data()) - return; - - assert(alignment.is_valid(graph, &config)); - - callback(std::move(alignment), start_node); - - return; - } - - // store visited nodes in paths to avoid returning subalignments - tsl::hopscotch_set visited_nodes; - - std::vector starts; - starts.reserve(size()); - for (const_iterator it = dp_table_.begin(); it != dp_table_.end(); ++it) { - if (it->second.best_score() > min_path_score - && it->second.best_op() == Cigar::MATCH) - starts.emplace_back(it); - } - - if (starts.empty()) - return; - - std::sort(starts.begin(), starts.end(), - [](const auto &a, const auto &b) { - return a->second.best_score() > b->second.best_score(); - }); - - size_t num_paths = 0; - for (const_iterator column_it : starts) { - if (num_paths >= config.num_alternative_paths) - return; - - // ignore if the current point is a subalignment of one already visited - if (visited_nodes.find(column_it->first) != visited_nodes.end()) - continue; - - start_node = DeBruijnGraph::npos; - Alignment next(*this, - config, - query_view, - column_it, - column_it->second.best_pos, - graph.get_k() - 1, - &start_node, - seed); - - if (next.empty() && !next.get_query().data()) - continue; - - assert(next.is_valid(graph, &config)); - visited_nodes.insert(next.begin(), next.end()); - - callback(std::move(next), start_node); - - ++num_paths; - } -} - -template -void DPTable::Column::expand_to_cover(size_t begin, size_t end) { - assert(best_pos >= start_index); - assert(best_pos - start_index < scores.size()); - assert(last_priority_pos >= start_index); - assert(last_priority_pos - start_index < scores.size()); - - if (begin >= start_index) { - // the current range already covers [begin, end) - if (end <= start_index + scores.size() - 8) - return; - - // extend the range to the right to reach end - scores.resize(end + 8 - start_index, min_score_); - gap_scores.resize(end + 8 - start_index, min_score_); - ops.resize(end + 8 - start_index, Cigar::CLIPPED); - prev_nodes.resize(end + 8 - start_index, 0); - gap_prev_nodes.resize(end + 8 - start_index, 0); - gap_count.resize(end + 8 - start_index, 0); - } else if (end <= start_index + scores.size() - 8) { - // extend the range to the left to reach begin - size_t shift = start_index - begin; - start_index = begin; - scores.insert(scores.begin(), shift, min_score_); - gap_scores.insert(gap_scores.begin(), shift, min_score_); - ops.insert(ops.begin(), shift, Cigar::CLIPPED); - prev_nodes.insert(prev_nodes.begin(), shift, 0); - gap_prev_nodes.insert(gap_prev_nodes.begin(), shift, 0); - gap_count.insert(gap_count.begin(), shift, 0); - } else { - // extend the range in both directions - size_t shift = start_index - begin; - start_index = begin; - end += 8; - - size_t new_size = end - begin; - - scores.reserve(new_size); - gap_scores.reserve(new_size); - ops.reserve(new_size); - prev_nodes.reserve(new_size); - gap_prev_nodes.reserve(new_size); - gap_count.reserve(new_size); - - scores.insert(scores.begin(), shift, min_score_); - gap_scores.insert(gap_scores.begin(), shift, min_score_); - ops.insert(ops.begin(), shift, Cigar::CLIPPED); - prev_nodes.insert(prev_nodes.begin(), shift, 0); - gap_prev_nodes.insert(gap_prev_nodes.begin(), shift, 0); - gap_count.insert(gap_count.begin(), shift, 0); - - scores.resize(new_size, min_score_); - gap_scores.resize(new_size, min_score_); - ops.resize(new_size, Cigar::CLIPPED); - prev_nodes.resize(new_size, 0); - gap_prev_nodes.resize(new_size, 0); - gap_count.resize(new_size, 0); - } - - assert(best_pos >= start_index); - assert(best_pos - start_index < scores.size()); - assert(last_priority_pos >= start_index); - assert(last_priority_pos - start_index < scores.size()); -} - -template class DPTable<>; - -} // namespace align -} // namespace graph -} // namespace mtg diff --git a/metagraph/src/graph/alignment/aligner_dp_table.hpp b/metagraph/src/graph/alignment/aligner_dp_table.hpp deleted file mode 100644 index 6c0669ef65..0000000000 --- a/metagraph/src/graph/alignment/aligner_dp_table.hpp +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef __ALIGNER_DP_TABLE_HPP__ -#define __ALIGNER_DP_TABLE_HPP__ - -#include - -#include "aligner_config.hpp" -#include "aligner_cigar.hpp" -#include "common/aligned_vector.hpp" -#include "common/utils/template_utils.hpp" - - -namespace mtg { -namespace graph { - -class DeBruijnGraph; - -namespace align { - -template -class Alignment; - -// dynamic programming table stores score columns and steps needed to reconstruct paths -template -class DPTable { - public: - typedef DBGAlignerConfig::score_t score_t; - - struct Column { - Column() = default; - - Column(size_t size, - score_t min_score, - char start_char, - size_t pos = 0, - size_t priority_pos = 0, - size_t start = 0, - size_t end = std::numeric_limits::max()) - : size_(size), - min_score_(min_score), - scores(std::min(end, size) - start + 8, min_score), - gap_scores(scores.size(), min_score), - ops(scores.size(), Cigar::CLIPPED), - prev_nodes(scores.size(), 0), - gap_prev_nodes(scores.size(), 0), - gap_count(scores.size(), 0), - last_char(start_char), - best_pos(std::min(std::max(pos, start), start + scores.size() - (size_t)9)), - last_priority_pos(std::min(std::max(priority_pos, start), start + scores.size() - (size_t)9)), - start_index(start) {} - - size_t size_; - score_t min_score_; - AlignedVector scores; - AlignedVector gap_scores; - AlignedVector ops; - AlignedVector prev_nodes; - AlignedVector gap_prev_nodes; - AlignedVector gap_count; - mutable std::vector incoming_nodes; - char last_char; - size_t best_pos; - size_t last_priority_pos; - size_t start_index; - - void expand_to_cover(size_t begin, size_t end); - - const score_t& best_score() const { return scores.at(best_pos - start_index); } - const score_t& last_priority_value() const { return scores.at(last_priority_pos - start_index); } - const Cigar::Operator& best_op() const { return ops.at(best_pos - start_index); } - - bool operator<(const Column &other) const { - return best_score() < other.best_score(); - } - - size_t cell_size() const { - return sizeof(score_t) * 2 + sizeof(Cigar::Operator) - + sizeof(uint8_t) * 2 + sizeof(int32_t); - } - - size_t bytes_taken() const { - return sizeof(Column) - + sizeof(score_t) * scores.capacity() - + sizeof(score_t) * gap_scores.capacity() - + sizeof(Cigar::Operator) * ops.capacity() - + sizeof(uint8_t) * prev_nodes.capacity() - + sizeof(uint8_t) * gap_prev_nodes.capacity() - + sizeof(int32_t) * gap_count.capacity() - + sizeof(NodeType) * incoming_nodes.capacity(); - } - - size_t size() const { return size_; } - - uint8_t rank_prev_node(NodeType node) const { - assert(incoming_nodes.size() < std::numeric_limits::max()); - for (uint8_t i = 0; i < incoming_nodes.size(); ++i) { - if (incoming_nodes[i] == node) - return i + 1; - } - - incoming_nodes.push_back(node); - return incoming_nodes.size(); - } - - NodeType select_prev_node(uint8_t rank) const { - assert(rank); - return incoming_nodes.at(rank - 1); - } - }; - - DPTable() {} - - bool add_seed(const Alignment &seed, - const DBGAlignerConfig &config, - size_t size, - size_t start_pos, - size_t query_offset = 0); - - typedef tsl::hopscotch_map Storage; - - typedef NodeType key_type; - typedef Column mapped_type; - typedef typename Storage::value_type value_type; - - typedef typename Storage::iterator iterator; - typedef typename Storage::const_iterator const_iterator; - - void expand_to_cover(iterator it, size_t begin, size_t end) { - size_t old_size = it->second.bytes_taken(); - it.value().expand_to_cover(begin, end); - num_bytes_ += it->second.bytes_taken() - old_size; - } - - iterator begin() { return dp_table_.begin(); } - iterator end() { return dp_table_.end(); } - const_iterator begin() const { return dp_table_.begin(); } - const_iterator end() const { return dp_table_.end(); } - - iterator find(const NodeType &node) { return dp_table_.find(node); } - const_iterator find(const NodeType &node) const { return dp_table_.find(node); } - - size_t size() const { return dp_table_.size(); } - - size_t num_bytes() const { return num_bytes_; } - - void clear() { - dp_table_.clear(); - num_bytes_ = 0; - } - - template - std::pair emplace(Args&&... args) { - auto pair = dp_table_.emplace(std::forward(args)...); - - if (pair.second) - num_bytes_ += pair.first.value().bytes_taken(); - - return pair; - } - - void erase(NodeType key) { dp_table_.erase(key); } - size_t count(NodeType key) const { return dp_table_.count(key); } - - void extract_alignments(const DeBruijnGraph &graph, - const DBGAlignerConfig &config, - std::string_view query_view, - std::function&&, NodeType)> callback, - score_t min_path_score, - const Alignment &seed, - NodeType *node = nullptr); - - std::pair best_score() const { - auto mx = std::max_element(begin(), end(), utils::LessSecond()); - return std::make_pair(mx->first, mx->second.best_score()); - } - - const Storage& data() const { return dp_table_; } - size_t get_query_offset() const { return query_offset_; } - NodeType get_start_node() const { return start_node_; } - - private: - Storage dp_table_; - NodeType start_node_; - size_t query_offset_ = 0; - size_t num_bytes_ = 0; -}; - -} // namespace align -} // namespace graph -} // namespace mtg - -#endif // __ALIGNER_DP_TABLE_HPP__ diff --git a/metagraph/src/graph/alignment/aligner_extender_methods.cpp b/metagraph/src/graph/alignment/aligner_extender_methods.cpp index 26660294cd..d47ebb9da8 100644 --- a/metagraph/src/graph/alignment/aligner_extender_methods.cpp +++ b/metagraph/src/graph/alignment/aligner_extender_methods.cpp @@ -1,12 +1,9 @@ #include "aligner_extender_methods.hpp" -#ifdef __AVX2__ -#include -#endif +#include -#include "common/logger.hpp" #include "common/utils/simd_utils.hpp" -#include "common/aligned_vector.hpp" +#include "common/utils/template_utils.hpp" #include "graph/representation/succinct/dbg_succinct.hpp" @@ -15,865 +12,744 @@ namespace mtg { namespace graph { namespace align { -using mtg::common::logger; +typedef DBGAlignerConfig::score_t score_t; +constexpr score_t ninf = std::numeric_limits::min() + 100; template DefaultColumnExtender::DefaultColumnExtender(const DeBruijnGraph &graph, const DBGAlignerConfig &config, std::string_view query) - : graph_(graph), config_(config), query(query) { + : graph_(graph), config_(config), query_(query) { assert(config_.check_config_scores()); - partial_sums_.resize(query.size()); - std::transform(query.begin(), query.end(), + partial_sums_.reserve(query_.size() + 1); + partial_sums_.resize(query_.size(), 0); + std::transform(query_.begin(), query_.end(), partial_sums_.begin(), [&](char c) { return config_.get_row(c)[c]; }); std::partial_sum(partial_sums_.rbegin(), partial_sums_.rend(), partial_sums_.rbegin()); - assert(config_.match_score(query) == partial_sums_.front()); - assert(config_.get_row(query.back())[query.back()] == partial_sums_.back()); + assert(config_.match_score(query_) == partial_sums_.front()); + assert(config_.get_row(query_.back())[query_.back()] == partial_sums_.back()); + partial_sums_.push_back(0); for (char c : graph_.alphabet()) { - auto &profile_score_row = profile_score.emplace(c, query.size() + 8).first.value(); - auto &profile_op_row = profile_op.emplace(c, query.size() + 8).first.value(); + auto &p_score_row = profile_score_.emplace(c, query_.size() + 9).first.value(); + auto &p_op_row = profile_op_.emplace(c, query_.size() + 9).first.value(); const auto &row = config_.get_row(c); - const auto &op_row = Cigar::get_op_row(c); + const auto &op_row = kCharToOp[c]; - std::transform(query.begin(), query.end(), profile_score_row.begin(), + // the first cell in a DP table row is one position before the last matched + // character, so we need to shift the indices of profile_score_ and profile_op_ + std::transform(query_.begin(), query_.end(), p_score_row.begin() + 1, [&row](char q) { return row[q]; }); - std::transform(query.begin(), query.end(), profile_op_row.begin(), - [&op_row](char q) { return op_row[q]; }); - } -} - -template -std::pair get_column_boundaries(DPTable &dp_table, - ColumnIt column_it, - DBGAlignerConfig::score_t xdrop_cutoff, - const Config &config) { - size_t bandwidth = config.bandwidth; - auto &column = column_it.value(); - auto &scores = column.scores; - size_t size = column.size(); - size_t best_pos = column.best_pos; - assert(best_pos >= column.start_index); - assert(best_pos - column.start_index < scores.size()); - - size_t shift = column.start_index; - if (xdrop_cutoff > scores[best_pos - shift]) - return std::make_pair(best_pos, best_pos); - - size_t begin = best_pos >= bandwidth ? best_pos - bandwidth : 0; - size_t end = bandwidth <= size - best_pos ? best_pos + bandwidth : size; - - if (column.min_score_ < xdrop_cutoff) { - begin = std::max(begin, column.start_index); - assert(column.start_index + scores.size() >= 8); - size_t cur_end = std::min(end, column.start_index + scores.size() - 8); - assert(begin < cur_end); - if (scores.at(cur_end - 1 - shift) >= xdrop_cutoff) { - size_t old_size = scores.size(); - dp_table.expand_to_cover(column_it, begin, end); - assert(shift == column.start_index); - if (scores.size() > old_size) { - update_ins_scores(config, - scores.data() + begin - shift, - column.prev_nodes.data() + begin - shift, - column.ops.data() + begin - shift, - nullptr, - end - begin, - xdrop_cutoff); - } - } else { - end = cur_end; - } - } else { - size_t old_size = scores.size(); - dp_table.expand_to_cover(column_it, begin, end); - shift = column.start_index; - if (scores.size() > old_size) { - update_ins_scores(config, - scores.data() + begin - shift, - column.prev_nodes.data() + begin - shift, - column.ops.data() + begin - shift, - nullptr, - end - begin, - xdrop_cutoff); - } - } - while (begin < end && scores[begin - shift] < xdrop_cutoff) { - ++begin; - } - - if (begin == scores.size() + shift) - return std::make_pair(begin, begin); - - // ensure that the next position is included in the range [begin, end) - size_t cur_end = best_pos + 2; - while (cur_end < end && scores[cur_end - shift] >= xdrop_cutoff) { - ++cur_end; + std::transform(query_.begin(), query_.end(), p_op_row.begin() + 1, + [&op_row](char q) { return op_row[q]; }); } - end = cur_end; - - assert(end > best_pos); - - if (begin >= end) - return std::make_pair(begin, begin); - - // align begin + 1 to 32-byte boundary - if (begin > 7) - begin = (begin & 0xFFFFFFFFFFFFFFF8) - 1; - - // round up to nearest multiple of 8 - end = std::min(begin + ((end - begin + 7) / 8) * 8, size); - dp_table.expand_to_cover(column_it, begin, end); - - assert(begin < end); - assert(begin <= best_pos); - assert(end > best_pos); - - return std::make_pair(begin, end); } template -std::pair::iterator, bool> DefaultColumnExtender -::emplace_node(NodeType node, - NodeType, - char c, - size_t size, - size_t best_pos, - size_t last_priority_pos, - size_t begin, - size_t end) { - end = std::min(end, size); - auto find = dp_table.find(node); - if (find == dp_table.end()) { - return dp_table.emplace(node, - typename DPTable::Column( - size, config_.min_cell_score, c, - best_pos + 1 != size ? best_pos + 1 : best_pos, - last_priority_pos, begin, end - )); - } else { - dp_table.expand_to_cover(find, begin, end); - auto [node_begin, node_end] = get_column_boundaries( - dp_table, find, xdrop_cutoff, config_ - ); - - if (node_begin != node_end) - overlapping_range_ |= (begin < std::min(size, node_end + 9) - && std::min(size, end + 9)> node_begin); - } +void DefaultColumnExtender::initialize(const DBGAlignment &seed) { + assert(seed.size()); + assert(seed.get_cigar().size()); + assert(seed.get_cigar().back().first == Cigar::MATCH + || seed.get_cigar().back().first == Cigar::MISMATCH); - return std::make_pair(find, false); + seed_ = &seed; + reset(); } -template -bool DefaultColumnExtender -::add_seed(size_t clipping) { - assert(path_->get_cigar().back().first == Cigar::MATCH - || path_->get_cigar().back().first == Cigar::MISMATCH); - - return dp_table.add_seed(*path_, config_, size, 0, clipping); +template +std::pair get_band(const Node &prev, + const Column &column_prev, + score_t xdrop_cutoff) { + const auto &S_prev = std::get<0>(column_prev[std::get<2>(prev)]); + size_t offset_prev = std::get<9>(column_prev[std::get<2>(prev)]); + size_t max_pos_prev = std::get<10>(column_prev[std::get<2>(prev)]); + assert(max_pos_prev - offset_prev < S_prev.size()); + assert(std::max_element(S_prev.begin(), S_prev.end()) + == S_prev.begin() + (max_pos_prev - offset_prev)); + + size_t start_pos = max_pos_prev - offset_prev; + if (S_prev[start_pos] < xdrop_cutoff) + return {}; + + auto stop = [cutoff=std::max(xdrop_cutoff, ninf)](score_t s) { return s < cutoff; }; + auto min_rit = std::find_if(std::make_reverse_iterator(S_prev.begin() + start_pos), + S_prev.rend(), stop); + auto max_it = std::find_if(S_prev.begin() + start_pos, S_prev.end(), stop); + + return std::make_pair(S_prev.rend() - min_rit + offset_prev, + max_it - S_prev.begin() + offset_prev); } -template -void DefaultColumnExtender::initialize(const DBGAlignment &seed) { - // this extender only works if at least one character has been matched - assert(seed.get_query().size()); - assert(query.data() <= seed.get_query().data()); +template +bool update_column(const DeBruijnGraph &graph_, + const DBGAlignerConfig &config_, + const Column &column_prev, + Scores &next_column, + char c, + size_t start, + size_t size, + score_t &xdrop_cutoff, + const ProfileScore &profile_score_, + const ProfileOp &profile_op_, + const Alignment &seed_) { + typedef DefaultColumnExtender Extender; + + auto &[S, E, F, OS, OE, OF, prev_node, PS, PF, offset, max_pos] = next_column; + size_t cur_size = S.size(); + assert(cur_size + offset <= size); + + auto &[S_prev, E_prev, F_prev, OS_prev, OE_prev, OF_prev, + prev_node_prev, PS_prev, PF_prev, offset_prev, max_pos_prev] + = column_prev[std::get<2>(prev_node)]; + assert(S_prev.size() + offset_prev <= size); + + // compute column boundaries for updating the match and deletion scores + ssize_t offset_diff = static_cast(offset_prev) - offset; + + // to define the boundaries for match scores + // need i + offset - offset_prev - 1 >= 0 + // && i + offset - offset_prev - 1 < S_prev.size() + // so offset_diff + 1 <= i < S_prev.size() + offset_diff + 1 + size_t match_begin = std::max((ssize_t)0, offset_diff + 1); + size_t match_end = std::max( + match_begin, + static_cast(std::min(S_prev.size() + offset_diff + 1, cur_size)) + ); - const char *query_end = query.data() + query.size(); - const char *seed_end = seed.get_query().data() + seed.get_query().size(); - assert(query_end >= seed_end); + // to define the boundaries for deletion scores + // need i + offset - offset_prev >= 0 + // && i + offset - offset_prev < S_prev.size() + // so offset_diff <= i < S_prev.size() + offset_diff + size_t del_begin = std::max((ssize_t)0, offset_diff); + size_t del_end = std::max( + del_begin, + static_cast(std::min(S_prev.size() + offset_diff, cur_size)) + ); - // size includes the last character of the seed since the upper-left corner - // of the score matrix is the seed score - size = query_end - seed_end + 1; + assert(del_end <= match_end); + assert(del_end + 1 >= match_end); - // extend_window_ doesn't include this last character - extend_window_ = { seed_end, size - 1 }; - match_score_begin = partial_sums_.data() + (seed_end - 1 - query.data()); - assert(config_.get_row(query.back())[query.back()] == match_score_begin[size - 1]); - assert(config_.match_score(std::string_view(extend_window_.data() - 1, size)) - == *match_score_begin); + const score_t *sprev = &S_prev[offset - offset_prev]; + const score_t *fprev = &F_prev[offset - offset_prev]; + const int8_t *profile = &profile_score_.find(c)->second[start + offset]; + const Cigar::Operator *profile_o = &profile_op_.find(c)->second[start + offset]; - start_node = seed.back(); - this->path_ = &seed; -} + std::fill(PS.begin() + match_begin, PS.begin() + match_end, Extender::PREV); + std::fill(PF.begin() + del_begin, PF.begin() + del_end, Extender::PREV); -template -void DefaultColumnExtender::check_and_push(ColumnRef&& next_column) { - const auto &[next_node, best_score_update, converged] = next_column; + bool updated = false; - // always push the next column if it hasn't converged - if (!converged) { - columns_to_update.emplace(std::move(next_column)); - return; - } + auto update_match = [sprev,profile,profile_o,S=S.data(),OS=OS.data()](ssize_t i) { + S[i + 1] = *(sprev + i) + profile[i + 1]; + OS[i + 1] = profile_o[i + 1]; + }; - assert(dp_table.find(next_node) != dp_table.end()); + auto update_del = [&config_,sprev,fprev,F=F.data(),OF=OF.data(), + S=S.data(),OS=OS.data()](size_t i) { + score_t del_open = sprev[i] + config_.gap_opening_penalty; + score_t del_extend = fprev[i] + config_.gap_extension_penalty; + F[i] = std::max(del_open, del_extend); + OF[i] = del_open < del_extend ? Cigar::DELETION : Cigar::MATCH; - // TODO: does the procedure below provably ensure that non-converged columns - // are not dropped? - const auto &column = dp_table.find(next_node)->second; + if (F[i] > S[i]) { + S[i] = F[i]; + OS[i] = Cigar::DELETION; + } + }; - // Ignore if there is no way it can be extended to an optimal alignment. - if (xdrop_cutoff > column.best_score() - || std::equal(match_score_begin + begin, - match_score_begin + end, - column.scores.data() + begin - column.start_index, - [&](auto a, auto b) { return a + b < score_cutoff; })) { - return; - } + if (del_begin < std::min(del_end, match_begin)) + update_del(del_begin); - // if the queue has space, push the next column - if (columns_to_update.size() < config_.queue_size) { - columns_to_update.emplace(std::move(next_column)); - return; - } + if (match_begin < match_end) + update_match(static_cast(match_begin) - 1); - const ColumnRef &bottom = columns_to_update.minimum(); +#ifndef __AVX2__ + for (size_t i = match_begin; i < del_end; ++i) { + update_del(i); + update_match(i); + } +#else + static_assert(sizeof(score_t) == sizeof(int32_t)); + for (size_t i = match_begin; i < del_end; i += 8) { + // vectorized update_match(i) + __m256i sprev_v = _mm256_loadu_si256((__m256i*)&sprev[i]); + __m256i profile_v = _mm256_cvtepi8_epi32(mm_loadu_si64(&profile[i + 1])); + _mm256_storeu_si256((__m256i*)&S[i + 1], _mm256_add_epi32(sprev_v, profile_v)); + *((uint64_t*)&OS[i + 1]) = *((uint64_t*)&profile_o[i + 1]); + + // vectorized update_del(i) + __m256i gap_open = _mm256_set1_epi32(config_.gap_opening_penalty); + __m256i del_open = _mm256_add_epi32(sprev_v, gap_open); + + __m256i fprev_v = _mm256_loadu_si256((__m256i*)&fprev[i]); + __m256i gap_extend = _mm256_set1_epi32(config_.gap_extension_penalty); + __m256i del_extend = _mm256_add_epi32(fprev_v, gap_extend); + + __m256i del_score = _mm256_max_epi32(del_open, del_extend); + _mm256_storeu_si256((__m256i*)&F[i], del_score); + + __m128i del_op_v = _mm_blendv_epi8( + _mm_set1_epi8(Cigar::MATCH), + _mm_set1_epi8(Cigar::DELETION), + mm256_cvtepi32_epi8(_mm256_cmpgt_epi32(del_extend, del_open)) + ); + mm_storeu_si64(&OF[i], del_op_v); - if (!utils::LessSecond()(bottom, next_column)) - return; + __m256i s_v = _mm256_loadu_si256((__m256i*)&S[i]); + __m256i score_max = _mm256_max_epi32(s_v, del_score); + _mm256_storeu_si256((__m256i*)&S[i], score_max); - if (std::get<2>(bottom) || std::get<1>(bottom) - != dp_table.find(std::get<0>(bottom))->second.last_priority_value()) { - // if the bottom has converged, or it is an invalidated reference - // (it's last priority value has changed), then replace the bottom element - columns_to_update.update(columns_to_update.begin(), std::move(next_column)); - return; - } else { - // otherwise, push - columns_to_update.emplace(std::move(next_column)); - return; + __m128i mask = mm256_cvtepi32_epi8(_mm256_cmpgt_epi32(del_score, s_v)); + mm_maskstorel_epi8((int8_t*)&OS[i], mask, _mm_set1_epi8(Cigar::DELETION)); } -} - +#endif -/* - * Helpers for column score updating - */ - -template -inline void update_ins_scores(const DBGAlignerConfig &config, - score_t *update_scores, - uint8_t *update_prevs, - Cigar::Operator *update_ops, - int8_t *updated_mask, - size_t length, - score_t xdrop_cutoff) { - for (size_t i = 1; i < length; ++i) { - if (update_ops[i - 1] != Cigar::DELETION) { - score_t ins_score = std::max(config.min_cell_score, - update_scores[i - 1] + (update_ops[i - 1] == Cigar::INSERTION - ? config.gap_extension_penalty - : config.gap_opening_penalty - )); - - if (ins_score >= xdrop_cutoff && ins_score > update_scores[i]) { - while (i < length && ins_score > update_scores[i]) { - update_scores[i] = ins_score; - update_ops[i] = Cigar::INSERTION; - update_prevs[i] = 0xFF; - - if (updated_mask) - updated_mask[i] = 0xFF; - - ins_score += config.gap_extension_penalty; - ++i; - } - --i; - } + auto update_max = [&xdrop_cutoff,&updated,&config_,ninf=ninf, + S=S.data(),E=E.data(),F=F.data(), + OS=OS.data(),OE=OE.data(),OF=OF.data(), + PS=PS.data(),PF=PF.data()](size_t i) { + if (S[i] < xdrop_cutoff) { + S[i] = ninf; + E[i] = ninf; + F[i] = ninf; + OS[i] = Cigar::CLIPPED; + OE[i] = Cigar::CLIPPED; + OF[i] = Cigar::CLIPPED; + PS[i] = Extender::NONE; + PF[i] = Extender::NONE; + } else if (S[i] <= 0) { + S[i] = 0; + OS[i] = Cigar::CLIPPED; + PS[i] = Extender::NONE; + updated = true; + } else { + xdrop_cutoff = std::max(xdrop_cutoff, S[i] - config_.xdrop); + updated = true; } - } -} + }; -#ifdef __AVX2__ + update_max(0); -// Drop-in replacement for _mm_loadu_si64 -inline __m128i mm_loadu_si64(const void *mem_addr) { - return _mm_loadl_epi64((const __m128i*)mem_addr); -} + size_t i = 1; + for ( ; i < cur_size; ++i) { + score_t ins_open = S[i - 1] + config_.gap_opening_penalty; + score_t ins_extend = E[i - 1] + config_.gap_extension_penalty; + E[i] = std::max(ins_open, ins_extend); + OE[i] = ins_open < ins_extend ? Cigar::INSERTION : Cigar::MATCH; -// Drop-in replacement for _mm_storeu_si64 -inline void mm_storeu_si64(void *mem_addr, __m128i a) { - _mm_storel_epi64((__m128i*)mem_addr, a); -} + if (E[i] > S[i]) { + S[i] = E[i]; + OS[i] = Cigar::INSERTION; + PS[i] = Extender::CUR; + } -inline void mm_maskstorel_epi8(int8_t *mem_addr, __m128i mask, __m128i a) { - __m128i orig = mm_loadu_si64((__m128i*)mem_addr); - a = _mm_blendv_epi8(orig, a, mask); - mm_storeu_si64(mem_addr, a); -} + update_max(i); -inline void compute_HE_avx2(size_t length, - __m128i prev_node, - __m256i gap_opening_penalty, - __m256i gap_extension_penalty, - int32_t *update_scores, - int32_t *update_gap_scores, - uint8_t *update_prevs, - int8_t *update_ops, - int32_t *update_gap_count, - uint8_t *update_gap_prevs, - const int32_t *incoming_scores, - const int32_t *incoming_gap_scores, - const int8_t *profile_scores, - const int8_t *profile_ops, - const int32_t *incoming_gap_count, - int8_t *updated_mask, - __m256i xdrop_cutoff) { - assert(update_scores != incoming_scores); - assert(update_gap_scores != incoming_gap_scores); - - __m128i del_p = _mm_set1_epi8(Cigar::DELETION); - for (size_t i = 1; i < length; i += 8) { - // store score updates - // load previous values for cells to update - __m256i H_orig = _mm256_loadu_si256((__m256i*)&update_scores[i]); - - // compute match score - __m256i incoming_p = _mm256_loadu_si256((__m256i*)&incoming_scores[i - 1]); - __m256i match_score = _mm256_add_epi32( - incoming_p, _mm256_cvtepi8_epi32(mm_loadu_si64(&profile_scores[i])) - ); + if (S[i] == ninf && E[i] == ninf) + break; + } - // compute score for cell update - __m256i H = _mm256_max_epi32(H_orig, match_score); + for ( ; i < cur_size; ++i) { + update_max(i); + } - // compute deletion score - __m256i update_score_open = _mm256_add_epi32( - rshiftpushback_epi32(incoming_p, incoming_scores[i + 7]), - gap_opening_penalty - ); - __m256i update_score_extend = _mm256_add_epi32( - _mm256_loadu_si256((__m256i*)&incoming_gap_scores[i]), - gap_extension_penalty - ); - __m256i update_score = _mm256_max_epi32(update_score_open, update_score_extend); - __m128i update_gap_prev = prev_node; - - // compute updated gap size count - __m256i is_extend = _mm256_cmpeq_epi32(update_score, update_score_extend); - __m256i incoming_count = _mm256_add_epi32( - _mm256_set1_epi32(1), - _mm256_and_si256(_mm256_loadu_si256((__m256i*)&incoming_gap_count[i]), is_extend) - ); + // extend to the right with insertion scores + while (offset + S.size() < size && S.back() >= xdrop_cutoff) { + score_t ins_open = S.back() + config_.gap_opening_penalty; + score_t ins_extend = E.back() + config_.gap_extension_penalty; + E.push_back(std::max(ins_open, ins_extend)); + F.push_back(ninf); + S.push_back(std::max({ 0, E.back(), F.back() })); + + OS.push_back(Cigar::CLIPPED); + OE.push_back(ins_open < ins_extend ? Cigar::INSERTION : Cigar::MATCH); + OF.push_back(Cigar::CLIPPED); + + PS.push_back(Extender::NONE); + PF.push_back(Extender::NONE); + if (S.back() > 0) { + assert(S.back() == E.back()); + updated = true; + PS.back() = Extender::CUR; + OS.back() = Cigar::INSERTION; + xdrop_cutoff = std::max(xdrop_cutoff, S.back() - config_.xdrop); + } + } - __m256i update_gap_scores_orig = _mm256_loadu_si256((__m256i*)&update_gap_scores[i]); - __m256i update_gap_count_orig = _mm256_loadu_si256((__m256i*)&update_gap_count[i]); - __m128i update_gap_prevs_orig = mm_loadu_si64(&update_gap_prevs[i]); - __m256i gap_updated = _mm256_cmpgt_epi32(update_score, update_gap_scores_orig); - __m128i gap_updated_small = mm256_cvtepi32_epi8(gap_updated); + // make sure that the first operation taken matches the seed + std::ignore = graph_; + std::ignore = seed_; + assert(std::get<0>(prev_node) != graph_.max_index() + 1 || std::get<2>(prev_node) + || (offset <= 1 && S[1 - offset] == seed_.get_score() + && OS[1 - offset] == seed_.get_cigar().back().first + && PS[1 - offset] == Extender::PREV)); - update_score = _mm256_blendv_epi8(update_gap_scores_orig, update_score, gap_updated); - incoming_count = _mm256_blendv_epi8(update_gap_count_orig, incoming_count, gap_updated); - update_gap_prev = _mm_blendv_epi8(update_gap_prevs_orig, update_gap_prev, gap_updated_small); + return updated; +} - // compute score for cell update. check if deleting improves the update - __m256i update_cmp = _mm256_cmpgt_epi32(update_score, H); - H = _mm256_max_epi32(H, update_score); +template +void backtrack(const Table &table_, + const Alignment &seed_, + const DeBruijnGraph &graph_, + const DBGAlignerConfig &config_, + score_t min_path_score, + AlignNode best_node, + StartSet &prev_starts, + size_t size, + std::string_view extend_window_, + std::string_view query, + std::vector> &extensions) { + typedef DefaultColumnExtender Extender; + + Cigar cigar; + std::vector path; + std::string seq; + NodeType start_node = DeBruijnGraph::npos; + + assert(table_.count(std::get<0>(best_node))); + const auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] + = table_.find(std::get<0>(best_node))->second.first.at(std::get<2>(best_node)); + + score_t max_score = S[max_pos - offset]; + score_t score = max_score; + Cigar::Operator last_op = OS[max_pos - offset]; + assert(last_op == Cigar::MATCH); + + if (max_pos + 1 < size) + cigar.append(Cigar::CLIPPED, size - max_pos - 1); + + size_t pos = max_pos; + while (true) { + assert(table_.count(std::get<0>(best_node))); + const auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] + = table_.find(std::get<0>(best_node))->second.first.at(std::get<2>(best_node)); + + prev_starts.emplace(best_node); + + assert(last_op == Cigar::MATCH || last_op == Cigar::MISMATCH); + last_op = OS[pos - offset]; + + if (last_op == Cigar::CLIPPED || S[pos - offset] == 0) { + assert(S[pos - offset] == 0); + max_score = score; + break; + } else if (pos == 1 && last_op != Cigar::DELETION) { + score -= S[pos - offset]; + if (std::get<0>(prev) != graph_.max_index() + 1 + || std::get<0>(best_node) != seed_.back() + || std::get<2>(best_node) + || last_op != seed_.get_cigar().back().first) { + // last op in the seed was skipped + // TODO: reconstruct the entire alignment. for now, throw this out + return; + } else { + assert(seed_.get_score() == S[pos - offset]); + start_node = seed_.back(); + } + break; + } - // determine which indices satisfy the x-drop criteria - __m256i xdrop_cmp = _mm256_cmpgt_epi32(H, xdrop_cutoff); + switch (last_op) { + case Cigar::MATCH: + case Cigar::MISMATCH: { + cigar.append(last_op); + path.push_back(std::get<0>(best_node)); + seq += std::get<1>(best_node); + assert((last_op == Cigar::MATCH) + == (graph_.get_node_sequence(std::get<0>(best_node)).back() + == extend_window_[pos - 1])); + switch (PS[pos - offset]) { + case Extender::PREV: { best_node = prev; } break; + case Extender::CUR: {} break; + case Extender::NONE: { assert(false); } + } + --pos; + assert(pos); + } break; + case Cigar::INSERTION: { + assert(PS[pos - offset] == Extender::CUR); + while (last_op == Cigar::INSERTION) { + last_op = OE[pos - offset]; + assert(last_op == Cigar::MATCH || last_op == Cigar::INSERTION); + cigar.append(Cigar::INSERTION); + --pos; + assert(pos); + } + } break; + case Cigar::DELETION: { + while (last_op == Cigar::DELETION) { + assert(table_.count(std::get<0>(best_node))); + const auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] + = table_.find(std::get<0>(best_node))->second.first.at(std::get<2>(best_node)); + last_op = OF[pos - offset]; + assert(last_op == Cigar::MATCH || last_op == Cigar::DELETION); + path.push_back(std::get<0>(best_node)); + seq += std::get<1>(best_node); + cigar.append(Cigar::DELETION); + switch (PF[pos - offset]) { + case Extender::PREV: { best_node = prev; } break; + case Extender::CUR: {} break; + case Extender::NONE: { assert(false); } + } + prev_starts.emplace(best_node); + } + } break; + case Cigar::CLIPPED: { assert(false); } + } - // revert values not satisfying the x-drop criteria - H = _mm256_blendv_epi8(H_orig, H, xdrop_cmp); + assert(pos); + } - __m256i both_cmp = _mm256_cmpgt_epi32(H, H_orig); - if (!_mm256_movemask_epi8(both_cmp)) - continue; + if (max_score < min_path_score) + return; - __m128i both_cmp_small = mm256_cvtepi32_epi8(both_cmp); - __m128i update_cmp_small = mm256_cvtepi32_epi8(update_cmp); + if (pos > 1) + cigar.append(Cigar::CLIPPED, pos - 1); - // update scores - _mm256_maskstore_epi32(&update_scores[i], both_cmp, H); + std::reverse(cigar.begin(), cigar.end()); + std::reverse(path.begin(), path.end()); + std::reverse(seq.begin(), seq.end()); - _mm256_storeu_si256((__m256i*)&update_gap_scores[i], - _mm256_blendv_epi8(update_gap_scores_orig, update_score, both_cmp)); - _mm256_storeu_si256((__m256i*)&update_gap_count[i], - _mm256_blendv_epi8(update_gap_count_orig, incoming_count, both_cmp)); + Alignment extension( + { extend_window_.data() + pos, max_pos - pos }, + std::move(path), std::move(seq), score, std::move(cigar), + 0, seed_.get_orientation(), graph_.get_k() - 1 + ); - update_gap_prev = _mm_blendv_epi8(update_gap_prevs_orig, update_gap_prev, both_cmp_small); - mm_storeu_si64(&update_gap_prevs[i], update_gap_prev); + std::ignore = config_; + assert(extension.is_valid(graph_, &config_)); + extension.extend_query_end(query.data() + query.size()); - __m128i update_op = _mm_blendv_epi8(mm_loadu_si64(&profile_ops[i]), del_p, update_cmp_small); - mm_maskstorel_epi8(&update_ops[i], both_cmp_small, update_op); + if (start_node) { + auto next_path = seed_; + next_path.append(std::move(extension)); + next_path.trim_offset(); + assert(next_path.is_valid(graph_, &config_)); - __m128i updated_mask_orig = mm_loadu_si64(&updated_mask[i]); - mm_storeu_si64(&updated_mask[i], _mm_or_si128(updated_mask_orig, both_cmp_small)); + DEBUG_LOG("Alignment (extended): {}", next_path); + extensions.emplace_back(std::move(next_path)); + } else { + extension.extend_query_begin(query.data()); + extension.trim_offset(); + assert(extension.is_valid(graph_, &config_)); - __m128i update_prev = _mm_blendv_epi8(prev_node, update_gap_prev, update_cmp_small); - mm_maskstorel_epi8((int8_t*)&update_prevs[i], both_cmp_small, update_prev); + DEBUG_LOG("Alignment (trim seed): {}", extension); + extensions.emplace_back(std::move(extension)); } } -#endif - -// direct translation of compute_HE_avx2 to scalar code -inline void compute_HE(size_t length, - uint8_t prev_node, - int32_t gap_opening_penalty, - int32_t gap_extension_penalty, - int32_t *update_scores, - int32_t *update_gap_scores, - uint8_t *update_prevs, - Cigar::Operator *update_ops, - int32_t *update_gap_count, - uint8_t *update_gap_prevs, - const int32_t *incoming_scores, - const int32_t *incoming_gap_scores, - const int8_t *profile_scores, - const Cigar::Operator *profile_ops, - const int32_t *incoming_gap_count, - int8_t *updated_mask, - int32_t xdrop_cutoff) { - // round to cover the same address space as the AVX2 version - for (size_t j = 1; j < length; j += 8) { - for (size_t i = j; i < j + 8; ++i) { - // store score updates - // load previous values for cells to update - int32_t H_orig = update_scores[i]; - - // compute match score - int32_t incoming_p = incoming_scores[i - 1]; - int32_t match_score = incoming_p + profile_scores[i]; - - // compute score for cell update - int32_t H = std::max(H_orig, match_score); - - // compute deletion score - int32_t update_score_open = incoming_scores[i] + gap_opening_penalty; - int32_t update_score_extend = incoming_gap_scores[i] + gap_extension_penalty; - int32_t update_score = std::max(update_score_open, update_score_extend); - int8_t update_gap_prev = prev_node; - - // compute updated gap size count - int32_t incoming_count = 1 + (update_score == update_score_extend - ? incoming_gap_count[i] - : 0); - - if (update_score <= update_gap_scores[i]) { - update_score = update_gap_scores[i]; - incoming_count = update_gap_count[i]; - update_gap_prev = update_gap_prevs[i]; - } +template +auto DefaultColumnExtender::get_extensions(score_t min_path_score) + -> std::vector { + const char *align_start = seed_->get_query().data() + seed_->get_query().size() - 1; + size_t start = align_start - query_.data(); + size_t size = query_.size() - start + 1; + assert(start + size == partial_sums_.size()); + match_score_begin_ = partial_sums_.data() + start; + + extend_window_ = { align_start, size - 1 }; + DEBUG_LOG("Extend query window: {}", extend_window_); + assert(extend_window_[0] == seed_->get_query().back()); + + auto &first_column = table_.emplace( + graph_.max_index() + 1, + Column{ { std::make_tuple( + ScoreVec(1, ninf), ScoreVec(1, ninf), ScoreVec(1, ninf), + OpVec(1, Cigar::CLIPPED), OpVec(1, Cigar::CLIPPED), OpVec(1, Cigar::CLIPPED), + AlignNode{}, PrevVec(1, NONE), PrevVec(1, NONE), + 0 /* offset */, 0 /* max_pos */ + ) }, false } + ).first.value().first[0]; + sanitize(first_column); + auto &[S, E, F, OS, OE, OF, prev_node, PS, PF, offset, max_pos] = first_column; + + size_t num_columns = 1; + constexpr size_t column_vector_size = sizeof(std::pair>); + + auto get_column_size = [&](const Scores &scores) { + size_t size = std::get<0>(scores).capacity(); + return sizeof(Scores) + size * ( + sizeof(score_t) * 3 + sizeof(Cigar::Operator) * 3 + sizeof(NodeId) * 2 + ); + }; + size_t total_size = column_vector_size + get_column_size(first_column); - // compute score for cell update. check if deleting improves the update - int32_t update_cmp = update_score > H ? 0xFFFFFFFF : 0x0; - H = std::max(H, update_score); + S[0] = seed_->get_score() - profile_score_[seed_->get_sequence().back()][start + 1]; - // determine which indices satisfy the x-drop criteria - int32_t xdrop_cmp = H > xdrop_cutoff ? 0xFFFFFFFF : 0x0; + AlignNode start_node{ graph_.max_index() + 1, + seed_->get_sequence()[seed_->get_sequence().size() - 2], + 0, 0 }; - // revert values not satisfying the x-drop criteria - if (!xdrop_cmp) - H = H_orig; + typedef std::pair Ref; + Ref best_start{ start_node, S[0] }; + std::vector starts; - int32_t both_cmp = H > H_orig ? 0xFFFFFFFF : 0x0; - if (!both_cmp) - continue; + std::priority_queue, utils::LessSecond> stack; + stack.emplace(start_node, S[0]); - // update scores - update_scores[i] = H; - update_gap_scores[i] = update_score; - update_gap_count[i] = incoming_count; - update_gap_prevs[i] = update_gap_prev; + while (stack.size()) { + AlignNode prev = stack.top().first; + stack.pop(); - update_ops[i] = update_cmp ? Cigar::DELETION : profile_ops[i]; - updated_mask[i] = 0xFF; - update_prevs[i] = update_cmp ? update_gap_prev : prev_node; + if (static_cast(total_size) / 1000000 + > config_.max_ram_per_alignment) { + DEBUG_LOG("Alignment RAM limit reached, stopping extension"); + break; } - } -} - -template ::score_t> -inline void compute_updates(Column &update_column, - size_t begin, - const DBGAlignerConfig &config, - const NodeType &prev_node, - const NodeType &node, - const score_t *incoming_scores, - const score_t *incoming_gap_scores, - const int32_t *incoming_gap_count, - const int8_t *profile_scores, - const Cigar::Operator *profile_ops, - AlignedVector &updated_mask, - size_t length, - score_t xdrop_cutoff) { - assert(length); - size_t shift = update_column.start_index; - score_t *update_scores = update_column.scores.data() + begin - shift; - score_t *update_gap_scores = update_column.gap_scores.data() + begin - shift; - uint8_t *update_prevs = update_column.prev_nodes.data() + begin - shift; - int32_t *update_gap_count = update_column.gap_count.data() + begin - shift; - uint8_t *update_gap_prevs = update_column.gap_prev_nodes.data() + begin - shift; - Cigar::Operator *update_ops = update_column.ops.data() + begin - shift; - - // handle first element (i.e., no match update possible) - score_t update_score = std::max( - incoming_scores[0] + config.gap_opening_penalty, - incoming_gap_scores[0] + config.gap_extension_penalty - ); - uint8_t prev_node_rank = prev_node != node - ? update_column.rank_prev_node(prev_node) - : 0xFF; - - if (update_score >= xdrop_cutoff && update_score > update_scores[0]) { - update_scores[0] = update_score; - update_ops[0] = Cigar::DELETION; - update_prevs[0] = prev_node_rank; - updated_mask[0] = 0xFF; - - update_gap_scores[0] = update_score; - update_gap_prevs[0] = prev_node_rank; - update_gap_count[0] = update_score == incoming_gap_scores[0] + config.gap_extension_penalty - ? incoming_gap_count[0] + 1 - : 1; - } - - // ensure sizes are the same before casting - static_assert(sizeof(Cigar::Operator) == sizeof(int8_t)); - - auto update_block = [&]() { - compute_HE(length, prev_node_rank, - config.gap_opening_penalty, config.gap_extension_penalty, - update_scores, update_gap_scores, - update_prevs, - update_ops, update_gap_count, - update_gap_prevs, - incoming_scores, incoming_gap_scores, - profile_scores, profile_ops, - incoming_gap_count, updated_mask.data(), xdrop_cutoff - 1); - }; - -#ifdef __AVX2__ - - if (prev_node != node) { - // update 8 scores at a time - compute_HE_avx2(length, - _mm_set1_epi8(prev_node_rank), - _mm256_set1_epi32(config.gap_opening_penalty), - _mm256_set1_epi32(config.gap_extension_penalty), - update_scores, update_gap_scores, - update_prevs, - reinterpret_cast(update_ops), - update_gap_count, - update_gap_prevs, - incoming_scores, incoming_gap_scores, - profile_scores, reinterpret_cast(profile_ops), - incoming_gap_count, updated_mask.data(), - _mm256_set1_epi32(xdrop_cutoff - 1)); - } else { - update_block(); - } - -#else - - update_block(); + if (static_cast(num_columns) / extend_window_.size() + > config_.max_nodes_per_seq_char) { + DEBUG_LOG("Alignment node limit reached, stopping extension"); + break; + } -#endif + size_t next_distance_from_origin = std::get<3>(prev) + 1; - update_ins_scores(config, update_scores, update_prevs, update_ops, - updated_mask.data(), length, xdrop_cutoff); -} + for (const auto &[next, c] : get_outgoing(prev)) { + auto &column_pair = table_[next]; + auto &[column, converged] = column_pair; + if (converged) + continue; + assert(table_.count(std::get<0>(prev))); + auto &column_prev = table_[std::get<0>(prev)].first; -template -void DefaultColumnExtender::operator()(ExtensionCallback callback, - score_t min_path_score) { - assert(columns_to_update.empty()); + score_t xdrop_cutoff = best_start.second - config_.xdrop; - const auto &path = get_seed(); + // compute bandwidth based on xdrop criterion + auto [min_i, max_i] = get_band(prev, column_prev, xdrop_cutoff); + if (min_i >= max_i) + continue; - if (!graph_.outdegree(path.back())) { - callback(DBGAlignment(), NodeType()); - return; - } + max_i = std::min(max_i + 1, size); - // stop path early if it can't be better than the min_path_score - if (path.get_score() + match_score_begin[1] < min_path_score) - return; + size_t depth = column.size(); + size_t cur_size = max_i - min_i; - size_t query_clipping = path.get_clipping() + path.get_query().size() - 1; - - if (dp_table.size() - && query_clipping >= dp_table.get_query_offset() - && std::all_of(path.begin(), path.end(), - [&](auto i) { return dp_table.find(i) != dp_table.end(); })) { - auto find = dp_table.find(path.back()); - size_t shift = find->second.start_index; - if (query_clipping - dp_table.get_query_offset() >= shift - && query_clipping - dp_table.get_query_offset() - shift < find->second.scores.size() - && find->second.scores[query_clipping - dp_table.get_query_offset() - shift] - >= path.get_score()) { - return; - } - } + Scores next_column(ScoreVec(cur_size, ninf), ScoreVec(cur_size, ninf), + ScoreVec(cur_size, ninf), OpVec(cur_size, Cigar::CLIPPED), + OpVec(cur_size, Cigar::CLIPPED), + OpVec(cur_size, Cigar::CLIPPED), + prev, PrevVec(cur_size, NONE), PrevVec(cur_size, NONE), + min_i /* offset */, 0 /* max_pos */); + sanitize(next_column); - reset(); - if (!add_seed(query_clipping)) - return; + bool updated = update_column( + graph_, config_, column_prev, next_column, c, start, size, + xdrop_cutoff, profile_score_, profile_op_, *seed_ + ); + sanitize(next_column); - start_score = dp_table.find(start_node)->second.best_score(); - score_cutoff = std::max(start_score, min_path_score); - xdrop_cutoff = score_cutoff - config_.xdrop; + auto &[S, E, F, OS, OE, OF, prev_node, PS, PF, offset, max_pos] = next_column; - if (xdrop_cutoff > start_score) - return; + auto max_it = std::max_element(S.begin(), S.end()); + max_pos = (max_it - S.begin()) + offset; + assert(max_pos < size); - begin = 0; - end = size; + converged = !updated || has_converged(column_pair, next_column); - check_and_push(ColumnRef(start_node, start_score, false)); + const score_t *match = &match_score_begin_[offset]; + bool extendable = false; + for (size_t i = 0; i < S.size() && !extendable; ++i) { + if (S[i] >= 0 && S[i] + match[i] >= min_path_score) + extendable = true; + } - if (columns_to_update.empty()) - return; + bool add_to_table = false; + AlignNode cur{ next, c, depth, next_distance_from_origin }; + if (OS[max_pos - offset] == Cigar::MATCH && *max_it > best_start.second) { + best_start.first = cur; + best_start.second = *max_it; + add_to_table = true; + } - max_num_nodes = std::numeric_limits::max(); - if (config_.max_nodes_per_seq_char < std::numeric_limits::max()) { - max_num_nodes = std::min(max_num_nodes, - static_cast(std::ceil(config_.max_nodes_per_seq_char - * static_cast(size))) - ); - } + assert(xdrop_cutoff == best_start.second - config_.xdrop); - if (ram_limit_reached()) { - logger->warn("Alignment RAM limit too low: {} MB > {} MB. Alignment may be fragmented.", - static_cast(dp_table.num_bytes()) / 1024 / 1024, - config_.max_ram_per_alignment); - } + if (*max_it >= xdrop_cutoff && extendable) { + stack.emplace(cur, *max_it); + add_to_table = true; + } - extend_main(callback, min_path_score); -} + if (add_to_table) { + total_size += get_column_size(next_column) + (!depth * column_vector_size); + ++num_columns; + if (OS[max_pos - offset] == Cigar::MATCH) + starts.emplace_back(cur, *max_it); -template -std::deque> DefaultColumnExtender -::fork_extension(NodeType node, ExtensionCallback, score_t) { - std::deque> out_columns; - - if (const auto *dbg_succ = dynamic_cast(&graph_)) { - // If an outgoing node is already in the DPTable, then there's no need - // to decode the last character of that node. - const auto &boss = dbg_succ->get_boss(); - graph_.adjacent_outgoing_nodes(node, [&](auto next_node) { - auto find = dp_table.find(next_node); - char c = find == dp_table.end() - ? boss.decode( - boss.get_W(dbg_succ->kmer_to_boss_index(next_node)) % boss.alph_size) - : find->second.last_char; - if (c != boss::BOSS::kSentinel - && ((dp_table.size() < max_num_nodes && !ram_limit_reached()) - || find != dp_table.end())) { - if (next_node == node) { - out_columns.emplace_front(next_node, c); - } else { - out_columns.emplace_back(next_node, c); - } + column.emplace_back(std::move(next_column)); + } else if (!depth) { + table_.erase(next); } - }); - } else { - graph_.call_outgoing_kmers(node, [&](auto next_node, char c) { - if (c != boss::BOSS::kSentinel - && ((dp_table.size() < max_num_nodes && !ram_limit_reached()) - || dp_table.count(next_node))) { - if (next_node == node) { - out_columns.emplace_front(next_node, c); - } else { - out_columns.emplace_back(next_node, c); - } - } - }); + } } - return out_columns; -} + std::sort(starts.begin(), starts.end(), utils::GreaterSecond()); + assert(starts.empty() || starts[0].second == best_start.second); -template -void DefaultColumnExtender::extend_main(ExtensionCallback callback, - score_t min_path_score) { - assert(start_score == dp_table.best_score().second); + struct AlignNodeHash { + uint64_t operator()(const AlignNode &x) const { + uint64_t seed = hasher1(std::get<0>(x)); + return seed ^ (hasher2(std::get<2>(x)) + 0x9e3779b9 + (seed << 6) + (seed >> 2)); + } - while (columns_to_update.size()) { - ColumnRef top = columns_to_update.top(); - columns_to_update.pop(); + std::hash hasher1; + std::hash hasher2; + }; - NodeType node = std::get<0>(top); - score_t best_score_update = std::get<1>(top); - const auto &cur_col = dp_table.find(node)->second; + tsl::hopscotch_set prev_starts; - // if this happens, then it means that the column was in the priority - // queue multiple times, so we don't need to consider it again - if (best_score_update != cur_col.last_priority_value()) + std::vector extensions; + for (const auto &[best_node, max_score] : starts) { + if (prev_starts.count(best_node)) continue; - overlapping_range_ = false; - auto out_columns = fork_extension(node, callback, min_path_score); - - assert(std::all_of(out_columns.begin(), out_columns.end(), [&](const auto &pair) { - return graph_.traverse(node, pair.second) == pair.first; - })); + assert(table_.count(std::get<0>(best_node))); + const auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] + = table_[std::get<0>(best_node)].first.at(std::get<2>(best_node)); - bool ram_limit_reached_before = ram_limit_reached(); - update_columns(node, out_columns, min_path_score); + assert(S[max_pos - offset] == max_score); - if (!ram_limit_reached_before && ram_limit_reached()) { - logger->warn("Alignment RAM limit too low: {} MB > {} MB. Alignment may be fragmented.", - static_cast(dp_table.num_bytes()) / 1024 / 1024, - config_.max_ram_per_alignment); + if (max_pos < 2 && std::get<0>(best_node) == seed_->back() + && !std::get<2>(best_node)) { + if (seed_->get_score() >= min_path_score) { + DEBUG_LOG("Alignment (seed): {}", *seed_); + extensions.emplace_back(*seed_); + extensions.back().extend_query_end(query_.data() + query_.size()); + extensions.back().trim_offset(); + assert(extensions.back().is_valid(graph_, &config_)); + } + } else { + assert(OS[max_pos - offset] == Cigar::MATCH); + backtrack(table_, *seed_, graph_, config_, min_path_score, best_node, + prev_starts, size, extend_window_, query_, extensions); } - } -#ifndef NDEBUG - logger->trace("Extension completed:\tquery size:\t{}\tseed size:\t{}\texplored nodes:\t{}", - query.size(), path_->size(), dp_table.size()); -#endif - - assert(start_score > config_.min_cell_score); + assert(extensions.size() < 2 + || extensions.back().get_score() <= extensions[extensions.size() - 2].get_score()); - // no good path found - if (start_node == SequenceGraph::npos - || start_score == get_seed().get_score() - || score_cutoff > start_score) { - reset(); - callback(Alignment(), NodeType()); - return; + if (extensions.size() == config_.num_alternative_paths) + break; } - // check to make sure that start_node stores the best starting point - assert(start_score == dp_table.best_score().second); - - if (dp_table.find(start_node)->second.best_op() != Cigar::MATCH) - logger->trace("best alignment does not end with a MATCH"); - - // get all alignments - dp_table.extract_alignments(graph_, config_, extend_window_, callback, - min_path_score, get_seed(), &start_node); + return extensions; } template void DefaultColumnExtender -::update_columns(NodeType incoming_node, - const std::deque> &out_columns, - score_t min_path_score) { - if (out_columns.empty()) - return; - - // set boundaries for vertical band - auto incoming_find = dp_table.find(incoming_node); - - if (dp_table.size() == 1 && out_columns.size() && out_columns.front().first != incoming_node) { - dp_table.expand_to_cover(incoming_find, 0, size); - update_ins_scores(config_, - incoming_find.value().scores.data(), - incoming_find.value().prev_nodes.data(), - incoming_find.value().ops.data(), - nullptr, - size, - xdrop_cutoff); - } - - std::tie(begin, end) = get_column_boundaries(dp_table, incoming_find, - xdrop_cutoff, config_); - - if (begin >= end) - return; - - for (const auto &[next_node, c] : out_columns) { - auto emplace = emplace_node(next_node, incoming_node, c, size, begin, begin, begin, end); - - // emplace_node may have invalidated incoming, so update the iterator - if (emplace.second) - incoming_find = dp_table.find(incoming_node); - - auto &incoming = incoming_find.value(); - auto &next_column = emplace.first.value(); - - assert(begin >= next_column.start_index); - assert(begin >= incoming.start_index); - assert(next_column.start_index + next_column.scores.size() >= end); - assert(incoming.start_index + incoming.scores.size() >= end); - - // store the mask indicating which cells were updated - // this is padded to ensure that the vectorized code doesn't access - // out of bounds - AlignedVector updated_mask(end - begin + 9, 0x0); - - assert(next_column.scores.size() == next_column.gap_scores.size()); - assert(incoming.scores.size() == incoming.gap_scores.size()); - - size_t shift = incoming.start_index; - compute_updates( - next_column, - begin, - config_, - incoming_node, - next_node, - incoming.scores.data() + begin - shift, - incoming.gap_scores.data() + begin - shift, - incoming.gap_count.data() + begin - shift, - profile_score[next_column.last_char].data() + query.size() - size + begin, - profile_op[next_column.last_char].data() + query.size() - size + begin, - updated_mask, - end - begin, - xdrop_cutoff - ); - - shift = next_column.start_index; - - // Find the maximum changed value - const score_t *best_update = nullptr; - size_t best_pos = -1; - for (size_t i = begin, j = 0; i < size && j < updated_mask.size(); ++i, ++j) { - if (updated_mask[j] && (!best_update || next_column.scores[i - shift] > *best_update)) { - best_update = &next_column.scores[i - shift]; - best_pos = i; - assert(i >= shift); - assert(i < next_column.size()); +::call_visited_nodes(const std::function &callback) const { + size_t window_start = extend_window_.data() - query_.data(); + for (const auto &[node, columns] : table_) { + size_t start = query_.size(); + size_t end = 0; + size_t start_distance_from_origin = 0; + for (const auto &column : columns.first) { + const auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] = column; + + auto it = std::find_if(S.begin(), S.end(), [](score_t s) { return s > 0; }); + auto rit = std::find_if(S.rbegin(), S.rend(), [](score_t s) { return s > 0; }); + size_t start_c = (it - S.begin()) + offset; + size_t end_c = (S.rend() - rit) + offset; + + size_t prev_distance_from_origin = std::get<3>(prev); + + if (start_c) + --start_c; + + if (start_c < start) { + start = start_c; + start_distance_from_origin = prev_distance_from_origin + + (OS[it - S.begin()] != Cigar::INSERTION); } + + end = std::max(end, end_c); } - // put column back in the priority queue if it's updated - if (best_update) { - assert(updated_mask[best_pos - begin]); + if (start < end) { + assert(start_distance_from_origin); + --start_distance_from_origin; - next_column.last_priority_pos = best_pos; + size_t start_pos = window_start + start + 1 - graph_.get_k(); + if (start_distance_from_origin < seed_->get_offset()) + start_pos += seed_->get_offset() - start_distance_from_origin; - if (*best_update > next_column.best_score()) { - next_column.best_pos = next_column.last_priority_pos; + callback(node, start_pos, window_start + end); + } + } +} - assert(next_column.best_pos >= begin); - assert(next_column.best_pos < size); - } +template +bool DefaultColumnExtender::has_converged(const Column &column, + const Scores &next) { + if (column.second) + return true; + + if (column.first.empty()) + return false; + + const auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] = next; + const auto &[S_b, E_b, F_b, OS_b, OE_b, OF_b, prev_b, PS_b, PF_b, offset_b, max_pos_b] + = column.first.back(); + + return offset == offset_b && max_pos == max_pos_b + && std::get<0>(prev) == std::get<0>(prev_b) + && S == S_b && E == E_b && F == F_b && OS == OS_b && OE == OE_b && OF == OF_b + && PS == PS_b && PF == PF_b; +} - assert(*best_update == next_column.best_score() - || next_column.best_pos < begin - || next_column.best_pos >= end - || !updated_mask[next_column.best_pos - begin]); - - // update global max score - if (*best_update > start_score) { - start_node = next_node; - start_score = *best_update; - xdrop_cutoff = std::max(start_score - config_.xdrop, xdrop_cutoff); - assert(start_score == dp_table.best_score().second); - score_cutoff = std::max(start_score, min_path_score); - } +template +void DefaultColumnExtender::sanitize(Scores &scores) { + auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] = scores; + + size_t size = S.size(); + size_t pad_size = ((size + 7) / 8) * 8 + 8; + size_t size_diff = pad_size - size; + + assert(size_diff); + + S.reserve(pad_size); + E.reserve(pad_size); + F.reserve(pad_size); + OS.reserve(pad_size); + OE.reserve(pad_size); + OF.reserve(pad_size); + PS.reserve(pad_size); + PF.reserve(pad_size); + + std::fill(&S[size], &S[size] + size_diff, ninf); + std::fill(&E[size], &E[size] + size_diff, ninf); + std::fill(&F[size], &F[size] + size_diff, ninf); + memset(&OS[size], 0, sizeof(typename decltype(OS)::value_type) * size_diff); + memset(&OE[size], 0, sizeof(typename decltype(OE)::value_type) * size_diff); + memset(&OF[size], 0, sizeof(typename decltype(OF)::value_type) * size_diff); + memset(&PS[size], 0, sizeof(typename decltype(PS)::value_type) * size_diff); + memset(&PF[size], 0, sizeof(typename decltype(PF)::value_type) * size_diff); +} - check_and_push(ColumnRef(next_node, *best_update, !overlapping_range_)); - } +template +std::vector> DefaultColumnExtender +::get_outgoing(const AlignNode &node) const { + std::vector> outgoing; + if (std::get<0>(node) == graph_.max_index() + 1) { + outgoing.emplace_back(seed_->back(), seed_->get_sequence().back()); + } else { + graph_.call_outgoing_kmers(std::get<0>(node), [&](NodeType next, char c) { + if (c != boss::BOSS::kSentinel) + outgoing.emplace_back(next, c); + }); } -} + return outgoing; +} template class DefaultColumnExtender<>; diff --git a/metagraph/src/graph/alignment/aligner_extender_methods.hpp b/metagraph/src/graph/alignment/aligner_extender_methods.hpp index 887f46c9cf..95c9e61cc7 100644 --- a/metagraph/src/graph/alignment/aligner_extender_methods.hpp +++ b/metagraph/src/graph/alignment/aligner_extender_methods.hpp @@ -1,14 +1,10 @@ #ifndef __DBG_ALIGNER_METHODS_HPP__ #define __DBG_ALIGNER_METHODS_HPP__ -#include - -#include +#include #include "aligner_alignment.hpp" -#include "aligner_dp_table.hpp" #include "common/aligned_vector.hpp" -#include "common/utils/template_utils.hpp" namespace mtg { @@ -24,16 +20,19 @@ class IExtender { typedef Alignment DBGAlignment; typedef typename DBGAlignment::node_index node_index; typedef typename DBGAlignment::score_t score_t; - typedef std::function ExtensionCallback; virtual ~IExtender() {} - virtual void - operator()(ExtensionCallback callback, - score_t min_path_score = std::numeric_limits::min()) = 0; + virtual std::vector + get_extensions(score_t min_path_score = std::numeric_limits::min()) = 0; virtual void initialize(const DBGAlignment &seed) = 0; + virtual void + call_visited_nodes(const std::function &callback) const = 0; + protected: virtual void reset() = 0; virtual const DBGAlignment& get_seed() const = 0; @@ -46,12 +45,12 @@ class DefaultColumnExtender : public IExtender { typedef typename IExtender::DBGAlignment DBGAlignment; typedef typename IExtender::node_index node_index; typedef typename IExtender::score_t score_t; - typedef typename IExtender::ExtensionCallback ExtensionCallback; - typedef std::tuple ColumnRef; - typedef boost::container::priority_deque, - utils::LessSecond> ColumnQueue; + enum NodeId : uint8_t { + NONE, + PREV, + CUR + }; DefaultColumnExtender(const DeBruijnGraph &graph, const DBGAlignerConfig &config, @@ -59,55 +58,43 @@ class DefaultColumnExtender : public IExtender { virtual ~DefaultColumnExtender() {} - virtual void - operator()(ExtensionCallback callback, - score_t min_path_score = std::numeric_limits::min()) override; + virtual std::vector + get_extensions(score_t min_path_score = std::numeric_limits::min()) override; virtual void initialize(const DBGAlignment &seed) override; - const DPTable& get_dp_table() const { return dp_table; } + virtual void + call_visited_nodes(const std::function &callback) const override; protected: const DeBruijnGraph &graph_; const DBGAlignerConfig &config_; - std::string_view query; - - // keep track of which columns to use next - ColumnQueue columns_to_update; + std::string_view query_; - DPTable dp_table; + typedef std::tuple AlignNode; - virtual void reset() override { dp_table.clear(); } + typedef AlignedVector ScoreVec; + typedef AlignedVector PrevVec; + typedef AlignedVector OpVec; + typedef std::tuple Scores; + typedef std::pair, bool> Column; - virtual std::pair::iterator, bool> - emplace_node(NodeType node, - NodeType incoming_node, - char c, - size_t size, - size_t best_pos = 0, - size_t last_priority_pos = 0, - size_t begin = 0, - size_t end = std::numeric_limits::max()); + tsl::hopscotch_map table_; - virtual bool add_seed(size_t clipping); + virtual void reset() override { table_.clear(); } - virtual const DBGAlignment& get_seed() const override { return *path_; } + virtual const DBGAlignment& get_seed() const override { return *seed_; } - virtual void check_and_push(ColumnRef&& next_column); - - void extend_main(ExtensionCallback callback, score_t min_path_score); - - void update_columns(NodeType incoming_node, - const std::deque> &out_columns, - score_t min_path_score); - - virtual std::deque> - fork_extension(NodeType /* fork after this node */, ExtensionCallback, score_t); - - inline bool ram_limit_reached() const { - return static_cast(dp_table.num_bytes()) / 1024 / 1024 - > config_.max_ram_per_alignment; - } + virtual std::vector> get_outgoing(const AlignNode &node) const; private: // compute perfect match scores for all suffixes @@ -115,27 +102,20 @@ class DefaultColumnExtender : public IExtender { std::vector partial_sums_; // a quick lookup table of char pair match/mismatch scores for the current query - tsl::hopscotch_map> profile_score; - tsl::hopscotch_map> profile_op; + tsl::hopscotch_map> profile_score_; + tsl::hopscotch_map> profile_op_; // the initial seed - const DBGAlignment *path_; + const DBGAlignment *seed_; std::string_view extend_window_; - // max size of a column - size_t size; - // start of the partial sum table - const score_t *match_score_begin; - NodeType start_node; - score_t start_score; - score_t score_cutoff; - size_t begin; - size_t end; - score_t xdrop_cutoff; - bool overlapping_range_; - size_t max_num_nodes; + const score_t *match_score_begin_; + + static bool has_converged(const Column &column, const Scores &next); + + static void sanitize(Scores &scores); }; } // namespace align diff --git a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp index e37cb05f6b..82ebab472e 100644 --- a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp +++ b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp @@ -76,7 +76,7 @@ auto ExactSeeder::get_seeds() const -> std::vector { score_t match_score = partial_sum[i + k] - partial_sum[i]; if (match_score > config.min_cell_score) { - seeds.emplace_back(std::string_view(query.data() + i, k), + seeds.emplace_back(query.substr(i, k), std::vector{ query_nodes[i] }, match_score, i, orientation); assert(seeds.back().is_valid(graph, &config)); @@ -166,7 +166,7 @@ auto SuffixSeeder::get_seeds() const -> std::vector { auto append_suffix_seed = [&](size_t i, node_index alt_node, size_t seed_length) { assert(i < suffix_seeds.size()); - std::string_view seed_seq(this->query_.data() + i, seed_length); + std::string_view seed_seq = this->query_.substr(i, seed_length); DBGAlignerConfig::score_t match_score = this->config_.match_score(seed_seq); if (match_score <= this->config_.min_cell_score) @@ -184,8 +184,12 @@ auto SuffixSeeder::get_seeds() const -> std::vector { for (size_t i = 0; i < min_seed_length.size(); ++i) { if (i >= this->query_nodes_.size() || !this->query_nodes_[i]) { if (i) { - min_seed_length[i] = std::max(min_seed_length[i - 1] - 1, - min_seed_length[i]); + size_t new_min_seed_length = std::max(min_seed_length[i - 1] - 1, + min_seed_length[i]); + if (new_min_seed_length > min_seed_length[i]) { + min_seed_length[i] = new_min_seed_length; + suffix_seeds[i].clear(); + } } size_t max_seed_length = std::min({ this->config_.max_seed_length, @@ -193,8 +197,11 @@ auto SuffixSeeder::get_seeds() const -> std::vector { this->query_.size() - i }); if (max_seed_length >= min_seed_length[i]) { dbg_succ_.call_nodes_with_suffix_matching_longest_prefix( - std::string_view(this->query_.data() + i, max_seed_length), + this->query_.substr(i, max_seed_length), [&](node_index alt_node, size_t seed_length) { + if (seed_length > min_seed_length[i]) + suffix_seeds[i].clear(); + min_seed_length[i] = seed_length; append_suffix_seed(i, alt_node, seed_length); }, @@ -202,8 +209,12 @@ auto SuffixSeeder::get_seeds() const -> std::vector { ); if (i + 1 < min_seed_length.size()) { - min_seed_length[i + 1] = std::max(min_seed_length[i + 1], - min_seed_length[i]); + size_t new_min_seed_length = std::max(min_seed_length[i + 1], + min_seed_length[i]); + if (new_min_seed_length > min_seed_length[i + 1]) { + min_seed_length[i + 1] = new_min_seed_length; + suffix_seeds[i + 1].clear(); + } } } } @@ -212,6 +223,7 @@ auto SuffixSeeder::get_seeds() const -> std::vector { const auto *canonical = dynamic_cast(&this->graph_); if (canonical) { // find sub-k matches in the reverse complement + // TODO: find sub-k seeds which are sink tips in the underlying graph std::string query_rc(this->query_); reverse_complement(query_rc.begin(), query_rc.end()); @@ -237,7 +249,7 @@ auto SuffixSeeder::get_seeds() const -> std::vector { size_t j_max = query_rc.size() - i - this->config_.min_seed_length; // skip over positions which have better matches - while (min_seed_length[j_min] > max_seed_length && j_min <= j_max) { + while (j_min <= j_max && min_seed_length[j_min] > max_seed_length) { ++j_min; --max_seed_length; } @@ -261,7 +273,10 @@ auto SuffixSeeder::get_seeds() const -> std::vector { continue; } - min_seed_length[j] = seed_length; + if (seed_length > min_seed_length[j]) { + min_seed_length[j] = seed_length; + suffix_seeds[j].clear(); + } // clear out shorter seeds size_t s = seed_length; @@ -292,6 +307,8 @@ auto SuffixSeeder::get_seeds() const -> std::vector { if (pos_seeds.empty()) continue; + assert(std::equal(pos_seeds.begin() + 1, pos_seeds.end(), pos_seeds.begin())); + if (!pos_seeds[0].get_offset()) { assert(min_seed_length[i] == this->graph_.get_k()); assert(pos_seeds.size() == 1); @@ -360,23 +377,23 @@ auto MEMSeeder::get_seeds() const -> std::vector { size_t mem_length = (next - it) + k - 1; assert(i + mem_length <= this->query_.size()); - const char *begin_it = this->query_.data() + i; - const char *end_it = begin_it + mem_length; + if (mem_length >= this->config_.min_seed_length) { + const char *begin_it = this->query_.data() + i; + const char *end_it = begin_it + mem_length; - assert(end_it >= begin_it + this->config_.min_seed_length); + score_t match_score = this->partial_sum_[end_it - this->query_.data()] + - this->partial_sum_[i]; - score_t match_score = this->partial_sum_[end_it - this->query_.data()] - - this->partial_sum_[i]; + auto node_begin_it = this->query_nodes_.begin() + i; + auto node_end_it = node_begin_it + (next - it); + assert(std::find(node_begin_it, node_end_it, DeBruijnGraph::npos) == node_end_it); - auto node_begin_it = this->query_nodes_.begin() + i; - auto node_end_it = node_begin_it + (next - it); - assert(std::find(node_begin_it, node_end_it, DeBruijnGraph::npos) == node_end_it); - - if (match_score > this->config_.min_cell_score) { - seeds.emplace_back(std::string_view(begin_it, mem_length), - std::vector{ node_begin_it, node_end_it }, - match_score, i,this->orientation_); - assert(seeds.back().is_valid(this->graph_, &this->config_)); + if (match_score > this->config_.min_cell_score) { + seeds.emplace_back(std::string_view(begin_it, mem_length), + std::vector{ node_begin_it, node_end_it }, + match_score, i,this->orientation_); + assert(seeds.back().is_valid(this->graph_, &this->config_)); + } } it = next; diff --git a/metagraph/src/graph/alignment/dbg_aligner.cpp b/metagraph/src/graph/alignment/dbg_aligner.cpp index 038b6ba4f5..1f0e2e5c0d 100644 --- a/metagraph/src/graph/alignment/dbg_aligner.cpp +++ b/metagraph/src/graph/alignment/dbg_aligner.cpp @@ -36,77 +36,92 @@ template void SeedAndExtendAlignerCore ::align_core(std::string_view query, const ISeeder &seeder, - IExtender&& extender, + IExtender &extender, const LocalAlignmentCallback &callback, const MinScoreComputer &get_min_path_score) const { - for (auto &seed : seeder.get_seeds()) { -#ifndef NDEBUG - mtg::common::logger->trace("Seed: {}", seed); -#endif + bool filter_seeds = dynamic_cast*>(&seeder); + + std::vector seeds = seeder.get_seeds(); + std::sort(seeds.begin(), seeds.end(), LocalAlignmentGreater()); + + for (DBGAlignment &seed : seeds) { score_t min_path_score = get_min_path_score(seed); + // check if this seed has been explored before in an alignment and discard + // it if so + if (filter_seeds) { + size_t found_count = 0; + std::pair idx_range { + seed.get_clipping(), + seed.get_clipping() + graph_.get_k() - seed.get_offset() + }; + for (node_index node : seed) { + auto emplace = visited_nodes_.emplace(node, idx_range); + auto &range = emplace.first.value(); + if (emplace.second) { + } else if (range.first > idx_range.first || range.second < idx_range.second) { + DEBUG_LOG("Node: {}; Prev_range: [{},{})", node, range.first, range.second); + range.first = std::min(range.first, idx_range.first); + range.second = std::max(range.second, idx_range.second); + DEBUG_LOG("Node: {}; cur_range: [{},{})", node, range.first, range.second); + } else { + ++found_count; + } + + if (idx_range.second - idx_range.first == graph_.get_k()) + ++idx_range.first; + + ++idx_range.second; + } + + if (found_count == seed.size()) { + DEBUG_LOG("Skipping seed: {}", seed); + continue; + } + } + if (seed.get_query().data() + seed.get_query().size() == query.data() + query.size()) { if (seed.get_score() >= min_path_score) { seed.trim_offset(); assert(seed.is_valid(graph_, &config_)); -#ifndef NDEBUG - mtg::common::logger->trace("Alignment: {}", seed); -#endif + DEBUG_LOG("Alignment: {}", seed); callback(std::move(seed)); } continue; } - bool extended = false; + DEBUG_LOG("Min path score: {}\tSeed: {}", min_path_score, seed); + extender.initialize(seed); - extender([&](DBGAlignment&& extension, auto start_node) { - if (!start_node && !extended) { - // no good extension found - if (seed.get_score() >= min_path_score) { - seed.extend_query_end(query.data() + query.size()); - seed.trim_offset(); - assert(seed.is_valid(graph_, &config_)); -#ifndef NDEBUG - mtg::common::logger->trace("Alignment: {}", seed); -#endif - callback(std::move(seed)); + auto extensions = extender.get_extensions(min_path_score); + + // if the ManualSeeder is not used, then add nodes to the visited_nodes_ + // table to allow for seed filtration + if (filter_seeds) { + extender.call_visited_nodes([&](node_index node, size_t begin, size_t end) { + auto emplace = visited_nodes_.emplace(node, std::make_pair(begin, end)); + auto &range = emplace.first.value(); + if (!emplace.second) { + range.first = std::min(range.first, begin); + range.second = std::max(range.second, end); } - extended = true; - return; - } - - assert(extension.is_valid(graph_, &config_)); - extension.extend_query_end(query.data() + query.size()); - - if (extension.get_clipping() || start_node != seed.back()) { - // if the extension starts at a different position - // from the seed end, then it's a new alignment - extension.extend_query_begin(query.data()); - extension.trim_offset(); - assert(extension.is_valid(graph_, &config_)); -#ifndef NDEBUG - mtg::common::logger->trace("Alignment: {}", extension); -#endif - callback(std::move(extension)); - return; - } - - assert(extension.get_offset() == graph_.get_k() - 1); - auto next_path = seed; - next_path.append(std::move(extension)); - next_path.trim_offset(); - assert(next_path.is_valid(graph_, &config_)); + }); + } -#ifndef NDEBUG - mtg::common::logger->trace("Alignment: {}", next_path); -#endif - callback(std::move(next_path)); - extended = true; - }, min_path_score); + if (extensions.empty() && seed.get_score() >= min_path_score) { + seed.extend_query_end(query.data() + query.size()); + seed.trim_offset(); + assert(seed.is_valid(graph_, &config_)); + DEBUG_LOG("Alignment (seed): {}", seed); + callback(std::move(seed)); + } - // if !extended, then the seed was not extended because of early cutoff + for (auto&& extension : extensions) { + assert(extension.is_valid(graph_, &config_)); + callback(std::move(extension)); + } } } @@ -120,8 +135,7 @@ ::align_one_direction(DBGQueryAlignment &paths, align_aggregate(paths, [&](const auto &alignment_callback, const auto &get_min_path_score) { - align_core(query, seeder, std::move(extender), - alignment_callback, get_min_path_score); + align_core(query, seeder, extender, alignment_callback, get_min_path_score); }); } @@ -137,12 +151,8 @@ ::align_best_direction(DBGQueryAlignment &paths, align_aggregate(paths, [&](const auto &alignment_callback, const auto &get_min_path_score) { - align_core(forward, seeder, std::move(extender), - alignment_callback, get_min_path_score); - - align_core(reverse, seeder_rc, std::move(extender_rc), - alignment_callback, get_min_path_score); - + align_core(forward, seeder, extender, alignment_callback, get_min_path_score); + align_core(reverse, seeder_rc, extender_rc, alignment_callback, get_min_path_score); }); } @@ -150,39 +160,35 @@ template void SeedAndExtendAlignerCore ::align_both_directions(DBGQueryAlignment &paths, const ISeeder &forward_seeder, + const ISeeder &reverse_seeder, IExtender&& forward_extender, + IExtender&& reverse_extender, const AlignCoreGenerator &rev_comp_core_generator) const { std::string_view forward = paths.get_query(); std::string_view reverse = paths.get_query(true); - std::vector reverse_seeds; - align_aggregate(paths, [&](const auto &alignment_callback, const auto &get_min_path_score) { -#ifndef NDEBUG - mtg::common::logger->trace("Aligning forwards"); -#endif - - // First get forward alignments - align_core(forward, forward_seeder, std::move(forward_extender), - [&](DBGAlignment&& path) { + auto get_forward_alignments = [&](std::string_view query, + std::string_view query_rc, + const ISeeder &seeder, + IExtender &extender) { + std::vector rc_of_alignments; + + DEBUG_LOG("Extending in forwards direction"); + align_core(query, seeder, extender, [&](DBGAlignment&& path) { score_t min_path_score = get_min_path_score(path); - // If the alignment starts from the beginning of the query, - // there's no sequence left for aligning backwards. - if (!path.get_clipping()) { - if (path.get_score() >= min_path_score) - alignment_callback(std::move(path)); + if (path.get_score() >= min_path_score) + alignment_callback(DBGAlignment(path)); + if (!path.get_clipping()) return; - } auto rev = path; - rev.reverse_complement(graph_, reverse); + rev.reverse_complement(graph_, query_rc); if (rev.empty()) { -#ifndef NDEBUG - mtg::common::logger->trace("Alignment cannot be reversed, returning"); -#endif + DEBUG_LOG("Alignment cannot be reversed, returning"); if (path.get_score() >= min_path_score) alignment_callback(std::move(path)); @@ -197,48 +203,32 @@ ::align_both_directions(DBGQueryAlignment &paths, // Pass the reverse complement of the forward alignment // as a seed for extension - reverse_seeds.emplace_back(std::move(rev)); - }, - [&](const auto&) { - // ignore the min path score for the forward alignment, - // since it may have a score that is too low before it is - // extended backwards - return config_.min_cell_score; - } + rc_of_alignments.emplace_back(std::move(rev)); + }, [&](const auto&) { return config_.min_cell_score; }); + + return rc_of_alignments; + }; + + std::vector rc_of_reverse = get_forward_alignments( + reverse, forward, reverse_seeder, reverse_extender + ); + std::vector rc_of_forward = get_forward_alignments( + forward, reverse, forward_seeder, forward_extender ); -#ifndef NDEBUG - mtg::common::logger->trace("Aligning backwards"); -#endif - - // Then use the reverse complements of the forward alignments as seeds - rev_comp_core_generator(reverse, forward_seeder, std::move(reverse_seeds), - [&](const auto &seeder_rc, auto&& extender_rc) { - align_core(reverse, seeder_rc, std::move(extender_rc), - [&](DBGAlignment&& path) { - // If the path originated from a backwards alignment (forward - // alignment of a reverse complement) and did not skip the first - // characters (so it is unable to be reversed), change back - // to the forward orientation - if (path.get_orientation()) { - auto forward_path = path; - forward_path.reverse_complement(graph_, forward); - if (!forward_path.empty()) { - path = std::move(forward_path); -#ifndef NDEBUG - } else { - mtg::common::logger->trace( - "Backwards alignment cannot be reversed, returning"); -#endif - } - } - - assert(path.is_valid(graph_, &config_)); - alignment_callback(std::move(path)); - }, - get_min_path_score - ); - }); + auto extend_reverse = [&](std::string_view query_rc, + const ISeeder &seeder, + std::vector&& rc_of_alignments) { + DEBUG_LOG("Extending in reverse direction"); + rev_comp_core_generator(query_rc, seeder, std::move(rc_of_alignments), + [&](const auto &seeder_rc, auto&& extender_rc) { + align_core(query_rc, seeder_rc, extender_rc, + alignment_callback, get_min_path_score); + }); + }; + + extend_reverse(forward, reverse_seeder, std::move(rc_of_reverse)); + extend_reverse(reverse, forward_seeder, std::move(rc_of_forward)); }); } diff --git a/metagraph/src/graph/alignment/dbg_aligner.hpp b/metagraph/src/graph/alignment/dbg_aligner.hpp index 37dbe0380c..78edbb78b7 100644 --- a/metagraph/src/graph/alignment/dbg_aligner.hpp +++ b/metagraph/src/graph/alignment/dbg_aligner.hpp @@ -4,6 +4,8 @@ #include #include +#include + #include "aligner_alignment.hpp" #include "aligner_seeder_methods.hpp" #include "aligner_extender_methods.hpp" @@ -49,12 +51,12 @@ class ISeedAndExtendAligner : public IDBGAligner { }; -template > +template class SeedAndExtendAlignerCore; template , class Extender = DefaultColumnExtender<>, - class AlignmentCompare = LocalAlignmentLess<>> + class AlignmentCompare = LocalAlignmentLess> class DBGAligner : public ISeedAndExtendAligner { public: typedef IDBGAligner::node_index node_index; @@ -65,7 +67,12 @@ class DBGAligner : public ISeedAndExtendAligner { typedef IDBGAligner::AlignmentCallback AlignmentCallback; DBGAligner(const DeBruijnGraph &graph, const DBGAlignerConfig &config) - : graph_(graph), config_(config), aligner_core_(graph_, config_) {} + : graph_(graph), config_(config) { + assert(config_.num_alternative_paths); + if (!config_.check_config_scores()) { + throw std::runtime_error("Error: invalid scoring parameters"); + } + } virtual void align_batch(const QueryGenerator &generate_query, const AlignmentCallback &callback) const override final; @@ -75,7 +82,6 @@ class DBGAligner : public ISeedAndExtendAligner { protected: const DeBruijnGraph &graph_; DBGAlignerConfig config_; - SeedAndExtendAlignerCore aligner_core_; }; template @@ -99,12 +105,7 @@ class SeedAndExtendAlignerCore { const AlignCoreCallback&)> AlignCoreGenerator; SeedAndExtendAlignerCore(const DeBruijnGraph &graph, const DBGAlignerConfig &config) - : graph_(graph), config_(config) { - assert(config_.num_alternative_paths); - if (!config_.check_config_scores()) { - throw std::runtime_error("Error: sum of min_cell_score and lowest penalty too low."); - } - } + : graph_(graph), config_(config) {} // Align the query sequence in the given orientation (false is forward, // true is reverse complement) @@ -121,21 +122,24 @@ class SeedAndExtendAlignerCore { IExtender&& extender, IExtender&& extender_rc) const; - // Align both forwards and backwards from a given seed. Procedure - // 1. Given each seed, extend forward to produce an alignment A - // 2. Reverse complement the alignment to get A', treated like a new seed - // 3. Extend A' forwards - // 4. Reverse complement A' to get the final alignment A'' + // Align the forward and reverse complement of the query sequence in both + // directions and return the overall best alignment. e.g., for the forward query + // 1. Find all seeds of its reverse complement + // 2. Given a seed, extend forwards to get alignment A + // 3. Reverse complement the alignment to get A', treat it like a new seed + // 4. Extend A' forwards to get the final alignment A'' void align_both_directions(DBGQueryAlignment &paths, const ISeeder &forward_seeder, + const ISeeder &reverse_seeder, IExtender&& forward_extender, + IExtender&& reverse_extender, const AlignCoreGenerator &rev_comp_core_generator) const; protected: // Generate seeds, then extend them void align_core(std::string_view query, const ISeeder &seeder, - IExtender&& extender, + IExtender &extender, const LocalAlignmentCallback &callback, const MinScoreComputer &get_min_path_score) const; @@ -147,6 +151,8 @@ class SeedAndExtendAlignerCore { const DeBruijnGraph &graph_; const DBGAlignerConfig &config_; + + mutable tsl::hopscotch_map> visited_nodes_; }; @@ -157,6 +163,7 @@ ::align_batch(const QueryGenerator &generate_query, generate_query([&](std::string_view header, std::string_view query, bool is_reverse_complement) { + SeedAndExtendAlignerCore aligner_core(graph_, config_); DBGQueryAlignment paths(query, is_reverse_complement); std::string_view this_query = paths.get_query(is_reverse_complement); assert(this_query == query); @@ -181,8 +188,13 @@ ::align_batch(const QueryGenerator &generate_query, // From a given seed, align forwards, then reverse complement and // align backwards. The graph needs to be canonical to ensure that // all paths exist even when complementing. - aligner_core_.align_both_directions(paths, seeder, std::move(extender), - build_rev_comp_alignment_core); + std::string_view reverse = paths.get_query(true); + Seeder seeder_rc(graph_, reverse, !is_reverse_complement, + map_sequence_to_nodes(graph_, reverse), config_); + aligner_core.align_both_directions(paths, seeder, seeder_rc, + std::move(extender), + Extender(graph_, config_, reverse), + build_rev_comp_alignment_core); } else if (config_.forward_and_reverse_complement) { assert(!is_reverse_complement); std::string_view reverse = paths.get_query(true); @@ -190,12 +202,12 @@ ::align_batch(const QueryGenerator &generate_query, Seeder seeder_rc(graph_, reverse, !is_reverse_complement, map_sequence_to_nodes(graph_, reverse), config_); - aligner_core_.align_best_direction(paths, seeder, seeder_rc, - std::move(extender), - Extender(graph_, config_, reverse)); + aligner_core.align_best_direction(paths, seeder, seeder_rc, + std::move(extender), + Extender(graph_, config_, reverse)); } else { - aligner_core_.align_one_direction(paths, is_reverse_complement, seeder, - std::move(extender)); + aligner_core.align_one_direction(paths, is_reverse_complement, seeder, + std::move(extender)); } callback(header, std::move(paths)); diff --git a/metagraph/tests/graph/test_aligner.cpp b/metagraph/tests/graph/test_aligner.cpp index 318d8075dd..b147452a68 100644 --- a/metagraph/tests/graph/test_aligner.cpp +++ b/metagraph/tests/graph/test_aligner.cpp @@ -512,16 +512,16 @@ TYPED_TEST(DBGAlignerTest, multiple_variations) { check_extend(graph, aligner.get_config(), paths, query); } -TYPED_TEST(DBGAlignerTest, noise_in_branching_point) { +TYPED_TEST(DBGAlignerTest, align_noise_in_branching_point) { size_t k = 4; std::string reference_1 = "AAAACTTTTTT"; std::string reference_2 = "AAAATTGGGGG"; std::string query = "AAAATTTTTTT"; - // X + // D auto graph = build_graph_batch(k, { reference_1, reference_2 }); - DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2), -3, -1); + DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -3, -3), -3, -1); config.num_alternative_paths = 2; DBGAligner<> aligner(*graph, config); @@ -531,11 +531,10 @@ TYPED_TEST(DBGAlignerTest, noise_in_branching_point) { EXPECT_NE(paths[0], paths[1]); auto path = paths[0]; - EXPECT_EQ(query.size() - k + 1, path.size()); - EXPECT_EQ(reference_1, path.get_sequence()); - EXPECT_EQ(config.score_sequences(query, reference_1), path.get_score()); - EXPECT_EQ("4=1X6=", path.get_cigar().to_string()); - EXPECT_EQ(10u, path.get_num_matches()); + EXPECT_EQ(query.size() - k + 2, path.size()); + EXPECT_EQ(reference_1 + "T", path.get_sequence()); + EXPECT_EQ("4=1D7=", path.get_cigar().to_string()); + EXPECT_EQ(11u, path.get_num_matches()); EXPECT_FALSE(path.is_exact_match()); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); @@ -559,7 +558,6 @@ TYPED_TEST(DBGAlignerTest, alternative_path_basic) { DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2), -3, -1); config.num_alternative_paths = 2; - config.queue_size = 100; DBGAligner<> aligner(*graph, config); auto paths = aligner.align(query); @@ -609,41 +607,6 @@ TYPED_TEST(DBGAlignerTest, align_multiple_misalignment) { check_extend(graph, aligner.get_config(), paths, query); } -TYPED_TEST(DBGAlignerTest, align_multiple_misalignment_bandwidth) { - size_t k = 4; - std::string reference = "AAAGCGGACCCTTTCCGTTAT"; - std::string query = "AAAGGGGACCCTTTTCGTTAT"; - // X X - - auto graph = build_graph_batch(k, { reference }); - DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2)); - - for (uint64_t bandwidth : std::vector{ 2, 5, 10, std::numeric_limits::max()}) { - auto config_bandwidth = config; - config_bandwidth.bandwidth = bandwidth; - - DBGAligner<> aligner(*graph, config_bandwidth); - auto paths = aligner.align(query); - - ASSERT_EQ(1ull, paths.size()); - auto path = paths[0]; - - EXPECT_EQ(query.size() - k + 1, path.size()); - EXPECT_EQ(reference, path.get_sequence()); - EXPECT_EQ(config.score_sequences(query, reference), path.get_score()); - EXPECT_EQ("4=1X9=1X6=", path.get_cigar().to_string()); - EXPECT_EQ(19u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); - EXPECT_EQ(0u, path.get_clipping()); - EXPECT_EQ(0u, path.get_end_clipping()); - EXPECT_EQ(0u, path.get_offset()); - EXPECT_TRUE(path.is_valid(*graph, &config)); - check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); - - check_extend(graph, aligner.get_config(), paths, query); - } -} - TYPED_TEST(DBGAlignerTest, align_insert_non_existent) { size_t k = 4; std::string reference = "TTTCCTTGTT"; @@ -714,7 +677,7 @@ TYPED_TEST(DBGAlignerTest, align_insert_long) { // IIIIIIIII auto graph = build_graph_batch(k, { reference }); - DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2)); + DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -1)); config.gap_opening_penalty = -1; config.gap_extension_penalty = -1; DBGAligner<> aligner(*graph, config); @@ -740,7 +703,7 @@ TYPED_TEST(DBGAlignerTest, align_insert_long) { } TYPED_TEST(DBGAlignerTest, align_insert_long_offset) { - size_t k = 4; + size_t k = 5; std::string reference = "TTTCCGGTTGTTA"; std::string query = "TTTCCGCAAAAAAAAATTGTTA"; // XIIIIIIIII @@ -1027,6 +990,7 @@ TYPED_TEST(DBGAlignerTest, align_repeat_sequence_no_delete_after_insert) { "TTTGTGGCTAGAGCTCGAGATCGCGCGGCCACAATTGACAAATGAGATCTAATGAAACTAAAGAGCTTCTGCACAGCAAAAGAAACTGTCATC" ) + config.gap_opening_penalty + score_t(6) * config.gap_extension_penalty, path.get_score()); EXPECT_TRUE(path.get_cigar().to_string() == "45=7I8=1X39=" + || path.get_cigar().to_string() == "45=5I1=2I7=1X39=" || path.get_cigar().to_string() == "44=2I1=5I8=1X39=" || path.get_cigar().to_string() == "44=3I1=4I8=1X39=" || path.get_cigar().to_string() == "44=4I1=3I8=1X39=") @@ -1050,6 +1014,7 @@ TYPED_TEST(DBGAlignerTest, align_repeat_sequence_no_delete_after_insert) { "TTTGTGGCTAGAGCTCGAGATCGCGCGGCCACAATTGACAAATGA" "GATCTAAT" "G" "AAACTAAAGAGCTTCTGCACAGCAAAAGAAACTGTCATC" ) + config.gap_opening_penalty + score_t(6) * config.gap_extension_penalty, path.get_score()); EXPECT_TRUE(path.get_cigar().to_string() == "45=7I8=1X39=" + || path.get_cigar().to_string() == "45=5I1=2I7=1X39=" || path.get_cigar().to_string() == "44=2I1=5I8=1X39=" || path.get_cigar().to_string() == "44=3I1=4I8=1X39=" || path.get_cigar().to_string() == "44=4I1=3I8=1X39=") @@ -1291,7 +1256,6 @@ TYPED_TEST(DBGAlignerTest, align_low_similarity4) { config.xdrop = xdrop; config.min_exact_match = discovery_fraction; config.max_nodes_per_seq_char = 10.0; - config.queue_size = 20; config.num_alternative_paths = 2; config.min_path_score = 0; config.min_cell_score = 0; @@ -1310,6 +1274,26 @@ TYPED_TEST(DBGAlignerTest, align_low_similarity4) { } } +TYPED_TEST(DBGAlignerTest, align_low_similarity5) { + size_t k = 31; + std::string reference = "GTCGTCAGATCGGAAGAGCGTCGTGTAGGGAAAGGTCTTCGCCTGTGTAGATCTCGGTGGTCG"; + std::string query = "GTCAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTTCCTGGTGGTGTAGATC"; + + auto graph = std::make_shared(k); + graph->add_sequence(reference); + + DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -3, -3)); + DBGAligner<> aligner(*graph, config); + auto paths = aligner.align(query); + ASSERT_EQ(1ull, paths.size()); + auto path = paths[0]; + EXPECT_TRUE(path.is_valid(*graph, &config)); + check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); + + check_extend(graph, aligner.get_config(), paths, query); + +} + TEST(DBGAlignerTest, align_suffix_seed_snp_min_seed_length) { { size_t k = 7; @@ -1354,7 +1338,7 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_min_seed_length) { auto graph = std::make_shared(k); graph->add_sequence(reference); - DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2)); + DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -1)); config.min_seed_length = 1; config.max_num_seeds_per_locus = std::numeric_limits::max(); config.min_cell_score = std::numeric_limits::min() + 100; @@ -1388,29 +1372,93 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_canonical) { std::string query = "GGGGGCTTTCGAGGCCAA"; // SSSSS - auto graph = std::make_shared(k, DeBruijnGraph::CANONICAL); - graph->add_sequence(reference); + std::string reference_rc = "TTGGCCTCGAAAGTTTTT"; + std::string query_rc = "TTGGCCTCGAAAGCCCCC"; + + for (auto mode : { DeBruijnGraph::PRIMARY, DeBruijnGraph::CANONICAL }) { + auto dbg_succ = std::make_shared(k, mode); + dbg_succ->add_sequence(reference_rc); + + std::shared_ptr graph; + if (mode == DeBruijnGraph::PRIMARY) { + graph = std::make_shared(*dbg_succ); + } else { + graph = dbg_succ; + } + + DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2)); + config.max_num_seeds_per_locus = std::numeric_limits::max(); + config.min_cell_score = std::numeric_limits::min() + 100; + config.min_path_score = std::numeric_limits::min() + 100; + config.max_seed_length = k; + config.min_seed_length = 13; + DBGAligner>> aligner(*graph, config); + auto paths = aligner.align(query); + ASSERT_EQ(1ull, paths.size()); + auto path = paths[0]; + + EXPECT_EQ(1u, path.size()); // includes dummy k-mers + if (path.get_sequence() == reference.substr(5)) { + EXPECT_EQ(config.score_sequences(query.substr(5), reference.substr(5)), + path.get_score()); + EXPECT_EQ("5S13=", path.get_cigar().to_string()); + EXPECT_EQ(5u, path.get_clipping()); + EXPECT_EQ(0u, path.get_end_clipping()); + } else { + ASSERT_EQ(reference_rc.substr(0, 13), path.get_sequence()); + EXPECT_EQ(config.score_sequences(query_rc.substr(0, 13), + reference_rc.substr(0, 13)), + path.get_score()); + EXPECT_EQ("13=5S", path.get_cigar().to_string()); + EXPECT_EQ(0u, path.get_clipping()); + EXPECT_EQ(5u, path.get_end_clipping()); + } + EXPECT_EQ(5u, path.get_offset()); + EXPECT_EQ(13u, path.get_num_matches()); + EXPECT_FALSE(path.is_exact_match()); + EXPECT_TRUE(path.is_valid(*graph, &config)); + check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); + // TODO: sub-k seeds which are sink tips in the underlying graph currently can't be found + if (mode != DeBruijnGraph::PRIMARY) + check_extend(graph, aligner.get_config(), paths, query); + } +} + +TYPED_TEST(DBGAlignerTest, align_both_directions) { + size_t k = 7; + std::string reference = "AAAAGCTTTCGAGGCCAA"; + std::string query = "AAAAGTTTTCGAGGCCAA"; + // X + + std::string reference_rc = "TTGGCCTCGAAAGCTTTT"; + std::string query_rc = "TTGGCCTCGAAAACTTTT"; + // X + + auto graph = build_graph_batch(k, { reference }, DeBruijnGraph::CANONICAL); DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2)); - config.max_num_seeds_per_locus = std::numeric_limits::max(); - config.min_cell_score = std::numeric_limits::min() + 100; - config.min_path_score = std::numeric_limits::min() + 100; config.max_seed_length = k; - config.min_seed_length = 13; - DBGAligner>> aligner(*graph, config); + DBGAligner<> aligner(*graph, config); auto paths = aligner.align(query); ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; - EXPECT_EQ(1u, path.size()); // includes dummy k-mers - EXPECT_EQ(reference.substr(5), path.get_sequence()); - EXPECT_EQ(config.score_sequences(query.substr(5), reference.substr(5)), path.get_score()); - EXPECT_EQ("5S13=", path.get_cigar().to_string()); - EXPECT_EQ(13u, path.get_num_matches()); + EXPECT_EQ(12u, path.size()); + + if (path.get_sequence() == reference) { + EXPECT_EQ(config.score_sequences(query, reference), path.get_score()); + EXPECT_EQ("5=1X12=", path.get_cigar().to_string()); + } else { + ASSERT_EQ(reference_rc, path.get_sequence()); + EXPECT_EQ(config.score_sequences(query_rc, reference_rc), path.get_score()); + EXPECT_EQ("12=1X5=", path.get_cigar().to_string()); + } + + EXPECT_EQ(17u, path.get_num_matches()); EXPECT_FALSE(path.is_exact_match()); - EXPECT_EQ(5u, path.get_clipping()); + EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); - EXPECT_EQ(5u, path.get_offset()); + EXPECT_EQ(0u, path.get_offset()); EXPECT_TRUE(path.is_valid(*graph, &config)); check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); @@ -1447,37 +1495,6 @@ TYPED_TEST(DBGAlignerTest, align_nodummy) { check_extend(graph, aligner.get_config(), paths, query); } -#if ! _PROTEIN_GRAPH -TYPED_TEST(DBGAlignerTest, align_both_directions) { - size_t k = 7; - std::string reference = "AAAAGCTTTCGAGGCCAA"; - std::string query = "AAAAGTTTTCGAGGCCAA"; - // X - - auto graph = build_graph_batch(k, { reference }, DeBruijnGraph::CANONICAL); - DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2)); - config.max_seed_length = k; - DBGAligner<> aligner(*graph, config); - auto paths = aligner.align(query); - ASSERT_EQ(1ull, paths.size()); - auto path = paths[0]; - - EXPECT_EQ(12u, path.size()); - EXPECT_EQ(reference, path.get_sequence()); - EXPECT_EQ(config.score_sequences(query, reference), path.get_score()); - EXPECT_EQ("5=1X12=", path.get_cigar().to_string()); - EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); - EXPECT_EQ(0u, path.get_clipping()); - EXPECT_EQ(0u, path.get_end_clipping()); - EXPECT_EQ(0u, path.get_offset()); - EXPECT_TRUE(path.is_valid(*graph, &config)); - check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); - - check_extend(graph, aligner.get_config(), paths, query); -} -#endif - TYPED_TEST(DBGAlignerTest, align_seed_to_end) { size_t k = 5; std::string reference = "ATCCCTTTTAAAA"; @@ -1542,53 +1559,14 @@ TEST(DBGAlignerTest, align_extended_insert_after_match) { ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; - EXPECT_EQ(47u, path.size()); - EXPECT_EQ(reference_1, path.get_sequence()); - EXPECT_EQ("22=3I2=3X9=3D4=1I1=1X2I3=2I1=1I7=1D1=2D2=1X3=1X6=6S", - path.get_cigar().to_string()); EXPECT_TRUE(path.is_valid(*graph, &config)); + EXPECT_EQ(52u, path.get_score()) + << path.get_score() << " " << path.get_cigar().to_string(); check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); check_extend(graph, aligner.get_config(), paths, query); } -TEST(DBGAlignerTest, align_suffix_seed_snp_canonical_wrapper) { - size_t k = 18; - std::string reference = "TTGGCCTCGAAAG" "TTTTT"; - std::string query = "GGGGG" "CTTTCGAGGCCAA"; - std::string ref_rev = "AAAAA" "CTTTCGAGGCCAA"; - - auto dbg_succ = std::make_shared(k, DeBruijnGraph::PRIMARY); - dbg_succ->add_sequence(reference); - auto graph = std::make_shared(*dbg_succ); - - DBGAlignerConfig config(DBGAlignerConfig::dna_scoring_matrix(2, -1, -2)); - config.max_num_seeds_per_locus = std::numeric_limits::max(); - config.min_cell_score = std::numeric_limits::min() + 100; - config.min_path_score = std::numeric_limits::min() + 100; - config.min_seed_length = 13; - - for (size_t max_seed_length : { k, k + 100 }) { - config.max_seed_length = max_seed_length; - DBGAligner>> aligner(*graph, config); - auto paths = aligner.align(query); - ASSERT_EQ(1ull, paths.size()); - auto path = paths[0]; - - EXPECT_EQ(1u, path.size()); // includes dummy k-mers - EXPECT_EQ(ref_rev.substr(5), path.get_sequence()); - EXPECT_EQ(config.score_sequences(query.substr(5), ref_rev.substr(5)), path.get_score()); - EXPECT_EQ("5S13=", path.get_cigar().to_string()); - EXPECT_EQ(13u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); - EXPECT_EQ(5u, path.get_clipping()); - EXPECT_EQ(0u, path.get_end_clipping()); - EXPECT_EQ(5u, path.get_offset()); - EXPECT_TRUE(path.is_valid(*graph, &config)); - check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); - } -} - TEST(DBGAlignerTest, align_suffix_seed_no_full_seeds) { size_t k = 31; std::string reference = "CTGCTGCGCCATCGCAACCCACGGTTGCTTTTTGAGTCGCTGCTCACGTTAGCCATCACACTGACGTTAAGCTGGCTTTCGATGCTGTATC";