From 40bb74ff7c17774aa61d150861a55f1a7819ccca Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Wed, 16 Jun 2021 12:22:06 +0200 Subject: [PATCH 01/24] verbose logs --- metagraph/src/annotation/row_diff_builder.cpp | 100 ++++++++++++------ 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index aee80ac04c..fcbfea3c2d 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -889,6 +889,13 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } }; + graph::DBGSuccinct graph(2); + logger->trace("Loading graph..."); + if (!graph.load(pred_succ_fprefix)) { + logger->error("Cannot load graph from {}", pred_succ_fprefix); + std::exit(1); + } + traverse_anno_chunked( num_rows, pred_succ_fprefix, sources, [&](uint64_t chunk_size) { @@ -919,20 +926,40 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, // or if the successor has zero diff uint64_t succ_value = is_anchor ? 0 : get_value(source_col, source_idx, j, *succ); if (succ_value != curr_value) { - // no reduction, we must keep the bit - auto &v = set_rows_fwd[source_idx][j]; - if constexpr(with_values) { - v.emplace_back(row_idx, curr_value - succ_value); - } else { - v.push_back(row_idx); - } - if (is_anchor) + if (is_anchor) { num_relations_anchored[source_idx][j]++; + } else { + // no reduction, we must keep the bit + auto &v = set_rows_fwd[source_idx][j]; + if constexpr(with_values) { + v.emplace_back(row_idx, curr_value - succ_value); + } else { + v.push_back(row_idx); + } - if (v.size() == v.capacity()) { - // dump chunk to disk - dump_chunk_to_disk(v, source_idx, j, 0); - v.resize(0); + if (v.size() == v.capacity()) { + // dump chunk to disk + dump_chunk_to_disk(v, source_idx, j, 0); + v.resize(0); + } + } + } else { + if (!succ) { + auto edge = graph.kmer_to_boss_index(to_node(row_idx)); + const auto &boss = graph.get_boss(); + logger->trace("Reduction: value {}" + "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" + "\nTo NULL", + curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); + } else { + auto edge = graph.kmer_to_boss_index(to_node(row_idx)); + auto next_edge = graph.kmer_to_boss_index(to_node(*succ)); + const auto &boss = graph.get_boss(); + logger->trace("Reduction: value {}" + "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" + "\nTo anchor: {}, only_incoming: {}, only_outgoing: {}", + curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge), + anchor[*succ], boss.is_single_incoming(next_edge, boss.get_W()[next_edge]), boss.is_single_outgoing(next_edge)); } } } @@ -1098,7 +1125,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, for (uint64_t chunk = 0; chunk < row_reduction.size(); chunk += BLOCK_SIZE) { row_nbits_block.assign(std::min(BLOCK_SIZE, row_reduction.size() - chunk), 0); - #pragma omp parallel for num_threads(num_threads) schedule(dynamic) + #pragma omp parallel for num_threads(get_num_threads()) schedule(dynamic) for (size_t l_idx = 0; l_idx < diff_columns.size(); ++l_idx) { for (const auto &col_ptr : diff_columns[l_idx]) { col_ptr->call_ones_in_range(chunk, chunk + row_nbits_block.size(), @@ -1126,9 +1153,6 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } async_writer.join(); - - logger->trace("Rows with negative row reduction: {} in vector {}", - num_larger_rows, row_reduction_fname); } void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, @@ -1347,30 +1371,40 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, bool is_anchor = anchor[row_idx]; - if (is_anchor) + if (is_anchor) { num_coords_anchored[s][j] += curr_value.size(); + } else { + const auto diff = is_anchor + ? curr_value + : get_diff(curr_value, get_value(source_col, s, j, *succ)); - const auto diff = is_anchor - ? curr_value - : get_diff(curr_value, get_value(source_col, s, j, *succ)); + if (diff.size()) { + // must write the coordinates/diff + auto &v = set_rows_fwd[s][j]; - if (diff.size()) { - // must write the coordinates/diff - auto &v = set_rows_fwd[s][j]; + if (is_anchor) { + logger->trace("anchor: {}", curr_value.size()); + } else { + logger->trace("diff: {}{}: {} -> {}", + curr_value.size() <= get_value(source_col, s, j, *succ).size() + ? "\t\t\t + " : "\t - ", + diff.size(), curr_value.size(), get_value(source_col, s, j, *succ).size()); + } - for (uint64_t coord : diff) { - assert((!v.size() || v.back() != std::make_pair(row_idx, coord)) - && "coordinates must be unique and can't repeat"); - v.emplace_back(row_idx, coord); - row_diff_coords[s][j]++; + for (uint64_t coord : diff) { + assert((!v.size() || v.back() != std::make_pair(row_idx, coord)) + && "coordinates must be unique and can't repeat"); + v.emplace_back(row_idx, coord); + row_diff_coords[s][j]++; - if (v.size() == v.capacity()) { - // dump chunk to disk - dump_chunk_to_disk(v, s, j, 0); - v.resize(0); + if (v.size() == v.capacity()) { + // dump chunk to disk + dump_chunk_to_disk(v, s, j, 0); + v.resize(0); + } } + row_diff_bits[s][j]++; } - row_diff_bits[s][j]++; } // check non-anchor predecessor nodes and add them if they are zero From a2acd8562327d3c328535b01713401f4b6c47d89 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Wed, 16 Jun 2021 12:42:00 +0200 Subject: [PATCH 02/24] changes --- metagraph/src/annotation/row_diff_builder.cpp | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index fcbfea3c2d..a1edb3beb3 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -926,23 +926,21 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, // or if the successor has zero diff uint64_t succ_value = is_anchor ? 0 : get_value(source_col, source_idx, j, *succ); if (succ_value != curr_value) { - if (is_anchor) { - num_relations_anchored[source_idx][j]++; + // no reduction, we must keep the bit + auto &v = set_rows_fwd[source_idx][j]; + if constexpr(with_values) { + v.emplace_back(row_idx, curr_value - succ_value); } else { - // no reduction, we must keep the bit - auto &v = set_rows_fwd[source_idx][j]; - if constexpr(with_values) { - v.emplace_back(row_idx, curr_value - succ_value); - } else { - v.push_back(row_idx); - } + v.push_back(row_idx); + } - if (v.size() == v.capacity()) { - // dump chunk to disk - dump_chunk_to_disk(v, source_idx, j, 0); - v.resize(0); - } + if (v.size() == v.capacity()) { + // dump chunk to disk + dump_chunk_to_disk(v, source_idx, j, 0); + v.resize(0); } + if (is_anchor) + num_relations_anchored[source_idx][j]++; } else { if (!succ) { auto edge = graph.kmer_to_boss_index(to_node(row_idx)); @@ -1125,7 +1123,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, for (uint64_t chunk = 0; chunk < row_reduction.size(); chunk += BLOCK_SIZE) { row_nbits_block.assign(std::min(BLOCK_SIZE, row_reduction.size() - chunk), 0); - #pragma omp parallel for num_threads(get_num_threads()) schedule(dynamic) + #pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (size_t l_idx = 0; l_idx < diff_columns.size(); ++l_idx) { for (const auto &col_ptr : diff_columns[l_idx]) { col_ptr->call_ones_in_range(chunk, chunk + row_nbits_block.size(), @@ -1153,6 +1151,9 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } async_writer.join(); + + logger->trace("Rows with negative row reduction: {} in vector {}", + num_larger_rows, row_reduction_fname); } void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, @@ -1371,40 +1372,39 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, bool is_anchor = anchor[row_idx]; - if (is_anchor) { + if (is_anchor) num_coords_anchored[s][j] += curr_value.size(); - } else { - const auto diff = is_anchor - ? curr_value - : get_diff(curr_value, get_value(source_col, s, j, *succ)); - if (diff.size()) { - // must write the coordinates/diff - auto &v = set_rows_fwd[s][j]; + const auto diff = is_anchor + ? curr_value + : get_diff(curr_value, get_value(source_col, s, j, *succ)); - if (is_anchor) { - logger->trace("anchor: {}", curr_value.size()); - } else { - logger->trace("diff: {}{}: {} -> {}", - curr_value.size() <= get_value(source_col, s, j, *succ).size() - ? "\t\t\t + " : "\t - ", - diff.size(), curr_value.size(), get_value(source_col, s, j, *succ).size()); - } + if (diff.size()) { + // must write the coordinates/diff + auto &v = set_rows_fwd[s][j]; - for (uint64_t coord : diff) { - assert((!v.size() || v.back() != std::make_pair(row_idx, coord)) - && "coordinates must be unique and can't repeat"); - v.emplace_back(row_idx, coord); - row_diff_coords[s][j]++; + if (is_anchor) { + logger->trace("anchor: {}", curr_value.size()); + } else { + logger->trace("diff: {}{}: {} -> {}", + curr_value.size() <= get_value(source_col, s, j, *succ).size() + ? "\t\t\t + " : "\t - ", + diff.size(), curr_value.size(), get_value(source_col, s, j, *succ).size()); + } - if (v.size() == v.capacity()) { - // dump chunk to disk - dump_chunk_to_disk(v, s, j, 0); - v.resize(0); - } + for (uint64_t coord : diff) { + assert((!v.size() || v.back() != std::make_pair(row_idx, coord)) + && "coordinates must be unique and can't repeat"); + v.emplace_back(row_idx, coord); + row_diff_coords[s][j]++; + + if (v.size() == v.capacity()) { + // dump chunk to disk + dump_chunk_to_disk(v, s, j, 0); + v.resize(0); } - row_diff_bits[s][j]++; } + row_diff_bits[s][j]++; } // check non-anchor predecessor nodes and add them if they are zero From e3d3c7b1ac15a5b0f6ee9e70827a0726393ce631 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Wed, 16 Jun 2021 17:57:44 +0200 Subject: [PATCH 03/24] assign multiple successors at forks --- .../src/annotation/annotation_converters.cpp | 17 ++ .../binary_matrix/row_diff/row_diff.hpp | 2 +- metagraph/src/annotation/row_diff_builder.cpp | 276 ++++++++++++------ .../common/elias_fano/elias_fano_merger.hpp | 7 +- .../graph/representation/succinct/boss.cpp | 57 +++- .../graph/representation/succinct/boss.hpp | 5 + 6 files changed, 262 insertions(+), 102 deletions(-) diff --git a/metagraph/src/annotation/annotation_converters.cpp b/metagraph/src/annotation/annotation_converters.cpp index 736af6a631..17464f87ab 100644 --- a/metagraph/src/annotation/annotation_converters.cpp +++ b/metagraph/src/annotation/annotation_converters.cpp @@ -1174,6 +1174,23 @@ void convert_to_row_diff(const std::vector &files, mem_bytes -= anchor_size; } + // TODO: move + if (construction_stage != RowDiffStage::COUNT_LABELS) { + const std::string rd_succ_fname = graph_fname + kRowDiffForkSuccExt; + if (!fs::exists(rd_succ_fname)) { + logger->error("Can't find row-diff successor bitmap at {}", rd_succ_fname); + exit(1); + } + uint64_t rd_succ_size = fs::file_size(rd_succ_fname); + if (rd_succ_size > mem_bytes) { + logger->warn("row-diff successor bitmap ({} MiB) is larger than" + " the memory allocated ({} MiB). Reserve more RAM.", + rd_succ_size >> 20, mem_bytes >> 20); + return; + } + mem_bytes -= rd_succ_size; + } + if (!files.size()) return; diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp index feb66a3ba5..5ae71fa22c 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp @@ -169,7 +169,7 @@ std::vector RowDiff::get_rows(const std::vector &row_ids) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes()); // diff rows annotating nodes along the row-diff paths std::vector rd_ids; diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index a1edb3beb3..8cb9a5d6b6 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -257,10 +257,10 @@ void sum_and_call_counts(const fs::path &dir, } } -rd_succ_bv_type route_at_forks(const graph::DBGSuccinct &graph, - const std::string &rd_succ_filename, - const std::string &count_vectors_dir, - const std::string &row_count_extension) { +void route_at_forks(const graph::DBGSuccinct &graph, + const std::string &rd_succ_fname, + const std::string &count_vectors_dir, + const std::string &row_count_extension) { logger->trace("Assigning row-diff successors at forks..."); rd_succ_bv_type rd_succ; @@ -280,7 +280,7 @@ rd_succ_bv_type route_at_forks(const graph::DBGSuccinct &graph, std::vector outgoing_counts; - sdsl::bit_vector rd_succ_bv(last.size(), false); + sdsl::bit_vector rd_succ_bv(graph.num_nodes(), false); sum_and_call_counts(count_vectors_dir, row_count_extension, "row counts", [&](int32_t count) { @@ -291,7 +291,7 @@ rd_succ_bv_type route_at_forks(const graph::DBGSuccinct &graph, size_t max_pos = std::max_element(outgoing_counts.rbegin(), outgoing_counts.rend()) - outgoing_counts.rbegin(); - rd_succ_bv[graph.kmer_to_boss_index(graph_idx - max_pos)] = true; + rd_succ_bv[to_row(graph_idx - max_pos)] = true; outgoing_counts.resize(0); } graph_idx++; @@ -312,11 +312,10 @@ rd_succ_bv_type route_at_forks(const graph::DBGSuccinct &graph, count_vectors_dir); } - std::ofstream f(rd_succ_filename, ios::binary); + std::ofstream f(rd_succ_fname, ios::binary); rd_succ.serialize(f); logger->trace("RowDiff successors are assigned for forks and written to {}", - rd_succ_filename); - return rd_succ; + rd_succ_fname); } void build_pred_succ(const std::string &graph_fname, @@ -343,8 +342,8 @@ void build_pred_succ(const std::string &graph_fname, } // assign row-diff successors at forks - rd_succ_bv_type rd_succ = route_at_forks(graph, outfbase + kRowDiffForkSuccExt, - count_vectors_dir, row_count_extension); + route_at_forks(graph, outfbase + kRowDiffForkSuccExt, + count_vectors_dir, row_count_extension); const BOSS &boss = graph.get_boss(); @@ -378,28 +377,22 @@ void build_pred_succ(const std::string &graph_fname, BOSS::edge_index next = boss.fwd(boss_idx, d); assert(next); if (!dummy[next]) { - while (rd_succ.size() && !rd_succ[next]) { - next--; - assert(!boss.get_last(next)); - } - succ_buf.push_back(to_row(graph.boss_to_kmer_index(next))); - succ_boundary_buf.push_back(0); + do { + succ_buf.push_back(to_row(graph.boss_to_kmer_index(next))); + succ_boundary_buf.push_back(0); + } while (!boss.get_last(--next)); } - // compute predecessors only for row-diff successors - if (rd_succ.size() ? rd_succ[boss_idx] : boss.get_last(boss_idx)) { - BOSS::TAlphabet d = boss.get_node_last_value(boss_idx); - BOSS::edge_index back_idx = boss.bwd(boss_idx); - boss.call_incoming_to_target(back_idx, d, - [&](BOSS::edge_index pred) { - // dummy predecessors are ignored - if (!dummy[pred]) { - uint64_t node_index = graph.boss_to_kmer_index(pred); - pred_buf.push_back(to_row(node_index)); - pred_boundary_buf.push_back(0); - } + BOSS::edge_index back_idx = boss.bwd(boss_idx); + boss.call_incoming_to_target(back_idx, boss.get_node_last_value(boss_idx), + [&](BOSS::edge_index pred) { + // dummy predecessors are ignored + if (!dummy[pred]) { + uint64_t node_index = graph.boss_to_kmer_index(pred); + pred_buf.push_back(to_row(node_index)); + pred_boundary_buf.push_back(0); } - ); - } + } + ); } succ_boundary_buf.push_back(1); pred_boundary_buf.push_back(1); @@ -487,7 +480,34 @@ void assign_anchors(const std::string &graph_fname, if (rd_succ.size()) { logger->trace("Assigning anchors for RowDiff successors {}...", rd_succ_fname); + { + sdsl::bit_vector rd_succ_bv(boss.get_last().size(), 0); + rd_succ.call_ones([&](uint64_t i) { + rd_succ_bv[graph.kmer_to_boss_index(to_node(i))] = 1; + }); + rd_succ = rd_succ_bv_type(rd_succ_bv); + } boss.row_diff_traverse(num_threads, max_length, rd_succ, &anchors_bv); + + logger->trace("Adding branching off forks to RowDiff successors {}...", rd_succ_fname); + uint64_t num_fork_successors = rd_succ.num_set_bits(); + sdsl::bit_vector rd_succ_bv = rd_succ.convert_to(); + rd_succ = rd_succ_bv_type(); + boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); + { + sdsl::bit_vector bv(num_rows, 0); + call_ones(rd_succ_bv, [&](BOSS::edge_index i) { + bv[to_row(graph.boss_to_kmer_index(i))] = 1; + }); + rd_succ_bv = std::move(bv); + } + rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); + logger->trace("Number of successors at forks increased from {} to {}", + num_fork_successors, rd_succ.num_set_bits()); + std::ofstream out(rd_succ_fname, ios::binary); + rd_succ.serialize(out); + logger->trace("Updated RowDiff successors {}", rd_succ_fname); + } else { logger->warn("Assigning anchors without chosen RowDiff successors." " The last outgoing edges will be used for routing."); @@ -497,7 +517,7 @@ void assign_anchors(const std::string &graph_fname, // anchors_bv uses BOSS edges as indices, so we need to map it to annotation indices { - sdsl::bit_vector anchors(num_rows, false); + sdsl::bit_vector anchors(num_rows, 0); for (BOSS::edge_index i = 1; i < anchors_bv.size(); ++i) { if (anchors_bv[i]) { uint64_t graph_idx = graph.boss_to_kmer_index(i); @@ -535,7 +555,7 @@ using CallOnes = std::function; void read_next_block(sdsl::int_vector_buffer<>::iterator &it, @@ -656,12 +676,10 @@ void traverse_anno_chunked( source_col.call_ones_in_range(chunk, chunk + block_size, [&](uint64_t i) { assert(succ_chunk_idx[i - chunk + 1] >= succ_chunk_idx[i - chunk]); - assert(succ_chunk_idx[i - chunk + 1] <= succ_chunk_idx[i - chunk] + 1); - const uint64_t *succ = succ_chunk_idx[i - chunk + 1] - > succ_chunk_idx[i - chunk] - ? succ_chunk.data() + succ_chunk_idx[i - chunk] - : NULL; - call_ones(source_col, i, i - chunk, l_idx, j, succ, + assert(pred_chunk_idx[i - chunk + 1] >= pred_chunk_idx[i - chunk]); + call_ones(source_col, i, i - chunk, l_idx, j, + succ_chunk.data() + succ_chunk_idx[i - chunk], + succ_chunk.data() + succ_chunk_idx[i - chunk + 1], pred_chunk.data() + pred_chunk_idx[i - chunk], pred_chunk.data() + pred_chunk_idx[i - chunk + 1]); } @@ -781,6 +799,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } anchor_bv_type anchor; + rd_succ_bv_type rd_succ; if (!compute_row_reduction) { const std::string anchors_fname = pred_succ_fprefix + kRowDiffAnchorExt; std::ifstream f(anchors_fname, std::ios::binary); @@ -794,6 +813,19 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, anchors_fname, anchor.size(), num_rows); exit(1); } + + const std::string &rd_succ_fname = pred_succ_fprefix + kRowDiffForkSuccExt; + f = std::ifstream(rd_succ_fname, ios::binary); + if (!rd_succ.load(f)) { + logger->error("Couldn't load row-diff successor bitmap from {}", rd_succ_fname); + exit(1); + } + if (rd_succ.size() != num_rows) { + logger->error("Successor bitmap {} is incompatible with annotations." + " Vector size: {}, number of rows: {}", + rd_succ_fname, rd_succ.size(), num_rows); + exit(1); + } } const fs::path tmp_path = utils::create_temp_dir(swap_dir, "col"); @@ -877,15 +909,22 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, // get bit at position |i| or its value auto get_value = [&](const bit_vector &col, - size_t s, size_t j, uint64_t i) -> uint64_t { + size_t s, size_t j, + const uint64_t *it, const uint64_t *end) -> uint64_t { if (!with_values) { - return col[i]; + for ( ; it != end; ++it) { + if ((!rd_succ.size() || rd_succ[*it]) && col[*it]) + return true; + } + return false; } else { - if (uint64_t rk = col.conditional_rank1(i)) { - return values[s][j][rk - 1]; - } else { - return 0; + uint64_t value = 0; + uint64_t rk; + for ( ; it != end; ++it) { + if ((!rd_succ.size() || rd_succ[*it]) && (rk = col.conditional_rank1(*it))) + value += values[s][j][rk - 1]; } + return value; } }; @@ -904,7 +943,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, }, [&](const bit_vector &source_col, uint64_t row_idx, uint64_t chunk_idx, size_t source_idx, size_t j, - const uint64_t *succ, + const uint64_t *succ_begin, const uint64_t *succ_end, const uint64_t *pred_begin, const uint64_t *pred_end) { // get bits for these positions (or values, hence uint64_t) @@ -916,7 +955,9 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } if (compute_row_reduction) { - if (succ && curr_value == get_value(source_col, source_idx, j, *succ)) { + uint64_t succ_value = get_value(source_col, source_idx, + j, succ_begin, succ_end); + if (succ_begin != succ_end && curr_value == succ_value) { // reduction (zero diff) __atomic_add_fetch(&row_nbits_block[chunk_idx], 1, __ATOMIC_RELAXED); } @@ -924,7 +965,8 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, bool is_anchor = anchor[row_idx]; // add current bit if this node is an anchor // or if the successor has zero diff - uint64_t succ_value = is_anchor ? 0 : get_value(source_col, source_idx, j, *succ); + uint64_t succ_value = is_anchor ? 0 : get_value(source_col, source_idx, + j, succ_begin, succ_end); if (succ_value != curr_value) { // no reduction, we must keep the bit auto &v = set_rows_fwd[source_idx][j]; @@ -942,29 +984,34 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, if (is_anchor) num_relations_anchored[source_idx][j]++; } else { - if (!succ) { + if (succ_begin == succ_end) { auto edge = graph.kmer_to_boss_index(to_node(row_idx)); const auto &boss = graph.get_boss(); logger->trace("Reduction: value {}" - "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" - "\nTo NULL", - curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); + "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" + "\nTo NULL", + curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); } else { auto edge = graph.kmer_to_boss_index(to_node(row_idx)); - auto next_edge = graph.kmer_to_boss_index(to_node(*succ)); const auto &boss = graph.get_boss(); logger->trace("Reduction: value {}" - "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" - "\nTo anchor: {}, only_incoming: {}, only_outgoing: {}", - curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge), - anchor[*succ], boss.is_single_incoming(next_edge, boss.get_W()[next_edge]), boss.is_single_outgoing(next_edge)); + "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}", + curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); + for (const uint64_t *it = succ_begin; it != succ_end; ++it) { + auto next_edge = graph.kmer_to_boss_index(to_node(*it)); + logger->trace("\nTo anchor: {}, only_incoming: {}, only_outgoing: {}", + anchor[*it], boss.is_single_incoming(next_edge, boss.get_W()[next_edge]), boss.is_single_outgoing(next_edge)); + } } } } + if (!curr_value || !rd_succ.size() || !rd_succ[row_idx]) + return; + // check non-anchor predecessor nodes and add them if they are zero for (const uint64_t *pred_p = pred_begin; pred_p < pred_end; ++pred_p) { - if (curr_value && !source_col[*pred_p] && (compute_row_reduction || !anchor[*pred_p])) { + if (!source_col[*pred_p] && (compute_row_reduction || !anchor[*pred_p])) { auto &v = set_rows_bwd[source_idx][j]; if constexpr(with_values) { v.emplace_back(*pred_p, -curr_value); @@ -1059,26 +1106,42 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, std::min(values[l_idx][j].width() + 1, 64)); } - auto call_ones = [&](const std::function &call) { - std::vector filenames; - // skip chunk with fwd bits which have already been counted if stage 1 - for (uint32_t chunk = compute_row_reduction ? 1 : 0; - chunk < num_chunks[l_idx][j]; ++chunk) { - filenames.push_back(tmp_file(l_idx, j, chunk)); - } + std::vector filenames; + // skip chunk with fwd bits which have already been counted if stage 1 + for (uint32_t chunk = compute_row_reduction ? 1 : 0; + chunk < num_chunks[l_idx][j]; ++chunk) { + filenames.push_back(tmp_file(l_idx, j, chunk)); + } - const bool remove_chunks = true; - uint64_t r = 0; - elias_fano::merge_files(filenames, [&](T v) { - call(utils::get_first(v)); - if constexpr(with_values) { - assert(v.second && "zero diffs must have been skipped"); - values[l_idx][j][r++] = matrix::encode_diff(v.second); + const bool remove_chunks = true; + uint64_t r = 0; + std::vector indexes; + indexes.reserve(row_diff_bits[l_idx][j]); + int64_t value = 0; + elias_fano::merge_files(filenames, [&](T v) { + if constexpr(with_values) { + assert(v.second && "zero diffs must have been skipped"); + if (indexes.empty() || v.first != indexes.back()) { + if (!indexes.empty()) + values[l_idx][j][r++] = matrix::encode_diff(value); + value = 0; + indexes.push_back(v.first); } - }, remove_chunks, chunks_open_per_thread); - }; - columns[j] = std::make_unique(call_ones, num_rows, - row_diff_bits[l_idx][j]); + value += v.second; + } else { + indexes.push_back(v); + } + }, remove_chunks, chunks_open_per_thread, false); + if constexpr(with_values) { + values[l_idx][j][r++] = matrix::encode_diff(value); + values[l_idx][j].resize(r); + assert(r == indexes.size()); + } + assert(row_diff_bits[l_idx][j] >= indexes.size()); + + columns[j] = std::make_unique( + [&](const auto &callback) { std::for_each(indexes.begin(), indexes.end(), callback); }, + num_rows, indexes.size()); } if (compute_row_reduction) { @@ -1188,14 +1251,20 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, ); logger->trace("Done loading coordinates"); + rd_succ_bv_type rd_succ; + // get bit at position |i| or its value auto get_value = [&](const bit_vector &col, - size_t s, size_t j, uint64_t i) { + size_t s, size_t j, + const uint64_t *it, const uint64_t *end) { std::vector result; - if (uint64_t rk = col.conditional_rank1(i)) { - uint64_t t = delims[s][j].select1(rk); - while (!delims[s][j][++t]) { - result.push_back(coords[s][j][t - rk]); + uint64_t rk; + for ( ; it != end; ++it) { + if ((!rd_succ.size() || rd_succ[*it]) && (rk = col.conditional_rank1(*it))) { + uint64_t t = delims[s][j].select1(rk); + while (!delims[s][j][++t]) { + result.push_back(coords[s][j][t - rk]); + } } } std::sort(result.begin(), result.end()); @@ -1263,13 +1332,14 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, }, [&](const bit_vector &source_col, uint64_t row_idx, uint64_t chunk_idx, size_t s, size_t j, - const uint64_t *succ, const uint64_t *, const uint64_t *) { - if (!succ) + const uint64_t *succ_begin, const uint64_t *succ_end, + const uint64_t *, const uint64_t *) { + if (succ_begin == succ_end) return; // get annotated coordinates for this k-mer - const auto curr_value = get_value(source_col, s, j, row_idx); - const auto diff = get_diff(curr_value, get_value(source_col, s, j, *succ)); + const auto curr_value = get_value(source_col, s, j, &row_idx, &row_idx + 1); + const auto diff = get_diff(curr_value, get_value(source_col, s, j, succ_begin, succ_end)); // reduction (zero diff) __atomic_add_fetch(&row_nbits_block[chunk_idx], curr_value.size() - diff.size(), @@ -1297,9 +1367,22 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, return; } + const std::string &rd_succ_fname = pred_succ_fprefix + kRowDiffForkSuccExt; + std::ifstream f(rd_succ_fname, ios::binary); + if (!rd_succ.load(f)) { + logger->error("Couldn't load row-diff successor bitmap from {}", rd_succ_fname); + exit(1); + } + if (rd_succ.size() != num_rows) { + logger->error("Successor bitmap {} is incompatible with annotations." + " Vector size: {}, number of rows: {}", + rd_succ_fname, rd_succ.size(), num_rows); + exit(1); + } + anchor_bv_type anchor; const std::string anchors_fname = pred_succ_fprefix + kRowDiffAnchorExt; - std::ifstream f(anchors_fname, std::ios::binary); + f = std::ifstream(anchors_fname, std::ios::binary); if (!anchor.load(f)) { logger->error("Can't load anchors from {}", anchors_fname); exit(1); @@ -1365,10 +1448,10 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, [&](uint64_t) {}, [&](const bit_vector &source_col, uint64_t row_idx, uint64_t, size_t s, size_t j, - const uint64_t *succ, + const uint64_t *succ_begin, const uint64_t *succ_end, const uint64_t *pred_begin, const uint64_t *pred_end) { // get annotated coordinates for this k-mer - const auto curr_value = get_value(source_col, s, j, row_idx); + const auto curr_value = get_value(source_col, s, j, &row_idx, &row_idx + 1); bool is_anchor = anchor[row_idx]; @@ -1377,7 +1460,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, const auto diff = is_anchor ? curr_value - : get_diff(curr_value, get_value(source_col, s, j, *succ)); + : get_diff(curr_value, get_value(source_col, s, j, succ_begin, succ_end)); if (diff.size()) { // must write the coordinates/diff @@ -1386,12 +1469,11 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, if (is_anchor) { logger->trace("anchor: {}", curr_value.size()); } else { + auto next = get_value(source_col, s, j, succ_begin, succ_end); logger->trace("diff: {}{}: {} -> {}", - curr_value.size() <= get_value(source_col, s, j, *succ).size() - ? "\t\t\t + " : "\t - ", - diff.size(), curr_value.size(), get_value(source_col, s, j, *succ).size()); + curr_value.size() <= next.size() ? "\t\t\t + " : "\t - ", + diff.size(), curr_value.size(), next.size()); } - for (uint64_t coord : diff) { assert((!v.size() || v.back() != std::make_pair(row_idx, coord)) && "coordinates must be unique and can't repeat"); @@ -1407,9 +1489,15 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, row_diff_bits[s][j]++; } + if (!curr_value.size() || !rd_succ[row_idx]) + return; + // check non-anchor predecessor nodes and add them if they are zero for (const uint64_t *pred_p = pred_begin; pred_p < pred_end; ++pred_p) { - if (curr_value.size() && !source_col[*pred_p] && !anchor[*pred_p]) { + if (!source_col[*pred_p] && !anchor[*pred_p] + && (!num_coords_per_seq + || std::any_of(curr_value.begin(), curr_value.end(), + [&](uint64_t c) { return c % num_coords_per_seq; }))) { auto &v = set_rows_bwd[s][j]; // indicate that there are no coordinates for the predecessor v.emplace_back(*pred_p, DUMMY_COORD); diff --git a/metagraph/src/common/elias_fano/elias_fano_merger.hpp b/metagraph/src/common/elias_fano/elias_fano_merger.hpp index f42298c470..939aa64535 100644 --- a/metagraph/src/common/elias_fano/elias_fano_merger.hpp +++ b/metagraph/src/common/elias_fano/elias_fano_merger.hpp @@ -188,7 +188,8 @@ template void merge_files(std::vector sources, const std::function &on_new_item, bool remove_sources = true, - size_t max_sources_open = -1) { + size_t max_sources_open = -1, + bool deduplicate = true) { if (!sources.size()) return; @@ -223,7 +224,7 @@ void merge_files(std::vector sources, EliasFanoEncoderBuffered::append_block(buf, new_chunks.back()); buf.resize(0); } - }); + }, remove_sources, -1, deduplicate); if (buf.size()) { EliasFanoEncoderBuffered::append_block(buf, new_chunks.back()); } @@ -238,7 +239,7 @@ void merge_files(std::vector sources, T last = decoder.pop(); while (!decoder.empty()) { T curr = decoder.pop(); - if (curr != last) { + if (!deduplicate || curr != last) { on_new_item(last); last = curr; } diff --git a/metagraph/src/graph/representation/succinct/boss.cpp b/metagraph/src/graph/representation/succinct/boss.cpp index 2fd1b4885b..887b15e69b 100644 --- a/metagraph/src/graph/representation/succinct/boss.cpp +++ b/metagraph/src/graph/representation/succinct/boss.cpp @@ -2681,7 +2681,7 @@ void traverse_rd_path_backward(const BOSS &boss, T max_length, sdsl::bit_vector *visited, sdsl::bit_vector *terminal, - sdsl::bit_vector *dummy, + const sdsl::bit_vector &dummy, ProgressBar &progress_bar) { constexpr bool async = true; @@ -2696,7 +2696,7 @@ void traverse_rd_path_backward(const BOSS &boss, std::tie(edge, dist_to_anchor) = queue.back(); queue.pop_back(); - assert(boss.get_W(edge) && !fetch_bit(dummy->data(), edge, async)); + assert(boss.get_W(edge) && !dummy[edge]); // mark as visited // also check if the node had already been visited (in case of loop) @@ -2725,7 +2725,7 @@ void traverse_rd_path_backward(const BOSS &boss, // So, we propagate the diff path backward. boss.call_incoming_to_target(boss.bwd(edge), boss.get_node_last_value(edge), [&](edge_index pred) { - if (!fetch_bit(dummy->data(), pred, async)) + if (!dummy[pred]) queue.emplace_back(pred, dist_to_anchor + 1); } ); @@ -2805,7 +2805,7 @@ void BOSS::row_diff_traverse(size_t num_threads, // backward traversal traverse_path = [&](edge_index anchor) { traverse_rd_path_backward(*this, rd_succ, anchor, max_length, &visited, - terminal, &dummy, progress_bar); + terminal, dummy, progress_bar); }; // run backward traversal from every anchor @@ -2918,6 +2918,55 @@ void BOSS::row_diff_traverse(size_t num_threads, #endif } +void BOSS::row_diff_add_forks(size_t num_threads, + const sdsl::bit_vector &anchor, + sdsl::bit_vector *rd_succ, + size_t max_length) const { + ProgressBar progress_bar(W_->size(), "Adding fork successors", + std::cerr, !common::get_verbose()); + + constexpr bool async = true; + sdsl::bit_vector excluded(rd_succ->size(), false); + + // start from 0 and go with blocks of 1024 bits to avoid race conditions + #pragma omp parallel for num_threads(num_threads) schedule(static, 1024) + for (edge_index i = 0; i < W_->size(); ++i) { + ++progress_bar; + // skip edge if it's not a fork or it's already selected + if (i < 2 || is_single_outgoing(i) || (*rd_succ)[i]) + continue; + + // (*rd_succ)[i] = true; + // continue; + // TODO: test this + // if (!is_single_incoming(i, get_W(i))) + // continue; + + std::vector queue = { i }; + + // make branching edge a successor if it reaches anchors not too deep + for (size_t depth = 0; depth < max_length && queue.size(); ++depth) { + edge_index edge = queue.back(); + queue.pop_back(); + // stop branch if anchor is reached + if (anchor[edge]) + continue; + + assert(get_W(edge) % alph_size); + + call_outgoing(fwd(edge, get_W(edge) % alph_size), [&](BOSS::edge_index next) { + if (!fetch_bit(excluded.data(), next, async)) + queue.push_back(next); + }); + } + if (queue.empty()) { + (*rd_succ)[i] = true; + } else { + set_bit(excluded.data(), i, async); + } + } +} + void BOSS::call_unitigs(Call&&> callback, size_t num_threads, size_t min_tip_size, diff --git a/metagraph/src/graph/representation/succinct/boss.hpp b/metagraph/src/graph/representation/succinct/boss.hpp index 8397f82809..c161d8d984 100644 --- a/metagraph/src/graph/representation/succinct/boss.hpp +++ b/metagraph/src/graph/representation/succinct/boss.hpp @@ -156,6 +156,11 @@ class BOSS { const bit_vector &rd_succ, sdsl::bit_vector *terminal) const; + void row_diff_add_forks(size_t num_threads, + const sdsl::bit_vector &anchors, + sdsl::bit_vector *rd_succ, + size_t max_length) const; + edge_index row_diff_successor(edge_index edge, const bit_vector &rd_succ) const { TAlphabet d = get_W(edge) % alph_size; assert(d != kSentinelCode && "sinks have no row-diff successors"); From c5feb4312ed0eb138d92a5b6da47e0a26c01998e Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Wed, 16 Jun 2021 22:08:48 +0200 Subject: [PATCH 04/24] adapted fork_succ bitmap --- .../install_test_environment.sh | 2 +- .../src/annotation/annotation_converters.cpp | 3 +- .../binary_matrix/row_diff/row_diff.hpp | 33 ++++------- .../int_matrix/row_diff/int_row_diff.hpp | 31 +++------- metagraph/src/annotation/row_diff_builder.cpp | 59 +++++++++++-------- .../graph/representation/succinct/boss.hpp | 1 + .../representation/succinct/dbg_succinct.hpp | 24 ++++++++ 7 files changed, 81 insertions(+), 72 deletions(-) diff --git a/metagraph/integration_tests/install_test_environment.sh b/metagraph/integration_tests/install_test_environment.sh index 33f7b1e13f..37d5734422 100755 --- a/metagraph/integration_tests/install_test_environment.sh +++ b/metagraph/integration_tests/install_test_environment.sh @@ -12,7 +12,7 @@ VENV_DIR="$2" if [ -f ${VENV_DIR}/DONE ]; then echo "Found a previously set up virtual environment in ${VENV_DIR}" - return + exit 0 fi echo "Setting up virtual environment in ${VENV_DIR}" python3 -m venv ${VENV_DIR} diff --git a/metagraph/src/annotation/annotation_converters.cpp b/metagraph/src/annotation/annotation_converters.cpp index 17464f87ab..820e5d0372 100644 --- a/metagraph/src/annotation/annotation_converters.cpp +++ b/metagraph/src/annotation/annotation_converters.cpp @@ -1247,8 +1247,7 @@ void convert_to_row_diff(const std::vector &files, } Timer timer; - logger->trace("Annotations in batch: {}", - file_batch.size()); + logger->trace("Annotations in batch: {}", file_batch.size()); if (construction_stage == RowDiffStage::COUNT_LABELS) { count_labels_per_row(file_batch, count_vector_fname, with_coordinates); diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp index 5ae71fa22c..bc320c03d0 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp @@ -13,7 +13,6 @@ #include "common/logger.hpp" #include "common/utils/template_utils.hpp" #include "graph/annotated_dbg.hpp" -#include "graph/representation/succinct/boss.hpp" #include "graph/representation/succinct/dbg_succinct.hpp" @@ -110,7 +109,7 @@ template bool RowDiff::get(Row row, Column column) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); SetBitPositions set_bits = get_row(row); SetBitPositions::iterator v = std::lower_bound(set_bits.begin(), set_bits.end(), column); @@ -125,7 +124,7 @@ template std::vector RowDiff::get_column(Column column) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); // TODO: implement a more efficient algorithm std::vector result; @@ -140,21 +139,16 @@ template BinaryMatrix::SetBitPositions RowDiff::get_row(Row row) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); Vector result = diffs_.get_row(row); std::sort(result.begin(), result.end()); - uint64_t boss_edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(row)); - const graph::boss::BOSS &boss = graph_->get_boss(); - const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last(); + auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); while (!anchor_[row]) { - boss_edge = boss.row_diff_successor(boss_edge, rd_succ); - - row = graph::AnnotatedSequenceGraph::graph_to_anno_index( - graph_->boss_to_kmer_index(boss_edge)); + node = graph_->row_diff_successor(node, fork_succ_); + row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node); auto diff_row = diffs_.get_row(row); std::sort(diff_row.begin(), diff_row.end()); @@ -169,7 +163,7 @@ std::vector RowDiff::get_rows(const std::vector &row_ids) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); // diff rows annotating nodes along the row-diff paths std::vector rd_ids; @@ -184,19 +178,10 @@ RowDiff::get_rows(const std::vector &row_ids) const { // been reached before, and thus, will be reconstructed before this one. std::vector> rd_paths_trunc(row_ids.size()); - const graph::boss::BOSS &boss = graph_->get_boss(); - const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last(); - for (size_t i = 0; i < row_ids.size(); ++i) { Row row = row_ids[i]; - graph::boss::BOSS::edge_index boss_edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(row)); - while (true) { - row = graph::AnnotatedSequenceGraph::graph_to_anno_index( - graph_->boss_to_kmer_index(boss_edge)); - auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); rd_paths_trunc[i].push_back(it.value()); @@ -212,7 +197,9 @@ RowDiff::get_rows(const std::vector &row_ids) const { if (anchor_[row]) break; - boss_edge = boss.row_diff_successor(boss_edge, rd_succ); + auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); + node = graph_->row_diff_successor(node, fork_succ_); + row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node); } } diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index 3f5655d76d..a57fe51ab6 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -13,7 +13,6 @@ #include "common/logger.hpp" #include "common/utils/template_utils.hpp" #include "graph/annotated_dbg.hpp" -#include "graph/representation/succinct/boss.hpp" #include "graph/representation/succinct/dbg_succinct.hpp" #include "annotation/binary_matrix/row_diff/row_diff.hpp" #include "annotation/int_matrix/base/int_matrix.hpp" @@ -99,7 +98,7 @@ template std::vector IntRowDiff::get_column(Column j) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); // TODO: implement a more efficient algorithm std::vector result; @@ -124,22 +123,17 @@ template IntMatrix::RowValues IntRowDiff::get_row_values(Row row) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); RowValues result = diffs_.get_row_values(row); decode_diffs(&result); std::sort(result.begin(), result.end()); - uint64_t boss_edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(row)); - const graph::boss::BOSS &boss = graph_->get_boss(); - const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last(); + auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); while (!anchor_[row]) { - boss_edge = boss.row_diff_successor(boss_edge, rd_succ); - - row = graph::AnnotatedSequenceGraph::graph_to_anno_index( - graph_->boss_to_kmer_index(boss_edge)); + node = graph_->row_diff_successor(node, fork_succ_); + row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node); RowValues diff_row = diffs_.get_row_values(row); decode_diffs(&diff_row); @@ -176,7 +170,7 @@ std::vector IntRowDiff::get_row_values(const std::vector &row_ids) const { assert(graph_ && "graph must be loaded"); assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); // diff rows annotating nodes along the row-diff paths std::vector rd_ids; @@ -191,19 +185,10 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { // been reached before, and thus, will be reconstructed before this one. std::vector> rd_paths_trunc(row_ids.size()); - const graph::boss::BOSS &boss = graph_->get_boss(); - const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last(); - for (size_t i = 0; i < row_ids.size(); ++i) { Row row = row_ids[i]; - graph::boss::BOSS::edge_index boss_edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(row)); - while (true) { - row = graph::AnnotatedSequenceGraph::graph_to_anno_index( - graph_->boss_to_kmer_index(boss_edge)); - auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); rd_paths_trunc[i].push_back(it.value()); @@ -219,7 +204,9 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { if (anchor_[row]) break; - boss_edge = boss.row_diff_successor(boss_edge, rd_succ); + auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); + node = graph_->row_diff_successor(node, fork_succ_); + row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node); } } diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 8cb9a5d6b6..fe5ea901c9 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -280,7 +280,7 @@ void route_at_forks(const graph::DBGSuccinct &graph, std::vector outgoing_counts; - sdsl::bit_vector rd_succ_bv(graph.num_nodes(), false); + sdsl::bit_vector rd_succ_bv(graph.num_nodes() + 1, false); sum_and_call_counts(count_vectors_dir, row_count_extension, "row counts", [&](int32_t count) { @@ -291,7 +291,7 @@ void route_at_forks(const graph::DBGSuccinct &graph, size_t max_pos = std::max_element(outgoing_counts.rbegin(), outgoing_counts.rend()) - outgoing_counts.rbegin(); - rd_succ_bv[to_row(graph_idx - max_pos)] = true; + rd_succ_bv[graph_idx - max_pos] = true; outgoing_counts.resize(0); } graph_idx++; @@ -483,7 +483,7 @@ void assign_anchors(const std::string &graph_fname, { sdsl::bit_vector rd_succ_bv(boss.get_last().size(), 0); rd_succ.call_ones([&](uint64_t i) { - rd_succ_bv[graph.kmer_to_boss_index(to_node(i))] = 1; + rd_succ_bv[graph.kmer_to_boss_index(i)] = 1; }); rd_succ = rd_succ_bv_type(rd_succ_bv); } @@ -493,11 +493,11 @@ void assign_anchors(const std::string &graph_fname, uint64_t num_fork_successors = rd_succ.num_set_bits(); sdsl::bit_vector rd_succ_bv = rd_succ.convert_to(); rd_succ = rd_succ_bv_type(); - boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); + // boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); { - sdsl::bit_vector bv(num_rows, 0); + sdsl::bit_vector bv(graph.num_nodes() + 1, 0); call_ones(rd_succ_bv, [&](BOSS::edge_index i) { - bv[to_row(graph.boss_to_kmer_index(i))] = 1; + bv[graph.boss_to_kmer_index(i)] = 1; }); rd_succ_bv = std::move(bv); } @@ -820,7 +820,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, logger->error("Couldn't load row-diff successor bitmap from {}", rd_succ_fname); exit(1); } - if (rd_succ.size() != num_rows) { + if (rd_succ.size() != num_rows + 1) { logger->error("Successor bitmap {} is incompatible with annotations." " Vector size: {}, number of rows: {}", rd_succ_fname, rd_succ.size(), num_rows); @@ -913,7 +913,11 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, const uint64_t *it, const uint64_t *end) -> uint64_t { if (!with_values) { for ( ; it != end; ++it) { - if ((!rd_succ.size() || rd_succ[*it]) && col[*it]) + if (!rd_succ.size()) { + // the last edge (first succ row) is the only successor + return col[*it]; + } + if (rd_succ[to_node(*it)] && col[*it]) return true; } return false; @@ -921,7 +925,12 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, uint64_t value = 0; uint64_t rk; for ( ; it != end; ++it) { - if ((!rd_succ.size() || rd_succ[*it]) && (rk = col.conditional_rank1(*it))) + if (!rd_succ.size()) { + // the last edge (first succ row) is the only successor + rk = col.conditional_rank1(*it); + return rk ? values[s][j][rk - 1] : 0; + } + if (rd_succ[to_node(*it)] && (rk = col.conditional_rank1(*it))) value += values[s][j][rk - 1]; } return value; @@ -1006,7 +1015,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } } - if (!curr_value || !rd_succ.size() || !rd_succ[row_idx]) + if (!curr_value || !rd_succ.size() || !rd_succ[to_node(row_idx)]) return; // check non-anchor predecessor nodes and add them if they are zero @@ -1115,33 +1124,35 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, const bool remove_chunks = true; uint64_t r = 0; - std::vector indexes; - indexes.reserve(row_diff_bits[l_idx][j]); + // TODO: use int_vector_buffer + std::vector ids; + ids.reserve(row_diff_bits[l_idx][j]); int64_t value = 0; elias_fano::merge_files(filenames, [&](T v) { if constexpr(with_values) { assert(v.second && "zero diffs must have been skipped"); - if (indexes.empty() || v.first != indexes.back()) { - if (!indexes.empty()) + if (ids.empty() || v.first != ids.back()) { + if (!ids.empty()) values[l_idx][j][r++] = matrix::encode_diff(value); value = 0; - indexes.push_back(v.first); + ids.push_back(v.first); } value += v.second; } else { - indexes.push_back(v); + if (ids.empty() || v != ids.back()) + ids.push_back(v); } }, remove_chunks, chunks_open_per_thread, false); - if constexpr(with_values) { + if (with_values && ids.size()) { values[l_idx][j][r++] = matrix::encode_diff(value); values[l_idx][j].resize(r); - assert(r == indexes.size()); + assert(r == ids.size()); } - assert(row_diff_bits[l_idx][j] >= indexes.size()); + assert(row_diff_bits[l_idx][j] >= ids.size()); columns[j] = std::make_unique( - [&](const auto &callback) { std::for_each(indexes.begin(), indexes.end(), callback); }, - num_rows, indexes.size()); + [&](const auto &callback) { std::for_each(ids.begin(), ids.end(), callback); }, + num_rows, ids.size()); } if (compute_row_reduction) { @@ -1260,7 +1271,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, std::vector result; uint64_t rk; for ( ; it != end; ++it) { - if ((!rd_succ.size() || rd_succ[*it]) && (rk = col.conditional_rank1(*it))) { + if ((!rd_succ.size() || rd_succ[to_node(*it)]) && (rk = col.conditional_rank1(*it))) { uint64_t t = delims[s][j].select1(rk); while (!delims[s][j][++t]) { result.push_back(coords[s][j][t - rk]); @@ -1373,7 +1384,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, logger->error("Couldn't load row-diff successor bitmap from {}", rd_succ_fname); exit(1); } - if (rd_succ.size() != num_rows) { + if (rd_succ.size() != num_rows + 1) { logger->error("Successor bitmap {} is incompatible with annotations." " Vector size: {}, number of rows: {}", rd_succ_fname, rd_succ.size(), num_rows); @@ -1489,7 +1500,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, row_diff_bits[s][j]++; } - if (!curr_value.size() || !rd_succ[row_idx]) + if (!curr_value.size() || !rd_succ[to_node(row_idx)]) return; // check non-anchor predecessor nodes and add them if they are zero diff --git a/metagraph/src/graph/representation/succinct/boss.hpp b/metagraph/src/graph/representation/succinct/boss.hpp index c161d8d984..f4d3dfdc63 100644 --- a/metagraph/src/graph/representation/succinct/boss.hpp +++ b/metagraph/src/graph/representation/succinct/boss.hpp @@ -161,6 +161,7 @@ class BOSS { sdsl::bit_vector *rd_succ, size_t max_length) const; + // TODO: remove edge_index row_diff_successor(edge_index edge, const bit_vector &rd_succ) const { TAlphabet d = get_W(edge) % alph_size; assert(d != kSentinelCode && "sinks have no row-diff successors"); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 3aa32a5cce..87ae42d61e 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -166,6 +166,8 @@ class DBGSuccinct : public DeBruijnGraph { virtual void call_source_nodes(const std::function &callback) const override final; + node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const; + uint64_t kmer_to_boss_index(node_index kmer_index) const; node_index boss_to_kmer_index(uint64_t boss_index) const; @@ -191,6 +193,28 @@ class DBGSuccinct : public DeBruijnGraph { std::unique_ptr> bloom_filter_; }; +inline DBGSuccinct::node_index +DBGSuccinct::row_diff_successor(node_index node, const bit_vector &rd_succ) const { + const boss::BOSS &boss = *boss_graph_; + boss::BOSS::edge_index edge = kmer_to_boss_index(node); + boss::BOSS::TAlphabet d = boss.get_W(edge) % boss.alph_size; + assert(d != boss::BOSS::kSentinelCode && "sinks have no row-diff successors"); + // make one traversal step + edge = boss.fwd(edge, d); + node = boss_to_kmer_index(edge); + + if (!rd_succ.size() || boss.get_last(edge - 1)) + return node; + + // pick the row-diff successor + while (!rd_succ[node]) { + node--; + edge--; + assert(!boss.get_last(edge) && "a row-diff successor must exist"); + } + return node; +} + } // namespace graph } // namespace mtg From 5eec5742cde901d0f420a6990f4fd309fd894533 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Mon, 21 Jun 2021 11:48:24 +0100 Subject: [PATCH 05/24] always construct rd_succ bitmap --- metagraph/src/annotation/row_diff_builder.cpp | 91 ++++++++++--------- .../representation/succinct/dbg_succinct.cpp | 3 +- 2 files changed, 49 insertions(+), 45 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index fe5ea901c9..52adb551a9 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -263,7 +263,8 @@ void route_at_forks(const graph::DBGSuccinct &graph, const std::string &row_count_extension) { logger->trace("Assigning row-diff successors at forks..."); - rd_succ_bv_type rd_succ; + const bit_vector &last = graph.get_boss().get_last(); + sdsl::bit_vector rd_succ_bv(graph.num_nodes() + 1, false); bool optimize_forks = false; for (const auto &p : fs::directory_iterator(count_vectors_dir)) { @@ -275,13 +276,9 @@ void route_at_forks(const graph::DBGSuccinct &graph, logger->trace("RowDiff successors will be set to the adjacent nodes with" " the largest number of labels"); - const bit_vector &last = graph.get_boss().get_last(); graph::DeBruijnGraph::node_index graph_idx = to_node(0); - std::vector outgoing_counts; - sdsl::bit_vector rd_succ_bv(graph.num_nodes() + 1, false); - sum_and_call_counts(count_vectors_dir, row_count_extension, "row counts", [&](int32_t count) { // TODO: skip single outgoing @@ -304,14 +301,21 @@ void route_at_forks(const graph::DBGSuccinct &graph, exit(1); } - rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); - } else { + // TODO: remove this mode? logger->warn("No count vectors could be found in {}. The last outgoing" " edges will be selected for assigning RowDiff successors", count_vectors_dir); + + last.call_ones([&](BOSS::edge_index i) { + rd_succ_bv[graph.boss_to_kmer_index(i)] = true; + }); + // npos is never a successor + rd_succ_bv[0] = false; } + rd_succ_bv_type rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); + std::ofstream f(rd_succ_fname, ios::binary); rd_succ.serialize(f); logger->trace("RowDiff successors are assigned for forks and written to {}", @@ -477,42 +481,41 @@ void assign_anchors(const std::string &graph_fname, logger->error("Couldn't load row-diff successor bitmap from {}", rd_succ_fname); exit(1); } + if (rd_succ.size() != num_rows + 1) { + logger->error("Successor bitmap {} is incompatible with annotations." + " Vector size: {}, number of rows: {}", + rd_succ_fname, rd_succ.size(), num_rows); + exit(1); + } - if (rd_succ.size()) { - logger->trace("Assigning anchors for RowDiff successors {}...", rd_succ_fname); - { - sdsl::bit_vector rd_succ_bv(boss.get_last().size(), 0); - rd_succ.call_ones([&](uint64_t i) { - rd_succ_bv[graph.kmer_to_boss_index(i)] = 1; - }); - rd_succ = rd_succ_bv_type(rd_succ_bv); - } - boss.row_diff_traverse(num_threads, max_length, rd_succ, &anchors_bv); - - logger->trace("Adding branching off forks to RowDiff successors {}...", rd_succ_fname); - uint64_t num_fork_successors = rd_succ.num_set_bits(); - sdsl::bit_vector rd_succ_bv = rd_succ.convert_to(); - rd_succ = rd_succ_bv_type(); - // boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); - { - sdsl::bit_vector bv(graph.num_nodes() + 1, 0); - call_ones(rd_succ_bv, [&](BOSS::edge_index i) { - bv[graph.boss_to_kmer_index(i)] = 1; - }); - rd_succ_bv = std::move(bv); - } - rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); - logger->trace("Number of successors at forks increased from {} to {}", - num_fork_successors, rd_succ.num_set_bits()); - std::ofstream out(rd_succ_fname, ios::binary); - rd_succ.serialize(out); - logger->trace("Updated RowDiff successors {}", rd_succ_fname); + logger->trace("Assigning anchors for RowDiff successors {}...", rd_succ_fname); + { + sdsl::bit_vector rd_succ_bv(boss.get_last().size(), 0); + rd_succ.call_ones([&](uint64_t i) { + rd_succ_bv[graph.kmer_to_boss_index(i)] = 1; + }); + rd_succ = rd_succ_bv_type(rd_succ_bv); + } + boss.row_diff_traverse(num_threads, max_length, rd_succ, &anchors_bv); - } else { - logger->warn("Assigning anchors without chosen RowDiff successors." - " The last outgoing edges will be used for routing."); - boss.row_diff_traverse(num_threads, max_length, boss.get_last(), &anchors_bv); + logger->trace("Adding branching off forks to RowDiff successors {}...", rd_succ_fname); + uint64_t num_fork_successors = rd_succ.num_set_bits(); + sdsl::bit_vector rd_succ_bv = rd_succ.convert_to(); + rd_succ = rd_succ_bv_type(); + // boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); + { + sdsl::bit_vector bv(graph.num_nodes() + 1, 0); + call_ones(rd_succ_bv, [&](BOSS::edge_index i) { + bv[graph.boss_to_kmer_index(i)] = 1; + }); + rd_succ_bv = std::move(bv); } + rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); + logger->trace("Number of successors at forks increased from {} to {}", + num_fork_successors, rd_succ.num_set_bits()); + std::ofstream out(rd_succ_fname, ios::binary); + rd_succ.serialize(out); + logger->trace("Updated RowDiff successors {}", rd_succ_fname); } // anchors_bv uses BOSS edges as indices, so we need to map it to annotation indices @@ -913,7 +916,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, const uint64_t *it, const uint64_t *end) -> uint64_t { if (!with_values) { for ( ; it != end; ++it) { - if (!rd_succ.size()) { + if (compute_row_reduction) { // the last edge (first succ row) is the only successor return col[*it]; } @@ -925,7 +928,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, uint64_t value = 0; uint64_t rk; for ( ; it != end; ++it) { - if (!rd_succ.size()) { + if (compute_row_reduction) { // the last edge (first succ row) is the only successor rk = col.conditional_rank1(*it); return rk ? values[s][j][rk - 1] : 0; @@ -1015,7 +1018,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } } - if (!curr_value || !rd_succ.size() || !rd_succ[to_node(row_idx)]) + if (!curr_value || compute_row_reduction || !rd_succ[to_node(row_idx)]) return; // check non-anchor predecessor nodes and add them if they are zero @@ -1271,7 +1274,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, std::vector result; uint64_t rk; for ( ; it != end; ++it) { - if ((!rd_succ.size() || rd_succ[to_node(*it)]) && (rk = col.conditional_rank1(*it))) { + if ((compute_row_reduction || rd_succ[to_node(*it)]) && (rk = col.conditional_rank1(*it))) { uint64_t t = delims[s][j].select1(rk); while (!delims[s][j][++t]) { result.push_back(coords[s][j][t - rk]); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 423b14d548..a7cccf8140 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -870,7 +870,8 @@ void DBGSuccinct::mask_dummy_kmers(size_t num_threads, bool with_pruning) { } uint64_t DBGSuccinct::kmer_to_boss_index(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0); + assert(node <= num_nodes()); if (!valid_edges_.get()) return node; From 78508ada5d5a73bde618fd29153073ad8c8e08a5 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Mon, 21 Jun 2021 13:17:45 +0100 Subject: [PATCH 06/24] assign multiple rd successors only for counts or coordinates --- .../src/annotation/annotation_converters.cpp | 3 ++- metagraph/src/annotation/row_diff_builder.cpp | 20 ++++++++++--------- metagraph/src/annotation/row_diff_builder.hpp | 3 ++- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/metagraph/src/annotation/annotation_converters.cpp b/metagraph/src/annotation/annotation_converters.cpp index 820e5d0372..6d5093ca1e 100644 --- a/metagraph/src/annotation/annotation_converters.cpp +++ b/metagraph/src/annotation/annotation_converters.cpp @@ -1157,7 +1157,8 @@ void convert_to_row_diff(const std::vector &files, if (construction_stage == RowDiffStage::CONVERT) { assign_anchors(graph_fname, graph_fname, out_dir, max_path_length, - ".row_reduction", get_num_threads()); + ".row_reduction", get_num_threads(), + with_values || with_coordinates); const std::string anchors_fname = graph_fname + kRowDiffAnchorExt; if (!fs::exists(anchors_fname)) { diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 52adb551a9..98d055a85b 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -421,7 +421,8 @@ void assign_anchors(const std::string &graph_fname, const std::filesystem::path &count_vectors_dir, uint32_t max_length, const std::string &row_reduction_extension, - uint32_t num_threads) { + uint32_t num_threads, + bool multiple_fork_successors) { std::string anchor_filename = outfbase + kRowDiffAnchorExt; if (fs::exists(anchor_filename)) { logger->trace("Using existing anchors {}", anchor_filename); @@ -502,14 +503,15 @@ void assign_anchors(const std::string &graph_fname, uint64_t num_fork_successors = rd_succ.num_set_bits(); sdsl::bit_vector rd_succ_bv = rd_succ.convert_to(); rd_succ = rd_succ_bv_type(); - // boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); - { - sdsl::bit_vector bv(graph.num_nodes() + 1, 0); - call_ones(rd_succ_bv, [&](BOSS::edge_index i) { - bv[graph.boss_to_kmer_index(i)] = 1; - }); - rd_succ_bv = std::move(bv); - } + if (multiple_fork_successors) + boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); + + sdsl::bit_vector bv(graph.num_nodes() + 1, 0); + call_ones(rd_succ_bv, [&](BOSS::edge_index i) { + bv[graph.boss_to_kmer_index(i)] = 1; + }); + rd_succ_bv = std::move(bv); + rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); logger->trace("Number of successors at forks increased from {} to {}", num_fork_successors, rd_succ.num_set_bits()); diff --git a/metagraph/src/annotation/row_diff_builder.hpp b/metagraph/src/annotation/row_diff_builder.hpp index f57fe4c38c..5718eeb8f4 100644 --- a/metagraph/src/annotation/row_diff_builder.hpp +++ b/metagraph/src/annotation/row_diff_builder.hpp @@ -27,7 +27,8 @@ void assign_anchors(const std::string &graph_filename, const std::filesystem::path &dest_dir, uint32_t max_length, const std::string &row_reduction_extension, - uint32_t num_threads); + uint32_t num_threads, + bool multiple_fork_successors = false); void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, const std::vector &source_files, From 5a1a20fe2322c44325e178b929692634b333d2b1 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Tue, 22 Jun 2021 12:44:46 +0100 Subject: [PATCH 07/24] adapted query algorithm for multiple fork successors --- .../int_matrix/row_diff/int_row_diff.hpp | 82 +++++++++---------- metagraph/src/annotation/row_diff_builder.hpp | 2 +- .../representation/succinct/dbg_succinct.hpp | 27 ++++++ 3 files changed, 67 insertions(+), 44 deletions(-) diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index a57fe51ab6..44726a8bef 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -121,31 +121,7 @@ IntMatrix::SetBitPositions IntRowDiff::get_row(Row i) const { template IntMatrix::RowValues IntRowDiff::get_row_values(Row row) const { - assert(graph_ && "graph must be loaded"); - assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); - assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); - - RowValues result = diffs_.get_row_values(row); - decode_diffs(&result); - std::sort(result.begin(), result.end()); - - auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); - - while (!anchor_[row]) { - node = graph_->row_diff_successor(node, fork_succ_); - row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node); - - RowValues diff_row = diffs_.get_row_values(row); - decode_diffs(&diff_row); - std::sort(diff_row.begin(), diff_row.end()); - add_diff(diff_row, &result); - } - - assert(std::all_of(result.begin(), result.end(), - [](auto &p) { return p.second; })); - assert(std::all_of(result.begin(), result.end(), - [](auto &p) { return (int64_t)p.second > 0; })); - return result; + return get_row_values(std::vector{ row })[0]; } template @@ -183,31 +159,50 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { // Truncated row-diff paths, indexes to |rd_rows|. // The last index in each path points to an anchor or to a row which had // been reached before, and thus, will be reconstructed before this one. - std::vector> rd_paths_trunc(row_ids.size()); + std::vector>> rd_paths_trunc(row_ids.size()); for (size_t i = 0; i < row_ids.size(); ++i) { - Row row = row_ids[i]; - - while (true) { + std::vector> &rd_path = rd_paths_trunc[i]; + + std::vector path; + Vector> queue; + queue.emplace_back(0, row_ids[i]); + + while (queue.size()) { + size_t depth = queue.back().first; + Row row = queue.back().second; + queue.pop_back(); + while (depth < path.size()) { + assert(path.size() > 1); + rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); + path.pop_back(); + } auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); - rd_paths_trunc[i].push_back(it.value()); - + path.push_back(it.value()); // If a node had been reached before, we interrupt the diff path. // The annotation for that node will have been reconstructed earlier // than for other nodes in this path as well. Thus, we will start // reconstruction from that node and don't need its successors. if (!is_new) - break; + continue; rd_ids.push_back(row); if (anchor_[row]) - break; + continue; auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); - node = graph_->row_diff_successor(node, fork_succ_); - row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node); + graph_->call_row_diff_successors(node, fork_succ_, [&](auto succ) { + queue.emplace_back(depth + 1, graph::AnnotatedSequenceGraph::graph_to_anno_index(succ)); + }); + } + + while (path.size() > 1) { + rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); + path.pop_back(); } + assert(path.size()); + rd_path.emplace_back(-1, path[0]); } node_to_rd = VectorMap(); @@ -215,6 +210,7 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { std::vector rd_rows = diffs_.get_row_values(rd_ids); for (auto &row : rd_rows) { decode_diffs(&row); + std::sort(row.begin(), row.end()); } rd_ids = std::vector(); @@ -223,17 +219,17 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { std::vector rows(row_ids.size()); for (size_t i = 0; i < row_ids.size(); ++i) { - RowValues &result = rows[i]; + const auto &rd_path = rd_paths_trunc[i]; // propagate back and reconstruct full annotations for predecessors - for (auto it = rd_paths_trunc[i].rbegin(); it != rd_paths_trunc[i].rend(); ++it) { - std::sort(rd_rows[*it].begin(), rd_rows[*it].end()); - add_diff(rd_rows[*it], &result); - // replace diff row with full reconstructed annotation - rd_rows[*it] = result; + for (size_t j = 0; j + 1 < rd_path.size(); ++j) { + auto [node, succ] = rd_path[j]; + // reconstruct annotation by adding the diff (full succ + diff) + add_diff(rd_rows[succ], &rd_rows[node]); } - assert(std::all_of(result.begin(), result.end(), + rows[i] = rd_rows[rd_path.back().second]; + assert(std::all_of(rows[i].begin(), rows[i].end(), [](auto &p) { return p.second; })); - assert(std::all_of(result.begin(), result.end(), + assert(std::all_of(rows[i].begin(), rows[i].end(), [](auto &p) { return (int64_t)p.second > 0; })); } diff --git a/metagraph/src/annotation/row_diff_builder.hpp b/metagraph/src/annotation/row_diff_builder.hpp index 5718eeb8f4..d9ae8dd1ed 100644 --- a/metagraph/src/annotation/row_diff_builder.hpp +++ b/metagraph/src/annotation/row_diff_builder.hpp @@ -28,7 +28,7 @@ void assign_anchors(const std::string &graph_filename, uint32_t max_length, const std::string &row_reduction_extension, uint32_t num_threads, - bool multiple_fork_successors = false); + bool multiple_fork_successors); void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, const std::vector &source_files, diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 87ae42d61e..bcc94b90a1 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -167,6 +167,8 @@ class DBGSuccinct : public DeBruijnGraph { virtual void call_source_nodes(const std::function &callback) const override final; node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const; + template + void call_row_diff_successors(node_index node, const bit_vector &rd_succ, const Callback &callback) const; uint64_t kmer_to_boss_index(node_index kmer_index) const; node_index boss_to_kmer_index(uint64_t boss_index) const; @@ -215,6 +217,31 @@ DBGSuccinct::row_diff_successor(node_index node, const bit_vector &rd_succ) cons return node; } +template +inline void DBGSuccinct::call_row_diff_successors(node_index node, + const bit_vector &rd_succ, + const Callback &callback) const { + const boss::BOSS &boss = *boss_graph_; + boss::BOSS::edge_index edge = kmer_to_boss_index(node); + boss::BOSS::TAlphabet d = boss.get_W(edge) % boss.alph_size; + assert(d != boss::BOSS::kSentinelCode && "sinks have no row-diff successors"); + // make one traversal step + edge = boss.fwd(edge, d); + node = boss_to_kmer_index(edge); + + if (!rd_succ.size() || boss.get_last(edge - 1)) { + callback(node); + return; + } + + // pick the row-diff successor + do { + if (rd_succ[node]) + callback(node); + node--; + } while (!boss.get_last(--edge)); +} + } // namespace graph } // namespace mtg From bdfd7c14bc0f766255959ebe6770cf09952544ec Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Mon, 5 Jul 2021 18:10:41 +0200 Subject: [PATCH 08/24] streamlined successor assignment --- .../src/annotation/annotation_converters.cpp | 3 - metagraph/src/annotation/row_diff_builder.cpp | 43 +++++-------- .../graph/representation/succinct/boss.cpp | 63 +++---------------- .../graph/representation/succinct/boss.hpp | 13 ++-- .../representation/succinct/dbg_succinct.cpp | 55 ++++++++++++++++ .../representation/succinct/dbg_succinct.hpp | 5 ++ metagraph/tests/graph/succinct/test_boss.cpp | 14 ++--- 7 files changed, 93 insertions(+), 103 deletions(-) diff --git a/metagraph/src/annotation/annotation_converters.cpp b/metagraph/src/annotation/annotation_converters.cpp index 6d5093ca1e..af74064b62 100644 --- a/metagraph/src/annotation/annotation_converters.cpp +++ b/metagraph/src/annotation/annotation_converters.cpp @@ -1173,10 +1173,7 @@ void convert_to_row_diff(const std::vector &files, return; } mem_bytes -= anchor_size; - } - // TODO: move - if (construction_stage != RowDiffStage::COUNT_LABELS) { const std::string rd_succ_fname = graph_fname + kRowDiffForkSuccExt; if (!fs::exists(rd_succ_fname)) { logger->error("Can't find row-diff successor bitmap at {}", rd_succ_fname); diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 98d055a85b..f867dd7863 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -314,10 +314,8 @@ void route_at_forks(const graph::DBGSuccinct &graph, rd_succ_bv[0] = false; } - rd_succ_bv_type rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); - std::ofstream f(rd_succ_fname, ios::binary); - rd_succ.serialize(f); + rd_succ_bv.serialize(f); logger->trace("RowDiff successors are assigned for forks and written to {}", rd_succ_fname); } @@ -475,44 +473,33 @@ void assign_anchors(const std::string &graph_fname, // assign extra anchors and restrict the length of row-diff paths logger->trace("Assigning required anchors..."); { - rd_succ_bv_type rd_succ; + sdsl::bit_vector rd_succ_bv; const std::string &rd_succ_fname = outfbase + kRowDiffForkSuccExt; std::ifstream f(rd_succ_fname, ios::binary); - if (!rd_succ.load(f)) { + try { + rd_succ_bv.load(f); + } catch (...) { logger->error("Couldn't load row-diff successor bitmap from {}", rd_succ_fname); exit(1); } - if (rd_succ.size() != num_rows + 1) { - logger->error("Successor bitmap {} is incompatible with annotations." - " Vector size: {}, number of rows: {}", - rd_succ_fname, rd_succ.size(), num_rows); + if (rd_succ_bv.size() != graph.num_nodes() + 1) { + logger->error("Successor bitmap {} is incompatible with the graph." + " Vector size: {}, number of nodes: {}", + rd_succ_fname, rd_succ_bv.size(), graph.num_nodes()); exit(1); } logger->trace("Assigning anchors for RowDiff successors {}...", rd_succ_fname); - { - sdsl::bit_vector rd_succ_bv(boss.get_last().size(), 0); - rd_succ.call_ones([&](uint64_t i) { - rd_succ_bv[graph.kmer_to_boss_index(i)] = 1; - }); - rd_succ = rd_succ_bv_type(rd_succ_bv); - } - boss.row_diff_traverse(num_threads, max_length, rd_succ, &anchors_bv); + boss.row_diff_traverse(num_threads, max_length, + [&](BOSS::edge_index i) { return rd_succ_bv[graph.boss_to_kmer_index(i)]; }, + &anchors_bv); logger->trace("Adding branching off forks to RowDiff successors {}...", rd_succ_fname); - uint64_t num_fork_successors = rd_succ.num_set_bits(); - sdsl::bit_vector rd_succ_bv = rd_succ.convert_to(); - rd_succ = rd_succ_bv_type(); + uint64_t num_fork_successors = sdsl::util::cnt_one_bits(rd_succ_bv); if (multiple_fork_successors) - boss.row_diff_add_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); - - sdsl::bit_vector bv(graph.num_nodes() + 1, 0); - call_ones(rd_succ_bv, [&](BOSS::edge_index i) { - bv[graph.boss_to_kmer_index(i)] = 1; - }); - rd_succ_bv = std::move(bv); + graph.add_rd_successors_at_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); - rd_succ = rd_succ_bv_type(std::move(rd_succ_bv)); + rd_succ_bv_type rd_succ(std::move(rd_succ_bv)); logger->trace("Number of successors at forks increased from {} to {}", num_fork_successors, rd_succ.num_set_bits()); std::ofstream out(rd_succ_fname, ios::binary); diff --git a/metagraph/src/graph/representation/succinct/boss.cpp b/metagraph/src/graph/representation/succinct/boss.cpp index 887b15e69b..1f3001c08c 100644 --- a/metagraph/src/graph/representation/succinct/boss.cpp +++ b/metagraph/src/graph/representation/succinct/boss.cpp @@ -2452,7 +2452,7 @@ void update_terminal_bits(size_t max_length, * if the row-diff successor in a fork has already been visited. */ void traverse_rd_path(const BOSS &boss, - const bit_vector &rd_succ, + const std::function &is_successor, edge_index edge, size_t max_length, sdsl::bit_vector *visited, @@ -2480,7 +2480,7 @@ void traverse_rd_path(const BOSS &boss, path.push_back(edge); - edge = boss.row_diff_successor(edge, rd_succ); + edge = boss.row_diff_successor(edge, is_successor); } // mark terminal and near terminal nodes @@ -2676,7 +2676,7 @@ void BOSS::call_sequences(Call&&> callbac // Reach all k-mers that merge into anchor |edge| by following their diff paths. template void traverse_rd_path_backward(const BOSS &boss, - const bit_vector &rd_succ, + const std::function &is_successor, edge_index edge, T max_length, sdsl::bit_vector *visited, @@ -2717,7 +2717,7 @@ void traverse_rd_path_backward(const BOSS &boss, // AAAX - AAX$ // ^^^^ // AAAY - **** - if (!rd_succ[edge]) + if (!is_successor(edge)) continue; // |edge| is the row-diff successor. Thus, it is part of a diff @@ -2734,7 +2734,7 @@ void traverse_rd_path_backward(const BOSS &boss, void BOSS::row_diff_traverse(size_t num_threads, size_t max_length, - const bit_vector &rd_succ, + const std::function &is_successor, sdsl::bit_vector *terminal) const { // TODO: can we do it without using this extra |dummy| bitmap? sdsl::bit_vector dummy(W_->size(), false); @@ -2804,7 +2804,7 @@ void BOSS::row_diff_traverse(size_t num_threads, // backward traversal traverse_path = [&](edge_index anchor) { - traverse_rd_path_backward(*this, rd_succ, anchor, max_length, &visited, + traverse_rd_path_backward(*this, is_successor, anchor, max_length, &visited, terminal, dummy, progress_bar); }; @@ -2816,7 +2816,7 @@ void BOSS::row_diff_traverse(size_t num_threads, // forward traversal traverse_path = [&](edge_index start) { - traverse_rd_path(*this, rd_succ, start, max_length, &visited, + traverse_rd_path(*this, is_successor, start, max_length, &visited, terminal, &near_terminal, progress_bar); }; // start traversal from the dummy source edges first ($X...X) @@ -2918,55 +2918,6 @@ void BOSS::row_diff_traverse(size_t num_threads, #endif } -void BOSS::row_diff_add_forks(size_t num_threads, - const sdsl::bit_vector &anchor, - sdsl::bit_vector *rd_succ, - size_t max_length) const { - ProgressBar progress_bar(W_->size(), "Adding fork successors", - std::cerr, !common::get_verbose()); - - constexpr bool async = true; - sdsl::bit_vector excluded(rd_succ->size(), false); - - // start from 0 and go with blocks of 1024 bits to avoid race conditions - #pragma omp parallel for num_threads(num_threads) schedule(static, 1024) - for (edge_index i = 0; i < W_->size(); ++i) { - ++progress_bar; - // skip edge if it's not a fork or it's already selected - if (i < 2 || is_single_outgoing(i) || (*rd_succ)[i]) - continue; - - // (*rd_succ)[i] = true; - // continue; - // TODO: test this - // if (!is_single_incoming(i, get_W(i))) - // continue; - - std::vector queue = { i }; - - // make branching edge a successor if it reaches anchors not too deep - for (size_t depth = 0; depth < max_length && queue.size(); ++depth) { - edge_index edge = queue.back(); - queue.pop_back(); - // stop branch if anchor is reached - if (anchor[edge]) - continue; - - assert(get_W(edge) % alph_size); - - call_outgoing(fwd(edge, get_W(edge) % alph_size), [&](BOSS::edge_index next) { - if (!fetch_bit(excluded.data(), next, async)) - queue.push_back(next); - }); - } - if (queue.empty()) { - (*rd_succ)[i] = true; - } else { - set_bit(excluded.data(), i, async); - } - } -} - void BOSS::call_unitigs(Call&&> callback, size_t num_threads, size_t min_tip_size, diff --git a/metagraph/src/graph/representation/succinct/boss.hpp b/metagraph/src/graph/representation/succinct/boss.hpp index f4d3dfdc63..98d7917106 100644 --- a/metagraph/src/graph/representation/succinct/boss.hpp +++ b/metagraph/src/graph/representation/succinct/boss.hpp @@ -153,23 +153,18 @@ class BOSS { */ void row_diff_traverse(size_t num_threads, size_t max_length, - const bit_vector &rd_succ, + const std::function &is_successor, sdsl::bit_vector *terminal) const; - void row_diff_add_forks(size_t num_threads, - const sdsl::bit_vector &anchors, - sdsl::bit_vector *rd_succ, - size_t max_length) const; - - // TODO: remove - edge_index row_diff_successor(edge_index edge, const bit_vector &rd_succ) const { + // TODO: move all row-diff related things to DBGSuccinct and remove this function + edge_index row_diff_successor(edge_index edge, const std::function &is_successor) const { TAlphabet d = get_W(edge) % alph_size; assert(d != kSentinelCode && "sinks have no row-diff successors"); // make one traversal step edge = fwd(edge, d); // pick the row-diff successor if (!get_last(edge - 1)) { - while (!rd_succ[edge]) { + while (!is_successor(edge)) { edge--; assert(!get_last(edge) && "a row-diff successor must exist"); } diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index a7cccf8140..d15d0f2e49 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -6,6 +6,8 @@ #include #include +#include + #include "common/seq_tools/reverse_complement.hpp" #include "common/serialization.hpp" #include "common/logger.hpp" @@ -1013,5 +1015,58 @@ void DBGSuccinct::print(std::ostream &out) const { } } +void DBGSuccinct::add_rd_successors_at_forks(size_t num_threads, + const sdsl::bit_vector &anchor, + sdsl::bit_vector *rd_succ, + size_t max_length) const { + const auto &W = boss_graph_->get_W(); + + ProgressBar progress_bar(W.size(), "Adding fork successors", + std::cerr, !common::get_verbose()); + + constexpr bool async = true; + sdsl::bit_vector excluded(W.size(), false); + + // start from 0 and go with blocks of 1024 bits to avoid race conditions + #pragma omp parallel for num_threads(num_threads) schedule(static, 1024) + for (BOSS::edge_index i = 0; i < W.size(); ++i) { + ++progress_bar; + // skip edge if it's not a fork or it's already selected + if (i < 2 || boss_graph_->is_single_outgoing(i) || (*rd_succ)[boss_to_kmer_index(i)]) + continue; + + // (*rd_succ)[i] = true; + // continue; + // TODO: test this + // if (!is_single_incoming(i, get_W(i))) + // continue; + + std::vector queue = { i }; + + // make branching edge a successor if it reaches anchors not too deep + for (size_t depth = 0; depth < max_length && queue.size(); ++depth) { + BOSS::edge_index edge = queue.back(); + queue.pop_back(); + // stop branch if anchor is reached + if (anchor[edge]) + continue; + + assert(W[edge] % boss_graph_->alph_size); + + boss_graph_->call_outgoing(boss_graph_->fwd(edge, W[edge] % boss_graph_->alph_size), + [&](BOSS::edge_index next) { + if (!fetch_bit(excluded.data(), next, async)) + queue.push_back(next); + } + ); + } + if (queue.empty()) { + (*rd_succ)[boss_to_kmer_index(i)] = true; + } else { + set_bit(excluded.data(), i, async); + } + } +} + } // namespace graph } // namespace mtg diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index bcc94b90a1..628bd4a125 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -170,6 +170,11 @@ class DBGSuccinct : public DeBruijnGraph { template void call_row_diff_successors(node_index node, const bit_vector &rd_succ, const Callback &callback) const; + void add_rd_successors_at_forks(size_t num_threads, + const sdsl::bit_vector &anchors, + sdsl::bit_vector *rd_succ, + size_t max_length) const; + uint64_t kmer_to_boss_index(node_index kmer_index) const; node_index boss_to_kmer_index(uint64_t boss_index) const; diff --git a/metagraph/tests/graph/succinct/test_boss.cpp b/metagraph/tests/graph/succinct/test_boss.cpp index 39da899143..23d42ab266 100644 --- a/metagraph/tests/graph/succinct/test_boss.cpp +++ b/metagraph/tests/graph/succinct/test_boss.cpp @@ -952,7 +952,7 @@ TEST(BOSS, CallSequencesRowDiff_EmptyGraph) { BOSS empty(k); sdsl::bit_vector terminal(empty.get_last().size(), false); - empty.row_diff_traverse(num_threads, 1, empty.get_last(), &terminal); + empty.row_diff_traverse(num_threads, 1, [&](auto i) { return empty.get_last(i); }, &terminal); ASSERT_EQ(sdsl::bit_vector(2, false), terminal) << "Empty graph must have no anchors"; } @@ -1067,7 +1067,7 @@ TEST(BOSS, CallSequenceRowDiff_TwoLoops) { ASSERT_EQ(2u, graph.num_edges()); sdsl::bit_vector terminal(graph.get_last().size(), false); - graph.row_diff_traverse(num_threads, 1, graph.get_last(), &terminal); + graph.row_diff_traverse(num_threads, 1, [&](auto i) { return graph.get_last(i); }, &terminal); ASSERT_EQ(graph.num_edges() + 1, terminal.size()); ASSERT_EQ(sdsl::bit_vector({ 0, 0, 1 }), terminal); } @@ -1128,11 +1128,11 @@ TEST(BOSS, CallSequenceRowDiff_TwoBigLoops) { BOSS graph(&constructor); sdsl::bit_vector terminal(graph.get_last().size(), false); - graph.row_diff_traverse(num_threads, 100, graph.get_last(), &terminal); + graph.row_diff_traverse(num_threads, 100, [&](auto i) { return graph.get_last(i); }, &terminal); ASSERT_EQ(graph.num_edges() + 1, terminal.size()); ASSERT_EQ(2, std::accumulate(terminal.begin() + 1, terminal.end(), 0U)); - graph.row_diff_traverse(num_threads, 1, graph.get_last(), &terminal); + graph.row_diff_traverse(num_threads, 1, [&](auto i) { return graph.get_last(i); }, &terminal); ASSERT_EQ(graph.num_edges() + 1, terminal.size()); ASSERT_EQ(104, std::accumulate(terminal.begin() + 1, terminal.end(), 0U)); } @@ -1204,7 +1204,7 @@ TEST(BOSS, CallSequenceRowDiff_FourLoops) { BOSS graph(&constructor); sdsl::bit_vector terminal(graph.get_last().size(), false); - graph.row_diff_traverse(num_threads, 1, graph.get_last(), &terminal); + graph.row_diff_traverse(num_threads, 1, [&](auto i) { return graph.get_last(i); }, &terminal); ASSERT_EQ(graph.num_edges() + 1, terminal.size()); ASSERT_EQ(4, std::accumulate(terminal.begin() + 1, terminal.end(), 0U)); } @@ -1220,11 +1220,11 @@ TEST(BOSS, CallSequenceRowDiff_FourPaths) { BOSS graph(&constructor); for (size_t num_threads : { 1, 4 }) { sdsl::bit_vector terminal(graph.get_last().size(), false); - graph.row_diff_traverse(num_threads, 20, graph.get_last(), &terminal); + graph.row_diff_traverse(num_threads, 20, [&](auto i) { return graph.get_last(i); }, &terminal); ASSERT_EQ(graph.num_edges() + 1, terminal.size()); ASSERT_EQ(4, std::accumulate(terminal.begin() + 1, terminal.end(), 0U)); - graph.row_diff_traverse(num_threads, 1, graph.get_last(), &terminal); + graph.row_diff_traverse(num_threads, 1, [&](auto i) { return graph.get_last(i); }, &terminal); ASSERT_EQ(graph.num_edges() + 1, terminal.size()); ASSERT_EQ(19, std::accumulate(terminal.begin() + 1, terminal.end(), 0U)); } From 675adfa986988ca2a1da9cda1988cf1d07cf84eb Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Mon, 5 Jul 2021 20:29:17 +0200 Subject: [PATCH 09/24] optimizations --- metagraph/src/annotation/row_diff_builder.cpp | 113 ++++++++++-------- 1 file changed, 60 insertions(+), 53 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index f867dd7863..6021957af2 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -2,6 +2,7 @@ #include #include +#include #include "annotation/binary_matrix/row_diff/row_diff.hpp" #include "annotation/int_matrix/row_diff/int_row_diff.hpp" @@ -495,7 +496,7 @@ void assign_anchors(const std::string &graph_fname, &anchors_bv); logger->trace("Adding branching off forks to RowDiff successors {}...", rd_succ_fname); - uint64_t num_fork_successors = sdsl::util::cnt_one_bits(rd_succ_bv); + const uint64_t num_fork_successors = sdsl::util::cnt_one_bits(rd_succ_bv); if (multiple_fork_successors) graph.add_rd_successors_at_forks(num_threads, anchors_bv, &rd_succ_bv, 10 * max_length); @@ -738,6 +739,10 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } } +uint8_t code_len(uint64_t x) { + return sdsl::coder::elias_delta::encoding_length(x); +} + // 'T' is either a row index, or a pair (row index, value) template void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, @@ -905,11 +910,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, const uint64_t *it, const uint64_t *end) -> uint64_t { if (!with_values) { for ( ; it != end; ++it) { - if (compute_row_reduction) { - // the last edge (first succ row) is the only successor - return col[*it]; - } - if (rd_succ[to_node(*it)] && col[*it]) + if ((compute_row_reduction || rd_succ[to_node(*it)]) && col[*it]) return true; } return false; @@ -917,24 +918,19 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, uint64_t value = 0; uint64_t rk; for ( ; it != end; ++it) { - if (compute_row_reduction) { - // the last edge (first succ row) is the only successor - rk = col.conditional_rank1(*it); - return rk ? values[s][j][rk - 1] : 0; - } - if (rd_succ[to_node(*it)] && (rk = col.conditional_rank1(*it))) + if ((compute_row_reduction || rd_succ[to_node(*it)]) && (rk = col.conditional_rank1(*it))) value += values[s][j][rk - 1]; } return value; } }; - graph::DBGSuccinct graph(2); - logger->trace("Loading graph..."); - if (!graph.load(pred_succ_fprefix)) { - logger->error("Cannot load graph from {}", pred_succ_fprefix); - std::exit(1); - } + // graph::DBGSuccinct graph(2); + // logger->trace("Loading graph..."); + // if (!graph.load(pred_succ_fprefix)) { + // logger->error("Cannot load graph from {}", pred_succ_fprefix); + // std::exit(1); + // } traverse_anno_chunked( num_rows, pred_succ_fprefix, sources, @@ -956,12 +952,13 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } if (compute_row_reduction) { - uint64_t succ_value = get_value(source_col, source_idx, - j, succ_begin, succ_end); - if (succ_begin != succ_end && curr_value == succ_value) { - // reduction (zero diff) - __atomic_add_fetch(&row_nbits_block[chunk_idx], 1, __ATOMIC_RELAXED); - } + uint64_t succ_value = get_value(source_col, source_idx, j, succ_begin, succ_end); + // if anchor + int n_bits_before = with_values ? code_len(matrix::encode_diff(curr_value)) : 1; + // if diff + int n_bits_after = with_values ? code_len(matrix::encode_diff(curr_value - succ_value)) : succ_value; + // add reduction + __atomic_add_fetch(&row_nbits_block[chunk_idx], n_bits_before - n_bits_after, __ATOMIC_RELAXED); } else { bool is_anchor = anchor[row_idx]; // add current bit if this node is an anchor @@ -984,30 +981,31 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } if (is_anchor) num_relations_anchored[source_idx][j]++; - } else { - if (succ_begin == succ_end) { - auto edge = graph.kmer_to_boss_index(to_node(row_idx)); - const auto &boss = graph.get_boss(); - logger->trace("Reduction: value {}" - "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" - "\nTo NULL", - curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); - } else { - auto edge = graph.kmer_to_boss_index(to_node(row_idx)); - const auto &boss = graph.get_boss(); - logger->trace("Reduction: value {}" - "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}", - curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); - for (const uint64_t *it = succ_begin; it != succ_end; ++it) { - auto next_edge = graph.kmer_to_boss_index(to_node(*it)); - logger->trace("\nTo anchor: {}, only_incoming: {}, only_outgoing: {}", - anchor[*it], boss.is_single_incoming(next_edge, boss.get_W()[next_edge]), boss.is_single_outgoing(next_edge)); - } - } } + // else { + // if (succ_begin == succ_end) { + // auto edge = graph.kmer_to_boss_index(to_node(row_idx)); + // const auto &boss = graph.get_boss(); + // logger->trace("Reduction: value {}" + // "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" + // "\nTo NULL", + // curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); + // } else { + // auto edge = graph.kmer_to_boss_index(to_node(row_idx)); + // const auto &boss = graph.get_boss(); + // logger->trace("Reduction: value {}" + // "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}", + // curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); + // for (const uint64_t *it = succ_begin; it != succ_end; ++it) { + // auto next_edge = graph.kmer_to_boss_index(to_node(*it)); + // logger->trace("\nTo anchor: {}, only_incoming: {}, only_outgoing: {}", + // anchor[*it], boss.is_single_incoming(next_edge, boss.get_W()[next_edge]), boss.is_single_outgoing(next_edge)); + // } + // } + // } } - if (!curr_value || compute_row_reduction || !rd_succ[to_node(row_idx)]) + if (!curr_value || (!compute_row_reduction && !rd_succ[to_node(row_idx)])) return; // check non-anchor predecessor nodes and add them if they are zero @@ -1120,6 +1118,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, std::vector ids; ids.reserve(row_diff_bits[l_idx][j]); int64_t value = 0; + // TODO: sum in merge? elias_fano::merge_files(filenames, [&](T v) { if constexpr(with_values) { assert(v.second && "zero diffs must have been skipped"); @@ -1165,16 +1164,15 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, for (size_t j = 0; j < label_encoders[l_idx].size(); ++j) { values[l_idx][j].serialize(outstream); } - logger->trace("Serialized {}", fpath); - values[l_idx].clear(); } else { fpath.replace_extension().replace_extension(RowDiffColumnAnnotator::kExtension); RowDiffColumnAnnotator( std::make_unique>(nullptr, ColumnMajor(std::move(columns))), std::move(label_encoders[l_idx])).serialize(fpath); - logger->trace("Serialized {}", fpath); } + + logger->trace("Serialized {}", fpath); } } @@ -1191,12 +1189,21 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, #pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (size_t l_idx = 0; l_idx < diff_columns.size(); ++l_idx) { - for (const auto &col_ptr : diff_columns[l_idx]) { - col_ptr->call_ones_in_range(chunk, chunk + row_nbits_block.size(), - [&](uint64_t i) { - __atomic_add_fetch(&row_nbits_block[i - chunk], 1, __ATOMIC_RELAXED); + for (size_t j = 0; j < diff_columns[l_idx].size(); ++j) { + const auto &col_ptr = diff_columns[l_idx][j]; + if (with_values) { + const uint64_t max_rank = col_ptr->rank1(chunk + row_nbits_block.size() - 1); + for (uint64_t r = chunk ? col_ptr->rank1(chunk - 1) + 1 : 1; r <= max_rank; ++r) { + __atomic_add_fetch(&row_nbits_block[col_ptr->select1(r) - chunk], + code_len(values[l_idx][j][r - 1]), __ATOMIC_RELAXED); } - ); + } else { + col_ptr->call_ones_in_range(chunk, chunk + row_nbits_block.size(), + [&](uint64_t i) { + __atomic_add_fetch(&row_nbits_block[i - chunk], 1, __ATOMIC_RELAXED); + } + ); + } } } From 1f505784b7272b76bba1a003ddfd5233922a7184 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Mon, 5 Jul 2021 20:34:18 +0200 Subject: [PATCH 10/24] minor --- metagraph/src/annotation/row_diff_builder.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 6021957af2..9542463591 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -1118,7 +1118,8 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, std::vector ids; ids.reserve(row_diff_bits[l_idx][j]); int64_t value = 0; - // TODO: sum in merge? + // Call merge_files with a single template parameter (not pair). + // It can't extract and add counts, so we do this manually. elias_fano::merge_files(filenames, [&](T v) { if constexpr(with_values) { assert(v.second && "zero diffs must have been skipped"); From 36698db12e3044d32e0ab72555add31b6db27a24 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Wed, 7 Jul 2021 16:49:52 +0200 Subject: [PATCH 11/24] changed formula --- metagraph/src/annotation/row_diff_builder.cpp | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 9542463591..0585d439e2 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -2,7 +2,6 @@ #include #include -#include #include "annotation/binary_matrix/row_diff/row_diff.hpp" #include "annotation/int_matrix/row_diff/int_row_diff.hpp" @@ -740,7 +739,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } uint8_t code_len(uint64_t x) { - return sdsl::coder::elias_delta::encoding_length(x); + return sdsl::bits::hi(x) + 1; } // 'T' is either a row index, or a pair (row index, value) @@ -952,13 +951,19 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } if (compute_row_reduction) { - uint64_t succ_value = get_value(source_col, source_idx, j, succ_begin, succ_end); - // if anchor - int n_bits_before = with_values ? code_len(matrix::encode_diff(curr_value)) : 1; - // if diff - int n_bits_after = with_values ? code_len(matrix::encode_diff(curr_value - succ_value)) : succ_value; - // add reduction - __atomic_add_fetch(&row_nbits_block[chunk_idx], n_bits_before - n_bits_after, __ATOMIC_RELAXED); + if (succ_begin != succ_end) { + const uint64_t succ_value = get_value(source_col, source_idx, j, succ_begin, succ_end); + // if anchor + int n_bits_before = curr_value + ? (with_values ? code_len(matrix::encode_diff(curr_value)) : 1) + : 0; + // if diff + int n_bits_after = curr_value != succ_value + ? (with_values ? code_len(matrix::encode_diff(curr_value - succ_value)) : 1) + : 0; + // reduction + __atomic_add_fetch(&row_nbits_block[chunk_idx], n_bits_before - n_bits_after, __ATOMIC_RELAXED); + } } else { bool is_anchor = anchor[row_idx]; // add current bit if this node is an anchor @@ -1165,15 +1170,16 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, for (size_t j = 0; j < label_encoders[l_idx].size(); ++j) { values[l_idx][j].serialize(outstream); } + logger->trace("Serialized {}", fpath); + values[l_idx].clear(); } else { fpath.replace_extension().replace_extension(RowDiffColumnAnnotator::kExtension); RowDiffColumnAnnotator( std::make_unique>(nullptr, ColumnMajor(std::move(columns))), std::move(label_encoders[l_idx])).serialize(fpath); + logger->trace("Serialized {}", fpath); } - - logger->trace("Serialized {}", fpath); } } From b7f57f9f337af8c756171c012a8361bfc2169077 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Mon, 19 Jul 2021 16:15:28 +0200 Subject: [PATCH 12/24] cleanup --- metagraph/src/annotation/row_diff_builder.cpp | 73 ++++++------------- 1 file changed, 22 insertions(+), 51 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 0585d439e2..4b61954f79 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -924,13 +924,6 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } }; - // graph::DBGSuccinct graph(2); - // logger->trace("Loading graph..."); - // if (!graph.load(pred_succ_fprefix)) { - // logger->error("Cannot load graph from {}", pred_succ_fprefix); - // std::exit(1); - // } - traverse_anno_chunked( num_rows, pred_succ_fprefix, sources, [&](uint64_t chunk_size) { @@ -962,7 +955,8 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, ? (with_values ? code_len(matrix::encode_diff(curr_value - succ_value)) : 1) : 0; // reduction - __atomic_add_fetch(&row_nbits_block[chunk_idx], n_bits_before - n_bits_after, __ATOMIC_RELAXED); + __atomic_add_fetch(&row_nbits_block[chunk_idx], + n_bits_before - n_bits_after, __ATOMIC_RELAXED); } } else { bool is_anchor = anchor[row_idx]; @@ -978,55 +972,33 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } else { v.push_back(row_idx); } + if (is_anchor) + num_relations_anchored[source_idx][j]++; if (v.size() == v.capacity()) { // dump chunk to disk dump_chunk_to_disk(v, source_idx, j, 0); v.resize(0); } - if (is_anchor) - num_relations_anchored[source_idx][j]++; } - // else { - // if (succ_begin == succ_end) { - // auto edge = graph.kmer_to_boss_index(to_node(row_idx)); - // const auto &boss = graph.get_boss(); - // logger->trace("Reduction: value {}" - // "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}" - // "\nTo NULL", - // curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); - // } else { - // auto edge = graph.kmer_to_boss_index(to_node(row_idx)); - // const auto &boss = graph.get_boss(); - // logger->trace("Reduction: value {}" - // "\nFrom anchor: {}, only_incoming: {}, only_outgoing: {}", - // curr_value, anchor[row_idx], boss.is_single_incoming(edge, boss.get_W()[edge]), boss.is_single_outgoing(edge)); - // for (const uint64_t *it = succ_begin; it != succ_end; ++it) { - // auto next_edge = graph.kmer_to_boss_index(to_node(*it)); - // logger->trace("\nTo anchor: {}, only_incoming: {}, only_outgoing: {}", - // anchor[*it], boss.is_single_incoming(next_edge, boss.get_W()[next_edge]), boss.is_single_outgoing(next_edge)); - // } - // } - // } } - if (!curr_value || (!compute_row_reduction && !rd_succ[to_node(row_idx)])) - return; - - // check non-anchor predecessor nodes and add them if they are zero - for (const uint64_t *pred_p = pred_begin; pred_p < pred_end; ++pred_p) { - if (!source_col[*pred_p] && (compute_row_reduction || !anchor[*pred_p])) { - auto &v = set_rows_bwd[source_idx][j]; - if constexpr(with_values) { - v.emplace_back(*pred_p, -curr_value); - } else { - v.push_back(*pred_p); - } + if (curr_value && (compute_row_reduction || rd_succ[to_node(row_idx)])) { + // check non-anchor predecessor nodes and add them if they are zero + for (const uint64_t *pred_p = pred_begin; pred_p < pred_end; ++pred_p) { + if (!source_col[*pred_p] && (compute_row_reduction || !anchor[*pred_p])) { + auto &v = set_rows_bwd[source_idx][j]; + if constexpr(with_values) { + v.emplace_back(*pred_p, -curr_value); + } else { + v.push_back(*pred_p); + } - if (v.size() == v.capacity()) { - std::sort(v.begin(), v.end()); - dump_chunk_to_disk(v, source_idx, j, num_chunks[source_idx][j]++); - v.resize(0); + if (v.size() == v.capacity()) { + std::sort(v.begin(), v.end()); + dump_chunk_to_disk(v, source_idx, j, num_chunks[source_idx][j]++); + v.resize(0); + } } } } @@ -1123,8 +1095,8 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, std::vector ids; ids.reserve(row_diff_bits[l_idx][j]); int64_t value = 0; - // Call merge_files with a single template parameter (not pair). - // It can't extract and add counts, so we do this manually. + // merge_files with a single template parameter (not pair) + // can't extract and add counts, so we do this manually. elias_fano::merge_files(filenames, [&](T v) { if constexpr(with_values) { assert(v.second && "zero diffs must have been skipped"); @@ -1359,8 +1331,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, const auto diff = get_diff(curr_value, get_value(source_col, s, j, succ_begin, succ_end)); // reduction (zero diff) __atomic_add_fetch(&row_nbits_block[chunk_idx], - curr_value.size() - diff.size(), - __ATOMIC_RELAXED); + curr_value.size() - diff.size(), __ATOMIC_RELAXED); }, [&](uint64_t block_begin) { __atomic_thread_fence(__ATOMIC_ACQUIRE); From 575baaca3a86b723bc465f9624516efff1500b38 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Mon, 19 Jul 2021 16:28:40 +0200 Subject: [PATCH 13/24] check only the number of attributes without their width --- metagraph/src/annotation/row_diff_builder.cpp | 66 +++++++------------ 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 4b61954f79..d846b1c5e0 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -738,10 +738,6 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } } -uint8_t code_len(uint64_t x) { - return sdsl::bits::hi(x) + 1; -} - // 'T' is either a row index, or a pair (row index, value) template void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, @@ -946,17 +942,11 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, if (compute_row_reduction) { if (succ_begin != succ_end) { const uint64_t succ_value = get_value(source_col, source_idx, j, succ_begin, succ_end); - // if anchor - int n_bits_before = curr_value - ? (with_values ? code_len(matrix::encode_diff(curr_value)) : 1) - : 0; - // if diff - int n_bits_after = curr_value != succ_value - ? (with_values ? code_len(matrix::encode_diff(curr_value - succ_value)) : 1) - : 0; + bool before = curr_value; // if anchor + bool after = (curr_value != succ_value); // if diff // reduction __atomic_add_fetch(&row_nbits_block[chunk_idx], - n_bits_before - n_bits_after, __ATOMIC_RELAXED); + (int)before - (int)after, __ATOMIC_RELAXED); } } else { bool is_anchor = anchor[row_idx]; @@ -983,22 +973,23 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, } } - if (curr_value && (compute_row_reduction || rd_succ[to_node(row_idx)])) { - // check non-anchor predecessor nodes and add them if they are zero - for (const uint64_t *pred_p = pred_begin; pred_p < pred_end; ++pred_p) { - if (!source_col[*pred_p] && (compute_row_reduction || !anchor[*pred_p])) { - auto &v = set_rows_bwd[source_idx][j]; - if constexpr(with_values) { - v.emplace_back(*pred_p, -curr_value); - } else { - v.push_back(*pred_p); - } + if (!curr_value || !(compute_row_reduction || rd_succ[to_node(row_idx)])) + return; - if (v.size() == v.capacity()) { - std::sort(v.begin(), v.end()); - dump_chunk_to_disk(v, source_idx, j, num_chunks[source_idx][j]++); - v.resize(0); - } + // check non-anchor predecessor nodes and add them if they are zero + for (const uint64_t *pred_p = pred_begin; pred_p < pred_end; ++pred_p) { + if (!source_col[*pred_p] && (compute_row_reduction || !anchor[*pred_p])) { + auto &v = set_rows_bwd[source_idx][j]; + if constexpr(with_values) { + v.emplace_back(*pred_p, -curr_value); + } else { + v.push_back(*pred_p); + } + + if (v.size() == v.capacity()) { + std::sort(v.begin(), v.end()); + dump_chunk_to_disk(v, source_idx, j, num_chunks[source_idx][j]++); + v.resize(0); } } } @@ -1168,21 +1159,12 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, #pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (size_t l_idx = 0; l_idx < diff_columns.size(); ++l_idx) { - for (size_t j = 0; j < diff_columns[l_idx].size(); ++j) { - const auto &col_ptr = diff_columns[l_idx][j]; - if (with_values) { - const uint64_t max_rank = col_ptr->rank1(chunk + row_nbits_block.size() - 1); - for (uint64_t r = chunk ? col_ptr->rank1(chunk - 1) + 1 : 1; r <= max_rank; ++r) { - __atomic_add_fetch(&row_nbits_block[col_ptr->select1(r) - chunk], - code_len(values[l_idx][j][r - 1]), __ATOMIC_RELAXED); + for (const auto &col_ptr : diff_columns[l_idx]) { + col_ptr->call_ones_in_range(chunk, chunk + row_nbits_block.size(), + [&](uint64_t i) { + __atomic_add_fetch(&row_nbits_block[i - chunk], 1, __ATOMIC_RELAXED); } - } else { - col_ptr->call_ones_in_range(chunk, chunk + row_nbits_block.size(), - [&](uint64_t i) { - __atomic_add_fetch(&row_nbits_block[i - chunk], 1, __ATOMIC_RELAXED); - } - ); - } + ); } } From e6113b4d5fef7b439fcb1ad755a161f33cc8236e Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Thu, 16 Dec 2021 01:04:52 +0100 Subject: [PATCH 14/24] update zlib --- metagraph/external-libraries/zlib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagraph/external-libraries/zlib b/metagraph/external-libraries/zlib index 4a94dad277..959b4ea305 160000 --- a/metagraph/external-libraries/zlib +++ b/metagraph/external-libraries/zlib @@ -1 +1 @@ -Subproject commit 4a94dad2771bfb1fe5a169878623b4632ee0a134 +Subproject commit 959b4ea305821e753385e873ec4edfaa9a5d49b7 From 8b071de54c331bc82f74b5e475d81f5511fb6924 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Thu, 16 Dec 2021 17:08:52 +0100 Subject: [PATCH 15/24] reorganized methods --- .../annotation/int_matrix/base/int_matrix.cpp | 51 +++++++++++++++++++ .../annotation/int_matrix/base/int_matrix.hpp | 6 +++ .../int_matrix/row_diff/int_row_diff.hpp | 29 ----------- .../int_matrix/row_diff/tuple_row_diff.hpp | 29 ----------- 4 files changed, 57 insertions(+), 58 deletions(-) diff --git a/metagraph/src/annotation/int_matrix/base/int_matrix.cpp b/metagraph/src/annotation/int_matrix/base/int_matrix.cpp index 11b8418a2f..7c02c04e02 100644 --- a/metagraph/src/annotation/int_matrix/base/int_matrix.cpp +++ b/metagraph/src/annotation/int_matrix/base/int_matrix.cpp @@ -47,6 +47,57 @@ IntMatrix::sum_row_values(const std::vector> &index_count return result; } +IntMatrix::SetBitPositions IntMatrix::get_row(Row i) const { + RowValues row = get_row_values(i); + SetBitPositions result(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result[k] = row[k].first; + } + return result; +} + +std::vector +IntMatrix::get_rows(const std::vector &row_ids) const { + std::vector result; + result.reserve(row_ids.size()); + + for (auto&& row : get_row_values(row_ids)) { + result.emplace_back(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result.back()[k] = row[k].first; + } + row = RowValues(); + } + + return result; +} + +// return the positions of all non-empty tuples in the row +MultiIntMatrix::SetBitPositions MultiIntMatrix::get_row(Row i) const { + RowTuples row = get_row_tuples(i); + SetBitPositions result(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result[k] = row[k].first; + } + return result; +} + +// for each row return the positions of all non-empty tuples +std::vector +MultiIntMatrix::get_rows(const std::vector &row_ids) const { + std::vector result; + result.reserve(row_ids.size()); + + for (auto&& row : get_row_tuples(row_ids)) { + result.emplace_back(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result.back()[k] = row[k].first; + } + row = RowTuples(); + } + + return result; +} // return sizes of all non-empty tuples in the row MultiIntMatrix::RowValues MultiIntMatrix::get_row_values(Row row) const { diff --git a/metagraph/src/annotation/int_matrix/base/int_matrix.hpp b/metagraph/src/annotation/int_matrix/base/int_matrix.hpp index a0a4c7d36e..8cf713f4b7 100644 --- a/metagraph/src/annotation/int_matrix/base/int_matrix.hpp +++ b/metagraph/src/annotation/int_matrix/base/int_matrix.hpp @@ -17,6 +17,9 @@ class IntMatrix : public binmat::BinaryMatrix { virtual ~IntMatrix() {} + virtual SetBitPositions get_row(Row i) const; + virtual std::vector get_rows(const std::vector &row_ids) const; + // |row| is in [0, num_rows), |column| is in [0, num_columns) virtual RowValues get_row_values(Row row) const = 0; @@ -40,6 +43,9 @@ class MultiIntMatrix : public IntMatrix { virtual ~MultiIntMatrix() {} + virtual SetBitPositions get_row(Row i) const; + virtual std::vector get_rows(const std::vector &row_ids) const; + // return tuple sizes (if not zero) at each entry virtual RowValues get_row_values(Row row) const; diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index deae5be8ac..650efae448 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -57,8 +57,6 @@ class IntRowDiff : public binmat::IRowDiff, public IntMatrix { bool get(Row i, Column j) const override; std::vector get_column(Column j) const override; - SetBitPositions get_row(Row i) const override; - std::vector get_rows(const std::vector &rows) const override; // query integer values RowValues get_row_values(Row i) const override; std::vector get_row_values(const std::vector &rows) const override; @@ -115,38 +113,11 @@ std::vector IntRowDiff::get_column(Column j) const { return result; } -template -IntMatrix::SetBitPositions IntRowDiff::get_row(Row i) const { - RowValues row = get_row_values(i); - SetBitPositions result(row.size()); - for (size_t k = 0; k < row.size(); ++k) { - result[k] = row[k].first; - } - return result; -} - template IntMatrix::RowValues IntRowDiff::get_row_values(Row row) const { return get_row_values(std::vector{ row })[0]; } -template -std::vector -IntRowDiff::get_rows(const std::vector &row_ids) const { - std::vector result; - result.reserve(row_ids.size()); - - for (auto&& row : get_row_values(row_ids)) { - result.emplace_back(row.size()); - for (size_t k = 0; k < row.size(); ++k) { - result.back()[k] = row[k].first; - } - row = RowValues(); - } - - return result; -} - template std::vector IntRowDiff::get_row_values(const std::vector &row_ids) const { diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index 5dd6a2fe39..7dc0fc361c 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -37,8 +37,6 @@ class TupleRowDiff : public binmat::IRowDiff, public MultiIntMatrix { bool get(Row i, Column j) const override; std::vector get_column(Column j) const override; - SetBitPositions get_row(Row i) const override; - std::vector get_rows(const std::vector &rows) const override; RowTuples get_row_tuples(Row i) const override; std::vector get_row_tuples(const std::vector &rows) const override; @@ -95,33 +93,6 @@ std::vector TupleRowDiff::get_column(Column j) return result; } -template -MultiIntMatrix::SetBitPositions TupleRowDiff::get_row(Row i) const { - RowTuples row = get_row_tuples(i); - SetBitPositions result(row.size()); - for (size_t k = 0; k < row.size(); ++k) { - result[k] = row[k].first; - } - return result; -} - -template -std::vector -TupleRowDiff::get_rows(const std::vector &row_ids) const { - std::vector result; - result.reserve(row_ids.size()); - - for (auto&& row : get_row_tuples(row_ids)) { - result.emplace_back(row.size()); - for (size_t k = 0; k < row.size(); ++k) { - result.back()[k] = row[k].first; - } - row = RowTuples(); - } - - return result; -} - template MultiIntMatrix::RowTuples TupleRowDiff::get_row_tuples(Row row) const { return get_row_tuples(std::vector{ row })[0]; From 67cd1590ce9fb9310a8c67c9612e010aa8bcf96c Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Thu, 16 Dec 2021 17:18:30 +0100 Subject: [PATCH 16/24] moved anchor/fork_succ to IRowDiff --- .../binary_matrix/row_diff/row_diff.cpp | 15 ++------ .../binary_matrix/row_diff/row_diff.hpp | 19 +++++----- .../int_matrix/row_diff/int_row_diff.hpp | 36 ------------------- .../int_matrix/row_diff/tuple_row_diff.hpp | 36 ------------------- 4 files changed, 12 insertions(+), 94 deletions(-) diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp index 4dda7cab78..e9f7c76110 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp @@ -11,8 +11,7 @@ namespace mtg { namespace annot { namespace binmat { -template -void RowDiff::load_anchor(const std::string &filename) { +void IRowDiff::load_anchor(const std::string &filename) { if (!std::filesystem::exists(filename)) { common::logger->error("Can't read anchor file: {}", filename); std::exit(1); @@ -23,11 +22,9 @@ void RowDiff::load_anchor(const std::string &filename) { std::exit(1); } anchor_.load(f); - f.close(); } -template -void RowDiff::load_fork_succ(const std::string &filename) { +void IRowDiff::load_fork_succ(const std::string &filename) { if (!std::filesystem::exists(filename)) { common::logger->error("Can't read fork successor file: {}", filename); std::exit(1); @@ -38,16 +35,8 @@ void RowDiff::load_fork_succ(const std::string &filename) { std::exit(1); } fork_succ_.load(f); - f.close(); } -template -class RowDiff; -template -class RowDiff; -template -class RowDiff; - } // namespace binmat } // namespace annot } // namespace mtg diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp index bac8405d89..89bcc63337 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp @@ -28,13 +28,23 @@ const size_t RD_PATH_RESERVE_SIZE = 2; class IRowDiff { public: + typedef bit_vector_small anchor_bv_type; + typedef bit_vector_small fork_succ_bv_type; + virtual ~IRowDiff() {} const graph::DBGSuccinct* graph() const { return graph_; } void set_graph(const graph::DBGSuccinct *graph) { graph_ = graph; } + void load_fork_succ(const std::string &filename); + void load_anchor(const std::string &filename); + + const anchor_bv_type& anchor() const { return anchor_; } + protected: const graph::DBGSuccinct *graph_ = nullptr; + anchor_bv_type anchor_; + fork_succ_bv_type fork_succ_; }; /** @@ -61,9 +71,6 @@ class IRowDiff { template class RowDiff : public IRowDiff, public BinaryMatrix { public: - using anchor_bv_type = bit_vector_small; - using fork_succ_bv_type = bit_vector_small; - RowDiff() {} RowDiff(const graph::DBGSuccinct *graph, BaseMatrix &&diff) @@ -90,10 +97,6 @@ class RowDiff : public IRowDiff, public BinaryMatrix { bool load(std::istream &f) override; void serialize(std::ostream &f) const override; - void load_fork_succ(const std::string &filename); - void load_anchor(const std::string &filename); - const anchor_bv_type& anchor() const { return anchor_; } - const BaseMatrix& diffs() const { return diffs_; } BaseMatrix& diffs() { return diffs_; } @@ -101,8 +104,6 @@ class RowDiff : public IRowDiff, public BinaryMatrix { static void add_diff(const Vector &diff, Vector *row); BaseMatrix diffs_; - anchor_bv_type anchor_; - fork_succ_bv_type fork_succ_; }; template diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index 650efae448..50e3ea70ca 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -46,8 +46,6 @@ inline int64_t decode_diff(uint64_t c) { template class IntRowDiff : public binmat::IRowDiff, public IntMatrix { public: - using anchor_bv_type = bit_vector_small; - using fork_succ_bv_type = bit_vector_small; static_assert(std::is_convertible::value); IntRowDiff() {} @@ -68,10 +66,6 @@ class IntRowDiff : public binmat::IRowDiff, public IntMatrix { bool load(std::istream &in) override; void serialize(std::ostream &out) const override; - void load_fork_succ(const std::string &filename); - void load_anchor(const std::string &filename); - - const anchor_bv_type& anchor() const { return anchor_; } const BaseMatrix& diffs() const { return diffs_; } BaseMatrix& diffs() { return diffs_; } @@ -80,8 +74,6 @@ class IntRowDiff : public binmat::IRowDiff, public IntMatrix { static void add_diff(const RowValues &diff, RowValues *row); BaseMatrix diffs_; - anchor_bv_type anchor_; - fork_succ_bv_type fork_succ_; }; @@ -268,34 +260,6 @@ void IntRowDiff::add_diff(const RowValues &diff, RowValues *row) { row->swap(result); } -template -void IntRowDiff::load_anchor(const std::string &filename) { - if (!std::filesystem::exists(filename)) { - common::logger->error("Can't read anchor file: {}", filename); - std::exit(1); - } - std::ifstream in(filename, ios::binary); - if (!in.good()) { - common::logger->error("Could not open anchor file {}", filename); - std::exit(1); - } - anchor_.load(in); -} - -template -void IntRowDiff::load_fork_succ(const std::string &filename) { - if (!std::filesystem::exists(filename)) { - common::logger->error("Can't read fork successor file: {}", filename); - std::exit(1); - } - std::ifstream in(filename, ios::binary); - if (!in.good()) { - common::logger->error("Could not open fork successor file {}", filename); - std::exit(1); - } - fork_succ_.load(in); -} - } // namespace matrix } // namespace annot } // namespace mtg diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index 7dc0fc361c..07d9f1b5d9 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -25,8 +25,6 @@ namespace matrix { template class TupleRowDiff : public binmat::IRowDiff, public MultiIntMatrix { public: - using anchor_bv_type = bit_vector_small; - using fork_succ_bv_type = bit_vector_small; static_assert(std::is_convertible::value); static const int SHIFT = 1; // coordinates increase by 1 at each edge @@ -48,10 +46,6 @@ class TupleRowDiff : public binmat::IRowDiff, public MultiIntMatrix { bool load(std::istream &in) override; void serialize(std::ostream &out) const override; - void load_fork_succ(const std::string &filename); - void load_anchor(const std::string &filename); - - const anchor_bv_type& anchor() const { return anchor_; } const BaseMatrix& diffs() const { return diffs_; } BaseMatrix& diffs() { return diffs_; } @@ -60,8 +54,6 @@ class TupleRowDiff : public binmat::IRowDiff, public MultiIntMatrix { static void add_diff(const RowTuples &diff, RowTuples *row); BaseMatrix diffs_; - anchor_bv_type anchor_; - fork_succ_bv_type fork_succ_; }; @@ -258,34 +250,6 @@ void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { } } -template -void TupleRowDiff::load_anchor(const std::string &filename) { - if (!std::filesystem::exists(filename)) { - common::logger->error("Can't read anchor file: {}", filename); - std::exit(1); - } - std::ifstream in(filename, ios::binary); - if (!in.good()) { - common::logger->error("Could not open anchor file {}", filename); - std::exit(1); - } - anchor_.load(in); -} - -template -void TupleRowDiff::load_fork_succ(const std::string &filename) { - if (!std::filesystem::exists(filename)) { - common::logger->error("Can't read fork successor file: {}", filename); - std::exit(1); - } - std::ifstream in(filename, ios::binary); - if (!in.good()) { - common::logger->error("Could not open fork successor file {}", filename); - std::exit(1); - } - fork_succ_.load(in); -} - } // namespace matrix } // namespace annot } // namespace mtg From fdd7860981687da7a0c66209ca2ec20d7b701b1d Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Thu, 16 Dec 2021 17:33:10 +0100 Subject: [PATCH 17/24] minor --- .../src/annotation/int_matrix/row_diff/int_row_diff.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index 50e3ea70ca..f7ee2cef55 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -22,9 +22,6 @@ namespace mtg { namespace annot { namespace matrix { -const size_t RD_PATH_RESERVE_SIZE = 2; - - /** * Convert deltas to positive integer for enable compression: * 0 -> X (not allowed, zero diffs must be skipped) @@ -117,6 +114,8 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); + const size_t RD_PATH_RESERVE_SIZE = 2; + // diff rows annotating nodes along the row-diff paths std::vector rd_ids; rd_ids.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); @@ -196,8 +195,6 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { add_diff(rd_rows[succ], &rd_rows[node]); } rows[i] = rd_rows[rd_path.back().second]; - assert(std::all_of(rows[i].begin(), rows[i].end(), - [](auto &p) { return p.second; })); assert(std::all_of(rows[i].begin(), rows[i].end(), [](auto &p) { return (int64_t)p.second > 0; })); } From bdebcd4017d28829834cefa0d6cfaefb6ba56d15 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Thu, 16 Dec 2021 17:52:45 +0100 Subject: [PATCH 18/24] fix: don't keep empty tuples --- metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index 07d9f1b5d9..a566b40d98 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -230,6 +230,8 @@ void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { std::set_symmetric_difference(it->second.begin(), it->second.end(), it2->second.begin(), it2->second.end(), std::back_inserter(result.back().second)); + if (result.back().second.empty()) + result.pop_back(); } ++it; ++it2; From 36333bcd811bd3a89c0a76f56408e1657ebe470e Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Fri, 17 Dec 2021 00:43:02 +0100 Subject: [PATCH 19/24] fix --- metagraph/src/annotation/row_diff_builder.cpp | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 148cf33dc9..c68cd02a6b 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -1409,7 +1409,6 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, // edges, for each of the sources, per chunk. set_rows_fwd is already sorted std::vector>> set_rows_bwd(sources.size()); std::vector>> set_rows_fwd(sources.size()); - std::vector> row_diff_bits(sources.size()); std::vector> row_diff_coords(sources.size()); std::vector> num_coords_anchored(sources.size()); std::vector> num_chunks(sources.size()); @@ -1430,7 +1429,6 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, for (size_t s = 0; s < sources.size(); ++s) { set_rows_fwd[s].resize(sources[s].num_labels()); set_rows_bwd[s].resize(sources[s].num_labels()); - row_diff_bits[s].assign(sources[s].num_labels(), 0); row_diff_coords[s].assign(sources[s].num_labels(), 0); num_coords_anchored[s].assign(sources[s].num_labels(), 0); // The first chunk will contain forward bits, all sorted. @@ -1487,7 +1485,6 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, v.resize(0); } } - row_diff_bits[s][j]++; } if (!curr_value.size() || !rd_succ[to_node(row_idx)]) @@ -1501,8 +1498,9 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, [&](uint64_t c) { return c % num_coords_per_seq; }))) { auto &v = set_rows_bwd[s][j]; // indicate that there are no coordinates for the predecessor + // FYI: in case of multiple successors at forks, this adds duplicate + // DUMMY_COORD from each fork-successor the empty predecessor. v.emplace_back(*pred_p, DUMMY_COORD); - row_diff_bits[s][j]++; if (v.size() == v.capacity()) { std::sort(v.begin(), v.end()); @@ -1575,10 +1573,9 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, // diff values may be negative, hence we need wider integers sdsl::int_vector<> diff_coords(row_diff_coords[l_idx][j]); auto coords_it = diff_coords.begin(); - sdsl::bit_vector diff_delims(diff_coords.size() + row_diff_bits[l_idx][j] + 1, 0); - auto delims_it = diff_delims.begin(); - - auto call_ones = [&](const std::function &call) { + std::vector ids; + std::vector diff_delims; + { std::vector filenames; // skip chunk with fwd bits which have already been counted if stage 1 for (uint32_t chunk = 0; chunk < num_chunks[l_idx][j]; ++chunk) { @@ -1589,22 +1586,23 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, elias_fano::merge_files(filenames, [&](T pair) { const auto &[i, coord] = pair; if (i != last) { - call(i); + ids.push_back(i); last = i; - *delims_it++ = 1; + diff_delims.push_back(1); } if (coord != DUMMY_COORD) { *coords_it++ = coord; - ++delims_it; + diff_delims.push_back(0); } }, true, chunks_open_per_thread); - *delims_it++ = 1; + diff_delims.push_back(1); assert(coords_it == diff_coords.end()); - assert(delims_it == diff_delims.end()); - }; - columns[j] = std::make_unique(call_ones, num_rows, - row_diff_bits[l_idx][j]); - bit_vector_smart(std::move(diff_delims)).serialize(out_coord); + assert(diff_delims.size() == diff_coords.size() + ids.size() + 1); + } + columns[j] = std::make_unique( + [&](auto callback) { std::for_each(ids.begin(), ids.end(), callback); }, + num_rows, ids.size()); + bit_vector_smart(to_sdsl(diff_delims)).serialize(out_coord); sdsl::util::bit_compress(diff_coords); diff_coords.serialize(out_coord); } From 75c9f9934e15d2b4b6bec733892f4f7d0aa15e2d Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Fri, 17 Dec 2021 16:25:27 +0100 Subject: [PATCH 20/24] fix --- .../int_matrix/row_diff/tuple_row_diff.hpp | 23 ++++++++--- metagraph/src/annotation/row_diff_builder.cpp | 38 ++++++++++++------- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index a566b40d98..4d55d90f55 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -177,7 +177,14 @@ TupleRowDiff::get_row_tuples(const std::vector &row_ids) const // reconstruct annotation by adding the diff (full succ + diff) add_diff(rd_rows[succ], &rd_rows[node]); } - rows[i] = rd_rows[rd_path.back().second]; + auto &row = rd_rows[rd_path.back().second]; + rows[i].reserve(std::count_if(row.begin(), row.end(), + [](const auto &v) { return !v.second.empty(); })); + for (auto&& v : row) { + if (v.second.size()) + rows[i].push_back(std::move(v)); + } + row = rows[i]; assert(std::all_of(rows[i].begin(), rows[i].end(), [](auto &p) { return p.second.size(); })); } @@ -222,11 +229,14 @@ void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { result.push_back(*it); ++it; } else if (it->first > it2->first) { - result.push_back(*it2); + if (it2->second.size()) + result.push_back(*it2); ++it2; } else { - if (it2->second.size()) { - result.emplace_back(it->first, Tuple{}); + result.emplace_back(it->first, Tuple{}); + if (it->second.size()) { + assert(std::is_sorted(it->second.begin(), it->second.end())); + assert(std::is_sorted(it2->second.begin(), it2->second.end())); std::set_symmetric_difference(it->second.begin(), it->second.end(), it2->second.begin(), it2->second.end(), std::back_inserter(result.back().second)); @@ -238,7 +248,10 @@ void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { } } std::copy(it, row->end(), std::back_inserter(result)); - std::copy(it2, diff.end(), std::back_inserter(result)); + for ( ; it2 != diff.end(); ++it2) { + if (it2->second.size()) + result.push_back(*it2); + } row->swap(result); } diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index c68cd02a6b..c4b524cdcb 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -1263,10 +1263,22 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, rd_succ_bv_type rd_succ; - // get bit at position |i| or its value - auto get_value = [&](const bit_vector &col, - size_t s, size_t j, - const uint64_t *it, const uint64_t *end) { + // get coordinates at position |i| + auto get_value = [&](const bit_vector &col, size_t s, size_t j, uint64_t i) { + std::vector result; + if (uint64_t rk = col.conditional_rank1(i)) { + uint64_t t = delims[s][j].select1(rk); + while (!delims[s][j][++t]) { + result.push_back(coords[s][j][t - rk]); + } + } + assert(std::is_sorted(result.begin(), result.end())); + return result; + }; + + auto get_succ = [&](const bit_vector &col, + size_t s, size_t j, + const uint64_t *it, const uint64_t *end) { std::vector result; uint64_t rk; for ( ; it != end; ++it) { @@ -1348,8 +1360,8 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, return; // get annotated coordinates for this k-mer - const auto curr_value = get_value(source_col, s, j, &row_idx, &row_idx + 1); - const auto diff = get_diff(curr_value, get_value(source_col, s, j, succ_begin, succ_end)); + const auto curr_value = get_value(source_col, s, j, row_idx); + const auto diff = get_diff(curr_value, get_succ(source_col, s, j, succ_begin, succ_end)); // reduction (zero diff) __atomic_add_fetch(&row_nbits_block[chunk_idx], curr_value.size() - diff.size(), __ATOMIC_RELAXED); @@ -1448,7 +1460,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, // We use this dummy index for an optimization where we don't store diff // for non-anchor k-mers with no coordinates. - uint64_t DUMMY_COORD = -1; + const uint64_t DUMMY_COORD = -1; traverse_anno_chunked( num_rows, pred_succ_fprefix, sources, @@ -1458,22 +1470,22 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, const uint64_t *succ_begin, const uint64_t *succ_end, const uint64_t *pred_begin, const uint64_t *pred_end) { // get annotated coordinates for this k-mer - const auto curr_value = get_value(source_col, s, j, &row_idx, &row_idx + 1); + const auto curr_value = get_value(source_col, s, j, row_idx); bool is_anchor = anchor[row_idx]; if (is_anchor) num_coords_anchored[s][j] += curr_value.size(); - const auto diff = is_anchor + const auto value = is_anchor ? curr_value - : get_diff(curr_value, get_value(source_col, s, j, succ_begin, succ_end)); + : get_diff(curr_value, get_succ(source_col, s, j, succ_begin, succ_end)); - if (diff.size()) { + if (value.size()) { // must write the coordinates/diff auto &v = set_rows_fwd[s][j]; - for (uint64_t coord : diff) { + for (uint64_t coord : value) { assert((!v.size() || v.back() != std::make_pair(row_idx, coord)) && "coordinates must be unique and can't repeat"); v.emplace_back(row_idx, coord); @@ -1594,7 +1606,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, *coords_it++ = coord; diff_delims.push_back(0); } - }, true, chunks_open_per_thread); + }, true, chunks_open_per_thread, false); diff_delims.push_back(1); assert(coords_it == diff_coords.end()); assert(diff_delims.size() == diff_coords.size() + ids.size() + 1); From 16333e47df299f1629838420dc5ceb054aedb58a Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Fri, 17 Dec 2021 17:53:03 +0100 Subject: [PATCH 21/24] final fix --- .../int_matrix/row_diff/tuple_row_diff.hpp | 25 ++++++++++++++----- metagraph/src/annotation/row_diff_builder.cpp | 1 + 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index 4d55d90f55..493ea9fcfd 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -171,13 +171,32 @@ TupleRowDiff::get_row_tuples(const std::vector &row_ids) const for (size_t i = 0; i < row_ids.size(); ++i) { const auto &rd_path = rd_paths_trunc[i]; + uint64_t last_node = -1; // propagate back and reconstruct full annotations for predecessors for (size_t j = 0; j + 1 < rd_path.size(); ++j) { auto [node, succ] = rd_path[j]; + if (last_node == succ) { + for (auto &[j, tuple] : rd_rows[last_node]) { + assert(std::is_sorted(tuple.begin(), tuple.end())); + for (uint64_t &c : tuple) { + c -= SHIFT; + } + } + } // reconstruct annotation by adding the diff (full succ + diff) add_diff(rd_rows[succ], &rd_rows[node]); + last_node = node; } auto &row = rd_rows[rd_path.back().second]; + if (last_node != (uint64_t)-1) { + assert(last_node == rd_path.back().second); + for (auto &[j, tuple] : rd_rows[last_node]) { + assert(std::is_sorted(tuple.begin(), tuple.end())); + for (uint64_t &c : tuple) { + c -= SHIFT; + } + } + } rows[i].reserve(std::count_if(row.begin(), row.end(), [](const auto &v) { return !v.second.empty(); })); for (auto&& v : row) { @@ -257,12 +276,6 @@ void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { } assert(std::is_sorted(row->begin(), row->end())); - for (auto &[j, tuple] : *row) { - assert(std::is_sorted(tuple.begin(), tuple.end())); - for (uint64_t &c : tuple) { - c -= SHIFT; - } - } } } // namespace matrix diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index c4b524cdcb..343213ae6b 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -1271,6 +1271,7 @@ void convert_batch_to_row_diff_coord(const std::string &pred_succ_fprefix, while (!delims[s][j][++t]) { result.push_back(coords[s][j][t - rk]); } + assert(result.size()); } assert(std::is_sorted(result.begin(), result.end())); return result; From 427706e724ea365afe8410db60dfa53d20a6d174 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Fri, 17 Dec 2021 18:00:08 +0100 Subject: [PATCH 22/24] cleanup --- .../int_matrix/row_diff/tuple_row_diff.hpp | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index 493ea9fcfd..d5da458cf1 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -51,6 +51,7 @@ class TupleRowDiff : public binmat::IRowDiff, public MultiIntMatrix { private: static void decode_diffs(RowTuples *diffs); + static void shift_coords(RowTuples *diffs); static void add_diff(const RowTuples &diff, RowTuples *row); BaseMatrix diffs_; @@ -175,28 +176,17 @@ TupleRowDiff::get_row_tuples(const std::vector &row_ids) const // propagate back and reconstruct full annotations for predecessors for (size_t j = 0; j + 1 < rd_path.size(); ++j) { auto [node, succ] = rd_path[j]; - if (last_node == succ) { - for (auto &[j, tuple] : rd_rows[last_node]) { - assert(std::is_sorted(tuple.begin(), tuple.end())); - for (uint64_t &c : tuple) { - c -= SHIFT; - } - } - } + if (last_node == succ) + shift_coords(&rd_rows[last_node]); // reconstruct annotation by adding the diff (full succ + diff) add_diff(rd_rows[succ], &rd_rows[node]); last_node = node; } - auto &row = rd_rows[rd_path.back().second]; if (last_node != (uint64_t)-1) { assert(last_node == rd_path.back().second); - for (auto &[j, tuple] : rd_rows[last_node]) { - assert(std::is_sorted(tuple.begin(), tuple.end())); - for (uint64_t &c : tuple) { - c -= SHIFT; - } - } + shift_coords(&rd_rows[last_node]); } + auto &row = rd_rows[rd_path.back().second]; rows[i].reserve(std::count_if(row.begin(), row.end(), [](const auto &v) { return !v.second.empty(); })); for (auto&& v : row) { @@ -232,6 +222,16 @@ void TupleRowDiff::decode_diffs(RowTuples *diffs) { // no encoding } +template +void TupleRowDiff::shift_coords(RowTuples *diffs) { + for (auto &[j, tuple] : *diffs) { + assert(std::is_sorted(tuple.begin(), tuple.end())); + for (uint64_t &c : tuple) { + c -= SHIFT; + } + } +} + template void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { assert(std::is_sorted(row->begin(), row->end())); From 779813552867643a098c46c9097f7d176be8f5de Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Sat, 18 Dec 2021 15:49:48 +0100 Subject: [PATCH 23/24] final final fix --- metagraph/src/annotation/row_diff_builder.cpp | 87 ++++++++++++------- 1 file changed, 55 insertions(+), 32 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 343213ae6b..a925cf4f30 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -849,7 +849,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, v.resize(0); }; - std::function &)>(size_t, size_t)> call_diffs; + std::function(size_t, size_t)> get_diff_values; if (swap_disk) { uint64_t total_num_labels = 0; for (size_t s = 0; s < sources.size(); ++s) { @@ -878,46 +878,61 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, exit(1); } - call_diffs = [&,chunks_open_per_thread](size_t s, size_t j) { - return [&,s,j](const std::function &call) { - std::vector filenames; - // for stage 1, fwd bits are already counted, so we skip that chunk - for (uint32_t chunk = compute_row_reduction ? 1 : 0; - chunk < num_chunks[s][j]; ++chunk) { - filenames.push_back(tmp_file(s, j, chunk)); - } + get_diff_values = [&,chunks_open_per_thread](size_t s, size_t j) { + std::vector filenames; + // for stage 1, fwd bits are already counted, so we skip that chunk + for (uint32_t chunk = compute_row_reduction ? 1 : 0; + chunk < num_chunks[s][j]; ++chunk) { + filenames.push_back(tmp_file(s, j, chunk)); + } - const bool remove_chunks = true; - uint64_t r = 0; + std::vector buffer; + const bool remove_chunks = true; + if constexpr(with_values) { elias_fano::merge_files(filenames, [&](T v) { - call(utils::get_first(v)); - if constexpr(with_values) { - assert(v.second && "zero diffs must have been skipped"); - values[s][j][r++] = matrix::encode_diff(v.second); + assert(v.second && "zero diffs must have been skipped"); + if (buffer.size() && buffer.back().first == v.first) { + buffer.back().second += v.second; + } else { + buffer.push_back(v); } }, remove_chunks, chunks_open_per_thread); - }; + } else { + elias_fano::merge_files(filenames, [&](T v) { buffer.push_back(v); }, + remove_chunks, chunks_open_per_thread, true /* dedupe */); + } + return buffer; }; } else { logger->info("Diff-transform in memory without disk swap"); - call_diffs = [&](size_t s, size_t j) { - return [&,s,j](const std::function &call) { - auto &v = set_rows_fwd[s][j]; - v.insert(v.end(), set_rows_bwd[s][j].begin(), set_rows_bwd[s][j].end()); - set_rows_bwd[s][j] = {}; + get_diff_values = [&](size_t s, size_t j) { + auto &buffer = set_rows_fwd[s][j]; + assert(std::is_sorted(buffer.begin(), buffer.end())); + + buffer.insert(buffer.end(), set_rows_bwd[s][j].begin(), set_rows_bwd[s][j].end()); + set_rows_bwd[s][j] = {}; - std::sort(v.begin(), v.end()); + std::sort(buffer.begin(), buffer.end()); + if constexpr(with_values) { uint64_t r = 0; - for (size_t i = 0; i < v.size(); ++i) { - call(utils::get_first(v[i])); - if constexpr(with_values) { - assert(v[i].second && "zero diffs must have been skipped"); - values[s][j][r++] = matrix::encode_diff(v[i].second); + for (size_t i = 1; i < buffer.size(); ++i) { + assert(buffer[i].second && "zero diffs must have been skipped"); + if (buffer[i].first == buffer[r].first) { + buffer[r].second += buffer[i].second; + } else { + buffer[++r] = buffer[i]; } } - }; + if (buffer.size()) + buffer.resize(r + 1); + + } else { + buffer.erase(std::unique(buffer.begin(), buffer.end()), buffer.end()); + } + + return std::move(buffer); }; } @@ -1143,14 +1158,22 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, std::vector> columns(label_encoders[l_idx].size()); for (size_t j = 0; j < label_encoders[l_idx].size(); ++j) { + std::vector buffer = get_diff_values(l_idx, j); if constexpr(with_values) { // diff values may be negative, hence we need wider integers - values[l_idx][j] = sdsl::int_vector<>(row_diff_bits[l_idx][j], 0, + values[l_idx][j] = sdsl::int_vector<>(buffer.size(), 0, std::min(values[l_idx][j].width() + 1, 64)); + for (size_t i = 0; i < buffer.size(); ++i) { + values[l_idx][j][i] = matrix::encode_diff(buffer[i].second); + } } - - columns[j] = std::make_unique(call_diffs(l_idx, j), num_rows, - row_diff_bits[l_idx][j]); + columns[j] = std::make_unique( + [&](auto call_index) { + for (const auto &v : buffer) { + call_index(utils::get_first(v)); + } + }, + num_rows, buffer.size()); } if (compute_row_reduction) { From a06ab67a994fe3d37e9179ff7199119d9a4485c2 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Sat, 18 Dec 2021 17:40:19 +0100 Subject: [PATCH 24/24] cleanup --- .../binary_matrix/row_diff/row_diff.cpp | 69 +++++++++++++++++++ .../binary_matrix/row_diff/row_diff.hpp | 4 ++ .../int_matrix/row_diff/int_row_diff.hpp | 62 +---------------- .../int_matrix/row_diff/tuple_row_diff.hpp | 62 +---------------- 4 files changed, 77 insertions(+), 120 deletions(-) diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp index e9f7c76110..5e22027c1d 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp @@ -37,6 +37,75 @@ void IRowDiff::load_fork_succ(const std::string &filename) { fork_succ_.load(f); } +std::pair, std::vector>>> +IRowDiff::get_rd_ids(const std::vector &row_ids) const { + assert(graph_ && "graph must be loaded"); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); + + using Row = BinaryMatrix::Row; + + const size_t RD_PATH_RESERVE_SIZE = 2; + + // diff rows annotating nodes along the row-diff paths + std::vector rd_ids; + rd_ids.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + + // map row index to its index in |rd_rows| + VectorMap node_to_rd; + node_to_rd.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + + // Truncated row-diff paths, indexes to |rd_rows|. + // The last index in each path points to an anchor or to a row which had + // been reached before, and thus, will be reconstructed before this one. + std::vector>> rd_paths_trunc(row_ids.size()); + + for (size_t i = 0; i < row_ids.size(); ++i) { + std::vector> &rd_path = rd_paths_trunc[i]; + + std::vector path; + Vector> queue; + queue.emplace_back(0, row_ids[i]); + + while (queue.size()) { + size_t depth = queue.back().first; + Row row = queue.back().second; + queue.pop_back(); + while (depth < path.size()) { + assert(path.size() > 1); + rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); + path.pop_back(); + } + auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); + path.push_back(it.value()); + // If a node had been reached before, we interrupt the diff path. + // The annotation for that node will have been reconstructed earlier + // than for other nodes in this path as well. Thus, we will start + // reconstruction from that node and don't need its successors. + if (!is_new) + continue; + + rd_ids.push_back(row); + + if (anchor_[row]) + continue; + + auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); + graph_->call_row_diff_successors(node, fork_succ_, [&](auto succ) { + queue.emplace_back(depth + 1, graph::AnnotatedSequenceGraph::graph_to_anno_index(succ)); + }); + } + + while (path.size() > 1) { + rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); + path.pop_back(); + } + assert(path.size()); + rd_path.emplace_back(-1, path[0]); + } + + return std::make_pair(std::move(rd_ids), std::move(rd_paths_trunc)); +} + } // namespace binmat } // namespace annot } // namespace mtg diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp index 89bcc63337..74aecb0d62 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp @@ -42,6 +42,10 @@ class IRowDiff { const anchor_bv_type& anchor() const { return anchor_; } protected: + // get row-diff paths starting at |row_ids| + std::pair, std::vector>>> + get_rd_ids(const std::vector &row_ids) const; + const graph::DBGSuccinct *graph_ = nullptr; anchor_bv_type anchor_; fork_succ_bv_type fork_succ_; diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index f7ee2cef55..fa5da63d3b 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -114,66 +114,8 @@ IntRowDiff::get_row_values(const std::vector &row_ids) const { assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); - const size_t RD_PATH_RESERVE_SIZE = 2; - - // diff rows annotating nodes along the row-diff paths - std::vector rd_ids; - rd_ids.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); - - // map row index to its index in |rd_rows| - VectorMap node_to_rd; - node_to_rd.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); - - // Truncated row-diff paths, indexes to |rd_rows|. - // The last index in each path points to an anchor or to a row which had - // been reached before, and thus, will be reconstructed before this one. - std::vector>> rd_paths_trunc(row_ids.size()); - - for (size_t i = 0; i < row_ids.size(); ++i) { - std::vector> &rd_path = rd_paths_trunc[i]; - - std::vector path; - Vector> queue; - queue.emplace_back(0, row_ids[i]); - - while (queue.size()) { - size_t depth = queue.back().first; - Row row = queue.back().second; - queue.pop_back(); - while (depth < path.size()) { - assert(path.size() > 1); - rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); - path.pop_back(); - } - auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); - path.push_back(it.value()); - // If a node had been reached before, we interrupt the diff path. - // The annotation for that node will have been reconstructed earlier - // than for other nodes in this path as well. Thus, we will start - // reconstruction from that node and don't need its successors. - if (!is_new) - continue; - - rd_ids.push_back(row); - - if (anchor_[row]) - continue; - - auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); - graph_->call_row_diff_successors(node, fork_succ_, [&](auto succ) { - queue.emplace_back(depth + 1, graph::AnnotatedSequenceGraph::graph_to_anno_index(succ)); - }); - } - - while (path.size() > 1) { - rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); - path.pop_back(); - } - assert(path.size()); - rd_path.emplace_back(-1, path[0]); - } - - node_to_rd = VectorMap(); + // get row-diff paths + auto [rd_ids, rd_paths_trunc] = get_rd_ids(row_ids); std::vector rd_rows = diffs_.get_row_values(rd_ids); for (auto &row : rd_rows) { diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index d5da458cf1..45e310754e 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -98,66 +98,8 @@ TupleRowDiff::get_row_tuples(const std::vector &row_ids) const assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); - const size_t RD_PATH_RESERVE_SIZE = 2; - - // diff rows annotating nodes along the row-diff paths - std::vector rd_ids; - rd_ids.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); - - // map row index to its index in |rd_rows| - VectorMap node_to_rd; - node_to_rd.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); - - // Truncated row-diff paths, indexes to |rd_rows|. - // The last index in each path points to an anchor or to a row which had - // been reached before, and thus, will be reconstructed before this one. - std::vector>> rd_paths_trunc(row_ids.size()); - - for (size_t i = 0; i < row_ids.size(); ++i) { - std::vector> &rd_path = rd_paths_trunc[i]; - - std::vector path; - Vector> queue; - queue.emplace_back(0, row_ids[i]); - - while (queue.size()) { - size_t depth = queue.back().first; - Row row = queue.back().second; - queue.pop_back(); - while (depth < path.size()) { - assert(path.size() > 1); - rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); - path.pop_back(); - } - auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); - path.push_back(it.value()); - // If a node had been reached before, we interrupt the diff path. - // The annotation for that node will have been reconstructed earlier - // than for other nodes in this path as well. Thus, we will start - // reconstruction from that node and don't need its successors. - if (!is_new) - continue; - - rd_ids.push_back(row); - - if (anchor_[row]) - continue; - - auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); - graph_->call_row_diff_successors(node, fork_succ_, [&](auto succ) { - queue.emplace_back(depth + 1, graph::AnnotatedSequenceGraph::graph_to_anno_index(succ)); - }); - } - - while (path.size() > 1) { - rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); - path.pop_back(); - } - assert(path.size()); - rd_path.emplace_back(-1, path[0]); - } - - node_to_rd = VectorMap(); + // get row-diff paths + auto [rd_ids, rd_paths_trunc] = get_rd_ids(row_ids); std::vector rd_rows = diffs_.get_row_tuples(rd_ids); for (auto &row : rd_rows) {