Skip to content

Commit

Permalink
Modify index long running
Browse files Browse the repository at this point in the history
  • Loading branch information
shosseinimotlagh committed Oct 17, 2024
1 parent d75d803 commit af3794e
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 10 deletions.
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "6.4.64"
version = "6.4.65"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down
2 changes: 1 addition & 1 deletion src/include/homestore/index/index_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
child_node->to_string(), child_last_key.to_string());

if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) {
// We have reached the last key, we can stop now
// We have reached the last key, and the parent node doesn't have edge, so we can stop now
break;
}

Expand Down
3 changes: 3 additions & 0 deletions src/lib/index/index_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ void IndexService::start() {
for (const auto& [_, tbl] : m_index_map) {
tbl->recovery_completed();
}
// Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty buffer
// after recovery can be added to dirty list for flushing in the new cp
hs()->cp_mgr().trigger_cp_flush(true /* force */);
}

void IndexService::stop() { m_wb_cache.reset(); }
Expand Down
13 changes: 12 additions & 1 deletion src/lib/index/wb_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,17 @@ void IndexWBCache::recover(sisl::byte_view sb) {
}
l0_bufs.push_back(buf);
// insert upbuffer and its upbuffer to the recovering_upbuffers set until nullptr
// suppose the following scenario that crash happened when flushing B
// ├── A (WAITING FOR B)
// │ ├── B (CRASHED)
// │ │ ├── C (FLUSHED)
// │ │ └── D (FLUSHED)
// │ │ └── E (FLUSHED)
// │ └── F (FLUSHED)
// └── G (FLUSHED)
// we need to recover B and A. During recover_buf(A), we need to recover F and commit its blk. So in
// this case, we give a second chance to new buffers to be a part of repair. There is still a case
// that committing new buffers is not needed since they are not part of recovery paths.
auto up_buf = buf->m_up_buffer;
while (up_buf) {
recovering_upbuffers.insert(up_buf);
Expand Down Expand Up @@ -686,7 +697,7 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const
if (buf->m_crash_flag_on) {
std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot";
LOGINFO("\nSimulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename);
cp_ctx->to_string_dot(filename);
// cp_ctx->to_string_dot(filename);
hs()->crash_simulator().crash();
cp_ctx->complete(true);
return;
Expand Down
2 changes: 1 addition & 1 deletion src/tests/btree_helpers/btree_test_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ struct BtreeTestHelper {

LOGINFO("{}{}", preamble.empty() ? "" : preamble + ":\n", m_bt->to_custom_string(print_key_range));
}
void visualize_keys(const std::string& file) const { m_bt->visualize_tree_keys(file); }
void visualize_keys(const std::string& file) const { /*m_bt->visualize_tree_keys(file);*/ }

void compare_files(const std::string& before, const std::string& after) {
std::ifstream b(before, std::ifstream::ate);
Expand Down
5 changes: 4 additions & 1 deletion src/tests/test_index_crash_recovery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ SISL_LOGGING_DECL(test_index_crash_recovery)

SISL_OPTION_GROUP(
test_index_crash_recovery,
(num_iters, "", "num_iters", "number of iterations for rand ops",
::cxxopts::value< uint32_t >()->default_value("500"), "number"),
(num_entries, "", "num_entries", "number of entries to test with",
::cxxopts::value< uint32_t >()->default_value("5000"), "number"),
(run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"),
Expand Down Expand Up @@ -428,7 +430,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
this->print_keys("Post crash and recovery, btree structure: ");
}
sanity_check(operations);
test_common::HSTestHelper::trigger_cp(true);
// Added to the index service right after recovery. Not needed here
// test_common::HSTestHelper::trigger_cp(true);
LOGINFO("Before Reapply: {} keys in shadow map and actually {} in trees operation size {}",
this->m_shadow_map.size(), tree_key_count(), operations.size());
this->reapply_after_crash(operations);
Expand Down
4 changes: 4 additions & 0 deletions src/tests/test_mem_btree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test {

void SetUp() override {
BtreeTestHelper< TestType >::SetUp();
#ifdef _PRERELEASE
this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >();
#endif
this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg);
}
};
Expand Down Expand Up @@ -303,7 +305,9 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin
.hugepage_size_mb = 0});

BtreeTestHelper< TestType >::SetUp();
#ifdef _PRERELEASE
this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >();
#endif
this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg);
}

Expand Down
17 changes: 12 additions & 5 deletions src/tests/test_scripts/index_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ def run_test(options, type):
raise TestFailedError(f"Test failed for type {type}")
print("Test completed")


def run_crash_test(options):
cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --max_keys_in_node={options['max_keys_in_node']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} {options['dev_list']}"
cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} "
# print(f"Running test with options: {cmd_opts}")
try:
subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True)
subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT,
shell=True)
except subprocess.CalledProcessError as e:
print(f"Test failed: {e}")
raise TestFailedError(f"Test failed for type {type}")
Expand All @@ -49,7 +51,9 @@ def parse_arguments():
parser.add_argument('--dev_list', help='Device list', default='')
parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False)
parser.add_argument('--init_device', help='Initialize device', type=bool, default=True)
parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=20)
parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5)
parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000)
parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60)

# Parse the known arguments and ignore any unknown arguments
args, unknown = parser.parse_known_args()
Expand All @@ -73,7 +77,6 @@ def long_runnig_index(options, type=0):

def long_running_clean_shutdown(options, type=0):
print("Long running clean shutdown started")
options['run_time'] = int(options['run_time']) // 10 # 20 minutes

try:
run_test(options, type)
Expand All @@ -87,14 +90,18 @@ def long_running_clean_shutdown(options, type=0):
raise
print("Long running clean shutdown completed")


def long_running_crash_put(options):
print("Long running crash put started")
options['num_entries'] = 20480 # 20K
options['num_entries'] = 2097152 # 2M
options['init_device'] = True
options['run_time'] = 14400 # 4 hours
options['preload_size'] = 100
print(f"options: {options}")
run_crash_test(options)
print("Long running crash put completed")


def main():
options = parse_arguments()
test_suite_name = options['test_suits']
Expand Down

0 comments on commit af3794e

Please sign in to comment.