From 40d9b5ce535e34499a531720bf664d4a5c8fdb49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xing=20Chen=20=C2=B7=20=E5=8D=95=E5=8D=95?= Date: Tue, 5 Sep 2023 00:48:40 +0800 Subject: [PATCH] tools: add rime_table_decompiler (#706) --------- Co-authored-by: Qijia Liu --- src/rime/dict/table.cc | 39 ----------- src/rime/dict/table.h | 41 ++++++++++- tools/CMakeLists.txt | 13 ++++ tools/rime_table_decompiler.cc | 124 +++++++++++++++++++++++++++++++++ 4 files changed, 177 insertions(+), 40 deletions(-) create mode 100644 tools/rime_table_decompiler.cc diff --git a/src/rime/dict/table.cc b/src/rime/dict/table.cc index 4a26c5e35f..70472cc2f0 100644 --- a/src/rime/dict/table.cc +++ b/src/rime/dict/table.cc @@ -21,45 +21,6 @@ const int kTableFormatLowestCompatible = 4.0; const char kTableFormatPrefix[] = "Rime::Table/"; const size_t kTableFormatPrefixLen = sizeof(kTableFormatPrefix) - 1; -class TableQuery { - public: - TableQuery(table::Index* index) : lv1_index_(index) { Reset(); } - - TableAccessor Access(SyllableId syllable_id, double credibility = 0.0) const; - void AccessAll(vector& accessors, double credibility = 0.0); - - // down to next level - bool Advance(SyllableId syllable_id, double credibility = 0.0); - - // up one level - bool Backdate(); - - // back to root - void Reset(); - - size_t level() const { return level_; } - - protected: - size_t level_ = 0; - Code index_code_; - vector credibility_; - - private: - bool Walk(SyllableId syllable_id); - - table::HeadIndex* lv1_index_ = nullptr; - table::TrunkIndex* lv2_index_ = nullptr; - table::TrunkIndex* lv3_index_ = nullptr; - table::TailIndex* lv4_index_ = nullptr; -}; - -struct QueryQueue { - size_t pos; - TableQuery query; - bool isRegularSpelling; - bool hasNoEntry; -}; - TableAccessor::TableAccessor(const Code& index_code, const List* list, double credibility) diff --git a/src/rime/dict/table.h b/src/rime/dict/table.h index a903ce173b..264cb6ab9c 100644 --- a/src/rime/dict/table.h +++ b/src/rime/dict/table.h @@ -128,7 +128,38 @@ class TableAccessor { using TableQueryResult = map>; struct SyllableGraph; -class TableQuery; + +class TableQuery { + public: + TableQuery(table::Index* index) : lv1_index_(index) { Reset(); } + + TableAccessor Access(SyllableId syllable_id, double credibility = 0.0) const; + void AccessAll(vector& accessors, double credibility = 0.0); + + // down to next level + bool Advance(SyllableId syllable_id, double credibility = 0.0); + + // up one level + bool Backdate(); + + // back to root + void Reset(); + + size_t level() const { return level_; } + + protected: + size_t level_ = 0; + Code index_code_; + vector credibility_; + + private: + bool Walk(SyllableId syllable_id); + + table::HeadIndex* lv1_index_ = nullptr; + table::TrunkIndex* lv2_index_ = nullptr; + table::TrunkIndex* lv3_index_ = nullptr; + table::TailIndex* lv4_index_ = nullptr; +}; class Table : public MappedFile { public: @@ -153,6 +184,7 @@ class Table : public MappedFile { RIME_API string GetEntryText(const table::Entry& entry); uint32_t dict_file_checksum() const; + table::Metadata* metadata() const { return metadata_; } private: table::Index* BuildIndex(const Vocabulary& vocabulary, size_t num_syllables); @@ -184,6 +216,13 @@ class Table : public MappedFile { the string_table_builder_; }; +struct QueryQueue { + size_t pos; + TableQuery query; + bool isRegularSpelling; + bool hasNoEntry; +}; + } // namespace rime #endif // RIME_TABLE_H_ diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 8a5ee1b44b..9e6e153713 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -37,7 +37,20 @@ target_link_libraries(rime_deployer ${rime_dict_library} ${rime_levers_library}) +set(rime_table_decompiler_src + "rime_table_decompiler.cc" + ${CMAKE_SOURCE_DIR}/src/rime/dict/table.cc + ${CMAKE_SOURCE_DIR}/src/rime/dict/mapped_file.cc + ${CMAKE_SOURCE_DIR}/src/rime/dict/string_table.cc + ${CMAKE_SOURCE_DIR}/src/rime/dict/vocabulary.cc + ) +add_executable(rime_table_decompiler ${rime_table_decompiler_src}) +target_link_libraries(rime_table_decompiler + ${rime_library} + ${rime_dict_library}) + install(TARGETS rime_deployer DESTINATION ${BIN_INSTALL_DIR}) install(TARGETS rime_dict_manager DESTINATION ${BIN_INSTALL_DIR}) +install(TARGETS rime_table_decompiler DESTINATION ${BIN_INSTALL_DIR}) install(TARGETS rime_patch DESTINATION ${BIN_INSTALL_DIR}) diff --git a/tools/rime_table_decompiler.cc b/tools/rime_table_decompiler.cc new file mode 100644 index 0000000000..76a1168323 --- /dev/null +++ b/tools/rime_table_decompiler.cc @@ -0,0 +1,124 @@ +// rime_table_decompiler.cc +// nopdan +// +#include +#include +#include +#include +#include +#include +#include + +// usage: +// rime_table_decompiler [save-path] +// example: +// rime_table_decompiler pinyin.table.bin pinyin.dict.yaml + +void outCode(rime::Table* table, const rime::Code code, std::ofstream& fout) { + if (code.empty()) { + return; + } + auto item = code.begin(); + fout << table->GetSyllableById(*item); + item++; + for (; item != code.end(); ++item) { + fout << " "; + fout << table->GetSyllableById(*item); + } + return; +} + +void access(rime::Table* table, + rime::TableAccessor accessor, + std::ofstream& fout) { + while (!accessor.exhausted()) { + auto word = table->GetEntryText(*accessor.entry()); + fout << word << "\t"; + outCode(table, accessor.code(), fout); + + auto weight = accessor.entry()->weight; + if (weight >= 0) { + fout << "\t" << exp(weight); + } + fout << std::endl; + accessor.Next(); + } +} + +// recursively traverse table +void recursion(rime::Table* table, + rime::TableQuery* query, + std::ofstream& fout) { + for (int i = 0; i < table->metadata()->num_syllables; i++) { + auto accessor = query->Access(i); + access(table, accessor, fout); + if (query->Advance(i)) { + if (query->level() < 3) { + recursion(table, query, fout); + } else { + auto accessor = query->Access(0); + access(table, accessor, fout); + } + query->Backdate(); + } + } +} + +void traversal(rime::Table* table, std::ofstream& fout) { + auto metadata = table->metadata(); + std::cout << "num_syllables: " << metadata->num_syllables << std::endl; + std::cout << "num_entries: " << metadata->num_entries << std::endl; + + fout << std::fixed; + fout << std::setprecision(0); + rime::TableQuery query(table->metadata()->index.get()); + recursion(table, &query, fout); +} + +int main(int argc, char* argv[]) { + if (argc < 2 || argc > 3) { + std::cout << "Usage: rime_table_decompiler [save-path]" + << std::endl; + std::cout << "Example: rime_table_decompiler pinyin.table.bin pinyin.dict.yaml" + << std::endl; + return 0; + } + + std::string fileName(argv[1]); + rime::Table table(fileName); + bool success = table.Load(); + if (!success) { + std::cerr << "Failed to load table." << std::endl; + return 1; + } + + // Remove the extension ".table.bin" if present. + const size_t table_bin_idx = fileName.rfind(".table.bin"); + if (std::string::npos != table_bin_idx) { + fileName.erase(table_bin_idx); + } + const std::string outputName = + (argc == 3) ? argv[2]: fileName + ".yaml"; + + std::ofstream fout; + fout.open(outputName); + if (!fout.is_open()) { + std::cerr << "Failed to open file " << outputName << std::endl; + return 1; + } + + // schema id + const size_t last_slash_idx = fileName.find_last_of("\\/"); + if (std::string::npos != last_slash_idx) { + fileName.erase(0, last_slash_idx + 1); + } + fout << "# Rime dictionary\n\n"; + fout << "---\n" + "name: " << fileName << "\n" + "version: \"1.0\"\n" + "...\n\n"; + traversal(&table, fout); + std::cout << "Save to: " << outputName << std::endl; + fout.close(); + return 0; +}