From 42a93a4b986e217c7579f063d45c1ba57c57dfbf Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 8 Dec 2024 16:46:24 +0800 Subject: [PATCH] Refactor decoding functions to use UTF-8 compliant methods Updated multiple files to replace instances of DecodeRunesInString with DecodeUTF8RunesInString, ensuring proper handling of UTF-8 encoded strings. This change enhances the robustness of string decoding across the cppjieba library, including updates in DictTrie, HMMModel, PosTagger, PreFilter, SegmentBase, and Unicode files. Additionally, corresponding unit tests have been modified to reflect these changes. --- include/cppjieba/DictTrie.hpp | 4 ++-- include/cppjieba/HMMModel.hpp | 2 +- include/cppjieba/PosTagger.hpp | 2 +- include/cppjieba/PreFilter.hpp | 2 +- include/cppjieba/SegmentBase.hpp | 2 +- include/cppjieba/Unicode.hpp | 24 ++++++++++++------------ test/unittest/trie_test.cpp | 26 +++++++++++++------------- test/unittest/unicode_test.cpp | 6 +++--- 8 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index f5c71902..3478db40 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -85,7 +85,7 @@ class DictTrie { { const DictUnit *tmp = NULL; RuneStrArray runes; - if (!DecodeRunesInString(word, runes)) + if (!DecodeUTF8RunesInString(word, runes)) { XLOG(ERROR) << "Decode failed."; } @@ -197,7 +197,7 @@ class DictTrie { const string& word, double weight, const string& tag) { - if (!DecodeRunesInString(word, node_info.word)) { + if (!DecodeUTF8RunesInString(word, node_info.word)) { XLOG(ERROR) << "Decode " << word << " failed."; return false; } diff --git a/include/cppjieba/HMMModel.hpp b/include/cppjieba/HMMModel.hpp index 27e6b662..3921faaf 100644 --- a/include/cppjieba/HMMModel.hpp +++ b/include/cppjieba/HMMModel.hpp @@ -105,7 +105,7 @@ struct HMMModel { XLOG(ERROR) << "emitProb illegal."; return false; } - if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { + if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) { XLOG(ERROR) << "TransCode failed."; return false; } diff --git a/include/cppjieba/PosTagger.hpp b/include/cppjieba/PosTagger.hpp index 78853d53..a6810b2d 100644 --- a/include/cppjieba/PosTagger.hpp +++ b/include/cppjieba/PosTagger.hpp @@ -34,7 +34,7 @@ class PosTagger { RuneStrArray runes; const DictTrie * dict = segment.GetDictTrie(); assert(dict != NULL); - if (!DecodeRunesInString(str, runes)) { + if (!DecodeUTF8RunesInString(str, runes)) { XLOG(ERROR) << "Decode failed."; return POS_X; } diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index ecb81c0b..e73b9ab5 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -17,7 +17,7 @@ class PreFilter { PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - if (!DecodeRunesInString(sentence, sentence_)) { + if (!DecodeUTF8RunesInString(sentence, sentence_)) { XLOG(ERROR) << "decode failed. "; } cursor_ = sentence_.begin(); diff --git a/include/cppjieba/SegmentBase.hpp b/include/cppjieba/SegmentBase.hpp index 79c80094..2885b83e 100644 --- a/include/cppjieba/SegmentBase.hpp +++ b/include/cppjieba/SegmentBase.hpp @@ -25,7 +25,7 @@ class SegmentBase { bool ResetSeparators(const string& s) { symbols_.clear(); RuneStrArray runes; - if (!DecodeRunesInString(s, runes)) { + if (!DecodeUTF8RunesInString(s, runes)) { XLOG(ERROR) << "decode " << s << " failed"; return false; } diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 7f064569..9adec2ca 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -84,7 +84,7 @@ struct RuneStrLite { } }; // struct RuneStrLite -inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { +inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) { RuneStrLite rp(0, 0); if (str == NULL || len == 0) { return rp; @@ -139,11 +139,11 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { return rp; } -inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { +inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) { runes.clear(); runes.reserve(len / 2); for (uint32_t i = 0, j = 0; i < len;) { - RuneStrLite rp = DecodeRuneInString(s + i, len - i); + RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i); if (rp.len == 0) { runes.clear(); return false; @@ -156,14 +156,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) return true; } -inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { - return DecodeRunesInString(s.c_str(), s.size(), runes); +inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) { + return DecodeUTF8RunesInString(s.c_str(), s.size(), runes); } -inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { +inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) { unicode.clear(); RuneStrArray runes; - if (!DecodeRunesInString(s, len, runes)) { + if (!DecodeUTF8RunesInString(s, len, runes)) { return false; } unicode.reserve(runes.size()); @@ -174,17 +174,17 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { } inline bool IsSingleWord(const string& str) { - RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); + RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size()); return rp.len == str.size(); } -inline bool DecodeRunesInString(const string& s, Unicode& unicode) { - return DecodeRunesInString(s.c_str(), s.size(), unicode); +inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) { + return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode); } -inline Unicode DecodeRunesInString(const string& s) { +inline Unicode DecodeUTF8RunesInString(const string& s) { Unicode result; - DecodeRunesInString(s, result); + DecodeUTF8RunesInString(s, result); return result; } diff --git a/test/unittest/trie_test.cpp b/test/unittest/trie_test.cpp index 1f035406..2e519930 100644 --- a/test/unittest/trie_test.cpp +++ b/test/unittest/trie_test.cpp @@ -15,7 +15,7 @@ TEST(TrieTest, Empty) { TEST(TrieTest, Construct) { vector keys; vector values; - keys.push_back(DecodeRunesInString("你")); + keys.push_back(DecodeUTF8RunesInString("你")); values.push_back((const DictUnit*)(NULL)); Trie trie(keys, values); } @@ -32,7 +32,7 @@ TEST(DictTrieTest, Test1) { ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); cppjieba::RuneStrArray uni; - ASSERT_TRUE(DecodeRunesInString(word, uni)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, uni)); //DictUnit nodeInfo; //nodeInfo.word = uni; //nodeInfo.tag = "v"; @@ -52,13 +52,13 @@ TEST(DictTrieTest, Test1) { LocalVector > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { - ASSERT_TRUE(DecodeRunesInString(words[i], uni)); + ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector > vec; vector dags; - ASSERT_TRUE(DecodeRunesInString(word, uni)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); @@ -72,20 +72,20 @@ TEST(DictTrieTest, UserDict) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); string word = "云计算"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit != NULL); ASSERT_NEAR(unit->weight, -14.100, 0.001); word = "蓝翔"; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit != NULL); ASSERT_EQ(unit->tag, "nz"); ASSERT_NEAR(unit->weight, -14.100, 0.001); word = "区块链"; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit != NULL); ASSERT_EQ(unit->tag, "nz"); @@ -96,7 +96,7 @@ TEST(DictTrieTest, UserDictWithMaxWeight) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); string word = "云计算"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); ASSERT_NEAR(unit->weight, -2.975, 0.001); @@ -108,7 +108,7 @@ TEST(DictTrieTest, Dag) { { string word = "清华大学"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -122,7 +122,7 @@ TEST(DictTrieTest, Dag) { { string word = "北京邮电大学"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -136,7 +136,7 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -150,7 +150,7 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 3); @@ -164,7 +164,7 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 4); diff --git a/test/unittest/unicode_test.cpp b/test/unittest/unicode_test.cpp index a22096e9..89113b9d 100644 --- a/test/unittest/unicode_test.cpp +++ b/test/unittest/unicode_test.cpp @@ -8,7 +8,7 @@ using namespace std; TEST(UnicodeTest, Test1) { string s = "你好世界"; RuneStrArray runes; - ASSERT_TRUE(DecodeRunesInString(s, runes)); + ASSERT_TRUE(DecodeUTF8RunesInString(s, runes)); string actual; string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]"; actual << runes; @@ -18,7 +18,7 @@ TEST(UnicodeTest, Test1) { TEST(UnicodeTest, Illegal) { string s = "123\x80"; RuneStrArray runes; - ASSERT_FALSE(DecodeRunesInString(s, runes)); + ASSERT_FALSE(DecodeUTF8RunesInString(s, runes)); string actual; string expected = "[]"; actual << runes; @@ -38,6 +38,6 @@ TEST(UnicodeTest, Rand) { s[rand() % len] = rand(); } RuneStrArray runes; - DecodeRunesInString(s, runes); + DecodeUTF8RunesInString(s, runes); } }