Skip to content

Commit

Permalink
Refactor decoding functions to use UTF-8 compliant methods
Browse files Browse the repository at this point in the history
Updated multiple files to replace instances of DecodeRunesInString with DecodeUTF8RunesInString, ensuring proper handling of UTF-8 encoded strings. This change enhances the robustness of string decoding across the cppjieba library, including updates in DictTrie, HMMModel, PosTagger, PreFilter, SegmentBase, and Unicode files. Additionally, corresponding unit tests have been modified to reflect these changes.
  • Loading branch information
yanyiwu committed Dec 8, 2024
1 parent 5ee74d7 commit 42a93a4
Show file tree
Hide file tree
Showing 8 changed files with 34 additions and 34 deletions.
4 changes: 2 additions & 2 deletions include/cppjieba/DictTrie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class DictTrie {
{
const DictUnit *tmp = NULL;
RuneStrArray runes;
if (!DecodeRunesInString(word, runes))
if (!DecodeUTF8RunesInString(word, runes))
{
XLOG(ERROR) << "Decode failed.";
}
Expand Down Expand Up @@ -197,7 +197,7 @@ class DictTrie {
const string& word,
double weight,
const string& tag) {
if (!DecodeRunesInString(word, node_info.word)) {
if (!DecodeUTF8RunesInString(word, node_info.word)) {
XLOG(ERROR) << "Decode " << word << " failed.";
return false;
}
Expand Down
2 changes: 1 addition & 1 deletion include/cppjieba/HMMModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ struct HMMModel {
XLOG(ERROR) << "emitProb illegal.";
return false;
}
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) {
XLOG(ERROR) << "TransCode failed.";
return false;
}
Expand Down
2 changes: 1 addition & 1 deletion include/cppjieba/PosTagger.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class PosTagger {
RuneStrArray runes;
const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL);
if (!DecodeRunesInString(str, runes)) {
if (!DecodeUTF8RunesInString(str, runes)) {
XLOG(ERROR) << "Decode failed.";
return POS_X;
}
Expand Down
2 changes: 1 addition & 1 deletion include/cppjieba/PreFilter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class PreFilter {
PreFilter(const unordered_set<Rune>& symbols,
const string& sentence)
: symbols_(symbols) {
if (!DecodeRunesInString(sentence, sentence_)) {
if (!DecodeUTF8RunesInString(sentence, sentence_)) {
XLOG(ERROR) << "decode failed. ";
}
cursor_ = sentence_.begin();
Expand Down
2 changes: 1 addition & 1 deletion include/cppjieba/SegmentBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class SegmentBase {
bool ResetSeparators(const string& s) {
symbols_.clear();
RuneStrArray runes;
if (!DecodeRunesInString(s, runes)) {
if (!DecodeUTF8RunesInString(s, runes)) {
XLOG(ERROR) << "decode " << s << " failed";
return false;
}
Expand Down
24 changes: 12 additions & 12 deletions include/cppjieba/Unicode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ struct RuneStrLite {
}
}; // struct RuneStrLite

inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
RuneStrLite rp(0, 0);
if (str == NULL || len == 0) {
return rp;
Expand Down Expand Up @@ -139,11 +139,11 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
return rp;
}

inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) {
runes.clear();
runes.reserve(len / 2);
for (uint32_t i = 0, j = 0; i < len;) {
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i);
if (rp.len == 0) {
runes.clear();
return false;
Expand All @@ -156,14 +156,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
return true;
}

inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
return DecodeRunesInString(s.c_str(), s.size(), runes);
inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) {
return DecodeUTF8RunesInString(s.c_str(), s.size(), runes);
}

inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) {
unicode.clear();
RuneStrArray runes;
if (!DecodeRunesInString(s, len, runes)) {
if (!DecodeUTF8RunesInString(s, len, runes)) {
return false;
}
unicode.reserve(runes.size());
Expand All @@ -174,17 +174,17 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
}

inline bool IsSingleWord(const string& str) {
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size());
return rp.len == str.size();
}

inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
return DecodeRunesInString(s.c_str(), s.size(), unicode);
inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) {
return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode);
}

inline Unicode DecodeRunesInString(const string& s) {
inline Unicode DecodeUTF8RunesInString(const string& s) {
Unicode result;
DecodeRunesInString(s, result);
DecodeUTF8RunesInString(s, result);
return result;
}

Expand Down
26 changes: 13 additions & 13 deletions test/unittest/trie_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ TEST(TrieTest, Empty) {
TEST(TrieTest, Construct) {
vector<Unicode> keys;
vector<const DictUnit*> values;
keys.push_back(DecodeRunesInString(""));
keys.push_back(DecodeUTF8RunesInString(""));
values.push_back((const DictUnit*)(NULL));
Trie trie(keys, values);
}
Expand All @@ -32,7 +32,7 @@ TEST(DictTrieTest, Test1) {
ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
string word("来到");
cppjieba::RuneStrArray uni;
ASSERT_TRUE(DecodeRunesInString(word, uni));
ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
//DictUnit nodeInfo;
//nodeInfo.word = uni;
//nodeInfo.tag = "v";
Expand All @@ -52,13 +52,13 @@ TEST(DictTrieTest, Test1) {
LocalVector<pair<size_t, const DictUnit*> > res;
const char * words[] = {"", "清华", "清华大学"};
for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
ASSERT_TRUE(DecodeRunesInString(words[i], uni));
ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
}
vector<pair<size_t, const DictUnit*> > vec;
vector<struct Dag> dags;
ASSERT_TRUE(DecodeRunesInString(word, uni));
ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
trie.Find(uni.begin(), uni.end(), dags);
ASSERT_EQ(dags.size(), uni.size());
ASSERT_NE(dags.size(), 0u);
Expand All @@ -72,20 +72,20 @@ TEST(DictTrieTest, UserDict) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
string word = "云计算";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_NEAR(unit->weight, -14.100, 0.001);

word = "蓝翔";
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz");
ASSERT_NEAR(unit->weight, -14.100, 0.001);

word = "区块链";
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz");
Expand All @@ -96,7 +96,7 @@ TEST(DictTrieTest, UserDictWithMaxWeight) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
string word = "云计算";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit);
ASSERT_NEAR(unit->weight, -2.975, 0.001);
Expand All @@ -108,7 +108,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "清华大学";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res);

Expand All @@ -122,7 +122,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "北京邮电大学";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res);

Expand All @@ -136,7 +136,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "长江大桥";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res);

Expand All @@ -150,7 +150,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "长江大桥";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res, 3);

Expand All @@ -164,7 +164,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "长江大桥";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res, 4);

Expand Down
6 changes: 3 additions & 3 deletions test/unittest/unicode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ using namespace std;
TEST(UnicodeTest, Test1) {
string s = "你好世界";
RuneStrArray runes;
ASSERT_TRUE(DecodeRunesInString(s, runes));
ASSERT_TRUE(DecodeUTF8RunesInString(s, runes));
string actual;
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
actual << runes;
Expand All @@ -18,7 +18,7 @@ TEST(UnicodeTest, Test1) {
TEST(UnicodeTest, Illegal) {
string s = "123\x80";
RuneStrArray runes;
ASSERT_FALSE(DecodeRunesInString(s, runes));
ASSERT_FALSE(DecodeUTF8RunesInString(s, runes));
string actual;
string expected = "[]";
actual << runes;
Expand All @@ -38,6 +38,6 @@ TEST(UnicodeTest, Rand) {
s[rand() % len] = rand();
}
RuneStrArray runes;
DecodeRunesInString(s, runes);
DecodeUTF8RunesInString(s, runes);
}
}

0 comments on commit 42a93a4

Please sign in to comment.