diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index f5c71902..3478db40 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -85,7 +85,7 @@ class DictTrie { { const DictUnit *tmp = NULL; RuneStrArray runes; - if (!DecodeRunesInString(word, runes)) + if (!DecodeUTF8RunesInString(word, runes)) { XLOG(ERROR) << "Decode failed."; } @@ -197,7 +197,7 @@ class DictTrie { const string& word, double weight, const string& tag) { - if (!DecodeRunesInString(word, node_info.word)) { + if (!DecodeUTF8RunesInString(word, node_info.word)) { XLOG(ERROR) << "Decode " << word << " failed."; return false; } diff --git a/include/cppjieba/HMMModel.hpp b/include/cppjieba/HMMModel.hpp index 27e6b662..3921faaf 100644 --- a/include/cppjieba/HMMModel.hpp +++ b/include/cppjieba/HMMModel.hpp @@ -105,7 +105,7 @@ struct HMMModel { XLOG(ERROR) << "emitProb illegal."; return false; } - if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { + if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) { XLOG(ERROR) << "TransCode failed."; return false; } diff --git a/include/cppjieba/PosTagger.hpp b/include/cppjieba/PosTagger.hpp index 78853d53..a6810b2d 100644 --- a/include/cppjieba/PosTagger.hpp +++ b/include/cppjieba/PosTagger.hpp @@ -34,7 +34,7 @@ class PosTagger { RuneStrArray runes; const DictTrie * dict = segment.GetDictTrie(); assert(dict != NULL); - if (!DecodeRunesInString(str, runes)) { + if (!DecodeUTF8RunesInString(str, runes)) { XLOG(ERROR) << "Decode failed."; return POS_X; } diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index ecb81c0b..e73b9ab5 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -17,7 +17,7 @@ class PreFilter { PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - if (!DecodeRunesInString(sentence, sentence_)) { + if (!DecodeUTF8RunesInString(sentence, sentence_)) { XLOG(ERROR) << "decode failed. "; } cursor_ = sentence_.begin(); diff --git a/include/cppjieba/SegmentBase.hpp b/include/cppjieba/SegmentBase.hpp index 79c80094..2885b83e 100644 --- a/include/cppjieba/SegmentBase.hpp +++ b/include/cppjieba/SegmentBase.hpp @@ -25,7 +25,7 @@ class SegmentBase { bool ResetSeparators(const string& s) { symbols_.clear(); RuneStrArray runes; - if (!DecodeRunesInString(s, runes)) { + if (!DecodeUTF8RunesInString(s, runes)) { XLOG(ERROR) << "decode " << s << " failed"; return false; } diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 7f064569..9adec2ca 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -84,7 +84,7 @@ struct RuneStrLite { } }; // struct RuneStrLite -inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { +inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) { RuneStrLite rp(0, 0); if (str == NULL || len == 0) { return rp; @@ -139,11 +139,11 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { return rp; } -inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { +inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) { runes.clear(); runes.reserve(len / 2); for (uint32_t i = 0, j = 0; i < len;) { - RuneStrLite rp = DecodeRuneInString(s + i, len - i); + RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i); if (rp.len == 0) { runes.clear(); return false; @@ -156,14 +156,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) return true; } -inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { - return DecodeRunesInString(s.c_str(), s.size(), runes); +inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) { + return DecodeUTF8RunesInString(s.c_str(), s.size(), runes); } -inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { +inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) { unicode.clear(); RuneStrArray runes; - if (!DecodeRunesInString(s, len, runes)) { + if (!DecodeUTF8RunesInString(s, len, runes)) { return false; } unicode.reserve(runes.size()); @@ -174,17 +174,17 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { } inline bool IsSingleWord(const string& str) { - RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); + RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size()); return rp.len == str.size(); } -inline bool DecodeRunesInString(const string& s, Unicode& unicode) { - return DecodeRunesInString(s.c_str(), s.size(), unicode); +inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) { + return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode); } -inline Unicode DecodeRunesInString(const string& s) { +inline Unicode DecodeUTF8RunesInString(const string& s) { Unicode result; - DecodeRunesInString(s, result); + DecodeUTF8RunesInString(s, result); return result; } diff --git a/test/unittest/trie_test.cpp b/test/unittest/trie_test.cpp index 1f035406..2e519930 100644 --- a/test/unittest/trie_test.cpp +++ b/test/unittest/trie_test.cpp @@ -15,7 +15,7 @@ TEST(TrieTest, Empty) { TEST(TrieTest, Construct) { vector keys; vector values; - keys.push_back(DecodeRunesInString("你")); + keys.push_back(DecodeUTF8RunesInString("你")); values.push_back((const DictUnit*)(NULL)); Trie trie(keys, values); } @@ -32,7 +32,7 @@ TEST(DictTrieTest, Test1) { ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); cppjieba::RuneStrArray uni; - ASSERT_TRUE(DecodeRunesInString(word, uni)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, uni)); //DictUnit nodeInfo; //nodeInfo.word = uni; //nodeInfo.tag = "v"; @@ -52,13 +52,13 @@ TEST(DictTrieTest, Test1) { LocalVector > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { - ASSERT_TRUE(DecodeRunesInString(words[i], uni)); + ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector > vec; vector dags; - ASSERT_TRUE(DecodeRunesInString(word, uni)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); @@ -72,20 +72,20 @@ TEST(DictTrieTest, UserDict) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); string word = "云计算"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit != NULL); ASSERT_NEAR(unit->weight, -14.100, 0.001); word = "蓝翔"; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit != NULL); ASSERT_EQ(unit->tag, "nz"); ASSERT_NEAR(unit->weight, -14.100, 0.001); word = "区块链"; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit != NULL); ASSERT_EQ(unit->tag, "nz"); @@ -96,7 +96,7 @@ TEST(DictTrieTest, UserDictWithMaxWeight) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); string word = "云计算"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); ASSERT_NEAR(unit->weight, -2.975, 0.001); @@ -108,7 +108,7 @@ TEST(DictTrieTest, Dag) { { string word = "清华大学"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -122,7 +122,7 @@ TEST(DictTrieTest, Dag) { { string word = "北京邮电大学"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -136,7 +136,7 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -150,7 +150,7 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 3); @@ -164,7 +164,7 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; cppjieba::RuneStrArray unicode; - ASSERT_TRUE(DecodeRunesInString(word, unicode)); + ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 4); diff --git a/test/unittest/unicode_test.cpp b/test/unittest/unicode_test.cpp index a22096e9..89113b9d 100644 --- a/test/unittest/unicode_test.cpp +++ b/test/unittest/unicode_test.cpp @@ -8,7 +8,7 @@ using namespace std; TEST(UnicodeTest, Test1) { string s = "你好世界"; RuneStrArray runes; - ASSERT_TRUE(DecodeRunesInString(s, runes)); + ASSERT_TRUE(DecodeUTF8RunesInString(s, runes)); string actual; string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]"; actual << runes; @@ -18,7 +18,7 @@ TEST(UnicodeTest, Test1) { TEST(UnicodeTest, Illegal) { string s = "123\x80"; RuneStrArray runes; - ASSERT_FALSE(DecodeRunesInString(s, runes)); + ASSERT_FALSE(DecodeUTF8RunesInString(s, runes)); string actual; string expected = "[]"; actual << runes; @@ -38,6 +38,6 @@ TEST(UnicodeTest, Rand) { s[rand() % len] = rand(); } RuneStrArray runes; - DecodeRunesInString(s, runes); + DecodeUTF8RunesInString(s, runes); } }