|
@@ -65,12 +65,17 @@ class BiasLm {
|
|
|
if (text.size() > 1) {
|
|
if (text.size() > 1) {
|
|
|
score = std::stof(text[1]);
|
|
score = std::stof(text[1]);
|
|
|
}
|
|
}
|
|
|
- Utf8ToCharset(text[0], split_str);
|
|
|
|
|
|
|
+ SplitChiEngCharacters(text[0], split_str);
|
|
|
for (auto &str : split_str) {
|
|
for (auto &str : split_str) {
|
|
|
- split_id.push_back(phn_set_.String2Id(str));
|
|
|
|
|
- if (!phn_set_.Find(str)) {
|
|
|
|
|
- is_oov = true;
|
|
|
|
|
- break;
|
|
|
|
|
|
|
+ std::vector<string> lex_vec;
|
|
|
|
|
+ std::string lex_str = vocab_.Word2Lex(str);
|
|
|
|
|
+ SplitStringToVector(lex_str, " ", true, &lex_vec);
|
|
|
|
|
+ for (auto &token : lex_vec) {
|
|
|
|
|
+ split_id.push_back(phn_set_.String2Id(token));
|
|
|
|
|
+ if (!phn_set_.Find(token)) {
|
|
|
|
|
+ is_oov = true;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
if (!is_oov) {
|
|
if (!is_oov) {
|
|
@@ -103,12 +108,17 @@ class BiasLm {
|
|
|
std::vector<std::string> split_str;
|
|
std::vector<std::string> split_str;
|
|
|
std::vector<int> split_id;
|
|
std::vector<int> split_id;
|
|
|
score = kv.second;
|
|
score = kv.second;
|
|
|
- Utf8ToCharset(kv.first, split_str);
|
|
|
|
|
|
|
+ SplitChiEngCharacters(kv.first, split_str);
|
|
|
for (auto &str : split_str) {
|
|
for (auto &str : split_str) {
|
|
|
- split_id.push_back(phn_set_.String2Id(str));
|
|
|
|
|
- if (!phn_set_.Find(str)) {
|
|
|
|
|
- is_oov = true;
|
|
|
|
|
- break;
|
|
|
|
|
|
|
+ std::vector<string> lex_vec;
|
|
|
|
|
+ std::string lex_str = vocab_.Word2Lex(str);
|
|
|
|
|
+ SplitStringToVector(lex_str, " ", true, &lex_vec);
|
|
|
|
|
+ for (auto &token : lex_vec) {
|
|
|
|
|
+ split_id.push_back(phn_set_.String2Id(token));
|
|
|
|
|
+ if (!phn_set_.Find(token)) {
|
|
|
|
|
+ is_oov = true;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
if (!is_oov) {
|
|
if (!is_oov) {
|