|
|
@@ -6,11 +6,12 @@ import pickle
|
|
|
from pathlib import Path
|
|
|
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
|
|
|
|
|
+import re
|
|
|
import numpy as np
|
|
|
import yaml
|
|
|
from onnxruntime import (GraphOptimizationLevel, InferenceSession,
|
|
|
SessionOptions, get_available_providers, get_device)
|
|
|
-
|
|
|
+import jieba
|
|
|
import warnings
|
|
|
|
|
|
root_dir = Path(__file__).resolve().parent
|
|
|
@@ -230,6 +231,64 @@ def code_mix_split_words(text: str):
|
|
|
words.append(current_word)
|
|
|
return words
|
|
|
|
|
|
+def isEnglish(text:str):
|
|
|
+ if re.search('^[a-zA-Z\']+$', text):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+def join_chinese_and_english(input_list):
|
|
|
+ line = ''
|
|
|
+ for token in input_list:
|
|
|
+ if isEnglish(token):
|
|
|
+ line = line + ' ' + token
|
|
|
+ else:
|
|
|
+ line = line + token
|
|
|
+
|
|
|
+ line = line.strip()
|
|
|
+ return line
|
|
|
+
|
|
|
+def code_mix_split_words_jieba(seg_dict_file: str):
|
|
|
+ jieba.load_userdict(seg_dict_file)
|
|
|
+
|
|
|
+ def _fn(text: str):
|
|
|
+ input_list = text.split()
|
|
|
+ token_list_all = []
|
|
|
+ langauge_list = []
|
|
|
+ token_list_tmp = []
|
|
|
+ language_flag = None
|
|
|
+ for token in input_list:
|
|
|
+ if isEnglish(token) and language_flag == 'Chinese':
|
|
|
+ token_list_all.append(token_list_tmp)
|
|
|
+ langauge_list.append('Chinese')
|
|
|
+ token_list_tmp = []
|
|
|
+ elif not isEnglish(token) and language_flag == 'English':
|
|
|
+ token_list_all.append(token_list_tmp)
|
|
|
+ langauge_list.append('English')
|
|
|
+ token_list_tmp = []
|
|
|
+
|
|
|
+ token_list_tmp.append(token)
|
|
|
+
|
|
|
+ if isEnglish(token):
|
|
|
+ language_flag = 'English'
|
|
|
+ else:
|
|
|
+ language_flag = 'Chinese'
|
|
|
+
|
|
|
+ if token_list_tmp:
|
|
|
+ token_list_all.append(token_list_tmp)
|
|
|
+ langauge_list.append(language_flag)
|
|
|
+
|
|
|
+ result_list = []
|
|
|
+ for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
|
|
|
+ if language_flag == 'English':
|
|
|
+ result_list.extend(token_list_tmp)
|
|
|
+ else:
|
|
|
+ seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False)
|
|
|
+ result_list.extend(seg_list)
|
|
|
+
|
|
|
+ return result_list
|
|
|
+ return _fn
|
|
|
+
|
|
|
def read_yaml(yaml_path: Union[str, Path]) -> Dict:
|
|
|
if not Path(yaml_path).exists():
|
|
|
raise FileExistsError(f'The {yaml_path} does not exist.')
|