|
|
@@ -664,26 +664,6 @@ class CodeMixTokenizerCommonPreprocessor(CommonPreprocessor):
|
|
|
if self.seg_jieba:
|
|
|
jieba.load_userdict(seg_dict_file)
|
|
|
|
|
|
- @classmethod
|
|
|
- def split_words(cls, text: str):
|
|
|
- words = []
|
|
|
- segs = text.split()
|
|
|
- for seg in segs:
|
|
|
- # There is no space in seg.
|
|
|
- current_word = ""
|
|
|
- for c in seg:
|
|
|
- if len(c.encode()) == 1:
|
|
|
- # This is an ASCII char.
|
|
|
- current_word += c
|
|
|
- else:
|
|
|
- # This is a Chinese char.
|
|
|
- if len(current_word) > 0:
|
|
|
- words.append(current_word)
|
|
|
- current_word = ""
|
|
|
- words.append(c)
|
|
|
- if len(current_word) > 0:
|
|
|
- words.append(current_word)
|
|
|
- return words
|
|
|
|
|
|
@classmethod
|
|
|
def isEnglish(cls, text:str):
|