|
|
@@ -1,4 +1,4 @@
|
|
|
-
|
|
|
+import re
|
|
|
|
|
|
def split_to_mini_sentence(words: list, word_limit: int = 20):
|
|
|
assert word_limit > 1
|
|
|
@@ -14,23 +14,98 @@ def split_to_mini_sentence(words: list, word_limit: int = 20):
|
|
|
return sentences
|
|
|
|
|
|
|
|
|
-def split_words(text: str):
|
|
|
- words = []
|
|
|
- segs = text.split()
|
|
|
- for seg in segs:
|
|
|
- # There is no space in seg.
|
|
|
- current_word = ""
|
|
|
- for c in seg:
|
|
|
- if len(c.encode()) == 1:
|
|
|
- # This is an ASCII char.
|
|
|
- current_word += c
|
|
|
+# def split_words(text: str, **kwargs):
|
|
|
+# words = []
|
|
|
+# segs = text.split()
|
|
|
+# for seg in segs:
|
|
|
+# # There is no space in seg.
|
|
|
+# current_word = ""
|
|
|
+# for c in seg:
|
|
|
+# if len(c.encode()) == 1:
|
|
|
+# # This is an ASCII char.
|
|
|
+# current_word += c
|
|
|
+# else:
|
|
|
+# # This is a Chinese char.
|
|
|
+# if len(current_word) > 0:
|
|
|
+# words.append(current_word)
|
|
|
+# current_word = ""
|
|
|
+# words.append(c)
|
|
|
+# if len(current_word) > 0:
|
|
|
+# words.append(current_word)
|
|
|
+#
|
|
|
+# return words
|
|
|
+
|
|
|
+def split_words(text: str, jieba_usr_dict=None, **kwargs):
|
|
|
+ if jieba_usr_dict:
|
|
|
+ input_list = text.split()
|
|
|
+ token_list_all = []
|
|
|
+ langauge_list = []
|
|
|
+ token_list_tmp = []
|
|
|
+ language_flag = None
|
|
|
+ for token in input_list:
|
|
|
+ if isEnglish(token) and language_flag == 'Chinese':
|
|
|
+ token_list_all.append(token_list_tmp)
|
|
|
+ langauge_list.append('Chinese')
|
|
|
+ token_list_tmp = []
|
|
|
+ elif not isEnglish(token) and language_flag == 'English':
|
|
|
+ token_list_all.append(token_list_tmp)
|
|
|
+ langauge_list.append('English')
|
|
|
+ token_list_tmp = []
|
|
|
+
|
|
|
+ token_list_tmp.append(token)
|
|
|
+
|
|
|
+ if isEnglish(token):
|
|
|
+ language_flag = 'English'
|
|
|
+ else:
|
|
|
+ language_flag = 'Chinese'
|
|
|
+
|
|
|
+ if token_list_tmp:
|
|
|
+ token_list_all.append(token_list_tmp)
|
|
|
+ langauge_list.append(language_flag)
|
|
|
+
|
|
|
+ result_list = []
|
|
|
+ for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
|
|
|
+ if language_flag == 'English':
|
|
|
+ result_list.extend(token_list_tmp)
|
|
|
else:
|
|
|
- # This is a Chinese char.
|
|
|
- if len(current_word) > 0:
|
|
|
- words.append(current_word)
|
|
|
- current_word = ""
|
|
|
- words.append(c)
|
|
|
- if len(current_word) > 0:
|
|
|
- words.append(current_word)
|
|
|
-
|
|
|
- return words
|
|
|
+ seg_list = jieba_usr_dict.cut(join_chinese_and_english(token_list_tmp), HMM=False)
|
|
|
+ result_list.extend(seg_list)
|
|
|
+
|
|
|
+ return result_list
|
|
|
+
|
|
|
+ else:
|
|
|
+ words = []
|
|
|
+ segs = text.split()
|
|
|
+ for seg in segs:
|
|
|
+ # There is no space in seg.
|
|
|
+ current_word = ""
|
|
|
+ for c in seg:
|
|
|
+ if len(c.encode()) == 1:
|
|
|
+ # This is an ASCII char.
|
|
|
+ current_word += c
|
|
|
+ else:
|
|
|
+ # This is a Chinese char.
|
|
|
+ if len(current_word) > 0:
|
|
|
+ words.append(current_word)
|
|
|
+ current_word = ""
|
|
|
+ words.append(c)
|
|
|
+ if len(current_word) > 0:
|
|
|
+ words.append(current_word)
|
|
|
+ return words
|
|
|
+
|
|
|
+def isEnglish(text:str):
|
|
|
+ if re.search('^[a-zA-Z\']+$', text):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+def join_chinese_and_english(input_list):
|
|
|
+ line = ''
|
|
|
+ for token in input_list:
|
|
|
+ if isEnglish(token):
|
|
|
+ line = line + ' ' + token
|
|
|
+ else:
|
|
|
+ line = line + token
|
|
|
+
|
|
|
+ line = line.strip()
|
|
|
+ return line
|