tokenize.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. #!/usr/bin/env python
  2. import re
  3. import numpy as np
  4. def forward_segment(text, seg_dict):
  5. word_list = []
  6. i = 0
  7. while i < len(text):
  8. longest_word = text[i]
  9. for j in range(i + 1, len(text) + 1):
  10. word = text[i:j]
  11. if word in seg_dict:
  12. if len(word) > len(longest_word):
  13. longest_word = word
  14. word_list.append(longest_word)
  15. i += len(longest_word)
  16. return word_list
  17. def seg_tokenize(txt, seg_dict):
  18. out_txt = ""
  19. pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
  20. for word in txt:
  21. if pattern.match(word):
  22. if word in seg_dict:
  23. out_txt += seg_dict[word] + " "
  24. else:
  25. out_txt += "<unk>" + " "
  26. else:
  27. continue
  28. return out_txt.strip().split()
  29. def tokenize(data,
  30. vocab=None,
  31. seg_dict=None):
  32. assert "text" in data
  33. assert isinstance(vocab, dict)
  34. text = data["text"]
  35. token = []
  36. if seg_dict is not None:
  37. assert isinstance(seg_dict, dict)
  38. txt = forward_segment("".join(text).lower(), seg_dict)
  39. text = seg_tokenize(txt, seg_dict)
  40. for x in text:
  41. if x in vocab:
  42. token.append(vocab[x])
  43. else:
  44. token.append(vocab['<unk>'])
  45. data["text"] = np.array(token)
  46. return data