text_tokenize.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import re
  2. import argparse
  3. def load_dict(seg_file):
  4. seg_dict = {}
  5. with open(seg_file, 'r') as infile:
  6. for line in infile:
  7. s = line.strip().split()
  8. key = s[0]
  9. value = s[1:]
  10. seg_dict[key] = " ".join(value)
  11. return seg_dict
  12. def forward_segment(text, dic):
  13. word_list = []
  14. i = 0
  15. while i < len(text):
  16. longest_word = text[i]
  17. for j in range(i + 1, len(text) + 1):
  18. word = text[i:j]
  19. if word in dic:
  20. if len(word) > len(longest_word):
  21. longest_word = word
  22. word_list.append(longest_word)
  23. i += len(longest_word)
  24. return word_list
  25. def tokenize(txt,
  26. seg_dict):
  27. out_txt = ""
  28. pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
  29. for word in txt:
  30. if pattern.match(word):
  31. if word in seg_dict:
  32. out_txt += seg_dict[word] + " "
  33. else:
  34. out_txt += "<unk>" + " "
  35. else:
  36. continue
  37. return out_txt.strip()
  38. def get_parser():
  39. parser = argparse.ArgumentParser(
  40. description="text tokenize",
  41. formatter_class=argparse.ArgumentDefaultsHelpFormatter,
  42. )
  43. parser.add_argument(
  44. "--text-file",
  45. "-t",
  46. default=False,
  47. required=True,
  48. type=str,
  49. help="input text",
  50. )
  51. parser.add_argument(
  52. "--seg-file",
  53. "-s",
  54. default=False,
  55. required=True,
  56. type=str,
  57. help="seg file",
  58. )
  59. parser.add_argument(
  60. "--txt-index",
  61. "-i",
  62. default=1,
  63. required=True,
  64. type=int,
  65. help="txt index",
  66. )
  67. parser.add_argument(
  68. "--output-dir",
  69. "-o",
  70. default=False,
  71. required=True,
  72. type=str,
  73. help="output dir",
  74. )
  75. return parser
  76. def main():
  77. parser = get_parser()
  78. args = parser.parse_args()
  79. txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), 'w')
  80. shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), 'w')
  81. seg_dict = load_dict(args.seg_file)
  82. with open(args.text_file, 'r') as infile:
  83. for line in infile:
  84. s = line.strip().split()
  85. text_id = s[0]
  86. text_list = forward_segment("".join(s[1:]).lower(), seg_dict)
  87. text = tokenize(text_list, seg_dict)
  88. lens = len(text.strip().split())
  89. txt_writer.write(text_id + " " + text + '\n')
  90. shape_writer.write(text_id + " " + str(lens) + '\n')
  91. if __name__ == '__main__':
  92. main()