proce_text.py 689 B

12345678910111213141516171819202122232425262728293031
  1. import sys
  2. import re
  3. in_f = sys.argv[1]
  4. out_f = sys.argv[2]
  5. with open(in_f, "r", encoding="utf-8") as f:
  6. lines = f.readlines()
  7. with open(out_f, "w", encoding="utf-8") as f:
  8. for line in lines:
  9. outs = line.strip().split(" ", 1)
  10. if len(outs) == 2:
  11. idx, text = outs
  12. text = re.sub("</s>", "", text)
  13. text = re.sub("<s>", "", text)
  14. text = re.sub("@@", "", text)
  15. text = re.sub("@", "", text)
  16. text = re.sub("<unk>", "", text)
  17. text = re.sub(" ", "", text)
  18. text = text.lower()
  19. else:
  20. idx = outs[0]
  21. text = " "
  22. text = [x for x in text]
  23. text = " ".join(text)
  24. out = "{} {}\n".format(idx, text)
  25. f.write(out)