text2token.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. #!/usr/bin/env python3
  2. # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
  3. # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
  4. import argparse
  5. import codecs
  6. import re
  7. import sys
  8. is_python2 = sys.version_info[0] == 2
  9. def exist_or_not(i, match_pos):
  10. start_pos = None
  11. end_pos = None
  12. for pos in match_pos:
  13. if pos[0] <= i < pos[1]:
  14. start_pos = pos[0]
  15. end_pos = pos[1]
  16. break
  17. return start_pos, end_pos
  18. def get_parser():
  19. parser = argparse.ArgumentParser(
  20. description="convert raw text to tokenized text",
  21. formatter_class=argparse.ArgumentDefaultsHelpFormatter,
  22. )
  23. parser.add_argument(
  24. "--nchar",
  25. "-n",
  26. default=1,
  27. type=int,
  28. help="number of characters to split, i.e., \
  29. aabb -> a a b b with -n 1 and aa bb with -n 2",
  30. )
  31. parser.add_argument(
  32. "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
  33. )
  34. parser.add_argument("--space", default="<space>", type=str, help="space symbol")
  35. parser.add_argument(
  36. "--non-lang-syms",
  37. "-l",
  38. default=None,
  39. type=str,
  40. help="list of non-linguistic symobles, e.g., <NOISE> etc.",
  41. )
  42. parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
  43. parser.add_argument(
  44. "--trans_type",
  45. "-t",
  46. type=str,
  47. default="char",
  48. choices=["char", "phn"],
  49. help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -
  50. If trans_type is char,
  51. read from SI1279.WRD file -> "bricks are an alternative"
  52. Else if trans_type is phn,
  53. read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
  54. sil t er n ih sil t ih v sil" """,
  55. )
  56. return parser
  57. def main():
  58. parser = get_parser()
  59. args = parser.parse_args()
  60. rs = []
  61. if args.non_lang_syms is not None:
  62. with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
  63. nls = [x.rstrip() for x in f.readlines()]
  64. rs = [re.compile(re.escape(x)) for x in nls]
  65. if args.text:
  66. f = codecs.open(args.text, encoding="utf-8")
  67. else:
  68. f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
  69. sys.stdout = codecs.getwriter("utf-8")(
  70. sys.stdout if is_python2 else sys.stdout.buffer
  71. )
  72. line = f.readline()
  73. n = args.nchar
  74. while line:
  75. x = line.split()
  76. print(" ".join(x[: args.skip_ncols]), end=" ")
  77. a = " ".join(x[args.skip_ncols :])
  78. # get all matched positions
  79. match_pos = []
  80. for r in rs:
  81. i = 0
  82. while i >= 0:
  83. m = r.search(a, i)
  84. if m:
  85. match_pos.append([m.start(), m.end()])
  86. i = m.end()
  87. else:
  88. break
  89. if args.trans_type == "phn":
  90. a = a.split(" ")
  91. else:
  92. if len(match_pos) > 0:
  93. chars = []
  94. i = 0
  95. while i < len(a):
  96. start_pos, end_pos = exist_or_not(i, match_pos)
  97. if start_pos is not None:
  98. chars.append(a[start_pos:end_pos])
  99. i = end_pos
  100. else:
  101. chars.append(a[i])
  102. i += 1
  103. a = chars
  104. a = [a[j : j + n] for j in range(0, len(a), n)]
  105. a_flat = []
  106. for z in a:
  107. a_flat.append("".join(z))
  108. a_chars = [z.replace(" ", args.space) for z in a_flat]
  109. if args.trans_type == "phn":
  110. a_chars = [z.replace("sil", args.space) for z in a_chars]
  111. print(" ".join(a_chars))
  112. line = f.readline()
  113. if __name__ == "__main__":
  114. main()