time.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. import pynini
  2. from fun_text_processing.text_normalization.en.graph_utils import (
  3. DAMO_NOT_QUOTE,
  4. DAMO_SIGMA,
  5. GraphFst,
  6. delete_preserve_order,
  7. delete_space,
  8. insert_space,
  9. )
  10. from fun_text_processing.text_normalization.es.utils import get_abs_path
  11. from pynini.lib import pynutil
  12. alt_minutes = pynini.string_file(get_abs_path("data/time/alt_minutes.tsv"))
  13. morning_times = pynini.string_file(get_abs_path("data/time/morning_times.tsv"))
  14. afternoon_times = pynini.string_file(get_abs_path("data/time/afternoon_times.tsv"))
  15. evening_times = pynini.string_file(get_abs_path("data/time/evening_times.tsv"))
  16. class TimeFst(GraphFst):
  17. """
  18. Finite state transducer for verbalizing time, e.g.
  19. time { hours: "doce" minutes: "media" suffix: "a m" } -> doce y media de la noche
  20. time { hours: "doce" } -> twelve o'clock
  21. Args:
  22. deterministic: if True will provide a single transduction option,
  23. for False multiple transduction are generated (used for audio-based normalization)
  24. """
  25. def __init__(self, deterministic: bool = True):
  26. super().__init__(name="time", kind="verbalize", deterministic=deterministic)
  27. change_minutes = pynini.cdrewrite(alt_minutes, pynini.accep("[BOS]"), pynini.accep("[EOS]"), DAMO_SIGMA)
  28. morning_phrases = pynini.cross("am", "de la mañana")
  29. afternoon_phrases = pynini.cross("pm", "de la tarde")
  30. evening_phrases = pynini.cross("pm", "de la noche")
  31. # For the 12's
  32. mid_times = pynini.accep("doce")
  33. mid_phrases = (
  34. pynini.string_map([("pm", "del mediodía"), ("am", "de la noche")])
  35. if deterministic
  36. else pynini.string_map(
  37. [
  38. ("pm", "de la mañana"),
  39. ("pm", "del día"),
  40. ("pm", "del mediodía"),
  41. ("am", "de la noche"),
  42. ("am", "de la medianoche"),
  43. ]
  44. )
  45. )
  46. hour = (
  47. pynutil.delete("hours:")
  48. + delete_space
  49. + pynutil.delete("\"")
  50. + pynini.closure(DAMO_NOT_QUOTE, 1)
  51. + pynutil.delete("\"")
  52. )
  53. minute = (
  54. pynutil.delete("minutes:")
  55. + delete_space
  56. + pynutil.delete("\"")
  57. + pynini.closure(DAMO_NOT_QUOTE, 1)
  58. + pynutil.delete("\"")
  59. )
  60. minute = (minute @ change_minutes) if deterministic else pynini.union(minute, minute @ change_minutes)
  61. suffix = (
  62. pynutil.delete("suffix:")
  63. + delete_space
  64. + pynutil.delete("\"")
  65. + pynini.closure(DAMO_NOT_QUOTE, 1)
  66. + pynutil.delete("\"")
  67. )
  68. zone = (
  69. pynutil.delete("zone:")
  70. + delete_space
  71. + pynutil.delete("\"")
  72. + pynini.closure(DAMO_NOT_QUOTE, 1)
  73. + pynutil.delete("\"")
  74. )
  75. optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1)
  76. second = (
  77. pynutil.delete("seconds:")
  78. + delete_space
  79. + pynutil.delete("\"")
  80. + pynini.closure(DAMO_NOT_QUOTE, 1)
  81. + pynutil.delete("\"")
  82. )
  83. graph_hms = (
  84. hour
  85. + pynutil.insert(" horas ")
  86. + delete_space
  87. + minute
  88. + pynutil.insert(" minutos y ")
  89. + delete_space
  90. + second
  91. + pynutil.insert(" segundos")
  92. )
  93. graph_hm = hour + delete_space + pynutil.insert(" y ") + minute
  94. graph_hm |= pynini.union(
  95. (hour @ morning_times)
  96. + delete_space
  97. + pynutil.insert(" y ")
  98. + minute
  99. + delete_space
  100. + insert_space
  101. + (suffix @ morning_phrases),
  102. (hour @ afternoon_times)
  103. + delete_space
  104. + pynutil.insert(" y ")
  105. + minute
  106. + delete_space
  107. + insert_space
  108. + (suffix @ afternoon_phrases),
  109. (hour @ evening_times)
  110. + delete_space
  111. + pynutil.insert(" y ")
  112. + minute
  113. + delete_space
  114. + insert_space
  115. + (suffix @ evening_phrases),
  116. (hour @ mid_times)
  117. + delete_space
  118. + pynutil.insert(" y ")
  119. + minute
  120. + delete_space
  121. + insert_space
  122. + (suffix @ mid_phrases),
  123. )
  124. graph_h = pynini.union(
  125. hour,
  126. (hour @ morning_times) + delete_space + insert_space + (suffix @ morning_phrases),
  127. (hour @ afternoon_times) + delete_space + insert_space + (suffix @ afternoon_phrases),
  128. (hour @ evening_times) + delete_space + insert_space + (suffix @ evening_phrases),
  129. (hour @ mid_times) + delete_space + insert_space + (suffix @ mid_phrases),
  130. )
  131. graph = (graph_hms | graph_hm | graph_h) + optional_zone
  132. if not deterministic:
  133. graph_style_1 = pynutil.delete(" style: \"1\"")
  134. graph_style_2 = pynutil.delete(" style: \"2\"")
  135. graph_menos = hour + delete_space + pynutil.insert(" menos ") + minute + graph_style_1
  136. graph_menos |= (
  137. (hour @ morning_times)
  138. + delete_space
  139. + pynutil.insert(" menos ")
  140. + minute
  141. + delete_space
  142. + insert_space
  143. + (suffix @ morning_phrases)
  144. + graph_style_1
  145. )
  146. graph_menos |= (
  147. (hour @ afternoon_times)
  148. + delete_space
  149. + pynutil.insert(" menos ")
  150. + minute
  151. + delete_space
  152. + insert_space
  153. + (suffix @ afternoon_phrases)
  154. + graph_style_1
  155. )
  156. graph_menos |= (
  157. (hour @ evening_times)
  158. + delete_space
  159. + pynutil.insert(" menos ")
  160. + minute
  161. + delete_space
  162. + insert_space
  163. + (suffix @ evening_phrases)
  164. + graph_style_1
  165. )
  166. graph_menos |= (
  167. (hour @ mid_times)
  168. + delete_space
  169. + pynutil.insert(" menos ")
  170. + minute
  171. + delete_space
  172. + insert_space
  173. + (suffix @ mid_phrases)
  174. + graph_style_1
  175. )
  176. graph_menos += optional_zone
  177. graph_para = minute + pynutil.insert(" para las ") + delete_space + hour + graph_style_2
  178. graph_para |= (
  179. minute
  180. + pynutil.insert(" para las ")
  181. + delete_space
  182. + (hour @ morning_times)
  183. + delete_space
  184. + insert_space
  185. + (suffix @ morning_phrases)
  186. + graph_style_2
  187. )
  188. graph_para |= (
  189. minute
  190. + pynutil.insert(" para las ")
  191. + delete_space
  192. + (hour @ afternoon_times)
  193. + delete_space
  194. + insert_space
  195. + (suffix @ afternoon_phrases)
  196. + graph_style_2
  197. )
  198. graph_para |= (
  199. minute
  200. + pynutil.insert(" para las ")
  201. + delete_space
  202. + (hour @ evening_times)
  203. + delete_space
  204. + insert_space
  205. + (suffix @ evening_phrases)
  206. + graph_style_2
  207. )
  208. graph_para |= (
  209. minute
  210. + pynutil.insert(" para las ")
  211. + delete_space
  212. + (hour @ mid_times)
  213. + delete_space
  214. + insert_space
  215. + (suffix @ mid_phrases)
  216. + graph_style_2
  217. )
  218. graph_para += optional_zone
  219. graph_para @= pynini.cdrewrite(
  220. pynini.cross(" las ", " la "), "para", "una", DAMO_SIGMA
  221. ) # Need agreement with one
  222. graph |= graph_menos | graph_para
  223. delete_tokens = self.delete_tokens(graph + delete_preserve_order)
  224. self.fst = delete_tokens.optimize()