fraction.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. import pynini
  2. from fun_text_processing.text_normalization.en.graph_utils import (
  3. DAMO_CHAR,
  4. DAMO_NOT_QUOTE,
  5. DAMO_NOT_SPACE,
  6. DAMO_SIGMA,
  7. DAMO_SPACE,
  8. GraphFst,
  9. delete_space,
  10. insert_space,
  11. )
  12. from fun_text_processing.text_normalization.es.graph_utils import (
  13. accents,
  14. shift_cardinal_gender,
  15. strip_cardinal_apocope,
  16. )
  17. from pynini.lib import pynutil
  18. class FractionFst(GraphFst):
  19. """
  20. Finite state transducer for verbalizing fraction
  21. e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } ->
  22. treinta y tres y cuatro quintos
  23. Args:
  24. deterministic: if True will provide a single transduction option,
  25. for False multiple transduction are generated (used for audio-based normalization)
  26. """
  27. def __init__(self, deterministic: bool = True):
  28. super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
  29. # Derivational strings append 'avo' as a suffix. Adding space for processing aid
  30. fraction_stem = pynutil.insert(" avo")
  31. plural = pynutil.insert("s")
  32. conjunction = pynutil.insert(" y ")
  33. integer = (
  34. pynutil.delete("integer_part: \"")
  35. + strip_cardinal_apocope(pynini.closure(DAMO_NOT_QUOTE))
  36. + pynutil.delete("\"")
  37. )
  38. numerator_one = pynutil.delete("numerator: \"") + pynini.accep("un") + pynutil.delete("\" ")
  39. numerator = (
  40. pynutil.delete("numerator: \"")
  41. + pynini.difference(pynini.closure(DAMO_NOT_QUOTE), "un")
  42. + pynutil.delete("\" ")
  43. )
  44. denominator_add_stem = pynutil.delete("denominator: \"") + (
  45. pynini.closure(DAMO_NOT_QUOTE)
  46. + fraction_stem
  47. + pynutil.delete("\" morphosyntactic_features: \"add_root\"")
  48. )
  49. denominator_ordinal = pynutil.delete("denominator: \"") + (
  50. pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete("\" morphosyntactic_features: \"ordinal\"")
  51. )
  52. denominator_cardinal = pynutil.delete("denominator: \"") + (
  53. pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete("\"")
  54. )
  55. denominator_singular = pynini.union(denominator_add_stem, denominator_ordinal)
  56. if not deterministic:
  57. # Occasional exceptions
  58. denominator_singular |= denominator_add_stem @ pynini.string_map(
  59. [("once avo", "undécimo"), ("doce avo", "duodécimo")]
  60. )
  61. denominator_plural = denominator_singular + plural
  62. # Merging operations
  63. merge = pynini.cdrewrite(
  64. pynini.cross(" y ", "i"), "", "", DAMO_SIGMA
  65. ) # The denominator must be a single word, with the conjunction "y" replaced by i
  66. merge @= pynini.cdrewrite(delete_space, "", pynini.difference(DAMO_CHAR, "parte"), DAMO_SIGMA)
  67. # The merger can produce duplicate vowels. This is not allowed in orthography
  68. delete_duplicates = pynini.string_map([("aa", "a"), ("oo", "o")]) # Removes vowels
  69. delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "", DAMO_SIGMA)
  70. remove_accents = pynini.cdrewrite(
  71. accents,
  72. pynini.union(DAMO_SPACE, pynini.accep("[BOS]")) + pynini.closure(DAMO_NOT_SPACE),
  73. pynini.closure(DAMO_NOT_SPACE) + pynini.union("avo", "ava", "ésimo", "ésima"),
  74. DAMO_SIGMA,
  75. )
  76. merge_into_single_word = merge @ remove_accents @ delete_duplicates
  77. fraction_default = numerator + delete_space + insert_space + (denominator_plural @ merge_into_single_word)
  78. fraction_with_one = (
  79. numerator_one + delete_space + insert_space + (denominator_singular @ merge_into_single_word)
  80. )
  81. fraction_with_cardinal = strip_cardinal_apocope(numerator | numerator_one)
  82. fraction_with_cardinal += (
  83. delete_space + pynutil.insert(" sobre ") + strip_cardinal_apocope(denominator_cardinal)
  84. )
  85. if not deterministic:
  86. # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine
  87. # Other rules will manage use of "un" at end, so just worry about endings
  88. exceptions = pynini.string_map([("tercia", "tercera")])
  89. apply_exceptions = pynini.cdrewrite(exceptions, "", "", DAMO_SIGMA)
  90. vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "", pynini.accep("[EOS]"), DAMO_SIGMA)
  91. denominator_singular_fem = shift_cardinal_gender(denominator_singular) @ vowel_change @ apply_exceptions
  92. denominator_plural_fem = denominator_singular_fem + plural
  93. numerator_one_fem = shift_cardinal_gender(numerator_one)
  94. numerator_fem = shift_cardinal_gender(numerator)
  95. fraction_with_cardinal |= (
  96. (numerator_one_fem | numerator_fem)
  97. + delete_space
  98. + pynutil.insert(" sobre ")
  99. + shift_cardinal_gender(denominator_cardinal)
  100. )
  101. # Still need to manage stems
  102. merge_stem = pynini.cdrewrite(
  103. delete_space, "", pynini.union("avo", "ava", "avos", "avas"), DAMO_SIGMA
  104. ) # For managing alternative spacing
  105. merge_stem @= remove_accents @ delete_duplicates
  106. fraction_with_one_fem = numerator_one_fem + delete_space + insert_space
  107. fraction_with_one_fem += pynini.union(
  108. denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word
  109. ) # Both forms exists
  110. fraction_with_one_fem += pynutil.insert(" parte")
  111. fraction_with_one_fem @= pynini.cdrewrite(
  112. pynini.cross("una media", "media"), "", "", DAMO_SIGMA
  113. ) # "media" not "una media"
  114. fraction_default_fem = numerator_fem + delete_space + insert_space
  115. fraction_default_fem += pynini.union(
  116. denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word
  117. )
  118. fraction_default_fem += pynutil.insert(" partes")
  119. fraction_default |= (
  120. numerator + delete_space + insert_space + denominator_plural @ merge_stem
  121. ) # Case of no merger
  122. fraction_default |= fraction_default_fem
  123. fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem
  124. fraction_with_one |= fraction_with_one_fem
  125. fraction_with_one @= pynini.cdrewrite(
  126. pynini.cross("un medio", "medio"), "", "", DAMO_SIGMA
  127. ) # "medio" not "un medio"
  128. fraction = fraction_with_one | fraction_default | fraction_with_cardinal
  129. graph_masc = pynini.closure(integer + delete_space + conjunction, 0, 1) + fraction
  130. # Manage cases of fem gender (only shows on integer except for "medio")
  131. integer_fem = shift_cardinal_gender(integer)
  132. fraction_default |= (
  133. shift_cardinal_gender(numerator)
  134. + delete_space
  135. + insert_space
  136. + (denominator_plural @ pynini.cross("medios", "medias"))
  137. )
  138. fraction_with_one |= (
  139. pynutil.delete(numerator_one) + delete_space + (denominator_singular @ pynini.cross("medio", "media"))
  140. )
  141. fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal
  142. graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0, 1) + fraction_fem
  143. self.graph_masc = pynini.optimize(graph_masc)
  144. self.graph_fem = pynini.optimize(graph_fem)
  145. self.graph = graph_masc | graph_fem
  146. delete_tokens = self.delete_tokens(self.graph)
  147. self.fst = delete_tokens.optimize()