graph_utils.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. # Copyright NeMo (https://github.com/NVIDIA/NeMo). All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import pynini
  15. from fun_text_processing.text_normalization.en.graph_utils import DAMO_SIGMA, DAMO_SPACE
  16. from fun_text_processing.text_normalization.es import LOCALIZATION
  17. from fun_text_processing.text_normalization.es.utils import get_abs_path, load_labels
  18. from pynini.lib import pynutil
  19. digits = pynini.project(pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input")
  20. tens = pynini.project(pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input")
  21. teens = pynini.project(pynini.string_file(get_abs_path("data/numbers/teen.tsv")), "input")
  22. twenties = pynini.project(pynini.string_file(get_abs_path("data/numbers/twenties.tsv")), "input")
  23. hundreds = pynini.project(pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")), "input")
  24. accents = pynini.string_map([("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u")])
  25. if LOCALIZATION == "am": # Setting localization for central and northern america formatting
  26. cardinal_separator = pynini.string_map([",", DAMO_SPACE])
  27. decimal_separator = pynini.accep(".")
  28. else:
  29. cardinal_separator = pynini.string_map([".", DAMO_SPACE])
  30. decimal_separator = pynini.accep(",")
  31. ones = pynini.union("un", "ún")
  32. fem_ones = pynini.union(pynini.cross("un", "una"), pynini.cross("ún", "una"), pynini.cross("uno", "una"))
  33. one_to_one_hundred = pynini.union(digits, "uno", tens, teens, twenties, tens + pynini.accep(" y ") + digits)
  34. fem_hundreds = hundreds @ pynini.cdrewrite(pynini.cross("ientos", "ientas"), "", "", DAMO_SIGMA)
  35. def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike':
  36. """
  37. Converts all accented vowels to non-accented equivalents
  38. Args:
  39. fst: Any fst. Composes vowel conversion onto fst's output strings
  40. """
  41. return fst @ pynini.cdrewrite(accents, "", "", DAMO_SIGMA)
  42. def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
  43. """
  44. Applies gender conversion rules to a cardinal string. These include: rendering all masculine forms of "uno" (including apocopated forms) as "una" and
  45. Converting all gendered numbers in the hundreds series (200,300,400...) to feminine equivalent (e.g. "doscientos" -> "doscientas"). Conversion only applies
  46. to value place for <1000 and multiple of 1000. (e.g. "doscientos mil doscientos" -> "doscientas mil doscientas".) For place values greater than the thousands, there
  47. is no gender shift as the higher powers of ten ("millones", "billones") are masculine nouns and any conversion would be formally
  48. ungrammatical.
  49. e.g.
  50. "doscientos" -> "doscientas"
  51. "doscientos mil" -> "doscientas mil"
  52. "doscientos millones" -> "doscientos millones"
  53. "doscientos mil millones" -> "doscientos mil millones"
  54. "doscientos millones doscientos mil doscientos" -> "doscientos millones doscientas mil doscientas"
  55. Args:
  56. fst: Any fst. Composes conversion onto fst's output strings
  57. """
  58. before_mil = (
  59. DAMO_SPACE
  60. + (pynini.accep("mil") | pynini.accep("milésimo"))
  61. + pynini.closure(DAMO_SPACE + hundreds, 0, 1)
  62. + pynini.closure(DAMO_SPACE + one_to_one_hundred, 0, 1)
  63. + pynini.union(pynini.accep("[EOS]"), pynini.accep("\""), decimal_separator)
  64. )
  65. before_double_digits = pynini.closure(DAMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
  66. pynini.accep("[EOS]"), pynini.accep("\"")
  67. )
  68. fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil, DAMO_SIGMA) # doscientas mil dosciento
  69. fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits, DAMO_SIGMA) # doscientas mil doscienta
  70. fem_allign @= pynini.cdrewrite(
  71. fem_ones, "", pynini.union("[EOS]", "\"", decimal_separator), DAMO_SIGMA
  72. ) # If before a quote or EOS, we know it's the end of a string
  73. return fst @ fem_allign
  74. def shift_number_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
  75. """
  76. Performs gender conversion on all verbalized numbers in output. All values in the hundreds series (200,300,400) are changed to
  77. feminine gender (e.g. "doscientos" -> "doscientas") and all forms of "uno" (including apocopated forms) are converted to "una".
  78. This has no boundary restriction and will perform shift across all values in output string.
  79. e.g.
  80. "doscientos" -> "doscientas"
  81. "doscientos millones" -> "doscientas millones"
  82. "doscientos millones doscientos" -> "doscientas millones doscientas"
  83. Args:
  84. fst: Any fst. Composes conversion onto fst's output strings
  85. """
  86. fem_allign = pynini.cdrewrite(fem_hundreds, "", "", DAMO_SIGMA)
  87. fem_allign @= pynini.cdrewrite(
  88. fem_ones, "", pynini.union(DAMO_SPACE, pynini.accep("[EOS]"), pynini.accep("\"")), DAMO_SIGMA
  89. ) # If before a quote or EOS, we know it's the end of a string
  90. return fst @ fem_allign
  91. def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
  92. """
  93. Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only
  94. affects strings where the final value is a variation of "un".
  95. e.g.
  96. "un" -> "uno"
  97. "veintiún" -> "veintiuno"
  98. Args:
  99. fst: Any fst. Composes conversion onto fst's output strings
  100. """
  101. # Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one
  102. strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno")
  103. strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), DAMO_SIGMA)
  104. return fst @ strip
  105. def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
  106. """
  107. Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable
  108. with text string, so is included for non-deterministic cases.
  109. e.g.
  110. "una" -> "un"
  111. "veintiuna" -> "veintiun"
  112. Args:
  113. fst: Any fst. Composes conversion onto fst's output strings
  114. """
  115. # Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one
  116. strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún")
  117. strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), DAMO_SIGMA)
  118. return fst @ strip
  119. def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
  120. """
  121. Alters given fst to convert Roman integers (lower and upper cased) into Arabic numerals. Valid for values up to 1000.
  122. e.g.
  123. "V" -> "5"
  124. "i" -> "1"
  125. Args:
  126. fst: Any fst. Composes fst onto Roman conversion outputs.
  127. """
  128. def _load_roman(file: str):
  129. roman = load_labels(get_abs_path(file))
  130. roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman]
  131. return pynini.string_map(roman_numerals)
  132. digit = _load_roman("data/roman/digit.tsv")
  133. ties = _load_roman("data/roman/ties.tsv")
  134. hundreds = _load_roman("data/roman/hundreds.tsv")
  135. graph = (
  136. digit
  137. | ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
  138. | (
  139. hundreds
  140. + (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
  141. + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
  142. )
  143. ).optimize()
  144. return graph @ fst