| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- import pynini
- from fun_text_processing.text_normalization.en.graph_utils import (
- DAMO_CHAR,
- DAMO_NOT_QUOTE,
- DAMO_NOT_SPACE,
- DAMO_SIGMA,
- DAMO_SPACE,
- GraphFst,
- delete_space,
- insert_space,
- )
- from fun_text_processing.text_normalization.es.graph_utils import (
- accents,
- shift_cardinal_gender,
- strip_cardinal_apocope,
- )
- from pynini.lib import pynutil
- class FractionFst(GraphFst):
- """
- Finite state transducer for verbalizing fraction
- e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } ->
- treinta y tres y cuatro quintos
- Args:
- deterministic: if True will provide a single transduction option,
- for False multiple transduction are generated (used for audio-based normalization)
- """
- def __init__(self, deterministic: bool = True):
- super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
- # Derivational strings append 'avo' as a suffix. Adding space for processing aid
- fraction_stem = pynutil.insert(" avo")
- plural = pynutil.insert("s")
- conjunction = pynutil.insert(" y ")
- integer = (
- pynutil.delete("integer_part: \"")
- + strip_cardinal_apocope(pynini.closure(DAMO_NOT_QUOTE))
- + pynutil.delete("\"")
- )
- numerator_one = pynutil.delete("numerator: \"") + pynini.accep("un") + pynutil.delete("\" ")
- numerator = (
- pynutil.delete("numerator: \"")
- + pynini.difference(pynini.closure(DAMO_NOT_QUOTE), "un")
- + pynutil.delete("\" ")
- )
- denominator_add_stem = pynutil.delete("denominator: \"") + (
- pynini.closure(DAMO_NOT_QUOTE)
- + fraction_stem
- + pynutil.delete("\" morphosyntactic_features: \"add_root\"")
- )
- denominator_ordinal = pynutil.delete("denominator: \"") + (
- pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete("\" morphosyntactic_features: \"ordinal\"")
- )
- denominator_cardinal = pynutil.delete("denominator: \"") + (
- pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete("\"")
- )
- denominator_singular = pynini.union(denominator_add_stem, denominator_ordinal)
- if not deterministic:
- # Occasional exceptions
- denominator_singular |= denominator_add_stem @ pynini.string_map(
- [("once avo", "undécimo"), ("doce avo", "duodécimo")]
- )
- denominator_plural = denominator_singular + plural
- # Merging operations
- merge = pynini.cdrewrite(
- pynini.cross(" y ", "i"), "", "", DAMO_SIGMA
- ) # The denominator must be a single word, with the conjunction "y" replaced by i
- merge @= pynini.cdrewrite(delete_space, "", pynini.difference(DAMO_CHAR, "parte"), DAMO_SIGMA)
- # The merger can produce duplicate vowels. This is not allowed in orthography
- delete_duplicates = pynini.string_map([("aa", "a"), ("oo", "o")]) # Removes vowels
- delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "", DAMO_SIGMA)
- remove_accents = pynini.cdrewrite(
- accents,
- pynini.union(DAMO_SPACE, pynini.accep("[BOS]")) + pynini.closure(DAMO_NOT_SPACE),
- pynini.closure(DAMO_NOT_SPACE) + pynini.union("avo", "ava", "ésimo", "ésima"),
- DAMO_SIGMA,
- )
- merge_into_single_word = merge @ remove_accents @ delete_duplicates
- fraction_default = numerator + delete_space + insert_space + (denominator_plural @ merge_into_single_word)
- fraction_with_one = (
- numerator_one + delete_space + insert_space + (denominator_singular @ merge_into_single_word)
- )
- fraction_with_cardinal = strip_cardinal_apocope(numerator | numerator_one)
- fraction_with_cardinal += (
- delete_space + pynutil.insert(" sobre ") + strip_cardinal_apocope(denominator_cardinal)
- )
- if not deterministic:
- # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine
- # Other rules will manage use of "un" at end, so just worry about endings
- exceptions = pynini.string_map([("tercia", "tercera")])
- apply_exceptions = pynini.cdrewrite(exceptions, "", "", DAMO_SIGMA)
- vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "", pynini.accep("[EOS]"), DAMO_SIGMA)
- denominator_singular_fem = shift_cardinal_gender(denominator_singular) @ vowel_change @ apply_exceptions
- denominator_plural_fem = denominator_singular_fem + plural
- numerator_one_fem = shift_cardinal_gender(numerator_one)
- numerator_fem = shift_cardinal_gender(numerator)
- fraction_with_cardinal |= (
- (numerator_one_fem | numerator_fem)
- + delete_space
- + pynutil.insert(" sobre ")
- + shift_cardinal_gender(denominator_cardinal)
- )
- # Still need to manage stems
- merge_stem = pynini.cdrewrite(
- delete_space, "", pynini.union("avo", "ava", "avos", "avas"), DAMO_SIGMA
- ) # For managing alternative spacing
- merge_stem @= remove_accents @ delete_duplicates
- fraction_with_one_fem = numerator_one_fem + delete_space + insert_space
- fraction_with_one_fem += pynini.union(
- denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word
- ) # Both forms exists
- fraction_with_one_fem += pynutil.insert(" parte")
- fraction_with_one_fem @= pynini.cdrewrite(
- pynini.cross("una media", "media"), "", "", DAMO_SIGMA
- ) # "media" not "una media"
- fraction_default_fem = numerator_fem + delete_space + insert_space
- fraction_default_fem += pynini.union(
- denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word
- )
- fraction_default_fem += pynutil.insert(" partes")
- fraction_default |= (
- numerator + delete_space + insert_space + denominator_plural @ merge_stem
- ) # Case of no merger
- fraction_default |= fraction_default_fem
- fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem
- fraction_with_one |= fraction_with_one_fem
- fraction_with_one @= pynini.cdrewrite(
- pynini.cross("un medio", "medio"), "", "", DAMO_SIGMA
- ) # "medio" not "un medio"
- fraction = fraction_with_one | fraction_default | fraction_with_cardinal
- graph_masc = pynini.closure(integer + delete_space + conjunction, 0, 1) + fraction
- # Manage cases of fem gender (only shows on integer except for "medio")
- integer_fem = shift_cardinal_gender(integer)
- fraction_default |= (
- shift_cardinal_gender(numerator)
- + delete_space
- + insert_space
- + (denominator_plural @ pynini.cross("medios", "medias"))
- )
- fraction_with_one |= (
- pynutil.delete(numerator_one) + delete_space + (denominator_singular @ pynini.cross("medio", "media"))
- )
- fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal
- graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0, 1) + fraction_fem
- self.graph_masc = pynini.optimize(graph_masc)
- self.graph_fem = pynini.optimize(graph_fem)
- self.graph = graph_masc | graph_fem
- delete_tokens = self.delete_tokens(self.graph)
- self.fst = delete_tokens.optimize()
|