| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245 |
- import pynini
- from fun_text_processing.text_normalization.en.graph_utils import (
- DAMO_NOT_QUOTE,
- DAMO_SIGMA,
- GraphFst,
- delete_preserve_order,
- delete_space,
- insert_space,
- )
- from fun_text_processing.text_normalization.es.utils import get_abs_path
- from pynini.lib import pynutil
- alt_minutes = pynini.string_file(get_abs_path("data/time/alt_minutes.tsv"))
- morning_times = pynini.string_file(get_abs_path("data/time/morning_times.tsv"))
- afternoon_times = pynini.string_file(get_abs_path("data/time/afternoon_times.tsv"))
- evening_times = pynini.string_file(get_abs_path("data/time/evening_times.tsv"))
- class TimeFst(GraphFst):
- """
- Finite state transducer for verbalizing time, e.g.
- time { hours: "doce" minutes: "media" suffix: "a m" } -> doce y media de la noche
- time { hours: "doce" } -> twelve o'clock
- Args:
- deterministic: if True will provide a single transduction option,
- for False multiple transduction are generated (used for audio-based normalization)
- """
- def __init__(self, deterministic: bool = True):
- super().__init__(name="time", kind="verbalize", deterministic=deterministic)
- change_minutes = pynini.cdrewrite(alt_minutes, pynini.accep("[BOS]"), pynini.accep("[EOS]"), DAMO_SIGMA)
- morning_phrases = pynini.cross("am", "de la mañana")
- afternoon_phrases = pynini.cross("pm", "de la tarde")
- evening_phrases = pynini.cross("pm", "de la noche")
- # For the 12's
- mid_times = pynini.accep("doce")
- mid_phrases = (
- pynini.string_map([("pm", "del mediodía"), ("am", "de la noche")])
- if deterministic
- else pynini.string_map(
- [
- ("pm", "de la mañana"),
- ("pm", "del día"),
- ("pm", "del mediodía"),
- ("am", "de la noche"),
- ("am", "de la medianoche"),
- ]
- )
- )
- hour = (
- pynutil.delete("hours:")
- + delete_space
- + pynutil.delete("\"")
- + pynini.closure(DAMO_NOT_QUOTE, 1)
- + pynutil.delete("\"")
- )
- minute = (
- pynutil.delete("minutes:")
- + delete_space
- + pynutil.delete("\"")
- + pynini.closure(DAMO_NOT_QUOTE, 1)
- + pynutil.delete("\"")
- )
- minute = (minute @ change_minutes) if deterministic else pynini.union(minute, minute @ change_minutes)
- suffix = (
- pynutil.delete("suffix:")
- + delete_space
- + pynutil.delete("\"")
- + pynini.closure(DAMO_NOT_QUOTE, 1)
- + pynutil.delete("\"")
- )
- zone = (
- pynutil.delete("zone:")
- + delete_space
- + pynutil.delete("\"")
- + pynini.closure(DAMO_NOT_QUOTE, 1)
- + pynutil.delete("\"")
- )
- optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1)
- second = (
- pynutil.delete("seconds:")
- + delete_space
- + pynutil.delete("\"")
- + pynini.closure(DAMO_NOT_QUOTE, 1)
- + pynutil.delete("\"")
- )
- graph_hms = (
- hour
- + pynutil.insert(" horas ")
- + delete_space
- + minute
- + pynutil.insert(" minutos y ")
- + delete_space
- + second
- + pynutil.insert(" segundos")
- )
- graph_hm = hour + delete_space + pynutil.insert(" y ") + minute
- graph_hm |= pynini.union(
- (hour @ morning_times)
- + delete_space
- + pynutil.insert(" y ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ morning_phrases),
- (hour @ afternoon_times)
- + delete_space
- + pynutil.insert(" y ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ afternoon_phrases),
- (hour @ evening_times)
- + delete_space
- + pynutil.insert(" y ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ evening_phrases),
- (hour @ mid_times)
- + delete_space
- + pynutil.insert(" y ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ mid_phrases),
- )
- graph_h = pynini.union(
- hour,
- (hour @ morning_times) + delete_space + insert_space + (suffix @ morning_phrases),
- (hour @ afternoon_times) + delete_space + insert_space + (suffix @ afternoon_phrases),
- (hour @ evening_times) + delete_space + insert_space + (suffix @ evening_phrases),
- (hour @ mid_times) + delete_space + insert_space + (suffix @ mid_phrases),
- )
- graph = (graph_hms | graph_hm | graph_h) + optional_zone
- if not deterministic:
- graph_style_1 = pynutil.delete(" style: \"1\"")
- graph_style_2 = pynutil.delete(" style: \"2\"")
- graph_menos = hour + delete_space + pynutil.insert(" menos ") + minute + graph_style_1
- graph_menos |= (
- (hour @ morning_times)
- + delete_space
- + pynutil.insert(" menos ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ morning_phrases)
- + graph_style_1
- )
- graph_menos |= (
- (hour @ afternoon_times)
- + delete_space
- + pynutil.insert(" menos ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ afternoon_phrases)
- + graph_style_1
- )
- graph_menos |= (
- (hour @ evening_times)
- + delete_space
- + pynutil.insert(" menos ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ evening_phrases)
- + graph_style_1
- )
- graph_menos |= (
- (hour @ mid_times)
- + delete_space
- + pynutil.insert(" menos ")
- + minute
- + delete_space
- + insert_space
- + (suffix @ mid_phrases)
- + graph_style_1
- )
- graph_menos += optional_zone
- graph_para = minute + pynutil.insert(" para las ") + delete_space + hour + graph_style_2
- graph_para |= (
- minute
- + pynutil.insert(" para las ")
- + delete_space
- + (hour @ morning_times)
- + delete_space
- + insert_space
- + (suffix @ morning_phrases)
- + graph_style_2
- )
- graph_para |= (
- minute
- + pynutil.insert(" para las ")
- + delete_space
- + (hour @ afternoon_times)
- + delete_space
- + insert_space
- + (suffix @ afternoon_phrases)
- + graph_style_2
- )
- graph_para |= (
- minute
- + pynutil.insert(" para las ")
- + delete_space
- + (hour @ evening_times)
- + delete_space
- + insert_space
- + (suffix @ evening_phrases)
- + graph_style_2
- )
- graph_para |= (
- minute
- + pynutil.insert(" para las ")
- + delete_space
- + (hour @ mid_times)
- + delete_space
- + insert_space
- + (suffix @ mid_phrases)
- + graph_style_2
- )
- graph_para += optional_zone
- graph_para @= pynini.cdrewrite(
- pynini.cross(" las ", " la "), "para", "una", DAMO_SIGMA
- ) # Need agreement with one
- graph |= graph_menos | graph_para
- delete_tokens = self.delete_tokens(graph + delete_preserve_order)
- self.fst = delete_tokens.optimize()
|