clean_eval_data.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. from argparse import ArgumentParser
  2. from typing import List
  3. import regex as re
  4. from fun_text_processing.text_normalization.data_loader_utils import (
  5. EOS_TYPE,
  6. Instance,
  7. load_files,
  8. training_data_to_sentences,
  9. )
  10. """
  11. This file is for evaluation purposes.
  12. filter_loaded_data() cleans data (list of instances) for text normalization. Filters and cleaners can be specified for each semiotic class individually.
  13. For example, normalized text should only include characters and whitespace characters but no punctuation.
  14. Cardinal unnormalized instances should contain at least one integer and all other characters are removed.
  15. """
  16. class Filter:
  17. """
  18. Filter class
  19. Args:
  20. class_type: semiotic class used in dataset
  21. process_func: function to transform text
  22. filter_func: function to filter text
  23. """
  24. def __init__(self, class_type: str, process_func: object, filter_func: object):
  25. self.class_type = class_type
  26. self.process_func = process_func
  27. self.filter_func = filter_func
  28. def filter(self, instance: Instance) -> bool:
  29. """
  30. filter function
  31. Args:
  32. filters given instance with filter function
  33. Returns: True if given instance fulfills criteria or does not belong to class type
  34. """
  35. if instance.token_type != self.class_type:
  36. return True
  37. return self.filter_func(instance)
  38. def process(self, instance: Instance) -> Instance:
  39. """
  40. process function
  41. Args:
  42. processes given instance with process function
  43. Returns: processed instance if instance belongs to expected class type or original instance
  44. """
  45. if instance.token_type != self.class_type:
  46. return instance
  47. return self.process_func(instance)
  48. def filter_cardinal_1(instance: Instance) -> bool:
  49. ok = re.search(r"[0-9]", instance.un_normalized)
  50. return ok
  51. def process_cardinal_1(instance: Instance) -> Instance:
  52. un_normalized = instance.un_normalized
  53. normalized = instance.normalized
  54. un_normalized = re.sub(r"[^0-9]", "", un_normalized)
  55. normalized = re.sub(r"[^a-z ]", "", normalized)
  56. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  57. def filter_ordinal_1(instance: Instance) -> bool:
  58. ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized)
  59. return ok
  60. def process_ordinal_1(instance: Instance) -> Instance:
  61. un_normalized = instance.un_normalized
  62. normalized = instance.normalized
  63. un_normalized = re.sub(r"[,\s]", "", un_normalized)
  64. normalized = re.sub(r"[^a-z ]", "", normalized)
  65. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  66. def filter_decimal_1(instance: Instance) -> bool:
  67. ok = re.search(r"[0-9]", instance.un_normalized)
  68. return ok
  69. def process_decimal_1(instance: Instance) -> Instance:
  70. un_normalized = instance.un_normalized
  71. un_normalized = re.sub(r",", "", un_normalized)
  72. normalized = instance.normalized
  73. normalized = re.sub(r"[^a-z ]", "", normalized)
  74. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  75. def filter_measure_1(instance: Instance) -> bool:
  76. ok = True
  77. return ok
  78. def process_measure_1(instance: Instance) -> Instance:
  79. un_normalized = instance.un_normalized
  80. normalized = instance.normalized
  81. un_normalized = re.sub(r",", "", un_normalized)
  82. un_normalized = re.sub(r"m2", "m²", un_normalized)
  83. un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized)
  84. normalized = re.sub(r"[^a-z\s]", "", normalized)
  85. normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized)
  86. normalized = re.sub(r"[^a-z ]", "", normalized)
  87. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  88. def filter_money_1(instance: Instance) -> bool:
  89. ok = re.search(r"[0-9]", instance.un_normalized)
  90. return ok
  91. def process_money_1(instance: Instance) -> Instance:
  92. un_normalized = instance.un_normalized
  93. normalized = instance.normalized
  94. un_normalized = re.sub(r",", "", un_normalized)
  95. un_normalized = re.sub(r"a\$", r"$", un_normalized)
  96. un_normalized = re.sub(r"us\$", r"$", un_normalized)
  97. un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized)
  98. un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized)
  99. normalized = re.sub(r"[^a-z ]", "", normalized)
  100. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  101. def filter_time_1(instance: Instance) -> bool:
  102. ok = re.search(r"[0-9]", instance.un_normalized)
  103. return ok
  104. def process_time_1(instance: Instance) -> Instance:
  105. un_normalized = instance.un_normalized
  106. un_normalized = re.sub(r": ", ":", un_normalized)
  107. un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized)
  108. un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized)
  109. normalized = instance.normalized
  110. normalized = re.sub(r"[^a-z ]", "", normalized)
  111. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  112. def filter_plain_1(instance: Instance) -> bool:
  113. ok = True
  114. return ok
  115. def process_plain_1(instance: Instance) -> Instance:
  116. un_normalized = instance.un_normalized
  117. normalized = instance.normalized
  118. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  119. def filter_punct_1(instance: Instance) -> bool:
  120. ok = True
  121. return ok
  122. def process_punct_1(instance: Instance) -> Instance:
  123. un_normalized = instance.un_normalized
  124. normalized = instance.normalized
  125. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  126. def filter_date_1(instance: Instance) -> bool:
  127. ok = True
  128. return ok
  129. def process_date_1(instance: Instance) -> Instance:
  130. un_normalized = instance.un_normalized
  131. un_normalized = re.sub(r",", "", un_normalized)
  132. normalized = instance.normalized
  133. normalized = re.sub(r"[^a-z ]", "", normalized)
  134. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  135. def filter_letters_1(instance: Instance) -> bool:
  136. ok = True
  137. return ok
  138. def process_letters_1(instance: Instance) -> Instance:
  139. un_normalized = instance.un_normalized
  140. normalized = instance.normalized
  141. normalized = re.sub(r"[^a-z ]", "", normalized)
  142. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  143. def filter_verbatim_1(instance: Instance) -> bool:
  144. ok = True
  145. return ok
  146. def process_verbatim_1(instance: Instance) -> Instance:
  147. un_normalized = instance.un_normalized
  148. normalized = instance.normalized
  149. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  150. def filter_digit_1(instance: Instance) -> bool:
  151. ok = re.search(r"[0-9]", instance.un_normalized)
  152. return ok
  153. def process_digit_1(instance: Instance) -> Instance:
  154. un_normalized = instance.un_normalized
  155. normalized = instance.normalized
  156. normalized = re.sub(r"[^a-z ]", "", normalized)
  157. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  158. def filter_telephone_1(instance: Instance) -> bool:
  159. ok = re.search(r"[0-9]", instance.un_normalized)
  160. return ok
  161. def process_telephone_1(instance: Instance) -> Instance:
  162. un_normalized = instance.un_normalized
  163. normalized = instance.normalized
  164. normalized = re.sub(r"[^a-z ]", "", normalized)
  165. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  166. def filter_electronic_1(instance: Instance) -> bool:
  167. ok = re.search(r"[0-9]", instance.un_normalized)
  168. return ok
  169. def process_electronic_1(instance: Instance) -> Instance:
  170. un_normalized = instance.un_normalized
  171. normalized = instance.normalized
  172. normalized = re.sub(r"[^a-z ]", "", normalized)
  173. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  174. def filter_fraction_1(instance: Instance) -> bool:
  175. ok = re.search(r"[0-9]", instance.un_normalized)
  176. return ok
  177. def process_fraction_1(instance: Instance) -> Instance:
  178. un_normalized = instance.un_normalized
  179. normalized = instance.normalized
  180. normalized = re.sub(r"[^a-z ]", "", normalized)
  181. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  182. def filter_address_1(instance: Instance) -> bool:
  183. ok = True
  184. return ok
  185. def process_address_1(instance: Instance) -> Instance:
  186. un_normalized = instance.un_normalized
  187. normalized = instance.normalized
  188. normalized = re.sub(r"[^a-z ]", "", normalized)
  189. return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
  190. filters = []
  191. filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1))
  192. filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1))
  193. filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1))
  194. filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1))
  195. filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1))
  196. filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1))
  197. filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1))
  198. filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1))
  199. filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1))
  200. filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1))
  201. filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1))
  202. filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1))
  203. filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1))
  204. filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1))
  205. filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1))
  206. filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1))
  207. filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True))
  208. def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]:
  209. """
  210. Filters list of instances
  211. Args:
  212. data: list of instances
  213. Returns: filtered and transformed list of instances
  214. """
  215. updates_instances = []
  216. for instance in data:
  217. updated_instance = False
  218. for fil in filters:
  219. if tl.class_type == instance.token_type and tl.filter(instance):
  220. instance = fil.process(instance)
  221. updated_instance = True
  222. if updated_instance:
  223. if verbose:
  224. print(instance)
  225. updates_instances.append(instance)
  226. return updates_instances
  227. def parse_args():
  228. parser = ArgumentParser()
  229. parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100')
  230. parser.add_argument("--verbose", help="print filtered instances", action='store_true')
  231. return parser.parse_args()
  232. if __name__ == "__main__":
  233. args = parse_args()
  234. file_path = args.input
  235. print("Loading training data: " + file_path)
  236. instance_list = load_files([file_path]) # List of instances
  237. filtered_instance_list = filter_loaded_data(instance_list, args.verbose)
  238. training_data_to_sentences(filtered_instance_list)