token_parser.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. # Copyright NeMo (https://github.com/NVIDIA/NeMo). All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import string
  15. from collections import OrderedDict
  16. from typing import Dict, List, Union
  17. PRESERVE_ORDER_KEY = "preserve_order"
  18. EOS = "<EOS>"
  19. class TokenParser:
  20. """
  21. Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'
  22. Args
  23. text: tokenized text
  24. """
  25. def __call__(self, text):
  26. """
  27. Setup function
  28. Args:
  29. text: text to be parsed
  30. """
  31. self.text = text
  32. self.len_text = len(text)
  33. self.char = text[0] # cannot handle empty string
  34. self.index = 0
  35. def parse(self) -> List[dict]:
  36. """
  37. Main function. Implements grammar:
  38. A -> space F space F space F ... space
  39. Returns list of dictionaries
  40. """
  41. l = list()
  42. while self.parse_ws():
  43. token = self.parse_token()
  44. if not token:
  45. break
  46. l.append(token)
  47. return l
  48. def parse_token(self) -> Dict[str, Union[str, dict]]:
  49. """
  50. Implements grammar:
  51. F-> no_space KG no_space
  52. Returns: K, G as dictionary values
  53. """
  54. d = OrderedDict()
  55. key = self.parse_string_key()
  56. if key is None:
  57. return None
  58. self.parse_ws()
  59. if key == PRESERVE_ORDER_KEY:
  60. self.parse_char(":")
  61. self.parse_ws()
  62. value = self.parse_chars("true")
  63. else:
  64. value = self.parse_token_value()
  65. d[key] = value
  66. return d
  67. def parse_token_value(self) -> Union[str, dict]:
  68. """
  69. Implements grammar:
  70. G-> no_space :"VALUE" no_space | no_space {A} no_space
  71. Returns: string or dictionary
  72. """
  73. if self.char == ":":
  74. self.parse_char(":")
  75. self.parse_ws()
  76. self.parse_char("\"")
  77. value_string = self.parse_string_value()
  78. self.parse_char("\"")
  79. return value_string
  80. elif self.char == "{":
  81. d = OrderedDict()
  82. self.parse_char("{")
  83. list_token_dicts = self.parse()
  84. # flatten tokens
  85. for tok_dict in list_token_dicts:
  86. for k, v in tok_dict.items():
  87. d[k] = v
  88. self.parse_char("}")
  89. return d
  90. else:
  91. raise ValueError()
  92. def parse_char(self, exp) -> bool:
  93. """
  94. Parses character
  95. Args:
  96. exp: character to read in
  97. Returns true if successful
  98. """
  99. assert self.char == exp
  100. self.read()
  101. return True
  102. def parse_chars(self, exp) -> bool:
  103. """
  104. Parses characters
  105. Args:
  106. exp: characters to read in
  107. Returns true if successful
  108. """
  109. ok = False
  110. for x in exp:
  111. ok |= self.parse_char(x)
  112. return ok
  113. def parse_string_key(self) -> str:
  114. """
  115. Parses string key, can only contain ascii and '_' characters
  116. Returns parsed string key
  117. """
  118. assert self.char not in string.whitespace and self.char != EOS
  119. incl_criterium = string.ascii_letters + "_"
  120. l = []
  121. while self.char in incl_criterium:
  122. l.append(self.char)
  123. if not self.read():
  124. raise ValueError()
  125. if not l:
  126. return None
  127. return "".join(l)
  128. def parse_string_value(self) -> str:
  129. """
  130. Parses string value, ends with quote followed by space
  131. Returns parsed string value
  132. """
  133. # assert self.char not in string.whitespace and self.char != EOS
  134. assert self.char != EOS
  135. l = []
  136. while self.char != "\"" or self.text[self.index + 1] != " ":
  137. l.append(self.char)
  138. if not self.read():
  139. raise ValueError()
  140. if not l:
  141. return None
  142. return "".join(l)
  143. def parse_ws(self):
  144. """
  145. Deletes whitespaces.
  146. Returns true if not EOS after parsing
  147. """
  148. not_eos = self.char != EOS
  149. while not_eos and self.char == " ":
  150. not_eos = self.read()
  151. return not_eos
  152. def read(self):
  153. """
  154. Reads in next char.
  155. Returns true if not EOS
  156. """
  157. if self.index < self.len_text - 1: # should be unique
  158. self.index += 1
  159. self.char = self.text[self.index]
  160. return True
  161. self.char = EOS
  162. return False