token_parser.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. import string
  2. from collections import OrderedDict
  3. from typing import Dict, List, Union
  4. PRESERVE_ORDER_KEY = "preserve_order"
  5. EOS = "<EOS>"
  6. class TokenParser:
  7. """
  8. Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'
  9. Args
  10. text: tokenized text
  11. """
  12. def __call__(self, text):
  13. """
  14. Setup function
  15. Args:
  16. text: text to be parsed
  17. """
  18. self.text = text
  19. self.len_text = len(text)
  20. self.char = text[0] # cannot handle empty string
  21. self.index = 0
  22. def parse(self) -> List[dict]:
  23. """
  24. Main function. Implements grammar:
  25. A -> space F space F space F ... space
  26. Returns list of dictionaries
  27. """
  28. l = list()
  29. while self.parse_ws():
  30. token = self.parse_token()
  31. if not token:
  32. break
  33. l.append(token)
  34. return l
  35. def parse_token(self) -> Dict[str, Union[str, dict]]:
  36. """
  37. Implements grammar:
  38. F-> no_space KG no_space
  39. Returns: K, G as dictionary values
  40. """
  41. d = OrderedDict()
  42. key = self.parse_string_key()
  43. if key is None:
  44. return None
  45. self.parse_ws()
  46. if key == PRESERVE_ORDER_KEY:
  47. self.parse_char(":")
  48. self.parse_ws()
  49. value = self.parse_chars("true")
  50. else:
  51. value = self.parse_token_value()
  52. d[key] = value
  53. return d
  54. def parse_token_value(self) -> Union[str, dict]:
  55. """
  56. Implements grammar:
  57. G-> no_space :"VALUE" no_space | no_space {A} no_space
  58. Returns: string or dictionary
  59. """
  60. if self.char == ":":
  61. self.parse_char(":")
  62. self.parse_ws()
  63. self.parse_char("\"")
  64. value_string = self.parse_string_value()
  65. self.parse_char("\"")
  66. return value_string
  67. elif self.char == "{":
  68. d = OrderedDict()
  69. self.parse_char("{")
  70. list_token_dicts = self.parse()
  71. # flatten tokens
  72. for tok_dict in list_token_dicts:
  73. for k, v in tok_dict.items():
  74. d[k] = v
  75. self.parse_char("}")
  76. return d
  77. else:
  78. raise ValueError()
  79. def parse_char(self, exp) -> bool:
  80. """
  81. Parses character
  82. Args:
  83. exp: character to read in
  84. Returns true if successful
  85. """
  86. assert self.char == exp
  87. self.read()
  88. return True
  89. def parse_chars(self, exp) -> bool:
  90. """
  91. Parses characters
  92. Args:
  93. exp: characters to read in
  94. Returns true if successful
  95. """
  96. ok = False
  97. for x in exp:
  98. ok |= self.parse_char(x)
  99. return ok
  100. def parse_string_key(self) -> str:
  101. """
  102. Parses string key, can only contain ascii and '_' characters
  103. Returns parsed string key
  104. """
  105. assert self.char not in string.whitespace and self.char != EOS
  106. incl_criterium = string.ascii_letters + "_"
  107. l = []
  108. while self.char in incl_criterium:
  109. l.append(self.char)
  110. if not self.read():
  111. raise ValueError()
  112. if not l:
  113. return None
  114. return "".join(l)
  115. def parse_string_value(self) -> str:
  116. """
  117. Parses string value, ends with quote followed by space
  118. Returns parsed string value
  119. """
  120. # assert self.char not in string.whitespace and self.char != EOS
  121. assert self.char != EOS
  122. l = []
  123. while self.char != "\"" or self.text[self.index + 1] != " ":
  124. l.append(self.char)
  125. if not self.read():
  126. raise ValueError()
  127. if not l:
  128. return None
  129. return "".join(l)
  130. def parse_ws(self):
  131. """
  132. Deletes whitespaces.
  133. Returns true if not EOS after parsing
  134. """
  135. not_eos = self.char != EOS
  136. while not_eos and self.char == " ":
  137. not_eos = self.read()
  138. return not_eos
  139. def read(self):
  140. """
  141. Reads in next char.
  142. Returns true if not EOS
  143. """
  144. if self.index < self.len_text - 1: # should be unique
  145. self.index += 1
  146. self.char = self.text[self.index]
  147. return True
  148. self.char = EOS
  149. return False