This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

222 lines
7.8 KiB

  1. import re
  2. import sre_parse
  3. from parsers.grammar_analysis import GrammarAnalyzer
  4. from .lexer import Lexer, ContextualLexer, Token
  5. from .common import is_terminal, GrammarError, Terminal_Regexp, Terminal_Token
  6. from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
  7. from .tree import Tree
  8. class WithLexer:
  9. def __init__(self, lexer_conf):
  10. self.lexer_conf = lexer_conf
  11. self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)
  12. def lex(self, text):
  13. stream = self.lexer.lex(text)
  14. if self.lexer_conf.postlex:
  15. return self.lexer_conf.postlex.process(stream)
  16. else:
  17. return stream
  18. class LALR(WithLexer):
  19. def __init__(self, lexer_conf, parser_conf, options=None):
  20. WithLexer.__init__(self, lexer_conf)
  21. self.parser_conf = parser_conf
  22. self.parser = lalr_parser.Parser(parser_conf)
  23. def parse(self, text):
  24. tokens = self.lex(text)
  25. return self.parser.parse(tokens)
  26. class LALR_ContextualLexer:
  27. def __init__(self, lexer_conf, parser_conf, options=None):
  28. self.lexer_conf = lexer_conf
  29. self.parser_conf = parser_conf
  30. self.parser = lalr_parser.Parser(parser_conf)
  31. d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()}
  32. always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
  33. self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
  34. def parse(self, text):
  35. tokens = self.lexer.lex(text)
  36. if self.lexer_conf.postlex:
  37. tokens = self.lexer_conf.postlex.process(tokens)
  38. return self.parser.parse(tokens, self.lexer.set_parser_state)
  39. def get_ambiguity_resolver(options):
  40. if not options or options.ambiguity == 'resolve':
  41. return resolve_ambig.standard_resolve_ambig
  42. elif options.ambiguity == 'resolve__antiscore_sum':
  43. return resolve_ambig.antiscore_sum_resolve_ambig
  44. elif options.ambiguity == 'explicit':
  45. return None
  46. raise ValueError(options)
  47. def tokenize_text(text):
  48. new_text = []
  49. line = 1
  50. col_start_pos = 0
  51. for i, ch in enumerate(text):
  52. if '\n' in ch:
  53. line += ch.count('\n')
  54. col_start_pos = i + ch.rindex('\n')
  55. new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
  56. return new_text
  57. class Earley_NoLex:
  58. def __init__(self, lexer_conf, parser_conf, options=None):
  59. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  60. rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
  61. self.parser = earley.Parser(rules,
  62. parser_conf.start,
  63. parser_conf.callback,
  64. resolve_ambiguity=get_ambiguity_resolver(options))
  65. def _prepare_expansion(self, expansion):
  66. for sym in expansion:
  67. if is_terminal(sym):
  68. regexp = self.token_by_name[sym].pattern.to_regexp()
  69. width = sre_parse.parse(regexp).getwidth()
  70. if width != (1,1):
  71. raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
  72. yield Terminal_Regexp(sym, regexp)
  73. else:
  74. yield sym
  75. def parse(self, text):
  76. new_text = tokenize_text(text)
  77. return self.parser.parse(new_text)
  78. class Earley(WithLexer):
  79. def __init__(self, lexer_conf, parser_conf, options=None):
  80. WithLexer.__init__(self, lexer_conf)
  81. rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]
  82. self.parser = earley.Parser(rules,
  83. parser_conf.start,
  84. parser_conf.callback,
  85. resolve_ambiguity=get_ambiguity_resolver(options))
  86. def _prepare_expansion(self, expansion):
  87. return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
  88. def parse(self, text):
  89. tokens = self.lex(text)
  90. return self.parser.parse(tokens)
  91. class XEarley:
  92. def __init__(self, lexer_conf, parser_conf, options=None):
  93. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  94. rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
  95. ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]
  96. self.parser = xearley.Parser(rules,
  97. parser_conf.start,
  98. parser_conf.callback,
  99. resolve_ambiguity=get_ambiguity_resolver(options),
  100. ignore=ignore,
  101. predict_all=options.earley__predict_all
  102. )
  103. def _prepare_expansion(self, expansion):
  104. for sym in expansion:
  105. if is_terminal(sym):
  106. regexp = self.token_by_name[sym].pattern.to_regexp()
  107. width = sre_parse.parse(regexp).getwidth()
  108. assert width
  109. yield Terminal_Regexp(sym, regexp)
  110. else:
  111. yield sym
  112. def parse(self, text):
  113. return self.parser.parse(text)
  114. class CYK(WithLexer):
  115. def __init__(self, lexer_conf, parser_conf, options=None):
  116. WithLexer.__init__(self, lexer_conf)
  117. # TokenDef from synthetic rule to terminal value
  118. self._token_by_name = {t.name: t for t in lexer_conf.tokens}
  119. rules = [(lhs, self._prepare_expansion(rhs), cb, opt) for lhs, rhs, cb, opt in parser_conf.rules]
  120. self._analysis = GrammarAnalyzer(rules, parser_conf.start)
  121. self._parser = cyk.Parser(self._analysis.rules, parser_conf.start)
  122. self._postprocess = {}
  123. for rule in self._analysis.rules:
  124. if rule.origin != '$root': # XXX kinda ugly
  125. a = rule.alias
  126. self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))
  127. def _prepare_expansion(self, expansion):
  128. return [
  129. Terminal_Regexp(sym, self._token_by_name[sym].pattern.to_regexp())
  130. if is_terminal(sym) else sym for sym in expansion
  131. ]
  132. def parse(self, text):
  133. tokenized = [token.value for token in self.lex(text)]
  134. parse = self._parser.parse(tokenized)
  135. parse = self._transform(parse)
  136. return parse
  137. def _transform(self, tree):
  138. subtrees = list(tree.iter_subtrees())
  139. for subtree in subtrees:
  140. subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
  141. return self._apply_callback(tree)
  142. def _apply_callback(self, tree):
  143. children = tree.children
  144. callback = self._postprocess[tree.rule.alias]
  145. assert callback, tree.rule.alias
  146. r = callback(children)
  147. return r
  148. def get_frontend(parser, lexer):
  149. if parser=='lalr':
  150. if lexer is None:
  151. raise ValueError('The LALR parser requires use of a lexer')
  152. elif lexer == 'standard':
  153. return LALR
  154. elif lexer == 'contextual':
  155. return LALR_ContextualLexer
  156. else:
  157. raise ValueError('Unknown lexer: %s' % lexer)
  158. elif parser=='earley':
  159. if lexer is None:
  160. return Earley_NoLex
  161. elif lexer=='standard':
  162. return Earley
  163. elif lexer=='dynamic':
  164. return XEarley
  165. elif lexer=='contextual':
  166. raise ValueError('The Earley parser does not support the contextual parser')
  167. else:
  168. raise ValueError('Unknown lexer: %s' % lexer)
  169. elif parser == 'cyk':
  170. if lexer == 'standard':
  171. return CYK
  172. else:
  173. raise ValueError('CYK parser requires using standard parser.')
  174. else:
  175. raise ValueError('Unknown parser: %s' % parser)