This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

205 linhas
7.0 KiB

  1. import re
  2. from .utils import get_regexp_width
  3. from .parsers.grammar_analysis import GrammarAnalyzer
  4. from .lexer import Lexer, ContextualLexer, Token
  5. from .common import is_terminal, GrammarError
  6. from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
  7. from .tree import Tree
  8. class WithLexer:
  9. def init_traditional_lexer(self, lexer_conf):
  10. self.lexer_conf = lexer_conf
  11. self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
  12. def init_contextual_lexer(self, lexer_conf, parser_conf):
  13. self.lexer_conf = lexer_conf
  14. states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
  15. always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
  16. self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks)
  17. def lex(self, text):
  18. stream = self.lexer.lex(text)
  19. if self.lexer_conf.postlex:
  20. return self.lexer_conf.postlex.process(stream)
  21. else:
  22. return stream
  23. class LALR(WithLexer):
  24. def __init__(self, lexer_conf, parser_conf, options=None):
  25. self.parser = lalr_parser.Parser(parser_conf)
  26. self.init_traditional_lexer(lexer_conf)
  27. def parse(self, text):
  28. token_stream = self.lex(text)
  29. return self.parser.parse(token_stream)
  30. class LALR_ContextualLexer(WithLexer):
  31. def __init__(self, lexer_conf, parser_conf, options=None):
  32. self.parser = lalr_parser.Parser(parser_conf)
  33. self.init_contextual_lexer(lexer_conf, parser_conf)
  34. def parse(self, text):
  35. token_stream = self.lex(text)
  36. return self.parser.parse(token_stream, self.lexer.set_parser_state)
  37. def get_ambiguity_resolver(options):
  38. if not options or options.ambiguity == 'resolve':
  39. return resolve_ambig.standard_resolve_ambig
  40. elif options.ambiguity == 'resolve__antiscore_sum':
  41. return resolve_ambig.antiscore_sum_resolve_ambig
  42. elif options.ambiguity == 'explicit':
  43. return None
  44. raise ValueError(options)
  45. def tokenize_text(text):
  46. line = 1
  47. col_start_pos = 0
  48. for i, ch in enumerate(text):
  49. if '\n' in ch:
  50. line += ch.count('\n')
  51. col_start_pos = i + ch.rindex('\n')
  52. yield Token('CHAR', ch, line=line, column=i - col_start_pos)
  53. class Earley_NoLex:
  54. def __init__(self, lexer_conf, parser_conf, options=None):
  55. self._prepare_match(lexer_conf)
  56. self.parser = earley.Parser(parser_conf, self.match,
  57. resolve_ambiguity=get_ambiguity_resolver(options))
  58. def match(self, term, text, index=0):
  59. return self.regexps[term].match(text, index)
  60. def _prepare_match(self, lexer_conf):
  61. self.regexps = {}
  62. for t in lexer_conf.tokens:
  63. regexp = t.pattern.to_regexp()
  64. width = get_regexp_width(regexp)
  65. if width != (1,1):
  66. raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (t.name, regexp, width))
  67. self.regexps[t.name] = re.compile(regexp)
  68. def parse(self, text):
  69. token_stream = tokenize_text(text)
  70. return self.parser.parse(token_stream)
  71. class Earley(WithLexer):
  72. def __init__(self, lexer_conf, parser_conf, options=None):
  73. self.init_traditional_lexer(lexer_conf)
  74. self.parser = earley.Parser(parser_conf, self.match,
  75. resolve_ambiguity=get_ambiguity_resolver(options))
  76. def match(self, term, token):
  77. return term == token.type
  78. def parse(self, text):
  79. tokens = self.lex(text)
  80. return self.parser.parse(tokens)
  81. class XEarley:
  82. def __init__(self, lexer_conf, parser_conf, options=None):
  83. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  84. self._prepare_match(lexer_conf)
  85. self.parser = xearley.Parser(parser_conf,
  86. self.match,
  87. resolve_ambiguity=get_ambiguity_resolver(options),
  88. ignore=lexer_conf.ignore,
  89. predict_all=options.earley__predict_all
  90. )
  91. def match(self, term, text, index=0):
  92. return self.regexps[term].match(text, index)
  93. def _prepare_match(self, lexer_conf):
  94. self.regexps = {}
  95. for t in lexer_conf.tokens:
  96. regexp = t.pattern.to_regexp()
  97. try:
  98. width = get_regexp_width(regexp)[0]
  99. except ValueError:
  100. raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
  101. else:
  102. if width == 0:
  103. raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
  104. self.regexps[t.name] = re.compile(regexp)
  105. def parse(self, text):
  106. return self.parser.parse(text)
  107. class CYK(WithLexer):
  108. def __init__(self, lexer_conf, parser_conf, options=None):
  109. self.init_traditional_lexer(lexer_conf)
  110. self._analysis = GrammarAnalyzer(parser_conf)
  111. self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)
  112. self._postprocess = {}
  113. for rule in parser_conf.rules:
  114. a = rule.alias
  115. self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))
  116. def parse(self, text):
  117. tokens = list(self.lex(text))
  118. parse = self._parser.parse(tokens)
  119. parse = self._transform(parse)
  120. return parse
  121. def _transform(self, tree):
  122. subtrees = list(tree.iter_subtrees())
  123. for subtree in subtrees:
  124. subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
  125. return self._apply_callback(tree)
  126. def _apply_callback(self, tree):
  127. children = tree.children
  128. callback = self._postprocess[tree.rule.alias]
  129. assert callback, tree.rule.alias
  130. r = callback(children)
  131. return r
  132. def get_frontend(parser, lexer):
  133. if parser=='lalr':
  134. if lexer is None:
  135. raise ValueError('The LALR parser requires use of a lexer')
  136. elif lexer == 'standard':
  137. return LALR
  138. elif lexer == 'contextual':
  139. return LALR_ContextualLexer
  140. else:
  141. raise ValueError('Unknown lexer: %s' % lexer)
  142. elif parser=='earley':
  143. if lexer is None:
  144. return Earley_NoLex
  145. elif lexer=='standard':
  146. return Earley
  147. elif lexer=='dynamic':
  148. return XEarley
  149. elif lexer=='contextual':
  150. raise ValueError('The Earley parser does not support the contextual parser')
  151. else:
  152. raise ValueError('Unknown lexer: %s' % lexer)
  153. elif parser == 'cyk':
  154. if lexer == 'standard':
  155. return CYK
  156. else:
  157. raise ValueError('CYK parser requires using standard parser.')
  158. else:
  159. raise ValueError('Unknown parser: %s' % parser)