This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

197 linhas
6.8 KiB

  1. import re
  2. from functools import partial
  3. from .utils import get_regexp_width, Serialize
  4. from .parsers.grammar_analysis import GrammarAnalyzer
  5. from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
  6. from .parsers import earley, xearley, cyk
  7. from .parsers.lalr_parser import LALR_Parser
  8. from .grammar import Rule
  9. from .tree import Tree
  10. ###{standalone
  11. class WithLexer(Serialize):
  12. lexer = None
  13. parser = None
  14. lexer_conf = None
  15. __serialize_fields__ = 'parser', 'lexer'
  16. __serialize_namespace__ = Rule, ContextualLexer
  17. @classmethod
  18. def deserialize(cls, data, memo, callbacks):
  19. inst = super(WithLexer, cls).deserialize(data, memo)
  20. inst.postlex = None # TODO
  21. inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
  22. return inst
  23. def _serialize(self, data, memo):
  24. data['parser'] = data['parser'].serialize(memo)
  25. def init_traditional_lexer(self, lexer_conf):
  26. self.lexer_conf = lexer_conf
  27. self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
  28. self.postlex = lexer_conf.postlex
  29. def init_contextual_lexer(self, lexer_conf):
  30. self.lexer_conf = lexer_conf
  31. self.postlex = lexer_conf.postlex
  32. states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
  33. always_accept = self.postlex.always_accept if self.postlex else ()
  34. self.lexer = ContextualLexer(lexer_conf.tokens, states,
  35. ignore=lexer_conf.ignore,
  36. always_accept=always_accept,
  37. user_callbacks=lexer_conf.callbacks)
  38. def lex(self, text):
  39. stream = self.lexer.lex(text)
  40. return self.postlex.process(stream) if self.postlex else stream
  41. def parse(self, text):
  42. token_stream = self.lex(text)
  43. sps = self.lexer.set_parser_state
  44. return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])
  45. class LALR_TraditionalLexer(WithLexer):
  46. def __init__(self, lexer_conf, parser_conf, options=None):
  47. debug = options.debug if options else False
  48. self.parser = LALR_Parser(parser_conf, debug=debug)
  49. self.init_traditional_lexer(lexer_conf)
  50. class LALR_ContextualLexer(WithLexer):
  51. def __init__(self, lexer_conf, parser_conf, options=None):
  52. debug = options.debug if options else False
  53. self.parser = LALR_Parser(parser_conf, debug=debug)
  54. self.init_contextual_lexer(lexer_conf)
  55. class LALR_CustomLexer(WithLexer):
  56. def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
  57. self.parser = LALR_Parser(parser_conf)
  58. self.lexer_conf = lexer_conf
  59. self.lexer = lexer_cls(lexer_conf)
  60. def tokenize_text(text):
  61. line = 1
  62. col_start_pos = 0
  63. for i, ch in enumerate(text):
  64. if '\n' in ch:
  65. line += ch.count('\n')
  66. col_start_pos = i + ch.rindex('\n')
  67. yield Token('CHAR', ch, line=line, column=i - col_start_pos)
  68. class Earley(WithLexer):
  69. def __init__(self, lexer_conf, parser_conf, options=None):
  70. self.init_traditional_lexer(lexer_conf)
  71. resolve_ambiguity = options.ambiguity == 'resolve'
  72. self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
  73. def match(self, term, token):
  74. return term.name == token.type
  75. class XEarley:
  76. def __init__(self, lexer_conf, parser_conf, options=None, **kw):
  77. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  78. self._prepare_match(lexer_conf)
  79. resolve_ambiguity = options.ambiguity == 'resolve'
  80. self.parser = xearley.Parser(parser_conf,
  81. self.match,
  82. ignore=lexer_conf.ignore,
  83. resolve_ambiguity=resolve_ambiguity,
  84. **kw
  85. )
  86. def match(self, term, text, index=0):
  87. return self.regexps[term.name].match(text, index)
  88. def _prepare_match(self, lexer_conf):
  89. self.regexps = {}
  90. for t in lexer_conf.tokens:
  91. if t.priority != 1:
  92. raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
  93. regexp = t.pattern.to_regexp()
  94. try:
  95. width = get_regexp_width(regexp)[0]
  96. except ValueError:
  97. raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
  98. else:
  99. if width == 0:
  100. raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
  101. self.regexps[t.name] = re.compile(regexp)
  102. def parse(self, text):
  103. return self.parser.parse(text)
  104. class XEarley_CompleteLex(XEarley):
  105. def __init__(self, *args, **kw):
  106. XEarley.__init__(self, *args, complete_lex=True, **kw)
  107. class CYK(WithLexer):
  108. def __init__(self, lexer_conf, parser_conf, options=None):
  109. self.init_traditional_lexer(lexer_conf)
  110. self._analysis = GrammarAnalyzer(parser_conf)
  111. self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)
  112. self.callbacks = parser_conf.callbacks
  113. def parse(self, text):
  114. tokens = list(self.lex(text))
  115. parse = self._parser.parse(tokens)
  116. parse = self._transform(parse)
  117. return parse
  118. def _transform(self, tree):
  119. subtrees = list(tree.iter_subtrees())
  120. for subtree in subtrees:
  121. subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
  122. return self._apply_callback(tree)
  123. def _apply_callback(self, tree):
  124. return self.callbacks[tree.rule](tree.children)
  125. def get_frontend(parser, lexer):
  126. if parser=='lalr':
  127. if lexer is None:
  128. raise ValueError('The LALR parser requires use of a lexer')
  129. elif lexer == 'standard':
  130. return LALR_TraditionalLexer
  131. elif lexer == 'contextual':
  132. return LALR_ContextualLexer
  133. elif issubclass(lexer, Lexer):
  134. return partial(LALR_CustomLexer, lexer)
  135. else:
  136. raise ValueError('Unknown lexer: %s' % lexer)
  137. elif parser=='earley':
  138. if lexer=='standard':
  139. return Earley
  140. elif lexer=='dynamic':
  141. return XEarley
  142. elif lexer=='dynamic_complete':
  143. return XEarley_CompleteLex
  144. elif lexer=='contextual':
  145. raise ValueError('The Earley parser does not support the contextual parser')
  146. else:
  147. raise ValueError('Unknown lexer: %s' % lexer)
  148. elif parser == 'cyk':
  149. if lexer == 'standard':
  150. return CYK
  151. else:
  152. raise ValueError('CYK parser requires using standard parser.')
  153. else:
  154. raise ValueError('Unknown parser: %s' % parser)
  155. ###}