This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

206 lines
7.0 KiB

  1. import re
  2. from functools import partial
  3. from .utils import get_regexp_width, Serialize
  4. from .parsers.grammar_analysis import GrammarAnalyzer
  5. from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
  6. from .parsers import earley, xearley, cyk
  7. from .parsers.lalr_parser import LALR_Parser
  8. from .grammar import Rule
  9. from .tree import Tree
  10. from .common import LexerConf
  11. ###{standalone
  12. def get_frontend(parser, lexer):
  13. if parser=='lalr':
  14. if lexer is None:
  15. raise ValueError('The LALR parser requires use of a lexer')
  16. elif lexer == 'standard':
  17. return LALR_TraditionalLexer
  18. elif lexer == 'contextual':
  19. return LALR_ContextualLexer
  20. elif issubclass(lexer, Lexer):
  21. return partial(LALR_CustomLexer, lexer)
  22. else:
  23. raise ValueError('Unknown lexer: %s' % lexer)
  24. elif parser=='earley':
  25. if lexer=='standard':
  26. return Earley
  27. elif lexer=='dynamic':
  28. return XEarley
  29. elif lexer=='dynamic_complete':
  30. return XEarley_CompleteLex
  31. elif lexer=='contextual':
  32. raise ValueError('The Earley parser does not support the contextual parser')
  33. else:
  34. raise ValueError('Unknown lexer: %s' % lexer)
  35. elif parser == 'cyk':
  36. if lexer == 'standard':
  37. return CYK
  38. else:
  39. raise ValueError('CYK parser requires using standard parser.')
  40. else:
  41. raise ValueError('Unknown parser: %s' % parser)
  42. class WithLexer(Serialize):
  43. lexer = None
  44. parser = None
  45. lexer_conf = None
  46. __serialize_fields__ = 'parser', 'lexer_conf'
  47. __serialize_namespace__ = LexerConf,
  48. def __init__(self, lexer_conf, parser_conf, options=None):
  49. self.lexer_conf = lexer_conf
  50. self.postlex = lexer_conf.postlex
  51. @classmethod
  52. def deserialize(cls, data, memo, callbacks, postlex):
  53. inst = super(WithLexer, cls).deserialize(data, memo)
  54. inst.postlex = postlex
  55. inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
  56. inst.init_lexer()
  57. return inst
  58. def _serialize(self, data, memo):
  59. data['parser'] = data['parser'].serialize(memo)
  60. def lex(self, text):
  61. stream = self.lexer.lex(text)
  62. return self.postlex.process(stream) if self.postlex else stream
  63. def parse(self, text):
  64. token_stream = self.lex(text)
  65. sps = self.lexer.set_parser_state
  66. return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])
  67. def init_traditional_lexer(self):
  68. self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
  69. class LALR_WithLexer(WithLexer):
  70. def __init__(self, lexer_conf, parser_conf, options=None):
  71. debug = options.debug if options else False
  72. self.parser = LALR_Parser(parser_conf, debug=debug)
  73. WithLexer.__init__(self, lexer_conf, parser_conf, options)
  74. self.init_lexer()
  75. def init_lexer(self):
  76. raise NotImplementedError()
  77. class LALR_TraditionalLexer(LALR_WithLexer):
  78. def init_lexer(self):
  79. self.init_traditional_lexer()
  80. class LALR_ContextualLexer(LALR_WithLexer):
  81. def init_lexer(self):
  82. states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
  83. always_accept = self.postlex.always_accept if self.postlex else ()
  84. self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
  85. ignore=self.lexer_conf.ignore,
  86. always_accept=always_accept,
  87. user_callbacks=self.lexer_conf.callbacks)
  88. ###}
  89. class LALR_CustomLexer(LALR_WithLexer):
  90. def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
  91. self.lexer = lexer_cls(self.lexer_conf)
  92. debug = options.debug if options else False
  93. self.parser = LALR_Parser(parser_conf, debug=debug)
  94. WithLexer.__init__(self, lexer_conf, parser_conf, options)
  95. def tokenize_text(text):
  96. line = 1
  97. col_start_pos = 0
  98. for i, ch in enumerate(text):
  99. if '\n' in ch:
  100. line += ch.count('\n')
  101. col_start_pos = i + ch.rindex('\n')
  102. yield Token('CHAR', ch, line=line, column=i - col_start_pos)
  103. class Earley(WithLexer):
  104. def __init__(self, lexer_conf, parser_conf, options=None):
  105. WithLexer.__init__(self, lexer_conf, parser_conf, options)
  106. self.init_traditional_lexer()
  107. resolve_ambiguity = options.ambiguity == 'resolve'
  108. self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
  109. def match(self, term, token):
  110. return term.name == token.type
  111. class XEarley:
  112. def __init__(self, lexer_conf, parser_conf, options=None, **kw):
  113. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  114. self._prepare_match(lexer_conf)
  115. resolve_ambiguity = options.ambiguity == 'resolve'
  116. self.parser = xearley.Parser(parser_conf,
  117. self.match,
  118. ignore=lexer_conf.ignore,
  119. resolve_ambiguity=resolve_ambiguity,
  120. **kw
  121. )
  122. def match(self, term, text, index=0):
  123. return self.regexps[term.name].match(text, index)
  124. def _prepare_match(self, lexer_conf):
  125. self.regexps = {}
  126. for t in lexer_conf.tokens:
  127. if t.priority != 1:
  128. raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
  129. regexp = t.pattern.to_regexp()
  130. try:
  131. width = get_regexp_width(regexp)[0]
  132. except ValueError:
  133. raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
  134. else:
  135. if width == 0:
  136. raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
  137. self.regexps[t.name] = re.compile(regexp)
  138. def parse(self, text):
  139. return self.parser.parse(text)
  140. class XEarley_CompleteLex(XEarley):
  141. def __init__(self, *args, **kw):
  142. XEarley.__init__(self, *args, complete_lex=True, **kw)
  143. class CYK(WithLexer):
  144. def __init__(self, lexer_conf, parser_conf, options=None):
  145. WithLexer.__init__(self, lexer_conf, parser_conf, options)
  146. self.init_traditional_lexer()
  147. self._analysis = GrammarAnalyzer(parser_conf)
  148. self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)
  149. self.callbacks = parser_conf.callbacks
  150. def parse(self, text):
  151. tokens = list(self.lex(text))
  152. parse = self._parser.parse(tokens)
  153. parse = self._transform(parse)
  154. return parse
  155. def _transform(self, tree):
  156. subtrees = list(tree.iter_subtrees())
  157. for subtree in subtrees:
  158. subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
  159. return self._apply_callback(tree)
  160. def _apply_callback(self, tree):
  161. return self.callbacks[tree.rule](tree.children)