This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

242 rader
8.5 KiB

  1. from .exceptions import ConfigurationError, GrammarError, assert_config
  2. from .utils import get_regexp_width, Serialize
  3. from .parsers.grammar_analysis import GrammarAnalyzer
  4. from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
  5. from .parsers import earley, xearley, cyk
  6. from .parsers.lalr_parser import LALR_Parser
  7. from .tree import Tree
  8. from .common import LexerConf, ParserConf
  9. try:
  10. import regex
  11. except ImportError:
  12. regex = None
  13. import re
  14. ###{standalone
  15. def _wrap_lexer(lexer_class):
  16. future_interface = getattr(lexer_class, '__future_interface__', False)
  17. if future_interface:
  18. return lexer_class
  19. else:
  20. class CustomLexerWrapper(Lexer):
  21. def __init__(self, lexer_conf):
  22. self.lexer = lexer_class(lexer_conf)
  23. def lex(self, lexer_state, parser_state):
  24. return self.lexer.lex(lexer_state.text)
  25. return CustomLexerWrapper
  26. class MakeParsingFrontend:
  27. def __init__(self, parser_type, lexer_type):
  28. self.parser_type = parser_type
  29. self.lexer_type = lexer_type
  30. def __call__(self, lexer_conf, parser_conf, options):
  31. assert isinstance(lexer_conf, LexerConf)
  32. assert isinstance(parser_conf, ParserConf)
  33. parser_conf.parser_type = self.parser_type
  34. lexer_conf.lexer_type = self.lexer_type
  35. return ParsingFrontend(lexer_conf, parser_conf, options)
  36. @classmethod
  37. def deserialize(cls, data, memo, callbacks, options):
  38. lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
  39. parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
  40. parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
  41. parser_conf.callbacks = callbacks
  42. terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]
  43. lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
  44. lexer_conf.re_module = regex if options.regex else re
  45. lexer_conf.use_bytes = options.use_bytes
  46. lexer_conf.g_regex_flags = options.g_regex_flags
  47. lexer_conf.skip_validation = True
  48. lexer_conf.postlex = options.postlex
  49. return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)
  50. class ParsingFrontend(Serialize):
  51. __serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options'
  52. def __init__(self, lexer_conf, parser_conf, options, parser=None):
  53. self.parser_conf = parser_conf
  54. self.lexer_conf = lexer_conf
  55. self.options = options
  56. # Set-up parser
  57. if parser: # From cache
  58. self.parser = parser
  59. else:
  60. create_parser = {
  61. 'lalr': create_lalr_parser,
  62. 'earley': create_earley_parser,
  63. 'cyk': CYK_FrontEnd,
  64. }[parser_conf.parser_type]
  65. self.parser = create_parser(lexer_conf, parser_conf, options)
  66. # Set-up lexer
  67. lexer_type = lexer_conf.lexer_type
  68. self.skip_lexer = False
  69. if lexer_type in ('dynamic', 'dynamic_complete'):
  70. self.skip_lexer = True
  71. return
  72. try:
  73. create_lexer = {
  74. 'standard': create_traditional_lexer,
  75. 'contextual': create_contextual_lexer,
  76. }[lexer_type]
  77. except KeyError:
  78. assert issubclass(lexer_type, Lexer), lexer_type
  79. self.lexer = _wrap_lexer(lexer_type)(lexer_conf)
  80. else:
  81. self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex)
  82. if lexer_conf.postlex:
  83. self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)
  84. def parse(self, text, start=None):
  85. if start is None:
  86. start = self.parser_conf.start
  87. if len(start) > 1:
  88. raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
  89. start ,= start
  90. if self.skip_lexer:
  91. return self.parser.parse(text, start)
  92. lexer_thread = LexerThread(self.lexer, text)
  93. return self.parser.parse(lexer_thread, start)
  94. def get_frontend(parser, lexer):
  95. assert_config(parser, ('lalr', 'earley', 'cyk'))
  96. if not isinstance(lexer, type): # not custom lexer?
  97. expected = {
  98. 'lalr': ('standard', 'contextual'),
  99. 'earley': ('standard', 'dynamic', 'dynamic_complete'),
  100. 'cyk': ('standard', ),
  101. }[parser]
  102. assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)
  103. return MakeParsingFrontend(parser, lexer)
  104. def _get_lexer_callbacks(transformer, terminals):
  105. result = {}
  106. for terminal in terminals:
  107. callback = getattr(transformer, terminal.name, None)
  108. if callback is not None:
  109. result[terminal.name] = callback
  110. return result
  111. class PostLexConnector:
  112. def __init__(self, lexer, postlexer):
  113. self.lexer = lexer
  114. self.postlexer = postlexer
  115. def make_lexer_state(self, text):
  116. return self.lexer.make_lexer_state(text)
  117. def lex(self, lexer_state, parser_state):
  118. i = self.lexer.lex(lexer_state, parser_state)
  119. return self.postlexer.process(i)
  120. def create_traditional_lexer(lexer_conf, parser, postlex):
  121. return TraditionalLexer(lexer_conf)
  122. def create_contextual_lexer(lexer_conf, parser, postlex):
  123. states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}
  124. always_accept = postlex.always_accept if postlex else ()
  125. return ContextualLexer(lexer_conf, states, always_accept=always_accept)
  126. def create_lalr_parser(lexer_conf, parser_conf, options=None):
  127. debug = options.debug if options else False
  128. return LALR_Parser(parser_conf, debug=debug)
  129. create_earley_parser = NotImplemented
  130. CYK_FrontEnd = NotImplemented
  131. ###}
  132. class EarleyRegexpMatcher:
  133. def __init__(self, lexer_conf):
  134. self.regexps = {}
  135. for t in lexer_conf.terminals:
  136. if t.priority != 1:
  137. raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
  138. regexp = t.pattern.to_regexp()
  139. try:
  140. width = get_regexp_width(regexp)[0]
  141. except ValueError:
  142. raise GrammarError("Bad regexp in token %s: %s" % (t.name, regexp))
  143. else:
  144. if width == 0:
  145. raise GrammarError("Dynamic Earley doesn't allow zero-width regexps", t)
  146. if lexer_conf.use_bytes:
  147. regexp = regexp.encode('utf-8')
  148. self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)
  149. def match(self, term, text, index=0):
  150. return self.regexps[term.name].match(text, index)
  151. def create_earley_parser__dynamic(lexer_conf, parser_conf, options=None, **kw):
  152. earley_matcher = EarleyRegexpMatcher(lexer_conf)
  153. return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw)
  154. def _match_earley_basic(term, token):
  155. return term.name == token.type
  156. def create_earley_parser__basic(lexer_conf, parser_conf, options, **kw):
  157. return earley.Parser(parser_conf, _match_earley_basic, **kw)
  158. def create_earley_parser(lexer_conf, parser_conf, options):
  159. resolve_ambiguity = options.ambiguity == 'resolve'
  160. debug = options.debug if options else False
  161. tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
  162. extra = {}
  163. if lexer_conf.lexer_type == 'dynamic':
  164. f = create_earley_parser__dynamic
  165. elif lexer_conf.lexer_type == 'dynamic_complete':
  166. extra['complete_lex'] =True
  167. f = create_earley_parser__dynamic
  168. else:
  169. f = create_earley_parser__basic
  170. return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra)
  171. class CYK_FrontEnd:
  172. def __init__(self, lexer_conf, parser_conf, options=None):
  173. self._analysis = GrammarAnalyzer(parser_conf)
  174. self.parser = cyk.Parser(parser_conf.rules)
  175. self.callbacks = parser_conf.callbacks
  176. def parse(self, lexer_thread, start):
  177. tokens = list(lexer_thread.lex(None))
  178. tree = self.parser.parse(tokens, start)
  179. return self._transform(tree)
  180. def _transform(self, tree):
  181. subtrees = list(tree.iter_subtrees())
  182. for subtree in subtrees:
  183. subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
  184. return self._apply_callback(tree)
  185. def _apply_callback(self, tree):
  186. return self.callbacks[tree.rule](tree.children)