This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 

242 řádky
8.9 KiB

  1. from .exceptions import ConfigurationError, GrammarError, assert_config
  2. from .utils import get_regexp_width, Serialize
  3. from .parsers.grammar_analysis import GrammarAnalyzer
  4. from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
  5. from .parsers import earley, xearley, cyk
  6. from .parsers.lalr_parser import LALR_Parser
  7. from .tree import Tree
  8. from .common import LexerConf, ParserConf
  9. try:
  10. import regex # type: ignore
  11. except ImportError:
  12. regex = None
  13. import re
  14. ###{standalone
  15. def _wrap_lexer(lexer_class):
  16. future_interface = getattr(lexer_class, '__future_interface__', False)
  17. if future_interface:
  18. return lexer_class
  19. else:
  20. class CustomLexerWrapper(Lexer):
  21. def __init__(self, lexer_conf):
  22. self.lexer = lexer_class(lexer_conf)
  23. def lex(self, lexer_state, parser_state):
  24. return self.lexer.lex(lexer_state.text)
  25. return CustomLexerWrapper
  26. class MakeParsingFrontend:
  27. def __init__(self, parser_type, lexer_type):
  28. self.parser_type = parser_type
  29. self.lexer_type = lexer_type
  30. def deserialize(self, data, memo, lexer_conf, callbacks, options):
  31. parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
  32. parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
  33. parser_conf.callbacks = callbacks
  34. return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)
  35. # ... Continued later in the module
  36. class ParsingFrontend(Serialize):
  37. __serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options'
  38. def __init__(self, lexer_conf, parser_conf, options, parser=None):
  39. self.parser_conf = parser_conf
  40. self.lexer_conf = lexer_conf
  41. self.options = options
  42. # Set-up parser
  43. if parser: # From cache
  44. self.parser = parser
  45. else:
  46. create_parser = {
  47. 'lalr': create_lalr_parser,
  48. 'earley': create_earley_parser,
  49. 'cyk': CYK_FrontEnd,
  50. }[parser_conf.parser_type]
  51. self.parser = create_parser(lexer_conf, parser_conf, options)
  52. # Set-up lexer
  53. lexer_type = lexer_conf.lexer_type
  54. self.skip_lexer = False
  55. if lexer_type in ('dynamic', 'dynamic_complete'):
  56. assert lexer_conf.postlex is None
  57. self.skip_lexer = True
  58. return
  59. try:
  60. create_lexer = {
  61. 'standard': create_traditional_lexer,
  62. 'contextual': create_contextual_lexer,
  63. }[lexer_type]
  64. except KeyError:
  65. assert issubclass(lexer_type, Lexer), lexer_type
  66. self.lexer = _wrap_lexer(lexer_type)(lexer_conf)
  67. else:
  68. self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex)
  69. if lexer_conf.postlex:
  70. self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)
  71. def _verify_start(self, start=None):
  72. if start is None:
  73. start_decls = self.parser_conf.start
  74. if len(start_decls) > 1:
  75. raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
  76. start ,= start_decls
  77. elif start not in self.parser_conf.start:
  78. raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
  79. return start
  80. def parse(self, text, start=None, on_error=None):
  81. chosen_start = self._verify_start(start)
  82. stream = text if self.skip_lexer else LexerThread(self.lexer, text)
  83. kw = {} if on_error is None else {'on_error': on_error}
  84. return self.parser.parse(stream, chosen_start, **kw)
  85. def parse_interactive(self, text=None, start=None):
  86. chosen_start = self._verify_start(start)
  87. if self.parser_conf.parser_type != 'lalr':
  88. raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
  89. stream = text if self.skip_lexer else LexerThread(self.lexer, text)
  90. return self.parser.parse_interactive(stream, chosen_start)
  91. def get_frontend(parser, lexer):
  92. assert_config(parser, ('lalr', 'earley', 'cyk'))
  93. if not isinstance(lexer, type): # not custom lexer?
  94. expected = {
  95. 'lalr': ('standard', 'contextual'),
  96. 'earley': ('standard', 'dynamic', 'dynamic_complete'),
  97. 'cyk': ('standard', ),
  98. }[parser]
  99. assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)
  100. return MakeParsingFrontend(parser, lexer)
  101. def _get_lexer_callbacks(transformer, terminals):
  102. result = {}
  103. for terminal in terminals:
  104. callback = getattr(transformer, terminal.name, None)
  105. if callback is not None:
  106. result[terminal.name] = callback
  107. return result
  108. class PostLexConnector:
  109. def __init__(self, lexer, postlexer):
  110. self.lexer = lexer
  111. self.postlexer = postlexer
  112. def make_lexer_state(self, text):
  113. return self.lexer.make_lexer_state(text)
  114. def lex(self, lexer_state, parser_state):
  115. i = self.lexer.lex(lexer_state, parser_state)
  116. return self.postlexer.process(i)
  117. def create_traditional_lexer(lexer_conf, parser, postlex):
  118. return TraditionalLexer(lexer_conf)
  119. def create_contextual_lexer(lexer_conf, parser, postlex):
  120. states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}
  121. always_accept = postlex.always_accept if postlex else ()
  122. return ContextualLexer(lexer_conf, states, always_accept=always_accept)
  123. def create_lalr_parser(lexer_conf, parser_conf, options=None):
  124. debug = options.debug if options else False
  125. return LALR_Parser(parser_conf, debug=debug)
  126. create_earley_parser = NotImplemented
  127. CYK_FrontEnd = NotImplemented
  128. ###}
  129. class EarleyRegexpMatcher:
  130. def __init__(self, lexer_conf):
  131. self.regexps = {}
  132. for t in lexer_conf.terminals:
  133. if t.priority != 1:
  134. raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
  135. regexp = t.pattern.to_regexp()
  136. try:
  137. width = get_regexp_width(regexp)[0]
  138. except ValueError:
  139. raise GrammarError("Bad regexp in token %s: %s" % (t.name, regexp))
  140. else:
  141. if width == 0:
  142. raise GrammarError("Dynamic Earley doesn't allow zero-width regexps", t)
  143. if lexer_conf.use_bytes:
  144. regexp = regexp.encode('utf-8')
  145. self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)
  146. def match(self, term, text, index=0):
  147. return self.regexps[term.name].match(text, index)
  148. def create_earley_parser__dynamic(lexer_conf, parser_conf, options=None, **kw):
  149. earley_matcher = EarleyRegexpMatcher(lexer_conf)
  150. return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw)
  151. def _match_earley_basic(term, token):
  152. return term.name == token.type
  153. def create_earley_parser__basic(lexer_conf, parser_conf, options, **kw):
  154. return earley.Parser(parser_conf, _match_earley_basic, **kw)
  155. def create_earley_parser(lexer_conf, parser_conf, options):
  156. resolve_ambiguity = options.ambiguity == 'resolve'
  157. debug = options.debug if options else False
  158. tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
  159. extra = {}
  160. if lexer_conf.lexer_type == 'dynamic':
  161. f = create_earley_parser__dynamic
  162. elif lexer_conf.lexer_type == 'dynamic_complete':
  163. extra['complete_lex'] =True
  164. f = create_earley_parser__dynamic
  165. else:
  166. f = create_earley_parser__basic
  167. return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra)
  168. class CYK_FrontEnd:
  169. def __init__(self, lexer_conf, parser_conf, options=None):
  170. self._analysis = GrammarAnalyzer(parser_conf)
  171. self.parser = cyk.Parser(parser_conf.rules)
  172. self.callbacks = parser_conf.callbacks
  173. def parse(self, lexer_thread, start):
  174. tokens = list(lexer_thread.lex(None))
  175. tree = self.parser.parse(tokens, start)
  176. return self._transform(tree)
  177. def _transform(self, tree):
  178. subtrees = list(tree.iter_subtrees())
  179. for subtree in subtrees:
  180. subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
  181. return self._apply_callback(tree)
  182. def _apply_callback(self, tree):
  183. return self.callbacks[tree.rule](tree.children)
  184. class MakeParsingFrontend(MakeParsingFrontend):
  185. def __call__(self, lexer_conf, parser_conf, options):
  186. assert isinstance(lexer_conf, LexerConf)
  187. assert isinstance(parser_conf, ParserConf)
  188. parser_conf.parser_type = self.parser_type
  189. lexer_conf.lexer_type = self.lexer_type
  190. return ParsingFrontend(lexer_conf, parser_conf, options)