This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

231 lines
8.2 KiB

  1. import re
  2. import sre_parse
  3. from .lexer import Lexer, ContextualLexer, Token
  4. from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token
  5. from .parsers import lalr_parser, old_earley, nearley, earley
  6. from .tree import Transformer
  7. from .parsers import xearley
  8. class WithLexer:
  9. def __init__(self, lexer_conf):
  10. self.lexer_conf = lexer_conf
  11. self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)
  12. def lex(self, text):
  13. stream = self.lexer.lex(text)
  14. if self.lexer_conf.postlex:
  15. return self.lexer_conf.postlex.process(stream)
  16. else:
  17. return stream
  18. class LALR(WithLexer):
  19. def __init__(self, lexer_conf, parser_conf, options=None):
  20. WithLexer.__init__(self, lexer_conf)
  21. self.parser_conf = parser_conf
  22. self.parser = lalr_parser.Parser(parser_conf)
  23. def parse(self, text):
  24. tokens = self.lex(text)
  25. return self.parser.parse(tokens)
  26. class LALR_ContextualLexer:
  27. def __init__(self, lexer_conf, parser_conf, options=None):
  28. self.lexer_conf = lexer_conf
  29. self.parser_conf = parser_conf
  30. self.parser = lalr_parser.Parser(parser_conf)
  31. d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()}
  32. always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
  33. self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
  34. def parse(self, text):
  35. tokens = self.lexer.lex(text)
  36. if self.lexer_conf.postlex:
  37. tokens = self.lexer_conf.postlex.process(tokens)
  38. return self.parser.parse(tokens, self.lexer.set_parser_state)
  39. class Nearley(WithLexer):
  40. def __init__(self, lexer_conf, parser_conf):
  41. WithLexer.__init__(self, lexer_conf)
  42. rules = [{'name':n,
  43. 'symbols': self._prepare_expansion(x),
  44. 'postprocess': getattr(parser_conf.callback, a)}
  45. for n,x,a in parser_conf.rules]
  46. self.parser = nearley.Parser(rules, parser_conf.start)
  47. def _prepare_expansion(self, expansion):
  48. return [(sym, None) if is_terminal(sym) else sym for sym in expansion]
  49. def parse(self, text):
  50. tokens = list(self.lex(text))
  51. res = self.parser.parse(tokens)
  52. assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
  53. return res[0]
  54. class OldEarley(WithLexer):
  55. def __init__(self, lexer_conf, parser_conf):
  56. WithLexer.__init__(self, lexer_conf)
  57. rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules]
  58. self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
  59. def _prepare_expansion(self, expansion):
  60. return [(sym,) if is_terminal(sym) else sym for sym in expansion]
  61. def parse(self, text):
  62. tokens = list(self.lex(text))
  63. res = self.parser.parse(tokens)
  64. assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
  65. return res[0]
  66. def tokenize_text(text):
  67. new_text = []
  68. line = 1
  69. col_start_pos = 0
  70. for i, ch in enumerate(text):
  71. if '\n' in ch:
  72. line += ch.count('\n')
  73. col_start_pos = i + ch.rindex('\n')
  74. new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
  75. return new_text
  76. class OldEarley_NoLex:
  77. def __init__(self, lexer_conf, parser_conf):
  78. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  79. rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules]
  80. self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
  81. def _prepare_expansion(self, expansion):
  82. for sym in expansion:
  83. if is_terminal(sym):
  84. regexp = self.token_by_name[sym].pattern.to_regexp()
  85. width = sre_parse.parse(regexp).getwidth()
  86. if width != (1,1):
  87. raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
  88. yield (re.compile(regexp).match, regexp)
  89. else:
  90. yield sym
  91. def parse(self, text):
  92. new_text = tokenize_text(text)
  93. res = self.parser.parse(new_text)
  94. assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
  95. return res[0]
  96. class Earley_NoLex:
  97. def __init__(self, lexer_conf, parser_conf, options=None):
  98. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  99. rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules]
  100. resolve_ambiguity = (options.ambiguity=='resolve') if options else True
  101. self.parser = earley.Parser(rules,
  102. parser_conf.start,
  103. parser_conf.callback,
  104. resolve_ambiguity=resolve_ambiguity)
  105. def _prepare_expansion(self, expansion):
  106. for sym in expansion:
  107. if is_terminal(sym):
  108. regexp = self.token_by_name[sym].pattern.to_regexp()
  109. width = sre_parse.parse(regexp).getwidth()
  110. if width != (1,1):
  111. raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
  112. yield Terminal_Regexp(regexp)
  113. else:
  114. yield sym
  115. def parse(self, text):
  116. new_text = tokenize_text(text)
  117. return self.parser.parse(new_text)
  118. class Earley(WithLexer):
  119. def __init__(self, lexer_conf, parser_conf, options=None):
  120. WithLexer.__init__(self, lexer_conf)
  121. rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules]
  122. resolve_ambiguity = (options.ambiguity=='resolve') if options else True
  123. self.parser = earley.Parser(rules,
  124. parser_conf.start,
  125. parser_conf.callback,
  126. resolve_ambiguity=resolve_ambiguity)
  127. def _prepare_expansion(self, expansion):
  128. return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
  129. def parse(self, text):
  130. tokens = self.lex(text)
  131. return self.parser.parse(tokens)
  132. class XEarley:
  133. def __init__(self, lexer_conf, parser_conf, options=None):
  134. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  135. rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules]
  136. resolve_ambiguity = (options.ambiguity=='resolve') if options else True
  137. ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]
  138. self.parser = xearley.Parser(rules,
  139. parser_conf.start,
  140. parser_conf.callback,
  141. resolve_ambiguity=resolve_ambiguity,
  142. ignore=ignore,
  143. )
  144. def _prepare_expansion(self, expansion):
  145. for sym in expansion:
  146. if is_terminal(sym):
  147. regexp = self.token_by_name[sym].pattern.to_regexp()
  148. width = sre_parse.parse(regexp).getwidth()
  149. yield Terminal_Regexp(regexp)
  150. else:
  151. yield sym
  152. def parse(self, text):
  153. return self.parser.parse(text)
  154. def get_frontend(parser, lexer):
  155. if parser=='lalr':
  156. if lexer is None:
  157. raise ValueError('The LALR parser requires use of a lexer')
  158. elif lexer == 'standard':
  159. return LALR
  160. elif lexer == 'contextual':
  161. return LALR_ContextualLexer
  162. else:
  163. raise ValueError('Unknown lexer: %s' % lexer)
  164. elif parser=='earley':
  165. if lexer is None:
  166. return Earley_NoLex
  167. elif lexer=='standard':
  168. return Earley
  169. elif lexer=='dynamic':
  170. return XEarley
  171. elif lexer=='contextual':
  172. raise ValueError('The Earley parser does not support the contextual parser')
  173. else:
  174. raise ValueError('Unknown lexer: %s' % lexer)
  175. else:
  176. raise ValueError('Unknown parser: %s' % parser)