This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

163 line
5.6 KiB

  1. import re
  2. from .utils import get_regexp_width
  3. from .lexer import Lexer, ContextualLexer, Token
  4. from .common import is_terminal, GrammarError, ParserConf
  5. from .parsers import lalr_parser, earley, xearley, resolve_ambig
  6. class WithLexer:
  7. def init_traditional_lexer(self, lexer_conf):
  8. self.lexer_conf = lexer_conf
  9. self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)
  10. def init_contextual_lexer(self, lexer_conf, parser_conf):
  11. self.lexer_conf = lexer_conf
  12. d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()}
  13. always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
  14. self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
  15. def lex(self, text):
  16. stream = self.lexer.lex(text)
  17. if self.lexer_conf.postlex:
  18. return self.lexer_conf.postlex.process(stream)
  19. else:
  20. return stream
  21. class LALR(WithLexer):
  22. def __init__(self, lexer_conf, parser_conf, options=None):
  23. self.parser = lalr_parser.Parser(parser_conf)
  24. self.init_traditional_lexer(lexer_conf)
  25. def parse(self, text):
  26. token_stream = self.lex(text)
  27. return self.parser.parse(token_stream)
  28. class LALR_ContextualLexer(WithLexer):
  29. def __init__(self, lexer_conf, parser_conf, options=None):
  30. self.parser = lalr_parser.Parser(parser_conf)
  31. self.init_contextual_lexer(lexer_conf, parser_conf)
  32. def parse(self, text):
  33. token_stream = self.lex(text)
  34. return self.parser.parse(token_stream, self.lexer.set_parser_state)
  35. def get_ambiguity_resolver(options):
  36. if not options or options.ambiguity == 'resolve':
  37. return resolve_ambig.standard_resolve_ambig
  38. elif options.ambiguity == 'resolve__antiscore_sum':
  39. return resolve_ambig.antiscore_sum_resolve_ambig
  40. elif options.ambiguity == 'explicit':
  41. return None
  42. raise ValueError(options)
  43. def tokenize_text(text):
  44. line = 1
  45. col_start_pos = 0
  46. for i, ch in enumerate(text):
  47. if '\n' in ch:
  48. line += ch.count('\n')
  49. col_start_pos = i + ch.rindex('\n')
  50. yield Token('CHAR', ch, line=line, column=i - col_start_pos)
  51. class Earley_NoLex:
  52. def __init__(self, lexer_conf, parser_conf, options=None):
  53. self._prepare_match(lexer_conf)
  54. self.parser = earley.Parser(parser_conf, self.match,
  55. resolve_ambiguity=get_ambiguity_resolver(options))
  56. def match(self, term, text, index=0):
  57. return self.regexps[term].match(text, index)
  58. def _prepare_match(self, lexer_conf):
  59. self.regexps = {}
  60. for t in lexer_conf.tokens:
  61. regexp = t.pattern.to_regexp()
  62. width = get_regexp_width(regexp)
  63. if width != (1,1):
  64. raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
  65. self.regexps[t.name] = re.compile(regexp)
  66. def parse(self, text):
  67. token_stream = tokenize_text(text)
  68. return self.parser.parse(token_stream)
  69. class Earley(WithLexer):
  70. def __init__(self, lexer_conf, parser_conf, options=None):
  71. self.init_traditional_lexer(lexer_conf)
  72. self.parser = earley.Parser(parser_conf, self.match,
  73. resolve_ambiguity=get_ambiguity_resolver(options))
  74. def match(self, term, token):
  75. return term == token.type
  76. def parse(self, text):
  77. tokens = self.lex(text)
  78. return self.parser.parse(tokens)
  79. class XEarley:
  80. def __init__(self, lexer_conf, parser_conf, options=None):
  81. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  82. self._prepare_match(lexer_conf)
  83. self.parser = xearley.Parser(parser_conf,
  84. self.match,
  85. resolve_ambiguity=get_ambiguity_resolver(options),
  86. ignore=lexer_conf.ignore,
  87. predict_all=options.earley__predict_all
  88. )
  89. def match(self, term, text, index=0):
  90. return self.regexps[term].match(text, index)
  91. def _prepare_match(self, lexer_conf):
  92. self.regexps = {}
  93. for t in lexer_conf.tokens:
  94. regexp = t.pattern.to_regexp()
  95. try:
  96. width = get_regexp_width(regexp)[0]
  97. except ValueError:
  98. raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
  99. else:
  100. if width == 0:
  101. raise ValueError("Dynamic Earley doesn't allow zero-width regexps")
  102. self.regexps[t.name] = re.compile(regexp)
  103. def parse(self, text):
  104. return self.parser.parse(text)
  105. def get_frontend(parser, lexer):
  106. if parser=='lalr':
  107. if lexer is None:
  108. raise ValueError('The LALR parser requires use of a lexer')
  109. elif lexer == 'standard':
  110. return LALR
  111. elif lexer == 'contextual':
  112. return LALR_ContextualLexer
  113. else:
  114. raise ValueError('Unknown lexer: %s' % lexer)
  115. elif parser=='earley':
  116. if lexer is None:
  117. return Earley_NoLex
  118. elif lexer=='standard':
  119. return Earley
  120. elif lexer=='dynamic':
  121. return XEarley
  122. elif lexer=='contextual':
  123. raise ValueError('The Earley parser does not support the contextual parser')
  124. else:
  125. raise ValueError('Unknown lexer: %s' % lexer)
  126. else:
  127. raise ValueError('Unknown parser: %s' % parser)