This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 

172 řádky
6.2 KiB

  1. import re
  2. import sre_parse
  3. from .lexer import Lexer, ContextualLexer, Token
  4. from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token
  5. from .parsers import lalr_parser, earley, xearley, resolve_ambig
  6. class WithLexer:
  7. def __init__(self, lexer_conf):
  8. self.lexer_conf = lexer_conf
  9. self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)
  10. def lex(self, text):
  11. stream = self.lexer.lex(text)
  12. if self.lexer_conf.postlex:
  13. return self.lexer_conf.postlex.process(stream)
  14. else:
  15. return stream
  16. class LALR(WithLexer):
  17. def __init__(self, lexer_conf, parser_conf, options=None):
  18. WithLexer.__init__(self, lexer_conf)
  19. self.parser_conf = parser_conf
  20. self.parser = lalr_parser.Parser(parser_conf)
  21. def parse(self, text):
  22. tokens = self.lex(text)
  23. return self.parser.parse(tokens)
  24. class LALR_ContextualLexer:
  25. def __init__(self, lexer_conf, parser_conf, options=None):
  26. self.lexer_conf = lexer_conf
  27. self.parser_conf = parser_conf
  28. self.parser = lalr_parser.Parser(parser_conf)
  29. d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()}
  30. always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
  31. self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
  32. def parse(self, text):
  33. tokens = self.lexer.lex(text)
  34. if self.lexer_conf.postlex:
  35. tokens = self.lexer_conf.postlex.process(tokens)
  36. return self.parser.parse(tokens, self.lexer.set_parser_state)
  37. def get_ambiguity_resolver(options):
  38. if not options or options.ambiguity == 'resolve':
  39. return resolve_ambig.standard_resolve_ambig
  40. elif options.ambiguity == 'resolve__antiscore_sum':
  41. return resolve_ambig.antiscore_sum_resolve_ambig
  42. elif options.ambiguity == 'explicit':
  43. return None
  44. raise ValueError(options)
  45. def tokenize_text(text):
  46. new_text = []
  47. line = 1
  48. col_start_pos = 0
  49. for i, ch in enumerate(text):
  50. if '\n' in ch:
  51. line += ch.count('\n')
  52. col_start_pos = i + ch.rindex('\n')
  53. new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
  54. return new_text
  55. class Earley_NoLex:
  56. def __init__(self, lexer_conf, parser_conf, options=None):
  57. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  58. rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
  59. self.parser = earley.Parser(rules,
  60. parser_conf.start,
  61. parser_conf.callback,
  62. resolve_ambiguity=get_ambiguity_resolver(options),
  63. all_derivations = options.earley__all_derivations if options else True)
  64. def _prepare_expansion(self, expansion):
  65. for sym in expansion:
  66. if is_terminal(sym):
  67. regexp = self.token_by_name[sym].pattern.to_regexp()
  68. width = sre_parse.parse(regexp).getwidth()
  69. if width != (1,1):
  70. raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
  71. yield Terminal_Regexp(sym, regexp)
  72. else:
  73. yield sym
  74. def parse(self, text):
  75. new_text = tokenize_text(text)
  76. return self.parser.parse(new_text)
  77. class Earley(WithLexer):
  78. def __init__(self, lexer_conf, parser_conf, options=None):
  79. WithLexer.__init__(self, lexer_conf)
  80. rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]
  81. self.parser = earley.Parser(rules,
  82. parser_conf.start,
  83. parser_conf.callback,
  84. resolve_ambiguity=get_ambiguity_resolver(options),
  85. all_derivations = options.earley__all_derivations if options else True)
  86. def _prepare_expansion(self, expansion):
  87. return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
  88. def parse(self, text):
  89. tokens = self.lex(text)
  90. return self.parser.parse(tokens)
  91. class XEarley:
  92. def __init__(self, lexer_conf, parser_conf, options=None):
  93. self.token_by_name = {t.name:t for t in lexer_conf.tokens}
  94. rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
  95. ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]
  96. self.parser = xearley.Parser(rules,
  97. parser_conf.start,
  98. parser_conf.callback,
  99. resolve_ambiguity=get_ambiguity_resolver(options),
  100. ignore=ignore,
  101. )
  102. def _prepare_expansion(self, expansion):
  103. for sym in expansion:
  104. if is_terminal(sym):
  105. regexp = self.token_by_name[sym].pattern.to_regexp()
  106. width = sre_parse.parse(regexp).getwidth()
  107. assert width
  108. yield Terminal_Regexp(sym, regexp)
  109. else:
  110. yield sym
  111. def parse(self, text):
  112. return self.parser.parse(text)
  113. def get_frontend(parser, lexer):
  114. if parser=='lalr':
  115. if lexer is None:
  116. raise ValueError('The LALR parser requires use of a lexer')
  117. elif lexer == 'standard':
  118. return LALR
  119. elif lexer == 'contextual':
  120. return LALR_ContextualLexer
  121. else:
  122. raise ValueError('Unknown lexer: %s' % lexer)
  123. elif parser=='earley':
  124. if lexer is None:
  125. return Earley_NoLex
  126. elif lexer=='standard':
  127. return Earley
  128. elif lexer=='dynamic':
  129. return XEarley
  130. elif lexer=='contextual':
  131. raise ValueError('The Earley parser does not support the contextual parser')
  132. else:
  133. raise ValueError('Unknown lexer: %s' % lexer)
  134. else:
  135. raise ValueError('Unknown parser: %s' % parser)