This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

159 lines
6.1 KiB

  1. "This module implements an experimental Earley Parser with a dynamic lexer"
  2. # The parser uses a parse-forest to keep track of derivations and ambiguations.
  3. # When the parse ends successfully, a disambiguation stage resolves all ambiguity
  4. # (right now ambiguity resolution is not developed beyond the needs of lark)
  5. # Afterwards the parse tree is reduced (transformed) according to user callbacks.
  6. # I use the no-recursion version of Transformer and Visitor, because the tree might be
  7. # deeper than Python's recursion limit (a bit absurd, but that's life)
  8. #
  9. # The algorithm keeps track of each state set, using a corresponding Column instance.
  10. # Column keeps track of new items using NewsList instances.
  11. #
  12. # Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
  13. # uses regular expressions by necessity, achieving high-performance while maintaining all of
  14. # Earley's power in parsing any CFG.
  15. #
  16. #
  17. # Author: Erez Shinan (2017)
  18. # Email : erezshin@gmail.com
  19. from collections import defaultdict
  20. from ..exceptions import ParseError, UnexpectedCharacters
  21. from ..lexer import Token
  22. from ..tree import Tree
  23. from .grammar_analysis import GrammarAnalyzer
  24. from ..grammar import NonTerminal, Terminal
  25. from .earley import ApplyCallbacks, Item, Column
  26. class Parser:
  27. def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False):
  28. self.analysis = GrammarAnalyzer(parser_conf)
  29. self.parser_conf = parser_conf
  30. self.resolve_ambiguity = resolve_ambiguity
  31. self.ignore = [Terminal(t) for t in ignore]
  32. self.predict_all = predict_all
  33. self.complete_lex = complete_lex
  34. self.FIRST = self.analysis.FIRST
  35. self.postprocess = {}
  36. self.predictions = {}
  37. for rule in parser_conf.rules:
  38. self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
  39. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  40. self.term_matcher = term_matcher
  41. def parse(self, stream, start_symbol=None):
  42. # Define parser functions
  43. start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
  44. delayed_matches = defaultdict(list)
  45. match = self.term_matcher
  46. text_line = 1
  47. text_column = 1
  48. def predict(nonterm, column):
  49. assert not nonterm.is_term, nonterm
  50. return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
  51. def complete(item):
  52. name = item.rule.origin
  53. return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
  54. def predict_and_complete(column):
  55. while True:
  56. to_predict = {x.expect for x in column.to_predict.get_news()
  57. if x.ptr} # if not part of an already predicted batch
  58. to_reduce = column.to_reduce.get_news()
  59. if not (to_predict or to_reduce):
  60. break
  61. for nonterm in to_predict:
  62. column.add( predict(nonterm, column) )
  63. for item in to_reduce:
  64. new_items = list(complete(item))
  65. if item in new_items:
  66. raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
  67. column.add(new_items)
  68. def scan(i, column):
  69. to_scan = column.to_scan
  70. for x in self.ignore:
  71. m = match(x, stream, i)
  72. if m:
  73. delayed_matches[m.end()] += set(to_scan)
  74. delayed_matches[m.end()] += set(column.to_reduce)
  75. # TODO add partial matches for ignore too?
  76. # s = m.group(0)
  77. # for j in range(1, len(s)):
  78. # m = x.match(s[:-j])
  79. # if m:
  80. # delayed_matches[m.end()] += to_scan
  81. for item in to_scan:
  82. m = match(item.expect, stream, i)
  83. if m:
  84. t = Token(item.expect.name, m.group(0), i, text_line, text_column)
  85. delayed_matches[m.end()].append(item.advance(t))
  86. if self.complete_lex:
  87. s = m.group(0)
  88. for j in range(1, len(s)):
  89. m = match(item.expect, s[:-j])
  90. if m:
  91. t = Token(item.expect.name, m.group(0), i, text_line, text_column)
  92. delayed_matches[i+m.end()].append(item.advance(t))
  93. next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
  94. next_set.add(delayed_matches[i+1])
  95. del delayed_matches[i+1] # No longer needed, so unburden memory
  96. if not next_set and not delayed_matches:
  97. raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))
  98. return next_set
  99. # Main loop starts
  100. column0 = Column(0, self.FIRST, predict_all=self.predict_all)
  101. column0.add(predict(start_symbol, column0))
  102. column = column0
  103. for i, token in enumerate(stream):
  104. predict_and_complete(column)
  105. column = scan(i, column)
  106. if token == '\n':
  107. text_line += 1
  108. text_column = 1
  109. else:
  110. text_column += 1
  111. predict_and_complete(column)
  112. # Parse ended. Now build a parse tree
  113. solutions = [n.tree for n in column.to_reduce
  114. if n.rule.origin==start_symbol and n.start is column0]
  115. if not solutions:
  116. expected_tokens = [t.expect for t in column.to_scan]
  117. raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
  118. elif len(solutions) == 1:
  119. tree = solutions[0]
  120. else:
  121. tree = Tree('_ambig', solutions)
  122. if self.resolve_ambiguity:
  123. tree = self.resolve_ambiguity(tree)
  124. return ApplyCallbacks(self.postprocess).transform(tree)