This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

156 rader
5.9 KiB

  1. "This module implements an experimental Earley Parser with a dynamic lexer"
  2. # The parser uses a parse-forest to keep track of derivations and ambiguations.
  3. # When the parse ends successfully, a disambiguation stage resolves all ambiguity
  4. # (right now ambiguity resolution is not developed beyond the needs of lark)
  5. # Afterwards the parse tree is reduced (transformed) according to user callbacks.
  6. # I use the no-recursion version of Transformer and Visitor, because the tree might be
  7. # deeper than Python's recursion limit (a bit absurd, but that's life)
  8. #
  9. # The algorithm keeps track of each state set, using a corresponding Column instance.
  10. # Column keeps track of new items using NewsList instances.
  11. #
  12. # Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
  13. # uses regular expressions by necessity, achieving high-performance while maintaining all of
  14. # Earley's power in parsing any CFG.
  15. #
  16. #
  17. # Author: Erez Shinan (2017)
  18. # Email : erezshin@gmail.com
  19. from collections import defaultdict
  20. from ..common import ParseError, is_terminal
  21. from ..lexer import Token, UnexpectedInput
  22. from ..tree import Tree
  23. from .grammar_analysis import GrammarAnalyzer
  24. from .earley import ApplyCallbacks, Item, Column
  25. class Parser:
  26. def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
  27. self.analysis = GrammarAnalyzer(parser_conf)
  28. self.parser_conf = parser_conf
  29. self.resolve_ambiguity = resolve_ambiguity
  30. self.ignore = list(ignore)
  31. self.predict_all = predict_all
  32. self.FIRST = self.analysis.FIRST
  33. self.postprocess = {}
  34. self.predictions = {}
  35. for rule in parser_conf.rules:
  36. self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
  37. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  38. self.term_matcher = term_matcher
  39. def parse(self, stream, start_symbol=None):
  40. # Define parser functions
  41. start_symbol = start_symbol or self.parser_conf.start
  42. delayed_matches = defaultdict(list)
  43. match = self.term_matcher
  44. text_line = 1
  45. text_column = 0
  46. def predict(nonterm, column):
  47. assert not is_terminal(nonterm), nonterm
  48. return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
  49. def complete(item):
  50. name = item.rule.origin
  51. return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
  52. def predict_and_complete(column):
  53. while True:
  54. to_predict = {x.expect for x in column.to_predict.get_news()
  55. if x.ptr} # if not part of an already predicted batch
  56. to_reduce = column.to_reduce.get_news()
  57. if not (to_predict or to_reduce):
  58. break
  59. for nonterm in to_predict:
  60. column.add( predict(nonterm, column) )
  61. for item in to_reduce:
  62. new_items = list(complete(item))
  63. if item in new_items:
  64. raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
  65. column.add(new_items)
  66. def scan(i, column):
  67. to_scan = column.to_scan
  68. for x in self.ignore:
  69. m = match(x, stream, i)
  70. if m:
  71. delayed_matches[m.end()] += set(to_scan)
  72. delayed_matches[m.end()] += set(column.to_reduce)
  73. # TODO add partial matches for ignore too?
  74. # s = m.group(0)
  75. # for j in range(1, len(s)):
  76. # m = x.match(s[:-j])
  77. # if m:
  78. # delayed_matches[m.end()] += to_scan
  79. for item in to_scan:
  80. m = match(item.expect, stream, i)
  81. if m:
  82. t = Token(item.expect, m.group(0), i, text_line, text_column)
  83. delayed_matches[m.end()].append(item.advance(t))
  84. s = m.group(0)
  85. for j in range(1, len(s)):
  86. m = match(item.expect, s[:-j])
  87. if m:
  88. t = Token(item.expect, m.group(0), i, text_line, text_column)
  89. delayed_matches[i+m.end()].append(item.advance(t))
  90. next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
  91. next_set.add(delayed_matches[i+1])
  92. del delayed_matches[i+1] # No longer needed, so unburden memory
  93. if not next_set and not delayed_matches:
  94. raise UnexpectedInput(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))
  95. return next_set
  96. # Main loop starts
  97. column0 = Column(0, self.FIRST, predict_all=self.predict_all)
  98. column0.add(predict(start_symbol, column0))
  99. column = column0
  100. for i, token in enumerate(stream):
  101. predict_and_complete(column)
  102. column = scan(i, column)
  103. if token == '\n':
  104. text_line += 1
  105. text_column = 0
  106. else:
  107. text_column += 1
  108. predict_and_complete(column)
  109. # Parse ended. Now build a parse tree
  110. solutions = [n.tree for n in column.to_reduce
  111. if n.rule.origin==start_symbol and n.start is column0]
  112. if not solutions:
  113. expected_tokens = [t.expect for t in column.to_scan]
  114. raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
  115. elif len(solutions) == 1:
  116. tree = solutions[0]
  117. else:
  118. tree = Tree('_ambig', solutions)
  119. if self.resolve_ambiguity:
  120. tree = self.resolve_ambiguity(tree)
  121. return ApplyCallbacks(self.postprocess).transform(tree)