This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

162 rader
6.0 KiB

  1. "This module implements an experimental Earley Parser with a dynamic lexer"
  2. # The parser uses a parse-forest to keep track of derivations and ambiguations.
  3. # When the parse ends successfully, a disambiguation stage resolves all ambiguity
  4. # (right now ambiguity resolution is not developed beyond the needs of lark)
  5. # Afterwards the parse tree is reduced (transformed) according to user callbacks.
  6. # I use the no-recursion version of Transformer and Visitor, because the tree might be
  7. # deeper than Python's recursion limit (a bit absurd, but that's life)
  8. #
  9. # The algorithm keeps track of each state set, using a corresponding Column instance.
  10. # Column keeps track of new items using NewsList instances.
  11. #
  12. # Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
  13. # uses regular expressions by necessity, achieving high-performance while maintaining all of
  14. # Earley's power in parsing any CFG.
  15. #
  16. #
  17. # Author: Erez Shinan (2017)
  18. # Email : erezshin@gmail.com
  19. from collections import defaultdict
  20. from ..common import ParseError, UnexpectedToken, is_terminal
  21. from ..lexer import Token, UnexpectedInput
  22. from ..tree import Tree
  23. from .grammar_analysis import GrammarAnalyzer
  24. from .earley import ApplyCallbacks, Item, Column
  25. class Parser:
  26. def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
  27. self.analysis = GrammarAnalyzer(rules, start_symbol)
  28. self.start_symbol = start_symbol
  29. self.resolve_ambiguity = resolve_ambiguity
  30. self.ignore = list(ignore)
  31. self.predict_all = predict_all
  32. self.postprocess = {}
  33. self.predictions = {}
  34. self.FIRST = {}
  35. for rule in self.analysis.rules:
  36. a = rule.alias
  37. self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
  38. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  39. self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
  40. self.term_matcher = term_matcher
  41. def parse(self, stream, start_symbol=None):
  42. # Define parser functions
  43. start_symbol = start_symbol or self.start_symbol
  44. delayed_matches = defaultdict(list)
  45. match = self.term_matcher
  46. text_line = 1
  47. text_column = 0
  48. def predict(nonterm, column):
  49. assert not is_terminal(nonterm), nonterm
  50. return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
  51. def complete(item):
  52. name = item.rule.origin
  53. return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
  54. def predict_and_complete(column):
  55. while True:
  56. to_predict = {x.expect for x in column.to_predict.get_news()
  57. if x.ptr} # if not part of an already predicted batch
  58. to_reduce = column.to_reduce.get_news()
  59. if not (to_predict or to_reduce):
  60. break
  61. for nonterm in to_predict:
  62. column.add( predict(nonterm, column) )
  63. for item in to_reduce:
  64. new_items = list(complete(item))
  65. for new_item in new_items:
  66. if new_item.similar(item):
  67. raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
  68. column.add(new_items)
  69. def scan(i, token, column):
  70. to_scan = column.to_scan
  71. for x in self.ignore:
  72. m = match(x, stream, i)
  73. if m:
  74. delayed_matches[m.end()] += set(to_scan)
  75. delayed_matches[m.end()] += set(column.to_reduce)
  76. # TODO add partial matches for ignore too?
  77. # s = m.group(0)
  78. # for j in range(1, len(s)):
  79. # m = x.match(s[:-j])
  80. # if m:
  81. # delayed_matches[m.end()] += to_scan
  82. for item in to_scan:
  83. m = match(item.expect, stream, i)
  84. if m:
  85. t = Token(item.expect, m.group(0), i, text_line, text_column)
  86. delayed_matches[m.end()].append(item.advance(t))
  87. s = m.group(0)
  88. for j in range(1, len(s)):
  89. m = match(item.expect, s[:-j])
  90. if m:
  91. t = Token(item.expect, m.group(0), i, text_line, text_column)
  92. delayed_matches[i+m.end()].append(item.advance(t))
  93. next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
  94. next_set.add(delayed_matches[i+1])
  95. del delayed_matches[i+1] # No longer needed, so unburden memory
  96. if not next_set and not delayed_matches:
  97. raise UnexpectedInput(stream, i, text_line, text_column, to_scan)
  98. return next_set
  99. # Main loop starts
  100. column0 = Column(0, self.FIRST, predict_all=self.predict_all)
  101. column0.add(predict(start_symbol, column0))
  102. column = column0
  103. for i, token in enumerate(stream):
  104. predict_and_complete(column)
  105. column = scan(i, token, column)
  106. if token == '\n':
  107. text_line += 1
  108. text_column = 1
  109. else:
  110. text_column += 1
  111. predict_and_complete(column)
  112. # Parse ended. Now build a parse tree
  113. solutions = [n.tree for n in column.to_reduce
  114. if n.rule.origin==start_symbol and n.start is column0]
  115. if not solutions:
  116. expected_tokens = [t.expect for t in column.to_scan]
  117. raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
  118. elif len(solutions) == 1:
  119. tree = solutions[0]
  120. else:
  121. tree = Tree('_ambig', solutions)
  122. if self.resolve_ambiguity:
  123. tree = self.resolve_ambiguity(tree)
  124. return ApplyCallbacks(self.postprocess).transform(tree)