This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

154 linhas
5.7 KiB

  1. "This module implements an experimental Earley Parser with a dynamic lexer"
  2. # The parser uses a parse-forest to keep track of derivations and ambiguations.
  3. # When the parse ends successfully, a disambiguation stage resolves all ambiguity
  4. # (right now ambiguity resolution is not developed beyond the needs of lark)
  5. # Afterwards the parse tree is reduced (transformed) according to user callbacks.
  6. # I use the no-recursion version of Transformer and Visitor, because the tree might be
  7. # deeper than Python's recursion limit (a bit absurd, but that's life)
  8. #
  9. # The algorithm keeps track of each state set, using a corresponding Column instance.
  10. # Column keeps track of new items using NewsList instances.
  11. #
  12. # Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
  13. # uses regular expressions by necessity, achieving high-performance while maintaining all of
  14. # Earley's power in parsing any CFG.
  15. #
  16. #
  17. # Author: Erez Shinan (2017)
  18. # Email : erezshin@gmail.com
  19. from collections import defaultdict
  20. from ..common import ParseError, UnexpectedToken, Terminal
  21. from ..lexer import Token
  22. from ..tree import Tree
  23. from .grammar_analysis import GrammarAnalyzer
  24. from .earley import ApplyCallbacks, Item, Column
  25. class Parser:
  26. def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()):
  27. self.analysis = GrammarAnalyzer(rules, start_symbol)
  28. self.start_symbol = start_symbol
  29. self.resolve_ambiguity = resolve_ambiguity
  30. self.ignore = list(ignore)
  31. self.postprocess = {}
  32. self.predictions = {}
  33. for rule in self.analysis.rules:
  34. if rule.origin != '$root': # XXX kinda ugly
  35. a = rule.alias
  36. self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
  37. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  38. def parse(self, stream, start_symbol=None):
  39. # Define parser functions
  40. start_symbol = start_symbol or self.start_symbol
  41. delayed_matches = defaultdict(set)
  42. match_after_ignore = set()
  43. text_line = 1
  44. text_column = 0
  45. def predict(nonterm, column):
  46. assert not isinstance(nonterm, Terminal), nonterm
  47. return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
  48. def complete(item):
  49. name = item.rule.origin
  50. return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
  51. def predict_and_complete(column):
  52. while True:
  53. to_predict = {x.expect for x in column.to_predict.get_news()
  54. if x.ptr} # if not part of an already predicted batch
  55. to_reduce = column.to_reduce.get_news()
  56. if not (to_predict or to_reduce):
  57. break
  58. for nonterm in to_predict:
  59. column.add( predict(nonterm, column) )
  60. for item in to_reduce:
  61. new_items = list(complete(item))
  62. for new_item in new_items:
  63. if new_item.similar(item):
  64. raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
  65. column.add(new_items)
  66. def scan(i, token, column):
  67. to_scan = column.to_scan.get_news()
  68. for x in self.ignore:
  69. m = x.match(stream, i)
  70. if m:
  71. delayed_matches[m.end()] |= set(to_scan)
  72. if m.end() == len(stream):
  73. match_after_ignore.update(set(column.to_reduce))
  74. # TODO add partial matches for ignore too?
  75. # s = m.group(0)
  76. # for j in range(1, len(s)):
  77. # m = x.match(s[:-j])
  78. # if m:
  79. # delayed_matches[m.end()] += to_scan
  80. for item in to_scan:
  81. m = item.expect.match(stream, i)
  82. if m:
  83. t = Token(item.expect.name, m.group(0), i, text_line, text_column)
  84. delayed_matches[m.end()].add(item.advance(t))
  85. s = m.group(0)
  86. for j in range(1, len(s)):
  87. m = item.expect.match(s[:-j])
  88. if m:
  89. delayed_matches[m.end()].add(item.advance(m.group(0)))
  90. next_set = Column(i+1)
  91. next_set.add(delayed_matches[i+1])
  92. del delayed_matches[i+1] # No longer needed, so unburden memory
  93. return next_set
  94. # Main loop starts
  95. column0 = Column(0)
  96. column0.add(predict(start_symbol, column0))
  97. column = column0
  98. for i, token in enumerate(stream):
  99. predict_and_complete(column)
  100. column = scan(i, token, column)
  101. if token == '\n':
  102. text_line += 1
  103. text_column = 0
  104. else:
  105. text_column += 1
  106. predict_and_complete(column)
  107. # Parse ended. Now build a parse tree
  108. solutions = [n.tree for n in column.to_reduce
  109. if n.rule.origin==start_symbol and n.start is column0]
  110. if not solutions:
  111. solutions = [n.tree for n in match_after_ignore
  112. if n.rule.origin==start_symbol and n.start is column0]
  113. if not solutions:
  114. raise ParseError('Incomplete parse: Could not find a solution to input')
  115. elif len(solutions) == 1:
  116. tree = solutions[0]
  117. else:
  118. tree = Tree('_ambig', solutions)
  119. if self.resolve_ambiguity:
  120. tree = self.resolve_ambiguity(tree)
  121. return ApplyCallbacks(self.postprocess).transform(tree)