From 8e7395c6d6b6220303ac8d38f85606e1833fefcd Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 21 Feb 2017 16:02:09 +0200 Subject: [PATCH] Improved earley.py and added some comments to both parsers --- lark/parsers/earley.py | 70 +++++++++++++++++++++++++---------- lark/parsers/lalr_analysis.py | 8 ++++ lark/parsers/lalr_parser.py | 5 +++ 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index f5b2f5b..78b1183 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -1,10 +1,20 @@ +"This module implements an Earley Parser" + +# The algorithm keeps track of each state set, using a corresponding Column instance. +# Column keeps track of new items using NewsList instances. +# +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + from ..common import ParseError, UnexpectedToken, is_terminal from .grammar_analysis import GrammarAnalyzer -class EndToken(str): +class EndToken: type = '$end' -class Item: +END_TOKEN = EndToken() + +class Item(object): def __init__(self, rule, ptr, start, data): self.rule = rule self.ptr = ptr @@ -34,6 +44,8 @@ class Item: class NewsList(list): + "Keeps track of newly added items (append-only)" + def __init__(self, initial=None): list.__init__(self, initial or []) self.last_iter = 0 @@ -45,22 +57,39 @@ class NewsList(list): class Column: + "An entry in the table, aka Earley Chart" def __init__(self): self.to_reduce = NewsList() self.to_predict = NewsList() self.to_scan = NewsList() self.item_count = 0 + self.added = set() + def add(self, items): - self.item_count += len(items) + """Sort items into scan/predict/reduce newslists + + Makes sure only unique items are added. + """ + + added = self.added for item in items: + if item.is_complete: - if item not in self.to_reduce: # Avoid infinite loop - self.to_reduce.append(item) - elif is_terminal(item.expect): - self.to_scan.append(item) + if item in added: + continue + self.to_reduce.append(item) + added.add(item) else: - self.to_predict.append(item) + if is_terminal(item.expect): + self.to_scan.append(item) + else: + if item in added: + continue + self.to_predict.append(item) + added.add(item) + + self.item_count += 1 # Only count if actually added def __nonzero__(self): return bool(self.item_count) @@ -78,8 +107,9 @@ class Parser: self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - def parse(self, stream): + def parse(self, stream, start=None): # Define parser functions + start = start or self.start def predict(nonterm, i): assert not is_terminal(nonterm), nonterm @@ -88,8 +118,7 @@ class Parser: def complete(item, table): name = item.rule.origin item.data = self.postprocess[item.rule](item.data) - return [i.advance(item.data) for i in table[item.start].to_predict - if i.expect == name] + return [i.advance(item.data) for i in table[item.start].to_predict if i.expect == name] def process_column(i, token): assert i == len(table)-1 @@ -109,29 +138,30 @@ class Parser: cur_set.add( complete(item, table) ) - for item in cur_set.to_scan.get_news(): - match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type - if match: - next_set.add([item.advance(stream[i])]) + if token is not END_TOKEN: + for item in cur_set.to_scan.get_news(): + match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type + if match: + next_set.add([item.advance(stream[i])]) - if not next_set and token.type != '$end': - expect = [i.expect for i in cur_set.to_scan] + if not next_set and token is not END_TOKEN: + expect = {i.expect for i in cur_set.to_scan} raise UnexpectedToken(token, expect, stream, i) table.append(next_set) # Main loop starts table = [Column()] - table[0].add(predict(self.start, 0)) + table[0].add(predict(start, 0)) for i, char in enumerate(stream): process_column(i, char) - process_column(len(stream), EndToken()) + process_column(len(stream), END_TOKEN) # Parse ended. Now build a parse tree solutions = [n.data for n in table[len(stream)].to_reduce - if n.rule.origin==self.start and n.start==0] + if n.rule.origin==start and n.start==0] if not solutions: raise ParseError('Incomplete parse: Could not find a solution to input') diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 83f96fc..caa41c9 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -1,3 +1,11 @@ +"""This module builds a LALR(1) transition-table for lalr_parser.py + +For now, shift/reduce conflicts are automatically resolved as shifts. +""" + +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + import logging from collections import defaultdict diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 7394f91..23554c6 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -1,3 +1,8 @@ +"""This module implements a LALR(1) Parser +""" +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + from ..common import ParseError, UnexpectedToken from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT