From 1cc4c965e87e89b686c0f8c7b01349bdcf3e8ddb Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 7 Jan 2018 00:50:40 +0200 Subject: [PATCH] Big Refactor: Grammars now build in half the time. Code shorter & cleaner. --- lark/common.py | 27 +------------ lark/grammar.py | 16 ++++++++ lark/lark.py | 1 + lark/lexer.py | 2 +- lark/parse_tree_builder.py | 5 ++- lark/parser_frontends.py | 67 ++++++++++++++++---------------- lark/parsers/earley.py | 24 ++++++------ lark/parsers/grammar_analysis.py | 43 ++++++++------------ lark/parsers/lalr_analysis.py | 2 +- lark/parsers/lalr_parser.py | 5 ++- lark/parsers/xearley.py | 30 +++++++------- 11 files changed, 104 insertions(+), 118 deletions(-) create mode 100644 lark/grammar.py diff --git a/lark/common.py b/lark/common.py index 55e9d28..800aa4f 100644 --- a/lark/common.py +++ b/lark/common.py @@ -33,7 +33,7 @@ class UnexpectedToken(ParseError): def is_terminal(sym): - return isinstance(sym, Terminal) or sym.isupper() or sym == '$end' + return sym.isupper() class LexerConf: @@ -44,7 +44,6 @@ class LexerConf: class ParserConf: def __init__(self, rules, callback, start): - assert all(len(r) == 4 for r in rules) self.rules = rules self.callback = callback self.start = start @@ -108,27 +107,3 @@ class TokenDef(object): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - -class Terminal: - def __init__(self, data): - self.data = data - - def __repr__(self): - return '%r' % self.data - - def __eq__(self, other): - return isinstance(other, type(self)) and self.data == other.data - def __hash__(self): - return hash(self.data) - - -class Terminal_Regexp(Terminal): - def __init__(self, name, regexp): - Terminal.__init__(self, regexp) - self.name = name - self.match = re.compile(regexp).match - -class Terminal_Token(Terminal): - def match(self, other): - return self.data == other.type - diff --git a/lark/grammar.py b/lark/grammar.py new file mode 100644 index 0000000..281c21c --- /dev/null +++ b/lark/grammar.py @@ -0,0 +1,16 @@ + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.options = options + + def __repr__(self): + return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + + diff --git a/lark/lark.py b/lark/lark.py index d8ee186..03bd253 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -171,6 +171,7 @@ class Lark: for f in dir(callback): if not (f.startswith('__') and f.endswith('__')): setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) + parser_conf = ParserConf(rules, callback, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) diff --git a/lark/lexer.py b/lark/lexer.py index 2741af0..66923b0 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -204,7 +204,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] + state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] lexer = Lexer(state_tokens, ignore=ignore) lexer_by_tokens[key] = lexer diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 975121d..497af55 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,6 +1,7 @@ from .common import is_terminal, GrammarError from .utils import suppress from .lexer import Token +from .grammar import Rule class NodeBuilder: def __init__(self, tree_class, name): @@ -27,7 +28,7 @@ class Factory: def __call__(self, node_builder): return self.cls(node_builder, *self.args) - + class TokenWrapper: "Used for fixing the results of scanless parsing" @@ -151,6 +152,6 @@ class ParseTreeBuilder: raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) setattr(callback, callback_name, f) - new_rules.append(( origin, expansion, callback_name, options )) + new_rules.append( Rule( origin, expansion, callback_name, options )) return new_rules, callback diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index ad5017b..228640f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -3,7 +3,7 @@ import sre_parse from .lexer import Lexer, ContextualLexer, Token -from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token +from .common import is_terminal, GrammarError, ParserConf from .parsers import lalr_parser, earley, xearley, resolve_ambig class WithLexer: @@ -70,25 +70,26 @@ def tokenize_text(text): class Earley_NoLex: def __init__(self, lexer_conf, parser_conf, options=None): - self.token_by_name = {t.name:t for t in lexer_conf.tokens} - - rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] + self._prepare_match(lexer_conf) - self.parser = earley.Parser(rules, + self.parser = earley.Parser(parser_conf.rules, parser_conf.start, parser_conf.callback, + self.match, resolve_ambiguity=get_ambiguity_resolver(options)) - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - if width != (1,1): - raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) - yield Terminal_Regexp(sym, regexp) - else: - yield sym + + def match(self, term, text, index=0): + return self.regexps[term].match(text, index) + + def _prepare_match(self, lexer_conf): + self.regexps = {} + for t in lexer_conf.tokens: + regexp = t.pattern.to_regexp() + width = sre_parse.parse(regexp).getwidth() + if width != (1,1): + raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) + self.regexps[t.name] = re.compile(regexp) def parse(self, text): new_text = tokenize_text(text) @@ -98,15 +99,14 @@ class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): WithLexer.__init__(self, lexer_conf) - rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] - - self.parser = earley.Parser(rules, + self.parser = earley.Parser(parser_conf.rules, parser_conf.start, parser_conf.callback, + self.match, resolve_ambiguity=get_ambiguity_resolver(options)) - def _prepare_expansion(self, expansion): - return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] + def match(self, term, token): + return term == token.type def parse(self, text): tokens = self.lex(text) @@ -117,27 +117,26 @@ class XEarley: def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} - rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] - - ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] + self._prepare_match(lexer_conf) - self.parser = xearley.Parser(rules, + self.parser = xearley.Parser(parser_conf.rules, parser_conf.start, parser_conf.callback, + self.match, resolve_ambiguity=get_ambiguity_resolver(options), - ignore=ignore, + ignore=lexer_conf.ignore, predict_all=options.earley__predict_all ) - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - assert width - yield Terminal_Regexp(sym, regexp) - else: - yield sym + def match(self, term, text, index=0): + return self.regexps[term].match(text, index) + + def _prepare_match(self, lexer_conf): + self.regexps = {} + for t in lexer_conf.tokens: + regexp = t.pattern.to_regexp() + assert sre_parse.parse(regexp).getwidth() + self.regexps[t.name] = re.compile(regexp) def parse(self, text): return self.parser.parse(text) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 55893f5..e6a914d 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -13,13 +13,13 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken, Terminal +from ..common import ParseError, UnexpectedToken, is_terminal from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse from .grammar_analysis import GrammarAnalyzer class EndToken: - type = '$end' + type = '$END' class Derivation(Tree): _hash = None @@ -135,7 +135,7 @@ class Column: self.completed[item_key] = item self.to_reduce.append(item) else: - if isinstance(item.expect, Terminal): + if is_terminal(item.expect): self.to_scan.append(item) else: k = item_key if self.predict_all else item @@ -152,7 +152,7 @@ class Column: __nonzero__ = __bool__ # Py2 backwards-compatibility class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): + def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity @@ -161,12 +161,13 @@ class Parser: self.predictions = {} self.FIRST = {} for rule in self.analysis.rules: - if rule.origin != '$root': # XXX kinda ugly - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + a = rule.alias + self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) + self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): @@ -174,9 +175,10 @@ class Parser: start_symbol = start_symbol or self.start_symbol _Item = Item + match = self.term_matcher def predict(nonterm, column): - assert not isinstance(nonterm, Terminal), nonterm + assert not is_terminal(nonterm), nonterm return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): @@ -203,7 +205,7 @@ class Parser: def scan(i, token, column): next_set = Column(i, self.FIRST) - next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token)) + next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) if not next_set: expect = {i.expect for i in column.to_scan} diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 391e3dd..7390c58 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -1,20 +1,8 @@ from ..utils import bfs, fzset from ..common import GrammarError, is_terminal +from ..grammar import Rule -class Rule(object): - """ - origin : a symbol - expansion : a list of symbols - """ - def __init__(self, origin, expansion, alias=None, options=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - self.options = options - - def __repr__(self): - return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) class RulePtr(object): def __init__(self, rule, index): @@ -106,28 +94,29 @@ def calculate_sets(rules): class GrammarAnalyzer(object): - def __init__(self, rule_tuples, start_symbol, debug=False): + def __init__(self, rules, start_symbol, debug=False): + assert len(rules) == len(set(rules)) + self.start_symbol = start_symbol self.debug = debug - rule_tuples = list(rule_tuples) - rule_tuples.append(('$root', [start_symbol, '$end'])) - rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] - - self.rules = set() - self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} - for origin, exp, alias, options in rule_tuples: - r = Rule( origin, exp, alias, options ) - self.rules.add(r) - self.rules_by_origin[origin].append(r) - - for r in self.rules: + + root_rule = Rule('$root', [start_symbol, '$END']) + + self.rules_by_origin = {r.origin: [] for r in rules} + for r in rules: + self.rules_by_origin[r.origin].append(r) + + self.rules_by_origin[root_rule.origin] = [root_rule] + + for r in rules: for sym in r.expansion: if not (is_terminal(sym) or sym in self.rules_by_origin): raise GrammarError("Using an undefined rule: %s" % sym) self.start_state = self.expand_rule('$root') + self.rules = rules - self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule]) def expand_rule(self, rule): "Returns all init_ptrs accessible by rule (recursive)" diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 3f2d30f..2c9e8a4 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -73,7 +73,7 @@ class LALR_Analyzer(GrammarAnalyzer): new_state = fzset(rps) lookahead[sym].append((Shift, new_state)) - if sym == '$end': + if sym == '$END': self.end_states.append( new_state ) yield fzset(rps) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index c913661..237619d 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -13,7 +13,8 @@ class FinalReduce: class Parser: def __init__(self, parser_conf): - assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" + assert all(r.options is None or r.options.priority is None + for r in parser_conf.rules), "LALR doesn't yet support prioritization" self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) analysis.compute_lookahead() callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) @@ -85,7 +86,7 @@ class _Parser: pass while True: - _action, arg = get_action('$end') + _action, arg = get_action('$END') if _action is Shift: assert arg == self.end_state val ,= value_stack diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 9b26190..055b26e 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -20,7 +20,7 @@ from collections import defaultdict -from ..common import ParseError, UnexpectedToken, Terminal +from ..common import ParseError, UnexpectedToken, is_terminal from ..lexer import Token, UnexpectedInput from ..tree import Tree from .grammar_analysis import GrammarAnalyzer @@ -28,7 +28,7 @@ from .grammar_analysis import GrammarAnalyzer from .earley import ApplyCallbacks, Item, Column class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): + def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity @@ -41,24 +41,26 @@ class Parser: self.FIRST = {} for rule in self.analysis.rules: - if rule.origin != '$root': # XXX kinda ugly - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + a = rule.alias + self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) + self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = start_symbol or self.start_symbol delayed_matches = defaultdict(list) + match = self.term_matcher text_line = 1 text_column = 0 def predict(nonterm, column): - assert not isinstance(nonterm, Terminal), nonterm + assert not is_terminal(nonterm), nonterm return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): @@ -86,7 +88,7 @@ class Parser: to_scan = column.to_scan for x in self.ignore: - m = x.match(stream, i) + m = match(x, stream, i) if m: delayed_matches[m.end()] += set(to_scan) delayed_matches[m.end()] += set(column.to_reduce) @@ -99,16 +101,16 @@ class Parser: # delayed_matches[m.end()] += to_scan for item in to_scan: - m = item.expect.match(stream, i) + m = match(item.expect, stream, i) if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) + t = Token(item.expect, m.group(0), i, text_line, text_column) delayed_matches[m.end()].append(item.advance(t)) s = m.group(0) for j in range(1, len(s)): - m = item.expect.match(s[:-j]) + m = match(item.expect, s[:-j]) if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) + t = Token(item.expect, m.group(0), i, text_line, text_column) delayed_matches[i+m.end()].append(item.advance(t)) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) @@ -143,7 +145,7 @@ class Parser: if n.rule.origin==start_symbol and n.start is column0] if not solutions: - expected_tokens = [t.expect.name for t in column.to_scan] + expected_tokens = [t.expect for t in column.to_scan] raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) elif len(solutions) == 1: