From 0ee80e675a74720a65bd5f637328a73d48e38503 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 6 Jan 2018 18:49:24 +0200 Subject: [PATCH 01/21] Refactoring for LALR, added the ParseTable class --- lark/parser_frontends.py | 2 +- lark/parsers/grammar_analysis.py | 2 +- lark/parsers/lalr_analysis.py | 61 +++++++++++++++++++++++--------- lark/parsers/lalr_parser.py | 29 +++++++-------- 4 files changed, 61 insertions(+), 33 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 718a0f9..ad5017b 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -38,7 +38,7 @@ class LALR_ContextualLexer: self.parser = lalr_parser.Parser(parser_conf) - d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()} + d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 9250c47..391e3dd 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -125,7 +125,7 @@ class GrammarAnalyzer(object): if not (is_terminal(sym) or sym in self.rules_by_origin): raise GrammarError("Using an undefined rule: %s" % sym) - self.init_state = self.expand_rule('$root') + self.start_state = self.expand_rule('$root') self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index e763b08..3f2d30f 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -14,7 +14,41 @@ from ..common import GrammarError, is_terminal from .grammar_analysis import GrammarAnalyzer -ACTION_SHIFT = 0 +class Action: + def __str__(self): + return self.__name__ + def __repr__(self): + return str(self) + +class Shift(Action): pass +class Reduce(Action): pass + +class ParseTable: + def __init__(self, states, start_state, end_state): + self.states = states + self.start_state = start_state + self.end_state = end_state + +class IntParseTable(ParseTable): + + @classmethod + def from_ParseTable(cls, parse_table): + enum = list(parse_table.states) + state_to_idx = {s:i for i,s in enumerate(enum)} + int_states = {} + + for s, la in parse_table.states.items(): + la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v + for k,v in la.items()} + int_states[ state_to_idx[s] ] = la + + + start_state = state_to_idx[parse_table.start_state] + end_state = state_to_idx[parse_table.end_state] + return cls(int_states, start_state, end_state) + + + class LALR_Analyzer(GrammarAnalyzer): @@ -27,7 +61,7 @@ class LALR_Analyzer(GrammarAnalyzer): sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) for rp in sat: for term in self.FOLLOW.get(rp.rule.origin, ()): - lookahead[term].append(('reduce', rp.rule)) + lookahead[term].append((Reduce, rp.rule)) d = classify(unsat, lambda rp: rp.next) for sym, rps in d.items(): @@ -38,7 +72,7 @@ class LALR_Analyzer(GrammarAnalyzer): rps |= self.expand_rule(rp.next) new_state = fzset(rps) - lookahead[sym].append(('shift', new_state)) + lookahead[sym].append((Shift, new_state)) if sym == '$end': self.end_states.append( new_state ) yield fzset(rps) @@ -50,7 +84,7 @@ class LALR_Analyzer(GrammarAnalyzer): for x in v: # XXX resolving shift/reduce into shift, like PLY # Give a proper warning - if x[0] == 'shift': + if x[0] is Shift: lookahead[k] = [x] for k, v in lookahead.items(): @@ -59,22 +93,15 @@ class LALR_Analyzer(GrammarAnalyzer): self.states[state] = {k:v[0] for k, v in lookahead.items()} - for _ in bfs([self.init_state], step): + for _ in bfs([self.start_state], step): pass self.end_state ,= self.end_states - # -- - self.enum = list(self.states) - self.enum_rev = {s:i for i,s in enumerate(self.enum)} - self.states_idx = {} - - for s, la in self.states.items(): - la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' - else (v[0], (v[1], len(v[1].expansion))) # Reduce - for k,v in la.items()} - self.states_idx[ self.enum_rev[s] ] = la + self._parse_table = ParseTable(self.states, self.start_state, self.end_state) + if self.debug: + self.parse_table = self._parse_table + else: + self.parse_table = IntParseTable.from_ParseTable(self._parse_table) - self.init_state_idx = self.enum_rev[self.init_state] - self.end_state_idx = self.enum_rev[self.end_state] diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f224bec..c913661 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,7 +5,7 @@ from ..common import ParseError, UnexpectedToken -from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT +from .lalr_analysis import LALR_Analyzer, Shift class FinalReduce: def __init__(self, value): @@ -19,14 +19,14 @@ class Parser: callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) for rule in analysis.rules} - self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks) + self.parser = _Parser(analysis.parse_table, callbacks) self.parse = self.parser.parse class _Parser: - def __init__(self, states, init_state, end_state, callbacks): - self.states = states - self.init_state = init_state - self.end_state = end_state + def __init__(self, parse_table, callbacks): + self.states = parse_table.states + self.start_state = parse_table.start_state + self.end_state = parse_table.end_state self.callbacks = callbacks def parse(self, seq, set_state=None): @@ -35,10 +35,10 @@ class _Parser: stream = iter(seq) states = self.states - state_stack = [self.init_state] + state_stack = [self.start_state] value_stack = [] - if set_state: set_state(self.init_state) + if set_state: set_state(self.start_state) def get_action(key): state = state_stack[-1] @@ -49,7 +49,8 @@ class _Parser: raise UnexpectedToken(token, expected, seq, i) - def reduce(rule, size): + def reduce(rule): + size = len(rule.expansion) if size: s = value_stack[-size:] del state_stack[-size:] @@ -60,7 +61,7 @@ class _Parser: value = self.callbacks[rule](s) _action, new_state = get_action(rule.origin) - assert _action == ACTION_SHIFT + assert _action is Shift state_stack.append(new_state) value_stack.append(value) @@ -72,22 +73,22 @@ class _Parser: action, arg = get_action(token.type) assert arg != self.end_state - if action == ACTION_SHIFT: + if action is Shift: state_stack.append(arg) value_stack.append(token) if set_state: set_state(arg) token = next(stream) i += 1 else: - reduce(*arg) + reduce(arg) except StopIteration: pass while True: _action, arg = get_action('$end') - if _action == ACTION_SHIFT: + if _action is Shift: assert arg == self.end_state val ,= value_stack return val else: - reduce(*arg) + reduce(arg) From 1cc4c965e87e89b686c0f8c7b01349bdcf3e8ddb Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 7 Jan 2018 00:50:40 +0200 Subject: [PATCH 02/21] Big Refactor: Grammars now build in half the time. Code shorter & cleaner. --- lark/common.py | 27 +------------ lark/grammar.py | 16 ++++++++ lark/lark.py | 1 + lark/lexer.py | 2 +- lark/parse_tree_builder.py | 5 ++- lark/parser_frontends.py | 67 ++++++++++++++++---------------- lark/parsers/earley.py | 24 ++++++------ lark/parsers/grammar_analysis.py | 43 ++++++++------------ lark/parsers/lalr_analysis.py | 2 +- lark/parsers/lalr_parser.py | 5 ++- lark/parsers/xearley.py | 30 +++++++------- 11 files changed, 104 insertions(+), 118 deletions(-) create mode 100644 lark/grammar.py diff --git a/lark/common.py b/lark/common.py index 55e9d28..800aa4f 100644 --- a/lark/common.py +++ b/lark/common.py @@ -33,7 +33,7 @@ class UnexpectedToken(ParseError): def is_terminal(sym): - return isinstance(sym, Terminal) or sym.isupper() or sym == '$end' + return sym.isupper() class LexerConf: @@ -44,7 +44,6 @@ class LexerConf: class ParserConf: def __init__(self, rules, callback, start): - assert all(len(r) == 4 for r in rules) self.rules = rules self.callback = callback self.start = start @@ -108,27 +107,3 @@ class TokenDef(object): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - -class Terminal: - def __init__(self, data): - self.data = data - - def __repr__(self): - return '%r' % self.data - - def __eq__(self, other): - return isinstance(other, type(self)) and self.data == other.data - def __hash__(self): - return hash(self.data) - - -class Terminal_Regexp(Terminal): - def __init__(self, name, regexp): - Terminal.__init__(self, regexp) - self.name = name - self.match = re.compile(regexp).match - -class Terminal_Token(Terminal): - def match(self, other): - return self.data == other.type - diff --git a/lark/grammar.py b/lark/grammar.py new file mode 100644 index 0000000..281c21c --- /dev/null +++ b/lark/grammar.py @@ -0,0 +1,16 @@ + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.options = options + + def __repr__(self): + return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + + diff --git a/lark/lark.py b/lark/lark.py index d8ee186..03bd253 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -171,6 +171,7 @@ class Lark: for f in dir(callback): if not (f.startswith('__') and f.endswith('__')): setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) + parser_conf = ParserConf(rules, callback, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) diff --git a/lark/lexer.py b/lark/lexer.py index 2741af0..66923b0 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -204,7 +204,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] + state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] lexer = Lexer(state_tokens, ignore=ignore) lexer_by_tokens[key] = lexer diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 975121d..497af55 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,6 +1,7 @@ from .common import is_terminal, GrammarError from .utils import suppress from .lexer import Token +from .grammar import Rule class NodeBuilder: def __init__(self, tree_class, name): @@ -27,7 +28,7 @@ class Factory: def __call__(self, node_builder): return self.cls(node_builder, *self.args) - + class TokenWrapper: "Used for fixing the results of scanless parsing" @@ -151,6 +152,6 @@ class ParseTreeBuilder: raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) setattr(callback, callback_name, f) - new_rules.append(( origin, expansion, callback_name, options )) + new_rules.append( Rule( origin, expansion, callback_name, options )) return new_rules, callback diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index ad5017b..228640f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -3,7 +3,7 @@ import sre_parse from .lexer import Lexer, ContextualLexer, Token -from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token +from .common import is_terminal, GrammarError, ParserConf from .parsers import lalr_parser, earley, xearley, resolve_ambig class WithLexer: @@ -70,25 +70,26 @@ def tokenize_text(text): class Earley_NoLex: def __init__(self, lexer_conf, parser_conf, options=None): - self.token_by_name = {t.name:t for t in lexer_conf.tokens} - - rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] + self._prepare_match(lexer_conf) - self.parser = earley.Parser(rules, + self.parser = earley.Parser(parser_conf.rules, parser_conf.start, parser_conf.callback, + self.match, resolve_ambiguity=get_ambiguity_resolver(options)) - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - if width != (1,1): - raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) - yield Terminal_Regexp(sym, regexp) - else: - yield sym + + def match(self, term, text, index=0): + return self.regexps[term].match(text, index) + + def _prepare_match(self, lexer_conf): + self.regexps = {} + for t in lexer_conf.tokens: + regexp = t.pattern.to_regexp() + width = sre_parse.parse(regexp).getwidth() + if width != (1,1): + raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) + self.regexps[t.name] = re.compile(regexp) def parse(self, text): new_text = tokenize_text(text) @@ -98,15 +99,14 @@ class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): WithLexer.__init__(self, lexer_conf) - rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] - - self.parser = earley.Parser(rules, + self.parser = earley.Parser(parser_conf.rules, parser_conf.start, parser_conf.callback, + self.match, resolve_ambiguity=get_ambiguity_resolver(options)) - def _prepare_expansion(self, expansion): - return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] + def match(self, term, token): + return term == token.type def parse(self, text): tokens = self.lex(text) @@ -117,27 +117,26 @@ class XEarley: def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} - rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] - - ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] + self._prepare_match(lexer_conf) - self.parser = xearley.Parser(rules, + self.parser = xearley.Parser(parser_conf.rules, parser_conf.start, parser_conf.callback, + self.match, resolve_ambiguity=get_ambiguity_resolver(options), - ignore=ignore, + ignore=lexer_conf.ignore, predict_all=options.earley__predict_all ) - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - assert width - yield Terminal_Regexp(sym, regexp) - else: - yield sym + def match(self, term, text, index=0): + return self.regexps[term].match(text, index) + + def _prepare_match(self, lexer_conf): + self.regexps = {} + for t in lexer_conf.tokens: + regexp = t.pattern.to_regexp() + assert sre_parse.parse(regexp).getwidth() + self.regexps[t.name] = re.compile(regexp) def parse(self, text): return self.parser.parse(text) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 55893f5..e6a914d 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -13,13 +13,13 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken, Terminal +from ..common import ParseError, UnexpectedToken, is_terminal from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse from .grammar_analysis import GrammarAnalyzer class EndToken: - type = '$end' + type = '$END' class Derivation(Tree): _hash = None @@ -135,7 +135,7 @@ class Column: self.completed[item_key] = item self.to_reduce.append(item) else: - if isinstance(item.expect, Terminal): + if is_terminal(item.expect): self.to_scan.append(item) else: k = item_key if self.predict_all else item @@ -152,7 +152,7 @@ class Column: __nonzero__ = __bool__ # Py2 backwards-compatibility class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): + def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity @@ -161,12 +161,13 @@ class Parser: self.predictions = {} self.FIRST = {} for rule in self.analysis.rules: - if rule.origin != '$root': # XXX kinda ugly - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + a = rule.alias + self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) + self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): @@ -174,9 +175,10 @@ class Parser: start_symbol = start_symbol or self.start_symbol _Item = Item + match = self.term_matcher def predict(nonterm, column): - assert not isinstance(nonterm, Terminal), nonterm + assert not is_terminal(nonterm), nonterm return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): @@ -203,7 +205,7 @@ class Parser: def scan(i, token, column): next_set = Column(i, self.FIRST) - next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token)) + next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) if not next_set: expect = {i.expect for i in column.to_scan} diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 391e3dd..7390c58 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -1,20 +1,8 @@ from ..utils import bfs, fzset from ..common import GrammarError, is_terminal +from ..grammar import Rule -class Rule(object): - """ - origin : a symbol - expansion : a list of symbols - """ - def __init__(self, origin, expansion, alias=None, options=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - self.options = options - - def __repr__(self): - return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) class RulePtr(object): def __init__(self, rule, index): @@ -106,28 +94,29 @@ def calculate_sets(rules): class GrammarAnalyzer(object): - def __init__(self, rule_tuples, start_symbol, debug=False): + def __init__(self, rules, start_symbol, debug=False): + assert len(rules) == len(set(rules)) + self.start_symbol = start_symbol self.debug = debug - rule_tuples = list(rule_tuples) - rule_tuples.append(('$root', [start_symbol, '$end'])) - rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] - - self.rules = set() - self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} - for origin, exp, alias, options in rule_tuples: - r = Rule( origin, exp, alias, options ) - self.rules.add(r) - self.rules_by_origin[origin].append(r) - - for r in self.rules: + + root_rule = Rule('$root', [start_symbol, '$END']) + + self.rules_by_origin = {r.origin: [] for r in rules} + for r in rules: + self.rules_by_origin[r.origin].append(r) + + self.rules_by_origin[root_rule.origin] = [root_rule] + + for r in rules: for sym in r.expansion: if not (is_terminal(sym) or sym in self.rules_by_origin): raise GrammarError("Using an undefined rule: %s" % sym) self.start_state = self.expand_rule('$root') + self.rules = rules - self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule]) def expand_rule(self, rule): "Returns all init_ptrs accessible by rule (recursive)" diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 3f2d30f..2c9e8a4 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -73,7 +73,7 @@ class LALR_Analyzer(GrammarAnalyzer): new_state = fzset(rps) lookahead[sym].append((Shift, new_state)) - if sym == '$end': + if sym == '$END': self.end_states.append( new_state ) yield fzset(rps) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index c913661..237619d 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -13,7 +13,8 @@ class FinalReduce: class Parser: def __init__(self, parser_conf): - assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" + assert all(r.options is None or r.options.priority is None + for r in parser_conf.rules), "LALR doesn't yet support prioritization" self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) analysis.compute_lookahead() callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) @@ -85,7 +86,7 @@ class _Parser: pass while True: - _action, arg = get_action('$end') + _action, arg = get_action('$END') if _action is Shift: assert arg == self.end_state val ,= value_stack diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 9b26190..055b26e 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -20,7 +20,7 @@ from collections import defaultdict -from ..common import ParseError, UnexpectedToken, Terminal +from ..common import ParseError, UnexpectedToken, is_terminal from ..lexer import Token, UnexpectedInput from ..tree import Tree from .grammar_analysis import GrammarAnalyzer @@ -28,7 +28,7 @@ from .grammar_analysis import GrammarAnalyzer from .earley import ApplyCallbacks, Item, Column class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): + def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity @@ -41,24 +41,26 @@ class Parser: self.FIRST = {} for rule in self.analysis.rules: - if rule.origin != '$root': # XXX kinda ugly - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + a = rule.alias + self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) + self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = start_symbol or self.start_symbol delayed_matches = defaultdict(list) + match = self.term_matcher text_line = 1 text_column = 0 def predict(nonterm, column): - assert not isinstance(nonterm, Terminal), nonterm + assert not is_terminal(nonterm), nonterm return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): @@ -86,7 +88,7 @@ class Parser: to_scan = column.to_scan for x in self.ignore: - m = x.match(stream, i) + m = match(x, stream, i) if m: delayed_matches[m.end()] += set(to_scan) delayed_matches[m.end()] += set(column.to_reduce) @@ -99,16 +101,16 @@ class Parser: # delayed_matches[m.end()] += to_scan for item in to_scan: - m = item.expect.match(stream, i) + m = match(item.expect, stream, i) if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) + t = Token(item.expect, m.group(0), i, text_line, text_column) delayed_matches[m.end()].append(item.advance(t)) s = m.group(0) for j in range(1, len(s)): - m = item.expect.match(s[:-j]) + m = match(item.expect, s[:-j]) if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) + t = Token(item.expect, m.group(0), i, text_line, text_column) delayed_matches[i+m.end()].append(item.advance(t)) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) @@ -143,7 +145,7 @@ class Parser: if n.rule.origin==start_symbol and n.start is column0] if not solutions: - expected_tokens = [t.expect.name for t in column.to_scan] + expected_tokens = [t.expect for t in column.to_scan] raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) elif len(solutions) == 1: From 39e58cb8fdb5bec30d8b44514fd75f0c70c86d10 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 7 Jan 2018 11:15:30 +0200 Subject: [PATCH 03/21] Post-refactor cleanup --- lark/parser_frontends.py | 56 ++++++++++++-------------------- lark/parsers/earley.py | 53 ++++++------------------------ lark/parsers/grammar_analysis.py | 7 ++-- lark/parsers/lalr_parser.py | 2 +- lark/parsers/xearley.py | 24 +++++--------- 5 files changed, 45 insertions(+), 97 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 228640f..e8e7ab8 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,10 +7,16 @@ from .common import is_terminal, GrammarError, ParserConf from .parsers import lalr_parser, earley, xearley, resolve_ambig class WithLexer: - def __init__(self, lexer_conf): + def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) + def init_contextual_lexer(self, lexer_conf, parser_conf): + self.lexer_conf = lexer_conf + d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} + always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) + def lex(self, text): stream = self.lexer.lex(text) if self.lexer_conf.postlex: @@ -21,32 +27,22 @@ class WithLexer: class LALR(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf) - - self.parser_conf = parser_conf self.parser = lalr_parser.Parser(parser_conf) + self.init_traditional_lexer(lexer_conf) def parse(self, text): - tokens = self.lex(text) - return self.parser.parse(tokens) + token_stream = self.lex(text) + return self.parser.parse(token_stream) -class LALR_ContextualLexer: +class LALR_ContextualLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - self.lexer_conf = lexer_conf - self.parser_conf = parser_conf - self.parser = lalr_parser.Parser(parser_conf) - - d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} - always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) + self.init_contextual_lexer(lexer_conf, parser_conf) def parse(self, text): - tokens = self.lexer.lex(text) - if self.lexer_conf.postlex: - tokens = self.lexer_conf.postlex.process(tokens) - return self.parser.parse(tokens, self.lexer.set_parser_state) + token_stream = self.lex(text) + return self.parser.parse(token_stream, self.lexer.set_parser_state) def get_ambiguity_resolver(options): if not options or options.ambiguity == 'resolve': @@ -58,24 +54,19 @@ def get_ambiguity_resolver(options): raise ValueError(options) def tokenize_text(text): - new_text = [] line = 1 col_start_pos = 0 for i, ch in enumerate(text): if '\n' in ch: line += ch.count('\n') col_start_pos = i + ch.rindex('\n') - new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) - return new_text + yield Token('CHAR', ch, line=line, column=i - col_start_pos) class Earley_NoLex: def __init__(self, lexer_conf, parser_conf, options=None): self._prepare_match(lexer_conf) - self.parser = earley.Parser(parser_conf.rules, - parser_conf.start, - parser_conf.callback, - self.match, + self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=get_ambiguity_resolver(options)) @@ -92,17 +83,14 @@ class Earley_NoLex: self.regexps[t.name] = re.compile(regexp) def parse(self, text): - new_text = tokenize_text(text) - return self.parser.parse(new_text) + token_stream = tokenize_text(text) + return self.parser.parse(token_stream) class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf) + self.init_traditional_lexer(lexer_conf) - self.parser = earley.Parser(parser_conf.rules, - parser_conf.start, - parser_conf.callback, - self.match, + self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=get_ambiguity_resolver(options)) def match(self, term, token): @@ -119,9 +107,7 @@ class XEarley: self._prepare_match(lexer_conf) - self.parser = xearley.Parser(parser_conf.rules, - parser_conf.start, - parser_conf.callback, + self.parser = xearley.Parser(parser_conf, self.match, resolve_ambiguity=get_ambiguity_resolver(options), ignore=lexer_conf.ignore, diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index e6a914d..62d3e15 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -18,9 +18,6 @@ from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse from .grammar_analysis import GrammarAnalyzer -class EndToken: - type = '$END' - class Derivation(Tree): _hash = None @@ -36,8 +33,6 @@ class Derivation(Tree): self._hash = Tree.__hash__(self) return self._hash -END_TOKEN = EndToken() - class Item(object): "An Earley Item, the atom of the algorithm." @@ -60,11 +55,8 @@ class Item(object): new_tree = Derivation(self.rule, self.tree.children + [tree]) return self.__class__(self.rule, self.ptr+1, self.start, new_tree) - def similar(self, other): - return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule - def __eq__(self, other): - return self.similar(other) #and (self.tree == other.tree) + return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule def __hash__(self): return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ @@ -152,27 +144,24 @@ class Column: __nonzero__ = __bool__ # Py2 backwards-compatibility class Parser: - def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None): - self.analysis = GrammarAnalyzer(rules, start_symbol) - self.start_symbol = start_symbol + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): + self.analysis = GrammarAnalyzer(parser_conf) + self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity + self.FIRST = self.analysis.FIRST self.postprocess = {} self.predictions = {} - self.FIRST = {} - for rule in self.analysis.rules: - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) + for rule in parser_conf.rules: + self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] - self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.start_symbol + start_symbol = start_symbol or self.parser_conf.start _Item = Item match = self.term_matcher @@ -198,9 +187,8 @@ class Parser: for item in to_reduce: new_items = list(complete(item)) - for new_item in new_items: - if new_item.similar(item): - raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) + if item in new_items: + raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) column.add(new_items) def scan(i, token, column): @@ -252,24 +240,3 @@ class ApplyCallbacks(Transformer_NoRecurse): return callback(children) else: return Tree(rule.origin, children) - -# RULES = [ -# ('a', ['d']), -# ('d', ['b']), -# ('b', ['C']), -# ('b', ['b', 'C']), -# ('b', ['C', 'b']), -# ] -# p = Parser(RULES, 'a') -# for x in p.parse('CC'): -# print x.pretty() - -#--------------- -# RULES = [ -# ('s', ['a', 'a']), -# ('a', ['b', 'b']), -# ('b', ['C'], lambda (x,): x), -# ('b', ['b', 'C']), -# ] -# p = Parser(RULES, 's', {}) -# print p.parse('CCCCC').pretty() diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 7390c58..a8c7757 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -94,13 +94,14 @@ def calculate_sets(rules): class GrammarAnalyzer(object): - def __init__(self, rules, start_symbol, debug=False): + def __init__(self, parser_conf, debug=False): + rules = parser_conf.rules assert len(rules) == len(set(rules)) - self.start_symbol = start_symbol + self.start_symbol = parser_conf.start self.debug = debug - root_rule = Rule('$root', [start_symbol, '$END']) + root_rule = Rule('$root', [self.start_symbol, '$END']) self.rules_by_origin = {r.origin: [] for r in rules} for r in rules: diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 237619d..bc45d4e 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -15,7 +15,7 @@ class Parser: def __init__(self, parser_conf): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" - self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) + self.analysis = analysis = LALR_Analyzer(parser_conf) analysis.compute_lookahead() callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) for rule in analysis.rules} diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 055b26e..3cc67f3 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -28,31 +28,26 @@ from .grammar_analysis import GrammarAnalyzer from .earley import ApplyCallbacks, Item, Column class Parser: - def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): - self.analysis = GrammarAnalyzer(rules, start_symbol) - self.start_symbol = start_symbol + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): + self.analysis = GrammarAnalyzer(parser_conf) + self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity self.ignore = list(ignore) self.predict_all = predict_all - + self.FIRST = self.analysis.FIRST self.postprocess = {} self.predictions = {} - self.FIRST = {} - - for rule in self.analysis.rules: - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) + for rule in parser_conf.rules: + self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] - self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.start_symbol + start_symbol = start_symbol or self.parser_conf.start delayed_matches = defaultdict(list) match = self.term_matcher @@ -79,9 +74,8 @@ class Parser: column.add( predict(nonterm, column) ) for item in to_reduce: new_items = list(complete(item)) - for new_item in new_items: - if new_item.similar(item): - raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) + if item in new_items: + raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) column.add(new_items) def scan(i, token, column): From 38c5fd244ab5c36a692a53ee9bf40881c60b5ac3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 7 Jan 2018 17:20:07 +0200 Subject: [PATCH 04/21] Improved grammar validation and refactored the lexers --- lark/lexer.py | 114 +++++++++++++++++-------------------- lark/load_grammar.py | 6 +- lark/parse_tree_builder.py | 2 +- lark/parsers/xearley.py | 2 +- tests/test_parser.py | 41 +++++++++++-- 5 files changed, 95 insertions(+), 70 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 66923b0..ba920c6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False): return _build_mres(tokens, len(tokens), match_whole) -class Lexer(object): +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculat the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + + + +class Lexer: def __init__(self, tokens, ignore=()): assert all(isinstance(t, TokenDef) for t in tokens), tokens self.ignore = ignore - self.newline_char = '\n' tokens = list(tokens) # Sanitization @@ -129,10 +152,7 @@ class Lexer(object): if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) - token_names = {t.name for t in tokens} - for t in ignore: - if t not in token_names: - raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) + assert set(ignore) <= {t.name for t in tokens} # Init self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] @@ -147,46 +167,8 @@ class Lexer(object): self.mres = build_mres(tokens) - def lex(self, stream): - lex_pos = 0 - line = 1 - col_start_pos = 0 - newline_types = list(self.newline_types) - ignore_types = list(self.ignore_types) - while True: - for mre, type_from_index in self.mres: - m = mre.match(stream, lex_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - to_yield = type_ not in ignore_types - - if to_yield: - t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) - end_col = t.column + len(value) - if t.type in self.callback: - t = self.callback[t.type](t) - - if type_ in newline_types: - newlines = value.count(self.newline_char) - if newlines: - line += newlines - last_newline_index = value.rindex(self.newline_char) + 1 - col_start_pos = lex_pos + last_newline_index - end_col = len(value) - last_newline_index - - if to_yield: - t.end_line = line - t.end_col = end_col - yield t - - lex_pos += len(value) - break - else: - if lex_pos < len(stream): - raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) - break + return _Lex(self).lex(stream, self.newline_types, self.ignore_types) class ContextualLexer: @@ -218,33 +200,39 @@ class ContextualLexer: self.parser_state = state def lex(self, stream): - lex_pos = 0 - line = 1 - col_start_pos = 0 - newline_types = list(self.root_lexer.newline_types) - ignore_types = list(self.root_lexer.ignore_types) + l = _Lex(self.lexers[self.parser_state]) + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] + + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + newline_types = list(newline_types) + line_ctr = LineCounter() + while True: - lexer = self.lexers[self.parser_state] + lexer = self.lexer for mre, type_from_index in lexer.mres: - m = mre.match(stream, lex_pos) + m = mre.match(stream, line_ctr.char_pos) if m: value = m.group(0) type_ = type_from_index[m.lastindex] if type_ not in ignore_types: - t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) if t.type in lexer.callback: t = lexer.callback[t.type](t) - yield t - - if type_ in newline_types: - newlines = value.count(lexer.newline_char) - if newlines: - line += newlines - col_start_pos = lex_pos + value.rindex(lexer.newline_char) - lex_pos += len(value) + lexer = yield t + + line_ctr.feed(value, type_ in newline_types) break else: - if lex_pos < len(stream): - raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) break diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 72e2e22..7726845 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -411,6 +411,7 @@ class Grammar: terms_to_ignore = {name:'__'+name for name in self.ignore} if terms_to_ignore: assert set(terms_to_ignore) <= {name for name, _t in term_defs} + term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] expr = Token('RULE', '__ignore') for r, tree, _o in rule_defs: @@ -562,6 +563,7 @@ class GrammarLoader: d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} rules, callback = ParseTreeBuilder(d, T).apply() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) + parser_conf = ParserConf(rules, callback, 'start') self.parser = LALR(lexer_conf, parser_conf) @@ -636,7 +638,6 @@ class GrammarLoader: ignore_names.append(name) token_defs.append((name, (t, 0))) - # Verify correctness 2 token_names = set() for name, _ in token_defs: @@ -644,6 +645,9 @@ class GrammarLoader: raise GrammarError("Token '%s' defined more than once" % name) token_names.add(name) + if set(ignore_names) > token_names: + raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) + # Resolve token references resolve_token_references(token_defs) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 497af55..e26d287 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -121,7 +121,7 @@ class ParseTreeBuilder: for expansion, alias in expansions: if alias and origin.startswith('_'): - raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) wrapper_chain = filter(None, [ (expand1 and not alias) and Expand1, diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 3cc67f3..420c469 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -127,7 +127,7 @@ class Parser: if token == '\n': text_line += 1 - text_column = 1 + text_column = 0 else: text_column += 1 diff --git a/tests/test_parser.py b/tests/test_parser.py index d93e33b..db28834 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase): r = T().transform(g.parse("x")) self.assertEqual( r.children, [""] ) - + g = Lark("""start: a ?a : b b : "x" @@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase): r = T().transform(g.parse("xx")) self.assertEqual( r.children, [""] ) - + g = Lark("""start: a ?a : b b -> c b : "x" """, parser='lalr', transformer=T()) r = g.parse("xx") self.assertEqual( r.children, [""] ) - + @@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, ['a', 'A']) + def test_undefined_ignore(self): + g = """!start: "A" + + %ignore B + """ + self.assertRaises( GrammarError, _Lark, g) + + @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO + def test_line_and_column(self): + g = r"""!start: "A" bc "D" + !bc: "B\nC" + """ + l = _Lark(g) + a, bc, d = l.parse("AB\nCD").children + self.assertEqual(a.line, 1) + self.assertEqual(a.column, 0) + + bc ,= bc.children + self.assertEqual(bc.line, 1) + self.assertEqual(bc.column, 1) + + self.assertEqual(d.line, 2) + self.assertEqual(d.column, 1) + + # self.assertEqual(a.end_line, 1) + # self.assertEqual(a.end_col, 1) + # self.assertEqual(bc.end_line, 2) + # self.assertEqual(bc.end_col, 1) + # self.assertEqual(d.end_line, 2) + # self.assertEqual(d.end_col, 2) + + + def test_reduce_cycle(self): """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. @@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER): parser = _Lark(grammar) - tree = parser.parse("int 1 ! This is a comment\n") + tree = parser.parse("int 1 ! This is a comment\n") self.assertEqual(tree.children, ['1']) tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! From 7182ba399136bf2c0f1f74d6652e60ffeb55d448 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 7 Jan 2018 22:33:37 +0200 Subject: [PATCH 05/21] Minor refactoring for the standalone tool (in progress) --- lark/grammar.py | 38 ++++++++++++++++++++++++- lark/lexer.py | 52 +++++++++++++++++------------------ lark/load_grammar.py | 28 +------------------ lark/parsers/lalr_analysis.py | 8 ++++-- lark/parsers/lalr_parser.py | 5 +--- 5 files changed, 70 insertions(+), 61 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 281c21c..f853182 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -10,7 +10,43 @@ class Rule(object): self.alias = alias self.options = options - def __repr__(self): + def __str__(self): return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + +class RuleOptions: + def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.create_token = create_token # used for scanless postprocessing + self.priority = priority + self.filter_out = filter_out # remove this rule from the tree + # used for "token"-rules in scanless + @classmethod + def from_rule(cls, name, *x): + if len(x) > 1: + priority, expansions = x + priority = int(priority) + else: + expansions ,= x + priority = None + + keep_all_tokens = name.startswith('!') + name = name.lstrip('!') + expand1 = name.startswith('?') + name = name.lstrip('?') + + return name, expansions, cls(keep_all_tokens, expand1, priority=priority) + + def __repr__(self): + return 'RuleOptions(%r, %r, %r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.create_token, + self.priority, + self.filter_out + ) diff --git a/lark/lexer.py b/lark/lexer.py index ba920c6..5ca77de 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -111,35 +111,11 @@ def build_mres(tokens, match_whole=False): return _build_mres(tokens, len(tokens), match_whole) -class LineCounter: - def __init__(self): - self.newline_char = '\n' - self.char_pos = 0 - self.line = 1 - self.column = 0 - self.line_start_pos = 0 - - def feed(self, token, test_newline=True): - """Consume a token and calculat the new line & column. - - As an optional optimization, set test_newline=False is token doesn't contain a newline. - """ - if test_newline: - newlines = token.count(self.newline_char) - if newlines: - self.line += newlines - self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 - - self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos - - class Lexer: def __init__(self, tokens, ignore=()): assert all(isinstance(t, TokenDef) for t in tokens), tokens - self.ignore = ignore tokens = list(tokens) # Sanitization @@ -156,7 +132,7 @@ class Lexer: # Init self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = [t for t in ignore] + self.ignore_types = list(ignore) tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) @@ -206,6 +182,30 @@ class ContextualLexer: l.lexer = self.lexers[self.parser_state] +###{lexer + +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + class _Lex: "Built to serve both Lexer and ContextualLexer" def __init__(self, lexer): @@ -235,4 +235,4 @@ class _Lex: if line_ctr.char_pos < len(stream): raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) break - +###} diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 7726845..ce4ec5a 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef +from .grammar import RuleOptions from .tree import Tree as T, Transformer, InlineTransformer, Visitor @@ -494,33 +495,6 @@ class Grammar: -class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.create_token = create_token # used for scanless postprocessing - self.priority = priority - - self.filter_out = filter_out # remove this rule from the tree - # used for "token"-rules in scanless - @classmethod - def from_rule(cls, name, *x): - if len(x) > 1: - priority, expansions = x - priority = int(priority) - else: - expansions ,= x - priority = None - - keep_all_tokens = name.startswith('!') - name = name.lstrip('!') - expand1 = name.startswith('?') - name = name.lstrip('?') - - return name, expansions, cls(keep_all_tokens, expand1, priority=priority) - - - _imported_grammars = {} def import_grammar(grammar_path): if grammar_path not in _imported_grammars: diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 2c9e8a4..6eb3fdf 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -15,13 +15,15 @@ from ..common import GrammarError, is_terminal from .grammar_analysis import GrammarAnalyzer class Action: + def __init__(self, name): + self.name = name def __str__(self): - return self.__name__ + return self.name def __repr__(self): return str(self) -class Shift(Action): pass -class Reduce(Action): pass +Shift = Action('Shift') +Reduce = Action('Reduce') class ParseTable: def __init__(self, states, start_state, end_state): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index bc45d4e..b093990 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -7,10 +7,6 @@ from ..common import ParseError, UnexpectedToken from .lalr_analysis import LALR_Analyzer, Shift -class FinalReduce: - def __init__(self, value): - self.value = value - class Parser: def __init__(self, parser_conf): assert all(r.options is None or r.options.priority is None @@ -20,6 +16,7 @@ class Parser: callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) for rule in analysis.rules} + self.parser_conf = parser_conf self.parser = _Parser(analysis.parse_table, callbacks) self.parse = self.parser.parse From e072d91760b4acfb7773389f44c001b66b6221f2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Jan 2018 17:00:53 +0200 Subject: [PATCH 06/21] Updated README --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 542977f..62a645f 100644 --- a/README.md +++ b/README.md @@ -66,8 +66,8 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) - Builds a parse-tree (AST) automagically, based on the structure of the grammar - **Earley** parser - - Can parse *ALL* context-free grammars - - Full support for ambiguity in grammar + - Can parse all context-free grammars + - Full support for ambiguous grammars - **LALR(1)** parser - Competitive with PLY - **EBNF** grammar @@ -86,7 +86,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/ #### Performance comparison -Lower is better! +Lark is the fastest and lightest (lower is better) ![Run-time Comparison](docs/comparison_runtime.png) @@ -99,14 +99,14 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail #### Feature comparison -| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? +| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | |:--------|:----------|:----|:--------|:------------|:------------ -| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | -| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | -| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | -| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | -| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | -| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | +| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | +| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | +| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | +| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | +| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | +| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | (\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) From 401833536888289e3215346d275cae6aa5d5dc15 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Jan 2018 17:02:06 +0200 Subject: [PATCH 07/21] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 62a645f..57ac62c 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail #### Feature comparison | Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | -|:--------|:----------|:----|:--------|:------------|:------------ +|:--------|:----------|:----|:--------|:------------|:------------|:---------- | **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | | [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | | [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | From 07b5469e8616567d85108dd237e3065b8c1e87c2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Jan 2018 19:16:07 +0200 Subject: [PATCH 08/21] More refactoring, untangling grammar compilation and parse-tree creation --- lark/grammar.py | 15 ----------- lark/lark.py | 5 ++-- lark/load_grammar.py | 40 +++++++++++++++++++++++------- lark/parse_tree_builder.py | 51 ++++++++++++++++++-------------------- 4 files changed, 58 insertions(+), 53 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index f853182..d257bc4 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -26,21 +26,6 @@ class RuleOptions: self.filter_out = filter_out # remove this rule from the tree # used for "token"-rules in scanless - @classmethod - def from_rule(cls, name, *x): - if len(x) > 1: - priority, expansions = x - priority = int(priority) - else: - expansions ,= x - priority = None - - keep_all_tokens = name.startswith('!') - name = name.lstrip('!') - expand1 = name.startswith('?') - name = name.lstrip('?') - - return name, expansions, cls(keep_all_tokens, expand1, priority=priority) def __repr__(self): return 'RuleOptions(%r, %r, %r, %r, %r)' % ( diff --git a/lark/lark.py b/lark/lark.py index 03bd253..a7af772 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -165,14 +165,15 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) + self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) - rules, callback = self.parse_tree_builder.apply(self.options.transformer) + callback = self.parse_tree_builder.apply(self.options.transformer) if self.profiler: for f in dir(callback): if not (f.startswith('__') and f.endswith('__')): setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) - parser_conf = ParserConf(rules, callback, self.options.start) + parser_conf = ParserConf(self.rules, callback, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ce4ec5a..b38a67c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef -from .grammar import RuleOptions +from .grammar import RuleOptions, Rule from .tree import Tree as T, Transformer, InlineTransformer, Visitor @@ -485,13 +485,21 @@ class Grammar: dict_update_safe(rules, ebnf_to_bnf.new_rules) - for tree, _o in rules.values(): + rule_tree_to_text = RuleTreeToText() + + new_rules = [] + for origin, (tree, options) in rules.items(): simplify_rule.visit(tree) + expansions = rule_tree_to_text.transform(tree) - rule_tree_to_text = RuleTreeToText() - rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()} + for expansion, alias in expansions: + if alias and origin.startswith('_'): + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) - return tokens, rules, self.ignore + rule = Rule(origin, expansion, alias, options) + new_rules.append(rule) + + return tokens, new_rules, self.ignore @@ -528,14 +536,28 @@ def resolve_token_references(token_defs): if not changed: break +def options_from_rule(name, *x): + if len(x) > 1: + priority, expansions = x + priority = int(priority) + else: + expansions ,= x + priority = None + + keep_all_tokens = name.startswith('!') + name = name.lstrip('!') + expand1 = name.startswith('?') + name = name.lstrip('?') + + return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] - rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()] - d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} - rules, callback = ParseTreeBuilder(d, T).apply() + rules = [options_from_rule(name, x) for name, x in RULES.items()] + rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] + callback = ParseTreeBuilder(rules, T).apply() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, 'start') @@ -625,7 +647,7 @@ class GrammarLoader: # Resolve token references resolve_token_references(token_defs) - rules = [RuleOptions.from_rule(*x) for x in rule_defs] + rules = [options_from_rule(*x) for x in rule_defs] rule_names = set() for name, _x, _o in rules: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index e26d287..4513583 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -109,49 +109,46 @@ class ParseTreeBuilder: def _init_builders(self, rules): filter_out = set() - for origin, (expansions, options) in rules.items(): - if options and options.filter_out: - assert origin.startswith('_') # Just to make sure - filter_out.add(origin) + for rule in rules: + if rule.options and rule.options.filter_out: + assert rule.origin.startswith('_') # Just to make sure + filter_out.add(rule.origin) - for origin, (expansions, options) in rules.items(): + for rule in rules: + options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand1 = options.expand1 if options else False create_token = options.create_token if options else False - for expansion, alias in expansions: - if alias and origin.startswith('_'): - raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) + wrapper_chain = filter(None, [ + (expand1 and not rule.alias) and Expand1, + create_token and Factory(TokenWrapper, create_token), + create_rule_handler(rule.expansion, keep_all_tokens, filter_out), + self.propagate_positions and PropagatePositions, + ]) - wrapper_chain = filter(None, [ - (expand1 and not alias) and Expand1, - create_token and Factory(TokenWrapper, create_token), - create_rule_handler(expansion, keep_all_tokens, filter_out), - self.propagate_positions and PropagatePositions, - ]) - - yield origin, expansion, options, alias or origin, wrapper_chain + yield rule, wrapper_chain def apply(self, transformer=None): callback = Callback() - new_rules = [] - for origin, expansion, options, alias, wrapper_chain in self.rule_builders: - callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion)) + for rule, wrapper_chain in self.rule_builders: + internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) + user_callback_name = rule.alias or rule.origin try: - f = transformer._get_func(alias) + f = transformer._get_func(user_callback_name) except AttributeError: - f = NodeBuilder(self.tree_class, alias) + f = NodeBuilder(self.tree_class, user_callback_name) + + rule.alias = internal_callback_name for w in wrapper_chain: f = w(f) - if hasattr(callback, callback_name): - raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) - setattr(callback, callback_name, f) - - new_rules.append( Rule( origin, expansion, callback_name, options )) + if hasattr(callback, internal_callback_name): + raise GrammarError("Rule '%s' already exists" % (rule,)) + setattr(callback, internal_callback_name, f) - return new_rules, callback + return callback From da1910f5b67b56528974aa9996abd46a103a37f2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Jan 2018 21:08:40 +0200 Subject: [PATCH 09/21] More refactoring towards standalone --- lark/common.py | 9 ++- lark/lark.py | 4 +- lark/lexer.py | 113 ++++++++++++++++++------------------ lark/load_grammar.py | 42 +++++++------- lark/parse_tree_builder.py | 6 +- lark/parsers/lalr_parser.py | 6 +- lark/tree.py | 4 +- 7 files changed, 96 insertions(+), 88 deletions(-) diff --git a/lark/common.py b/lark/common.py index 800aa4f..ff1897a 100644 --- a/lark/common.py +++ b/lark/common.py @@ -4,12 +4,18 @@ import sys Py36 = (sys.version_info[:2] >= (3, 6)) + +###{standalone +def is_terminal(sym): + return sym.isupper() + class GrammarError(Exception): pass class ParseError(Exception): pass +###} class UnexpectedToken(ParseError): def __init__(self, token, expected, seq, index): @@ -32,9 +38,6 @@ class UnexpectedToken(ParseError): -def is_terminal(sym): - return sym.isupper() - class LexerConf: def __init__(self, tokens, ignore=(), postlex=None): diff --git a/lark/lark.py b/lark/lark.py index a7af772..58a6ff7 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -166,8 +166,8 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) - callback = self.parse_tree_builder.apply(self.options.transformer) + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) + callback = self._parse_tree_builder.create_callback(self.options.transformer) if self.profiler: for f in dir(callback): if not (f.startswith('__') and f.endswith('__')): diff --git a/lark/lexer.py b/lark/lexer.py index 5ca77de..4f673f6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -5,6 +5,7 @@ import re from .utils import Str, classify from .common import is_terminal, PatternStr, PatternRE, TokenDef +###{standalone class LexError(Exception): pass @@ -48,10 +49,60 @@ class Token(Str): __hash__ = Str.__hash__ -class Regex: - def __init__(self, pattern, flags=()): - self.pattern = pattern - self.flags = flags + +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + newline_types = list(newline_types) + line_ctr = LineCounter() + + while True: + lexer = self.lexer + for mre, type_from_index in lexer.mres: + m = mre.match(stream, line_ctr.char_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + lexer = yield t + + line_ctr.feed(value, type_ in newline_types) + break + else: + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + break +###} + def _regexp_has_newline(r): return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) @@ -182,57 +233,3 @@ class ContextualLexer: l.lexer = self.lexers[self.parser_state] -###{lexer - -class LineCounter: - def __init__(self): - self.newline_char = '\n' - self.char_pos = 0 - self.line = 1 - self.column = 0 - self.line_start_pos = 0 - - def feed(self, token, test_newline=True): - """Consume a token and calculate the new line & column. - - As an optional optimization, set test_newline=False is token doesn't contain a newline. - """ - if test_newline: - newlines = token.count(self.newline_char) - if newlines: - self.line += newlines - self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 - - self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos - -class _Lex: - "Built to serve both Lexer and ContextualLexer" - def __init__(self, lexer): - self.lexer = lexer - - def lex(self, stream, newline_types, ignore_types): - newline_types = list(newline_types) - newline_types = list(newline_types) - line_ctr = LineCounter() - - while True: - lexer = self.lexer - for mre, type_from_index in lexer.mres: - m = mre.match(stream, line_ctr.char_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - lexer = yield t - - line_ctr.feed(value, type_ in newline_types) - break - else: - if line_ctr.char_pos < len(stream): - raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) - break -###} diff --git a/lark/load_grammar.py b/lark/load_grammar.py index b38a67c..2086591 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -128,7 +128,7 @@ RULES = { class EBNF_to_BNF(InlineTransformer): def __init__(self): - self.new_rules = {} + self.new_rules = [] self.rules_by_expr = {} self.prefix = 'anon' self.i = 0 @@ -141,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) self.i += 1 t = Token('RULE', new_name, -1) - self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options + tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) + self.new_rules.append((new_name, tree, self.rule_options)) self.rules_by_expr[expr] = t return t @@ -390,12 +391,6 @@ def _interleave(l, item): def _choice_of_rules(rules): return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) -def dict_update_safe(d1, d2): - for k, v in d2.items(): - assert k not in d1 - d1[k] = v - - class Grammar: def __init__(self, rule_defs, token_defs, ignore): self.token_defs = token_defs @@ -468,38 +463,41 @@ class Grammar: # ================= # Compile Rules # ================= - ebnf_to_bnf = EBNF_to_BNF() - simplify_rule = SimplifyRule_Visitor() + # 1. Pre-process terminals transformer = PrepareLiterals() if not lexer: transformer *= SplitLiterals() transformer *= ExtractAnonTokens(tokens) # Adds to tokens - rules = {} + # 2. Convert EBNF to BNF (and apply step 1) + ebnf_to_bnf = EBNF_to_BNF() + rules = [] for name, rule_tree, options in rule_defs: - assert name not in rules, name ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None tree = transformer.transform(rule_tree) - rules[name] = ebnf_to_bnf.transform(tree), options + rules.append((name, ebnf_to_bnf.transform(tree), options)) + rules += ebnf_to_bnf.new_rules - dict_update_safe(rules, ebnf_to_bnf.new_rules) + assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" + # 3. Compile tree to Rule objects rule_tree_to_text = RuleTreeToText() - new_rules = [] - for origin, (tree, options) in rules.items(): + simplify_rule = SimplifyRule_Visitor() + compiled_rules = [] + for name, tree, options in rules: simplify_rule.visit(tree) expansions = rule_tree_to_text.transform(tree) for expansion, alias in expansions: - if alias and origin.startswith('_'): - raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) + if alias and name.startswith('_'): + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - rule = Rule(origin, expansion, alias, options) - new_rules.append(rule) + rule = Rule(name, expansion, alias, options) + compiled_rules.append(rule) - return tokens, new_rules, self.ignore + return tokens, compiled_rules, self.ignore @@ -557,7 +555,7 @@ class GrammarLoader: rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] - callback = ParseTreeBuilder(rules, T).apply() + callback = ParseTreeBuilder(rules, T).create_callback() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, 'start') diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 4513583..f960931 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -3,6 +3,8 @@ from .utils import suppress from .lexer import Token from .grammar import Rule +###{standalone + class NodeBuilder: def __init__(self, tree_class, name): self.tree_class = tree_class @@ -130,7 +132,7 @@ class ParseTreeBuilder: yield rule, wrapper_chain - def apply(self, transformer=None): + def create_callback(self, transformer=None): callback = Callback() for rule, wrapper_chain in self.rule_builders: @@ -152,3 +154,5 @@ class ParseTreeBuilder: setattr(callback, internal_callback_name, f) return callback + +###} diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index b093990..eafc4ea 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -3,7 +3,7 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken +from ..common import UnexpectedToken from .lalr_analysis import LALR_Analyzer, Shift @@ -20,6 +20,8 @@ class Parser: self.parser = _Parser(analysis.parse_table, callbacks) self.parse = self.parser.parse +###{standalone + class _Parser: def __init__(self, parse_table, callbacks): self.states = parse_table.states @@ -90,3 +92,5 @@ class _Parser: return val else: reduce(arg) + +###} diff --git a/lark/tree.py b/lark/tree.py index f832857..1639bb1 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -7,6 +7,7 @@ from copy import deepcopy from .utils import inline_args +###{standalone class Tree(object): def __init__(self, data, children): self.data = data @@ -33,6 +34,7 @@ class Tree(object): def pretty(self, indent_str=' '): return ''.join(self._pretty(0, indent_str)) +###} def expand_kids_by_index(self, *indices): for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices @@ -138,7 +140,7 @@ class TransformerChain(object): def __mul__(self, other): return TransformerChain(*self.transformers + (other,)) - + class InlineTransformer(Transformer): From 5ac4120b71d9481eccba04e7c9a746c50be38fdc Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 10 Jan 2018 00:50:12 +0200 Subject: [PATCH 10/21] Stand-alone tool working for LALR+traditional lexer (first commit) --- lark/parse_tree_builder.py | 3 + lark/tools/standalone.py | 184 +++++++++++++++++++++++++++++++++++++ lark/tree.py | 2 + lark/utils.py | 36 ++++---- 4 files changed, 208 insertions(+), 17 deletions(-) create mode 100644 lark/tools/standalone.py diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index f960931..7e52125 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -109,6 +109,8 @@ class ParseTreeBuilder: self.rule_builders = list(self._init_builders(rules)) + self.user_aliases = {} + def _init_builders(self, rules): filter_out = set() for rule in rules: @@ -144,6 +146,7 @@ class ParseTreeBuilder: except AttributeError: f = NodeBuilder(self.tree_class, user_callback_name) + self.user_aliases[rule] = rule.alias rule.alias = internal_callback_name for w in wrapper_chain: diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py new file mode 100644 index 0000000..54dc69a --- /dev/null +++ b/lark/tools/standalone.py @@ -0,0 +1,184 @@ +###{standalone +# +# +# Lark Stand-alone Generator Tool +# ---------------------------------- +# Git: https://github.com/erezsh/lark +# Author: Erez Shinan (erezshin@gmail.com) +# +# +# >>> LICENSE +# +# This tool and its generated code use a separate license from Lark. +# +# It is licensed under GPLv2 or above. +# +# If you wish to purchase a commercial license for this tool and its +# generated code, contact me via email. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# See . +# +# +###} + +import codecs +import sys +import os +from pprint import pprint +from os import path +from collections import defaultdict + +import lark +from lark import Lark + +from ..grammar import Rule + +__dir__ = path.dirname(__file__) +__larkdir__ = path.join(__dir__, path.pardir) + + +EXTRACT_STANDALONE_FILES = [ + 'tools/standalone.py', + 'utils.py', + 'common.py', + 'tree.py', + 'lexer.py', + 'parse_tree_builder.py', + 'parsers/lalr_parser.py', +] + + +def extract_sections(lines): + section = None + text = [] + sections = defaultdict(list) + for l in lines: + if l.startswith('###'): + if l[3] == '{': + section = l[4:].strip() + elif l[3] == '}': + sections[section] += text + section = None + text = [] + else: + raise ValueError(l) + elif section: + text.append(l) + + return {name:''.join(text) for name, text in sections.items()} + +class LexerAtoms: + def __init__(self, lexer): + assert not lexer.callback + self.mres = [(p.pattern,d) for p,d in lexer.mres] + self.newline_types = lexer.newline_types + self.ignore_types = lexer.ignore_types + + def print_python(self): + print('import re') + print('MRES = (') + pprint(self.mres) + print(')') + print('NEWLINE_TYPES = %s' % self.newline_types) + print('IGNORE_TYPES = %s' % self.ignore_types) + print('class LexerRegexps: pass') + print('lexer_regexps = LexerRegexps()') + print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') + print('lexer_regexps.callback = {}') + print('lexer = _Lex(lexer_regexps)') + print('def lex(stream):') + print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)') + + +class GetRule: + def __init__(self, rule_id): + self.rule_id = rule_id + + def __repr__(self): + return 'RULE_ID[%d]' % self.rule_id + + +def get_rule_ids(x): + if isinstance(x, (tuple, list)): + return type(x)(map(get_rule_ids, x)) + elif isinstance(x, dict): + return {get_rule_ids(k):get_rule_ids(v) for k, v in x.items()} + elif isinstance(x, Rule): + return GetRule(id(x)) + return x + +class ParserAtoms: + def __init__(self, parser): + self.parse_table = parser.analysis.parse_table + + def print_python(self): + print('class ParseTable: pass') + print('parse_table = ParseTable()') + print('parse_table.states = (') + pprint(get_rule_ids(self.parse_table.states)) + print(')') + print('parse_table.start_state = %s' % self.parse_table.start_state) + print('parse_table.end_state = %s' % self.parse_table.end_state) + print('class Lark_StandAlone:') + print(' def __init__(self, transformer=None):') + print(' callback = parse_tree_builder.create_callback(transformer=transformer)') + print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES}') + print(' self.parser = _Parser(parse_table, callbacks)') + print(' def parse(self, stream):') + print(' return self.parser.parse(lex(stream))') + +class TreeBuilderAtoms: + def __init__(self, lark): + self.rules = lark.rules + self.ptb = lark._parse_tree_builder + + def print_python(self): + print('RULE_ID = {') + for r in self.rules: + print(' %d: Rule(%r, %r, %r, %r),' % (id(r), r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) + print('}') + print('RULES = list(RULE_ID.values())') + print('parse_tree_builder = ParseTreeBuilder(RULES, Tree)') + +def main(fn): + with codecs.open(fn, encoding='utf8') as f: + lark_inst = Lark(f, parser="lalr") + + lexer_atoms = LexerAtoms(lark_inst.parser.lexer) + parser_atoms = ParserAtoms(lark_inst.parser.parser) + tree_builder_atoms = TreeBuilderAtoms(lark_inst) + + print('# Generated by Lark v%s' % lark.__version__) + + + for pyfile in EXTRACT_STANDALONE_FILES: + print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone']) + + print(open(os.path.join(__larkdir__, 'grammar.py')).read()) + print('Shift = 0') + print('Reduce = 1') + lexer_atoms.print_python() + tree_builder_atoms.print_python() + parser_atoms.print_python() + + # print('print(parser.parse(lex("1+2")).pretty())') + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Generates a stand-alone lalr parser") + print("Usage: %s " % sys.argv[0]) + sys.exit(1) + + fn ,= sys.argv[1:] + + main(fn) diff --git a/lark/tree.py b/lark/tree.py index 1639bb1..9c8e7da 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -101,6 +101,7 @@ class Tree(object): +###{standalone class Transformer(object): def _get_func(self, name): return getattr(self, name) @@ -197,6 +198,7 @@ class Transformer_NoRecurse(Transformer): def __default__(self, t): return t +###} def pydot__tree_to_png(tree, filename): diff --git a/lark/utils.py b/lark/utils.py index d984400..01c70a1 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,7 +1,4 @@ -import functools -import types from collections import deque -from contextlib import contextmanager class fzset(frozenset): def __repr__(self): @@ -49,8 +46,13 @@ try: except NameError: # Python 3 STRING_TYPE = str -Str = type(u'') +###{standalone +import types +import functools +from contextlib import contextmanager + +Str = type(u'') def inline_args(f): # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) @@ -76,19 +78,6 @@ def inline_args(f): return _f - -try: - compare = cmp -except NameError: - def compare(a, b): - if a == b: - return 0 - elif a > b: - return 1 - else: - return -1 - - try: from contextlib import suppress # Python 3 except ImportError: @@ -107,6 +96,19 @@ except ImportError: except excs: pass +###} +try: + compare = cmp +except NameError: + def compare(a, b): + if a == b: + return 0 + elif a > b: + return 1 + else: + return -1 + + From a409f2835c9715be41c37c9426a4e0afa634556c Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 10 Jan 2018 00:56:09 +0200 Subject: [PATCH 11/21] Corrections to the standalone tool --- lark/tools/standalone.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 54dc69a..45bd18d 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -3,6 +3,8 @@ # # Lark Stand-alone Generator Tool # ---------------------------------- +# Generates a stand-alone LALR(1) parser with a standard lexer +# # Git: https://github.com/erezsh/lark # Author: Erez Shinan (erezshin@gmail.com) # @@ -158,8 +160,7 @@ def main(fn): parser_atoms = ParserAtoms(lark_inst.parser.parser) tree_builder_atoms = TreeBuilderAtoms(lark_inst) - print('# Generated by Lark v%s' % lark.__version__) - + print('# The file was automatically generated by Lark v%s' % lark.__version__) for pyfile in EXTRACT_STANDALONE_FILES: print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone']) @@ -171,12 +172,10 @@ def main(fn): tree_builder_atoms.print_python() parser_atoms.print_python() - # print('print(parser.parse(lex("1+2")).pretty())') - if __name__ == '__main__': if len(sys.argv) < 2: - print("Generates a stand-alone lalr parser") - print("Usage: %s " % sys.argv[0]) + print("Lark Stand-alone Generator Tool") + print("Usage: python -m lark.tools.standalone ") sys.exit(1) fn ,= sys.argv[1:] From 9b0672fda646c6bbe662e4e51d2d5e3bdc700d77 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 10 Jan 2018 10:28:25 +0200 Subject: [PATCH 12/21] Standalone tools now supports postlex --- lark/common.py | 4 ++-- lark/indenter.py | 3 +++ lark/lexer.py | 25 ++++++++++++------------- lark/tools/standalone.py | 32 +++++++++++++++++++++++--------- 4 files changed, 40 insertions(+), 24 deletions(-) diff --git a/lark/common.py b/lark/common.py index ff1897a..1717fe7 100644 --- a/lark/common.py +++ b/lark/common.py @@ -15,8 +15,6 @@ class GrammarError(Exception): class ParseError(Exception): pass -###} - class UnexpectedToken(ParseError): def __init__(self, token, expected, seq, index): self.token = token @@ -37,6 +35,8 @@ class UnexpectedToken(ParseError): super(UnexpectedToken, self).__init__(message) +###} + class LexerConf: diff --git a/lark/indenter.py b/lark/indenter.py index a5f107d..34e61a0 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -2,6 +2,7 @@ from .lexer import Token +###{standalone class Indenter: def __init__(self): self.paren_level = 0 @@ -50,3 +51,5 @@ class Indenter: @property def always_accept(self): return (self.NL_type,) + +###} diff --git a/lark/lexer.py b/lark/lexer.py index 4f673f6..844025d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -101,25 +101,23 @@ class _Lex: if line_ctr.char_pos < len(stream): raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) break -###} - -def _regexp_has_newline(r): - return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) +class UnlessCallback: + def __init__(self, mres): + self.mres = mres -def _create_unless_callback(strs): - mres = build_mres(strs, match_whole=True) - def unless_callback(t): - # if t in strs: - # t.type = strs[t] - for mre, type_from_index in mres: + def __call__(self, t): + for mre, type_from_index in self.mres: m = mre.match(t.value) if m: value = m.group(0) t.type = type_from_index[m.lastindex] break return t - return unless_callback + +###} + + def _create_unless(tokens): tokens_by_type = classify(tokens, lambda t: type(t.pattern)) @@ -136,7 +134,7 @@ def _create_unless(tokens): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = _create_unless_callback(unless) + callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) tokens = [t for t in tokens if t not in embedded_strs] return tokens, callback @@ -161,7 +159,8 @@ def _build_mres(tokens, max_size, match_whole): def build_mres(tokens, match_whole=False): return _build_mres(tokens, len(tokens), match_whole) - +def _regexp_has_newline(r): + return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) class Lexer: def __init__(self, tokens, ignore=()): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 45bd18d..7a9f5a2 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -54,6 +54,7 @@ EXTRACT_STANDALONE_FILES = [ 'utils.py', 'common.py', 'tree.py', + 'indenter.py', 'lexer.py', 'parse_tree_builder.py', 'parsers/lalr_parser.py', @@ -81,22 +82,27 @@ def extract_sections(lines): class LexerAtoms: def __init__(self, lexer): - assert not lexer.callback self.mres = [(p.pattern,d) for p,d in lexer.mres] self.newline_types = lexer.newline_types self.ignore_types = lexer.ignore_types + self.callback = {name:[(p.pattern,d) for p,d in c.mres] + for name, c in lexer.callback.items()} def print_python(self): print('import re') print('MRES = (') pprint(self.mres) print(')') + print('LEXER_CALLBACK = (') + pprint(self.callback) + print(')') print('NEWLINE_TYPES = %s' % self.newline_types) print('IGNORE_TYPES = %s' % self.ignore_types) print('class LexerRegexps: pass') print('lexer_regexps = LexerRegexps()') print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') - print('lexer_regexps.callback = {}') + print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') + print(' for n, mres in LEXER_CALLBACK.items()}') print('lexer = _Lex(lexer_regexps)') print('def lex(stream):') print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)') @@ -132,12 +138,15 @@ class ParserAtoms: print('parse_table.start_state = %s' % self.parse_table.start_state) print('parse_table.end_state = %s' % self.parse_table.end_state) print('class Lark_StandAlone:') - print(' def __init__(self, transformer=None):') + print(' def __init__(self, transformer=None, postlex=None):') print(' callback = parse_tree_builder.create_callback(transformer=transformer)') print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES}') print(' self.parser = _Parser(parse_table, callbacks)') + print(' self.postlex = postlex') print(' def parse(self, stream):') - print(' return self.parser.parse(lex(stream))') + print(' tokens = lex(stream)') + print(' if self.postlex: tokens = self.postlex.process(tokens)') + print(' return self.parser.parse(tokens)') class TreeBuilderAtoms: def __init__(self, lark): @@ -152,9 +161,9 @@ class TreeBuilderAtoms: print('RULES = list(RULE_ID.values())') print('parse_tree_builder = ParseTreeBuilder(RULES, Tree)') -def main(fn): +def main(fn, start): with codecs.open(fn, encoding='utf8') as f: - lark_inst = Lark(f, parser="lalr") + lark_inst = Lark(f, parser="lalr", start=start) lexer_atoms = LexerAtoms(lark_inst.parser.lexer) parser_atoms = ParserAtoms(lark_inst.parser.parser) @@ -175,9 +184,14 @@ def main(fn): if __name__ == '__main__': if len(sys.argv) < 2: print("Lark Stand-alone Generator Tool") - print("Usage: python -m lark.tools.standalone ") + print("Usage: python -m lark.tools.standalone []") sys.exit(1) - fn ,= sys.argv[1:] + if len(sys.argv) == 3: + fn, start = sys.argv[1:] + elif len(sys.argv) == 2: + fn, start = sys.argv[1], 'start' + else: + assert False, sys.argv - main(fn) + main(fn, start) From e697c266a7cab63ba79e5bc9eb547e59e05f118e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 10 Jan 2018 13:11:15 +0200 Subject: [PATCH 13/21] Standalone: Significantly reduced generated code size --- lark/tools/standalone.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 7a9f5a2..0444614 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -42,6 +42,7 @@ from collections import defaultdict import lark from lark import Lark +from lark.parsers.lalr_analysis import Shift, Reduce from ..grammar import Rule @@ -113,17 +114,15 @@ class GetRule: self.rule_id = rule_id def __repr__(self): - return 'RULE_ID[%d]' % self.rule_id + return 'RULES[%d]' % self.rule_id +rule_ids = {} +token_types = {} -def get_rule_ids(x): - if isinstance(x, (tuple, list)): - return type(x)(map(get_rule_ids, x)) - elif isinstance(x, dict): - return {get_rule_ids(k):get_rule_ids(v) for k, v in x.items()} - elif isinstance(x, Rule): - return GetRule(id(x)) - return x +def _get_token_type(token_type): + if token_type not in token_types: + token_types[token_type] = len(token_types) + return token_types[token_type] class ParserAtoms: def __init__(self, parser): @@ -132,15 +131,22 @@ class ParserAtoms: def print_python(self): print('class ParseTable: pass') print('parse_table = ParseTable()') - print('parse_table.states = (') - pprint(get_rule_ids(self.parse_table.states)) + print('STATES = {') + for state, actions in self.parse_table.states.items(): + print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()})) + print('}') + print('TOKEN_TYPES = (') + pprint({v:k for k, v in token_types.items()}) print(')') + print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') + print(' for s, acts in STATES.items()}') print('parse_table.start_state = %s' % self.parse_table.start_state) print('parse_table.end_state = %s' % self.parse_table.end_state) print('class Lark_StandAlone:') print(' def __init__(self, transformer=None, postlex=None):') print(' callback = parse_tree_builder.create_callback(transformer=transformer)') - print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES}') + print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}') print(' self.parser = _Parser(parse_table, callbacks)') print(' self.postlex = postlex') print(' def parse(self, stream):') @@ -154,12 +160,12 @@ class TreeBuilderAtoms: self.ptb = lark._parse_tree_builder def print_python(self): - print('RULE_ID = {') - for r in self.rules: - print(' %d: Rule(%r, %r, %r, %r),' % (id(r), r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) + print('RULES = {') + for i, r in enumerate(self.rules): + rule_ids[r] = i + print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) print('}') - print('RULES = list(RULE_ID.values())') - print('parse_tree_builder = ParseTreeBuilder(RULES, Tree)') + print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') def main(fn, start): with codecs.open(fn, encoding='utf8') as f: From f9b02c1f13d9ffe462d0d646dbcec51d7e6b68dc Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 10 Jan 2018 14:45:56 +0200 Subject: [PATCH 14/21] Updated README to mention standalone --- README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 57ac62c..794a203 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Lark can: - Build a parse-tree automagically, no construction code required - Outperform all other Python libraries when using LALR(1) (Yes, including PLY) - Run on every Python interpreter (it's pure-python) + - Generate a stand-alone parser (for LALR(1) grammars) And many more features. Read ahead and find out. @@ -69,7 +70,8 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) - Can parse all context-free grammars - Full support for ambiguous grammars - **LALR(1)** parser - - Competitive with PLY + - Fast and light, competitive with PLY + - Can generate a stand-alone parser - **EBNF** grammar - **Unicode** fully supported - **Python 2 & 3** compatible @@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail #### Feature comparison -| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | -|:--------|:----------|:----|:--------|:------------|:------------|:---------- -| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | -| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | -| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | -| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | -| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | -| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | +| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone +|:--------|:----------|:----|:--------|:------------|:------------|:----------|:---------- +| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) | +| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No | +| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No | +| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No | +| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No | +| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No | -(\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) +(\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) ### Projects using Lark From 4679a348cea97f633a486a5b14cc32ba59c72f2e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 10 Jan 2018 14:46:25 +0200 Subject: [PATCH 15/21] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 930fa01..1637a75 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError from .lark import Lark from .utils import inline_args -__version__ = "0.5.1" +__version__ = "0.5.2" From 4d219ae837aaf15c6d1c533358683e30abf837c1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 11 Jan 2018 16:02:02 +0200 Subject: [PATCH 16/21] Added standalone example --- examples/standalone/create_standalone.sh | 1 + examples/standalone/json.g | 21 + examples/standalone/json_parser.py | 794 +++++++++++++++++++++++ examples/standalone/json_parser_main.py | 25 + 4 files changed, 841 insertions(+) create mode 100755 examples/standalone/create_standalone.sh create mode 100644 examples/standalone/json.g create mode 100644 examples/standalone/json_parser.py create mode 100644 examples/standalone/json_parser_main.py diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh new file mode 100755 index 0000000..1eba3a4 --- /dev/null +++ b/examples/standalone/create_standalone.sh @@ -0,0 +1 @@ +python -m lark.tools.standalone json.g > json_parser.py diff --git a/examples/standalone/json.g b/examples/standalone/json.g new file mode 100644 index 0000000..243a230 --- /dev/null +++ b/examples/standalone/json.g @@ -0,0 +1,21 @@ +?start: value + +?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + +array : "[" [value ("," value)*] "]" +object : "{" [pair ("," pair)*] "}" +pair : string ":" value + +string : ESCAPED_STRING + +%import common.ESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.WS + +%ignore WS diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py new file mode 100644 index 0000000..f249f61 --- /dev/null +++ b/examples/standalone/json_parser.py @@ -0,0 +1,794 @@ +# The file was automatically generated by Lark v0.5.2 +# +# +# Lark Stand-alone Generator Tool +# ---------------------------------- +# Generates a stand-alone LALR(1) parser with a standard lexer +# +# Git: https://github.com/erezsh/lark +# Author: Erez Shinan (erezshin@gmail.com) +# +# +# >>> LICENSE +# +# This tool and its generated code use a separate license from Lark. +# +# It is licensed under GPLv2 or above. +# +# If you wish to purchase a commercial license for this tool and its +# generated code, contact me via email. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# See . +# +# + + +import types +import functools +from contextlib import contextmanager + +Str = type(u'') + +def inline_args(f): + # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) + if isinstance(f, types.FunctionType): + @functools.wraps(f) + def _f_func(self, args): + return f(self, *args) + return _f_func + elif isinstance(f, (type, types.BuiltinFunctionType)): + @functools.wraps(f) + def _f_builtin(_self, args): + return f(*args) + return _f_builtin + elif isinstance(f, types.MethodType): + @functools.wraps(f.__func__) + def _f(self, args): + return f.__func__(self, *args) + return _f + else: + @functools.wraps(f.__call__.__func__) + def _f(self, args): + return f.__call__.__func__(self, *args) + return _f + + +try: + from contextlib import suppress # Python 3 +except ImportError: + @contextmanager + def suppress(*excs): + '''Catch and dismiss the provided exception + + >>> x = 'hello' + >>> with suppress(IndexError): + ... x = x[10] + >>> x + 'hello' + ''' + try: + yield + except excs: + pass + + +def is_terminal(sym): + return sym.isupper() + +class GrammarError(Exception): + pass + +class ParseError(Exception): + pass + +class UnexpectedToken(ParseError): + def __init__(self, token, expected, seq, index): + self.token = token + self.expected = expected + self.line = getattr(token, 'line', '?') + self.column = getattr(token, 'column', '?') + + try: + context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) + except AttributeError: + context = seq[index:index+5] + except TypeError: + context = "" + message = ("Unexpected token %r at line %s, column %s.\n" + "Expected: %s\n" + "Context: %s" % (token, self.line, self.column, expected, context)) + + super(UnexpectedToken, self).__init__(message) + + + +class Tree(object): + def __init__(self, data, children): + self.data = data + self.children = list(children) + + def __repr__(self): + return 'Tree(%s, %s)' % (self.data, self.children) + + def _pretty_label(self): + return self.data + + def _pretty(self, level, indent_str): + if len(self.children) == 1 and not isinstance(self.children[0], Tree): + return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n'] + + l = [ indent_str*level, self._pretty_label(), '\n' ] + for n in self.children: + if isinstance(n, Tree): + l += n._pretty(level+1, indent_str) + else: + l += [ indent_str*(level+1), '%s' % n, '\n' ] + + return l + + def pretty(self, indent_str=' '): + return ''.join(self._pretty(0, indent_str)) +class Transformer(object): + def _get_func(self, name): + return getattr(self, name) + + def transform(self, tree): + items = [] + for c in tree.children: + try: + items.append(self.transform(c) if isinstance(c, Tree) else c) + except Discard: + pass + try: + f = self._get_func(tree.data) + except AttributeError: + return self.__default__(tree.data, items) + else: + return f(items) + + def __default__(self, data, children): + return Tree(data, children) + + def __mul__(self, other): + return TransformerChain(self, other) + + +class Discard(Exception): + pass + +class TransformerChain(object): + def __init__(self, *transformers): + self.transformers = transformers + + def transform(self, tree): + for t in self.transformers: + tree = t.transform(tree) + return tree + + def __mul__(self, other): + return TransformerChain(*self.transformers + (other,)) + + + +class InlineTransformer(Transformer): + def _get_func(self, name): # use super()._get_func + return inline_args(getattr(self, name)).__get__(self) + + +class Visitor(object): + def visit(self, tree): + for child in tree.children: + if isinstance(child, Tree): + self.visit(child) + + f = getattr(self, tree.data, self.__default__) + f(tree) + return tree + + def __default__(self, tree): + pass + + +class Visitor_NoRecurse(Visitor): + def visit(self, tree): + subtrees = list(tree.iter_subtrees()) + + for subtree in (subtrees): + getattr(self, subtree.data, self.__default__)(subtree) + return tree + + +class Transformer_NoRecurse(Transformer): + def transform(self, tree): + subtrees = list(tree.iter_subtrees()) + + def _t(t): + # Assumes t is already transformed + try: + f = self._get_func(t.data) + except AttributeError: + return self.__default__(t) + else: + return f(t) + + for subtree in subtrees: + children = [] + for c in subtree.children: + try: + children.append(_t(c) if isinstance(c, Tree) else c) + except Discard: + pass + subtree.children = children + + return _t(tree) + + def __default__(self, t): + return t + +class Indenter: + def __init__(self): + self.paren_level = 0 + self.indent_level = [0] + + def handle_NL(self, token): + if self.paren_level > 0: + return + + yield token + + indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces + indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len + + if indent > self.indent_level[-1]: + self.indent_level.append(indent) + yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) + else: + while indent < self.indent_level[-1]: + self.indent_level.pop() + yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) + + assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) + + def process(self, stream): + for token in stream: + if token.type == self.NL_type: + for t in self.handle_NL(token): + yield t + else: + yield token + + if token.type in self.OPEN_PAREN_types: + self.paren_level += 1 + elif token.type in self.CLOSE_PAREN_types: + self.paren_level -= 1 + assert self.paren_level >= 0 + + while len(self.indent_level) > 1: + self.indent_level.pop() + yield Token(self.DEDENT_type, '') + + assert self.indent_level == [0], self.indent_level + + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? + @property + def always_accept(self): + return (self.NL_type,) + + +class LexError(Exception): + pass + +class UnexpectedInput(LexError): + def __init__(self, seq, lex_pos, line, column, allowed=None): + context = seq[lex_pos:lex_pos+5] + message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) + + super(UnexpectedInput, self).__init__(message) + + self.line = line + self.column = column + self.context = context + self.allowed = allowed + +class Token(Str): + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): + inst = Str.__new__(cls, value) + inst.type = type_ + inst.pos_in_stream = pos_in_stream + inst.value = value + inst.line = line + inst.column = column + return inst + + @classmethod + def new_borrow_pos(cls, type_, value, borrow_t): + return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) + + def __repr__(self): + return 'Token(%s, %r)' % (self.type, self.value) + + def __deepcopy__(self, memo): + return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) + + def __eq__(self, other): + if isinstance(other, Token) and self.type != other.type: + return False + + return Str.__eq__(self, other) + + __hash__ = Str.__hash__ + + +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + newline_types = list(newline_types) + line_ctr = LineCounter() + + while True: + lexer = self.lexer + for mre, type_from_index in lexer.mres: + m = mre.match(stream, line_ctr.char_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + lexer = yield t + + line_ctr.feed(value, type_ in newline_types) + break + else: + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + break + +class UnlessCallback: + def __init__(self, mres): + self.mres = mres + + def __call__(self, t): + for mre, type_from_index in self.mres: + m = mre.match(t.value) + if m: + value = m.group(0) + t.type = type_from_index[m.lastindex] + break + return t + + + +class NodeBuilder: + def __init__(self, tree_class, name): + self.tree_class = tree_class + self.name = name + + def __call__(self, children): + return self.tree_class(self.name, children) + +class Expand1: + def __init__(self, node_builder): + self.node_builder = node_builder + + def __call__(self, children): + if len(children) == 1: + return children[0] + else: + return self.node_builder(children) + +class Factory: + def __init__(self, cls, *args): + self.cls = cls + self.args = args + + def __call__(self, node_builder): + return self.cls(node_builder, *self.args) + + +class TokenWrapper: + "Used for fixing the results of scanless parsing" + + def __init__(self, node_builder, token_name): + self.node_builder = node_builder + self.token_name = token_name + + def __call__(self, children): + return self.node_builder( [Token(self.token_name, ''.join(children))] ) + +def identity(node_builder): + return node_builder + + +class ChildFilter: + def __init__(self, node_builder, to_include): + self.node_builder = node_builder + self.to_include = to_include + + def __call__(self, children): + filtered = [] + for i, to_expand in self.to_include: + if to_expand: + filtered += children[i].children + else: + filtered.append(children[i]) + + return self.node_builder(filtered) + +def create_rule_handler(expansion, keep_all_tokens, filter_out): + # if not keep_all_tokens: + to_include = [(i, not is_terminal(sym) and sym.startswith('_')) + for i, sym in enumerate(expansion) + if keep_all_tokens + or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out) + ] + + if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): + return Factory(ChildFilter, to_include) + + # else, if no filtering required.. + return identity + +class PropagatePositions: + def __init__(self, node_builder): + self.node_builder = node_builder + + def __call__(self, children): + res = self.node_builder(children) + + if children: + for a in children: + with suppress(AttributeError): + res.line = a.line + res.column = a.column + break + + for a in reversed(children): + with suppress(AttributeError): + res.end_line = a.end_line + res.end_col = a.end_col + break + + return res + + +class Callback(object): + pass + +class ParseTreeBuilder: + def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): + self.tree_class = tree_class + self.propagate_positions = propagate_positions + self.always_keep_all_tokens = keep_all_tokens + + self.rule_builders = list(self._init_builders(rules)) + + self.user_aliases = {} + + def _init_builders(self, rules): + filter_out = set() + for rule in rules: + if rule.options and rule.options.filter_out: + assert rule.origin.startswith('_') # Just to make sure + filter_out.add(rule.origin) + + for rule in rules: + options = rule.options + keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) + expand1 = options.expand1 if options else False + create_token = options.create_token if options else False + + wrapper_chain = filter(None, [ + (expand1 and not rule.alias) and Expand1, + create_token and Factory(TokenWrapper, create_token), + create_rule_handler(rule.expansion, keep_all_tokens, filter_out), + self.propagate_positions and PropagatePositions, + ]) + + yield rule, wrapper_chain + + + def create_callback(self, transformer=None): + callback = Callback() + + for rule, wrapper_chain in self.rule_builders: + internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) + + user_callback_name = rule.alias or rule.origin + try: + f = transformer._get_func(user_callback_name) + except AttributeError: + f = NodeBuilder(self.tree_class, user_callback_name) + + self.user_aliases[rule] = rule.alias + rule.alias = internal_callback_name + + for w in wrapper_chain: + f = w(f) + + if hasattr(callback, internal_callback_name): + raise GrammarError("Rule '%s' already exists" % (rule,)) + setattr(callback, internal_callback_name, f) + + return callback + + + +class _Parser: + def __init__(self, parse_table, callbacks): + self.states = parse_table.states + self.start_state = parse_table.start_state + self.end_state = parse_table.end_state + self.callbacks = callbacks + + def parse(self, seq, set_state=None): + i = 0 + token = None + stream = iter(seq) + states = self.states + + state_stack = [self.start_state] + value_stack = [] + + if set_state: set_state(self.start_state) + + def get_action(key): + state = state_stack[-1] + try: + return states[state][key] + except KeyError: + expected = states[state].keys() + + raise UnexpectedToken(token, expected, seq, i) + + def reduce(rule): + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = self.callbacks[rule](s) + + _action, new_state = get_action(rule.origin) + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + # Main LALR-parser loop + try: + token = next(stream) + i += 1 + while True: + action, arg = get_action(token.type) + assert arg != self.end_state + + if action is Shift: + state_stack.append(arg) + value_stack.append(token) + if set_state: set_state(arg) + token = next(stream) + i += 1 + else: + reduce(arg) + except StopIteration: + pass + + while True: + _action, arg = get_action('$END') + if _action is Shift: + assert arg == self.end_state + val ,= value_stack + return val + else: + reduce(arg) + + + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.options = options + + def __str__(self): + return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + +class RuleOptions: + def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.create_token = create_token # used for scanless postprocessing + self.priority = priority + + self.filter_out = filter_out # remove this rule from the tree + # used for "token"-rules in scanless + + def __repr__(self): + return 'RuleOptions(%r, %r, %r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.create_token, + self.priority, + self.filter_out + ) + +Shift = 0 +Reduce = 1 +import re +MRES = ( +[('(?P(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P(?:[ \t\x0c' + '\r\n' + '])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', + {1: 'SIGNED_NUMBER', + 2: 'ESCAPED_STRING', + 3: 'WS', + 4: '__FALSE1', + 5: '__NULL2', + 6: '__TRUE0', + 7: '__COLON', + 8: '__COMMA', + 9: '__LBRACE', + 10: '__LSQB', + 11: '__RBRACE', + 12: '__RSQB'})] +) +LEXER_CALLBACK = ( +{} +) +NEWLINE_TYPES = ['WS'] +IGNORE_TYPES = ['WS'] +class LexerRegexps: pass +lexer_regexps = LexerRegexps() +lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] +lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) + for n, mres in LEXER_CALLBACK.items()} +lexer = _Lex(lexer_regexps) +def lex(stream): + return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) +RULES = { + 0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)), + 1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)), + 2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)), + 3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)), + 4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)), + 5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)), + 6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)), + 7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)), + 8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)), + 15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), + 16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None), + 17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None), + 18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None), + 19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None), +} +parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) +class ParseTable: pass +parse_table = ParseTable() +STATES = { + 0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)}, + 1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)}, + 2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)}, + 3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)}, + 4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)}, + 5: {12: (0, 16)}, + 6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)}, + 7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)}, + 8: {12: (1, 0)}, + 9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)}, + 10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)}, + 11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)}, + 12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)}, + 13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)}, + 14: {14: (0, 19), 13: (0, 20), 18: (0, 21)}, + 15: {17: (0, 22)}, + 16: {}, + 17: {19: (0, 23), 15: (0, 24), 13: (0, 25)}, + 18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)}, + 19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)}, + 20: {9: (0, 10), 11: (0, 15), 16: (0, 26)}, + 21: {14: (0, 27), 13: (0, 28)}, + 22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)}, + 23: {15: (0, 30), 13: (0, 31)}, + 24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)}, + 25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, + 26: {13: (1, 18), 14: (1, 18)}, + 27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)}, + 28: {16: (0, 33), 9: (0, 10), 11: (0, 15)}, + 29: {13: (1, 14), 14: (1, 14)}, + 30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)}, + 31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, + 32: {15: (1, 16), 13: (1, 16)}, + 33: {13: (1, 19), 14: (1, 19)}, + 34: {15: (1, 17), 13: (1, 17)}, +} +TOKEN_TYPES = ( +{0: '__TRUE0', + 1: '__LBRACE', + 2: 'array', + 3: 'object', + 4: 'start', + 5: '__LSQB', + 6: 'SIGNED_NUMBER', + 7: 'value', + 8: '__NULL2', + 9: 'ESCAPED_STRING', + 10: '__FALSE1', + 11: 'string', + 12: '$END', + 13: '__COMMA', + 14: '__RBRACE', + 15: '__RSQB', + 16: 'pair', + 17: '__COLON', + 18: '__anon_star_1', + 19: '__anon_star_0'} +) +parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()} + for s, acts in STATES.items()} +parse_table.start_state = 0 +parse_table.end_state = 16 +class Lark_StandAlone: + def __init__(self, transformer=None, postlex=None): + callback = parse_tree_builder.create_callback(transformer=transformer) + callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()} + self.parser = _Parser(parse_table, callbacks) + self.postlex = postlex + def parse(self, stream): + tokens = lex(stream) + if self.postlex: tokens = self.postlex.process(tokens) + return self.parser.parse(tokens) diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py new file mode 100644 index 0000000..47c1bb1 --- /dev/null +++ b/examples/standalone/json_parser_main.py @@ -0,0 +1,25 @@ +import sys + +from json_parser import Lark_StandAlone, Transformer, inline_args + +class TreeToJson(Transformer): + @inline_args + def string(self, s): + return s[1:-1].replace('\\"', '"') + + array = list + pair = tuple + object = dict + number = inline_args(float) + + null = lambda self, _: None + true = lambda self, _: True + false = lambda self, _: False + + +parser = Lark_StandAlone(transformer=TreeToJson()) + +if __name__ == '__main__': + with open(sys.argv[1]) as f: + print(parser.parse(f.read())) + From 8acd77d7ffb546e2aa55c80042a8908d7e7e2fc9 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 16 Jan 2018 00:51:30 +0200 Subject: [PATCH 17/21] Minor fixes in lexer --- lark/grammars/common.g | 1 + lark/lexer.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lark/grammars/common.g b/lark/grammars/common.g index a54d49d..c38f485 100644 --- a/lark/grammars/common.g +++ b/lark/grammars/common.g @@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT // float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ _EXP: ("e"|"E") SIGNED_INT FLOAT: INT _EXP | DECIMAL _EXP? +SIGNED_FLOAT: ["+"|"-"] INT NUMBER: FLOAT | INT SIGNED_NUMBER: ["+"|"-"] NUMBER diff --git a/lark/lexer.py b/lark/lexer.py index 844025d..64cfb46 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -79,7 +79,7 @@ class _Lex: def lex(self, stream, newline_types, ignore_types): newline_types = list(newline_types) - newline_types = list(newline_types) + ignore_types = list(ignore_types) line_ctr = LineCounter() while True: @@ -93,7 +93,7 @@ class _Lex: t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) if t.type in lexer.callback: t = lexer.callback[t.type](t) - lexer = yield t + yield t line_ctr.feed(value, type_ in newline_types) break From 5fd331be542c586f96feacacc5163e75a533ac2e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 16 Jan 2018 00:52:31 +0200 Subject: [PATCH 18/21] BUGFIX: Internally repetitive rules are now handled silently (Issue #60) --- lark/load_grammar.py | 2 +- tests/test_parser.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 2086591..16dc0d9 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -196,7 +196,7 @@ class SimplifyRule_Visitor(Visitor): tree.data = 'expansions' tree.children = [self.visit(T('expansion', [option if i==j else other for j, other in enumerate(tree.children)])) - for option in child.children] + for option in set(child.children)] break else: break diff --git a/tests/test_parser.py b/tests/test_parser.py index db28834..1c7cfcf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -796,6 +796,16 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, ['a', 'A']) + def test_twice_empty(self): + g = """!start: [["A"]] + """ + l = _Lark(g) + tree = l.parse('A') + self.assertEqual(tree.children, ['A']) + + tree = l.parse('') + self.assertEqual(tree.children, []) + def test_undefined_ignore(self): g = """!start: "A" @@ -1016,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, []) + @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") def test_regex_escaping(self): g = _Lark("start: /[ab]/") From 37c1c0f65f40473007b0d32a941e684fb1119822 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 17 Jan 2018 00:05:24 +0200 Subject: [PATCH 19/21] Better error message for bad regexps (Issue #62) --- lark/common.py | 7 ++++--- lark/parser_frontends.py | 9 ++++++--- lark/utils.py | 7 +++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/lark/common.py b/lark/common.py index 1717fe7..f745018 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,7 +1,8 @@ import re -import sre_parse import sys +from .utils import get_regexp_width + Py36 = (sys.version_info[:2] >= (3, 6)) @@ -95,10 +96,10 @@ class PatternRE(Pattern): @property def min_width(self): - return sre_parse.parse(self.to_regexp()).getwidth()[0] + return get_regexp_width(self.to_regexp())[0] @property def max_width(self): - return sre_parse.parse(self.to_regexp()).getwidth()[1] + return get_regexp_width(self.to_regexp())[1] class TokenDef(object): def __init__(self, name, pattern, priority=1): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e8e7ab8..db6cdcc 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,5 +1,5 @@ import re -import sre_parse +from .utils import get_regexp_width from .lexer import Lexer, ContextualLexer, Token @@ -77,7 +77,7 @@ class Earley_NoLex: self.regexps = {} for t in lexer_conf.tokens: regexp = t.pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() + width = get_regexp_width(regexp) if width != (1,1): raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) self.regexps[t.name] = re.compile(regexp) @@ -121,7 +121,10 @@ class XEarley: self.regexps = {} for t in lexer_conf.tokens: regexp = t.pattern.to_regexp() - assert sre_parse.parse(regexp).getwidth() + try: + assert get_regexp_width(regexp) + except ValueError: + raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp)) self.regexps[t.name] = re.compile(regexp) def parse(self, text): diff --git a/lark/utils.py b/lark/utils.py index 01c70a1..abe036f 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -112,3 +112,10 @@ except NameError: return -1 +import sre_parse +import sre_constants +def get_regexp_width(regexp): + try: + return sre_parse.parse(regexp).getwidth() + except sre_constants.error: + raise ValueError(regexp) From d173d6d66bc43b0000d7aa1e00b29426636e0b96 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 17 Jan 2018 10:38:51 +0200 Subject: [PATCH 20/21] Validate against zero-width terminals in XEarley (Issue #63) --- lark/parser_frontends.py | 6 +++++- tests/test_parser.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index db6cdcc..3865679 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -122,9 +122,13 @@ class XEarley: for t in lexer_conf.tokens: regexp = t.pattern.to_regexp() try: - assert get_regexp_width(regexp) + width = get_regexp_width(regexp)[0] except ValueError: raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp)) + else: + if width == 0: + raise ValueError("Dynamic Earley doesn't allow zero-width regexps") + self.regexps[t.name] = re.compile(regexp) def parse(self, text): diff --git a/tests/test_parser.py b/tests/test_parser.py index 1c7cfcf..8e954e2 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER): # Fails an Earley implementation without special handling for empty rules, # or re-processing of already completed rules. g = Lark(r"""start: B - B: ("ab"|/[^b]/)* + B: ("ab"|/[^b]/)+ """, lexer=LEXER) self.assertEqual( g.parse('abc').children[0], 'abc') From b002ec47fb7879cafd1cf5abd56b4860241efe81 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 17 Jan 2018 10:49:52 +0200 Subject: [PATCH 21/21] BUGFIX: Repeating subrules are now allowed (Issue #61) --- lark/load_grammar.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 16dc0d9..2d01277 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -176,7 +176,6 @@ class SimplifyRule_Visitor(Visitor): break tree.expand_kids_by_index(*to_expand) - def expansion(self, tree): # rules_list unpacking # a : b (c|d) e @@ -210,7 +209,10 @@ class SimplifyRule_Visitor(Visitor): tree.data = 'expansions' tree.children = aliases - expansions = _flatten + def expansions(self, tree): + self._flatten(tree) + tree.children = list(set(tree.children)) + class RuleTreeToText(Transformer): def expansions(self, x):