diff --git a/README.md b/README.md index 542977f..794a203 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Lark can: - Build a parse-tree automagically, no construction code required - Outperform all other Python libraries when using LALR(1) (Yes, including PLY) - Run on every Python interpreter (it's pure-python) + - Generate a stand-alone parser (for LALR(1) grammars) And many more features. Read ahead and find out. @@ -66,10 +67,11 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) - Builds a parse-tree (AST) automagically, based on the structure of the grammar - **Earley** parser - - Can parse *ALL* context-free grammars - - Full support for ambiguity in grammar + - Can parse all context-free grammars + - Full support for ambiguous grammars - **LALR(1)** parser - - Competitive with PLY + - Fast and light, competitive with PLY + - Can generate a stand-alone parser - **EBNF** grammar - **Unicode** fully supported - **Python 2 & 3** compatible @@ -86,7 +88,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/ #### Performance comparison -Lower is better! +Lark is the fastest and lightest (lower is better) ![Run-time Comparison](docs/comparison_runtime.png) @@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail #### Feature comparison -| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? -|:--------|:----------|:----|:--------|:------------|:------------ -| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | -| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | -| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | -| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | -| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | -| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | +| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone +|:--------|:----------|:----|:--------|:------------|:------------|:----------|:---------- +| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) | +| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No | +| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No | +| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No | +| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No | +| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No | -(\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) +(\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) ### Projects using Lark diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh new file mode 100755 index 0000000..1eba3a4 --- /dev/null +++ b/examples/standalone/create_standalone.sh @@ -0,0 +1 @@ +python -m lark.tools.standalone json.g > json_parser.py diff --git a/examples/standalone/json.g b/examples/standalone/json.g new file mode 100644 index 0000000..243a230 --- /dev/null +++ b/examples/standalone/json.g @@ -0,0 +1,21 @@ +?start: value + +?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + +array : "[" [value ("," value)*] "]" +object : "{" [pair ("," pair)*] "}" +pair : string ":" value + +string : ESCAPED_STRING + +%import common.ESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.WS + +%ignore WS diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py new file mode 100644 index 0000000..f249f61 --- /dev/null +++ b/examples/standalone/json_parser.py @@ -0,0 +1,794 @@ +# The file was automatically generated by Lark v0.5.2 +# +# +# Lark Stand-alone Generator Tool +# ---------------------------------- +# Generates a stand-alone LALR(1) parser with a standard lexer +# +# Git: https://github.com/erezsh/lark +# Author: Erez Shinan (erezshin@gmail.com) +# +# +# >>> LICENSE +# +# This tool and its generated code use a separate license from Lark. +# +# It is licensed under GPLv2 or above. +# +# If you wish to purchase a commercial license for this tool and its +# generated code, contact me via email. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# See . +# +# + + +import types +import functools +from contextlib import contextmanager + +Str = type(u'') + +def inline_args(f): + # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) + if isinstance(f, types.FunctionType): + @functools.wraps(f) + def _f_func(self, args): + return f(self, *args) + return _f_func + elif isinstance(f, (type, types.BuiltinFunctionType)): + @functools.wraps(f) + def _f_builtin(_self, args): + return f(*args) + return _f_builtin + elif isinstance(f, types.MethodType): + @functools.wraps(f.__func__) + def _f(self, args): + return f.__func__(self, *args) + return _f + else: + @functools.wraps(f.__call__.__func__) + def _f(self, args): + return f.__call__.__func__(self, *args) + return _f + + +try: + from contextlib import suppress # Python 3 +except ImportError: + @contextmanager + def suppress(*excs): + '''Catch and dismiss the provided exception + + >>> x = 'hello' + >>> with suppress(IndexError): + ... x = x[10] + >>> x + 'hello' + ''' + try: + yield + except excs: + pass + + +def is_terminal(sym): + return sym.isupper() + +class GrammarError(Exception): + pass + +class ParseError(Exception): + pass + +class UnexpectedToken(ParseError): + def __init__(self, token, expected, seq, index): + self.token = token + self.expected = expected + self.line = getattr(token, 'line', '?') + self.column = getattr(token, 'column', '?') + + try: + context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) + except AttributeError: + context = seq[index:index+5] + except TypeError: + context = "" + message = ("Unexpected token %r at line %s, column %s.\n" + "Expected: %s\n" + "Context: %s" % (token, self.line, self.column, expected, context)) + + super(UnexpectedToken, self).__init__(message) + + + +class Tree(object): + def __init__(self, data, children): + self.data = data + self.children = list(children) + + def __repr__(self): + return 'Tree(%s, %s)' % (self.data, self.children) + + def _pretty_label(self): + return self.data + + def _pretty(self, level, indent_str): + if len(self.children) == 1 and not isinstance(self.children[0], Tree): + return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n'] + + l = [ indent_str*level, self._pretty_label(), '\n' ] + for n in self.children: + if isinstance(n, Tree): + l += n._pretty(level+1, indent_str) + else: + l += [ indent_str*(level+1), '%s' % n, '\n' ] + + return l + + def pretty(self, indent_str=' '): + return ''.join(self._pretty(0, indent_str)) +class Transformer(object): + def _get_func(self, name): + return getattr(self, name) + + def transform(self, tree): + items = [] + for c in tree.children: + try: + items.append(self.transform(c) if isinstance(c, Tree) else c) + except Discard: + pass + try: + f = self._get_func(tree.data) + except AttributeError: + return self.__default__(tree.data, items) + else: + return f(items) + + def __default__(self, data, children): + return Tree(data, children) + + def __mul__(self, other): + return TransformerChain(self, other) + + +class Discard(Exception): + pass + +class TransformerChain(object): + def __init__(self, *transformers): + self.transformers = transformers + + def transform(self, tree): + for t in self.transformers: + tree = t.transform(tree) + return tree + + def __mul__(self, other): + return TransformerChain(*self.transformers + (other,)) + + + +class InlineTransformer(Transformer): + def _get_func(self, name): # use super()._get_func + return inline_args(getattr(self, name)).__get__(self) + + +class Visitor(object): + def visit(self, tree): + for child in tree.children: + if isinstance(child, Tree): + self.visit(child) + + f = getattr(self, tree.data, self.__default__) + f(tree) + return tree + + def __default__(self, tree): + pass + + +class Visitor_NoRecurse(Visitor): + def visit(self, tree): + subtrees = list(tree.iter_subtrees()) + + for subtree in (subtrees): + getattr(self, subtree.data, self.__default__)(subtree) + return tree + + +class Transformer_NoRecurse(Transformer): + def transform(self, tree): + subtrees = list(tree.iter_subtrees()) + + def _t(t): + # Assumes t is already transformed + try: + f = self._get_func(t.data) + except AttributeError: + return self.__default__(t) + else: + return f(t) + + for subtree in subtrees: + children = [] + for c in subtree.children: + try: + children.append(_t(c) if isinstance(c, Tree) else c) + except Discard: + pass + subtree.children = children + + return _t(tree) + + def __default__(self, t): + return t + +class Indenter: + def __init__(self): + self.paren_level = 0 + self.indent_level = [0] + + def handle_NL(self, token): + if self.paren_level > 0: + return + + yield token + + indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces + indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len + + if indent > self.indent_level[-1]: + self.indent_level.append(indent) + yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) + else: + while indent < self.indent_level[-1]: + self.indent_level.pop() + yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) + + assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) + + def process(self, stream): + for token in stream: + if token.type == self.NL_type: + for t in self.handle_NL(token): + yield t + else: + yield token + + if token.type in self.OPEN_PAREN_types: + self.paren_level += 1 + elif token.type in self.CLOSE_PAREN_types: + self.paren_level -= 1 + assert self.paren_level >= 0 + + while len(self.indent_level) > 1: + self.indent_level.pop() + yield Token(self.DEDENT_type, '') + + assert self.indent_level == [0], self.indent_level + + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? + @property + def always_accept(self): + return (self.NL_type,) + + +class LexError(Exception): + pass + +class UnexpectedInput(LexError): + def __init__(self, seq, lex_pos, line, column, allowed=None): + context = seq[lex_pos:lex_pos+5] + message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) + + super(UnexpectedInput, self).__init__(message) + + self.line = line + self.column = column + self.context = context + self.allowed = allowed + +class Token(Str): + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): + inst = Str.__new__(cls, value) + inst.type = type_ + inst.pos_in_stream = pos_in_stream + inst.value = value + inst.line = line + inst.column = column + return inst + + @classmethod + def new_borrow_pos(cls, type_, value, borrow_t): + return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) + + def __repr__(self): + return 'Token(%s, %r)' % (self.type, self.value) + + def __deepcopy__(self, memo): + return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) + + def __eq__(self, other): + if isinstance(other, Token) and self.type != other.type: + return False + + return Str.__eq__(self, other) + + __hash__ = Str.__hash__ + + +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + newline_types = list(newline_types) + line_ctr = LineCounter() + + while True: + lexer = self.lexer + for mre, type_from_index in lexer.mres: + m = mre.match(stream, line_ctr.char_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + lexer = yield t + + line_ctr.feed(value, type_ in newline_types) + break + else: + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + break + +class UnlessCallback: + def __init__(self, mres): + self.mres = mres + + def __call__(self, t): + for mre, type_from_index in self.mres: + m = mre.match(t.value) + if m: + value = m.group(0) + t.type = type_from_index[m.lastindex] + break + return t + + + +class NodeBuilder: + def __init__(self, tree_class, name): + self.tree_class = tree_class + self.name = name + + def __call__(self, children): + return self.tree_class(self.name, children) + +class Expand1: + def __init__(self, node_builder): + self.node_builder = node_builder + + def __call__(self, children): + if len(children) == 1: + return children[0] + else: + return self.node_builder(children) + +class Factory: + def __init__(self, cls, *args): + self.cls = cls + self.args = args + + def __call__(self, node_builder): + return self.cls(node_builder, *self.args) + + +class TokenWrapper: + "Used for fixing the results of scanless parsing" + + def __init__(self, node_builder, token_name): + self.node_builder = node_builder + self.token_name = token_name + + def __call__(self, children): + return self.node_builder( [Token(self.token_name, ''.join(children))] ) + +def identity(node_builder): + return node_builder + + +class ChildFilter: + def __init__(self, node_builder, to_include): + self.node_builder = node_builder + self.to_include = to_include + + def __call__(self, children): + filtered = [] + for i, to_expand in self.to_include: + if to_expand: + filtered += children[i].children + else: + filtered.append(children[i]) + + return self.node_builder(filtered) + +def create_rule_handler(expansion, keep_all_tokens, filter_out): + # if not keep_all_tokens: + to_include = [(i, not is_terminal(sym) and sym.startswith('_')) + for i, sym in enumerate(expansion) + if keep_all_tokens + or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out) + ] + + if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): + return Factory(ChildFilter, to_include) + + # else, if no filtering required.. + return identity + +class PropagatePositions: + def __init__(self, node_builder): + self.node_builder = node_builder + + def __call__(self, children): + res = self.node_builder(children) + + if children: + for a in children: + with suppress(AttributeError): + res.line = a.line + res.column = a.column + break + + for a in reversed(children): + with suppress(AttributeError): + res.end_line = a.end_line + res.end_col = a.end_col + break + + return res + + +class Callback(object): + pass + +class ParseTreeBuilder: + def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): + self.tree_class = tree_class + self.propagate_positions = propagate_positions + self.always_keep_all_tokens = keep_all_tokens + + self.rule_builders = list(self._init_builders(rules)) + + self.user_aliases = {} + + def _init_builders(self, rules): + filter_out = set() + for rule in rules: + if rule.options and rule.options.filter_out: + assert rule.origin.startswith('_') # Just to make sure + filter_out.add(rule.origin) + + for rule in rules: + options = rule.options + keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) + expand1 = options.expand1 if options else False + create_token = options.create_token if options else False + + wrapper_chain = filter(None, [ + (expand1 and not rule.alias) and Expand1, + create_token and Factory(TokenWrapper, create_token), + create_rule_handler(rule.expansion, keep_all_tokens, filter_out), + self.propagate_positions and PropagatePositions, + ]) + + yield rule, wrapper_chain + + + def create_callback(self, transformer=None): + callback = Callback() + + for rule, wrapper_chain in self.rule_builders: + internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) + + user_callback_name = rule.alias or rule.origin + try: + f = transformer._get_func(user_callback_name) + except AttributeError: + f = NodeBuilder(self.tree_class, user_callback_name) + + self.user_aliases[rule] = rule.alias + rule.alias = internal_callback_name + + for w in wrapper_chain: + f = w(f) + + if hasattr(callback, internal_callback_name): + raise GrammarError("Rule '%s' already exists" % (rule,)) + setattr(callback, internal_callback_name, f) + + return callback + + + +class _Parser: + def __init__(self, parse_table, callbacks): + self.states = parse_table.states + self.start_state = parse_table.start_state + self.end_state = parse_table.end_state + self.callbacks = callbacks + + def parse(self, seq, set_state=None): + i = 0 + token = None + stream = iter(seq) + states = self.states + + state_stack = [self.start_state] + value_stack = [] + + if set_state: set_state(self.start_state) + + def get_action(key): + state = state_stack[-1] + try: + return states[state][key] + except KeyError: + expected = states[state].keys() + + raise UnexpectedToken(token, expected, seq, i) + + def reduce(rule): + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = self.callbacks[rule](s) + + _action, new_state = get_action(rule.origin) + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + # Main LALR-parser loop + try: + token = next(stream) + i += 1 + while True: + action, arg = get_action(token.type) + assert arg != self.end_state + + if action is Shift: + state_stack.append(arg) + value_stack.append(token) + if set_state: set_state(arg) + token = next(stream) + i += 1 + else: + reduce(arg) + except StopIteration: + pass + + while True: + _action, arg = get_action('$END') + if _action is Shift: + assert arg == self.end_state + val ,= value_stack + return val + else: + reduce(arg) + + + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.options = options + + def __str__(self): + return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + +class RuleOptions: + def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.create_token = create_token # used for scanless postprocessing + self.priority = priority + + self.filter_out = filter_out # remove this rule from the tree + # used for "token"-rules in scanless + + def __repr__(self): + return 'RuleOptions(%r, %r, %r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.create_token, + self.priority, + self.filter_out + ) + +Shift = 0 +Reduce = 1 +import re +MRES = ( +[('(?P(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P(?:[ \t\x0c' + '\r\n' + '])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', + {1: 'SIGNED_NUMBER', + 2: 'ESCAPED_STRING', + 3: 'WS', + 4: '__FALSE1', + 5: '__NULL2', + 6: '__TRUE0', + 7: '__COLON', + 8: '__COMMA', + 9: '__LBRACE', + 10: '__LSQB', + 11: '__RBRACE', + 12: '__RSQB'})] +) +LEXER_CALLBACK = ( +{} +) +NEWLINE_TYPES = ['WS'] +IGNORE_TYPES = ['WS'] +class LexerRegexps: pass +lexer_regexps = LexerRegexps() +lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] +lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) + for n, mres in LEXER_CALLBACK.items()} +lexer = _Lex(lexer_regexps) +def lex(stream): + return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) +RULES = { + 0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)), + 1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)), + 2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)), + 3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)), + 4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)), + 5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)), + 6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)), + 7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)), + 8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)), + 15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), + 16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None), + 17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None), + 18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None), + 19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None), +} +parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) +class ParseTable: pass +parse_table = ParseTable() +STATES = { + 0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)}, + 1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)}, + 2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)}, + 3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)}, + 4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)}, + 5: {12: (0, 16)}, + 6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)}, + 7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)}, + 8: {12: (1, 0)}, + 9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)}, + 10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)}, + 11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)}, + 12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)}, + 13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)}, + 14: {14: (0, 19), 13: (0, 20), 18: (0, 21)}, + 15: {17: (0, 22)}, + 16: {}, + 17: {19: (0, 23), 15: (0, 24), 13: (0, 25)}, + 18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)}, + 19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)}, + 20: {9: (0, 10), 11: (0, 15), 16: (0, 26)}, + 21: {14: (0, 27), 13: (0, 28)}, + 22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)}, + 23: {15: (0, 30), 13: (0, 31)}, + 24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)}, + 25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, + 26: {13: (1, 18), 14: (1, 18)}, + 27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)}, + 28: {16: (0, 33), 9: (0, 10), 11: (0, 15)}, + 29: {13: (1, 14), 14: (1, 14)}, + 30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)}, + 31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, + 32: {15: (1, 16), 13: (1, 16)}, + 33: {13: (1, 19), 14: (1, 19)}, + 34: {15: (1, 17), 13: (1, 17)}, +} +TOKEN_TYPES = ( +{0: '__TRUE0', + 1: '__LBRACE', + 2: 'array', + 3: 'object', + 4: 'start', + 5: '__LSQB', + 6: 'SIGNED_NUMBER', + 7: 'value', + 8: '__NULL2', + 9: 'ESCAPED_STRING', + 10: '__FALSE1', + 11: 'string', + 12: '$END', + 13: '__COMMA', + 14: '__RBRACE', + 15: '__RSQB', + 16: 'pair', + 17: '__COLON', + 18: '__anon_star_1', + 19: '__anon_star_0'} +) +parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()} + for s, acts in STATES.items()} +parse_table.start_state = 0 +parse_table.end_state = 16 +class Lark_StandAlone: + def __init__(self, transformer=None, postlex=None): + callback = parse_tree_builder.create_callback(transformer=transformer) + callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()} + self.parser = _Parser(parse_table, callbacks) + self.postlex = postlex + def parse(self, stream): + tokens = lex(stream) + if self.postlex: tokens = self.postlex.process(tokens) + return self.parser.parse(tokens) diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py new file mode 100644 index 0000000..47c1bb1 --- /dev/null +++ b/examples/standalone/json_parser_main.py @@ -0,0 +1,25 @@ +import sys + +from json_parser import Lark_StandAlone, Transformer, inline_args + +class TreeToJson(Transformer): + @inline_args + def string(self, s): + return s[1:-1].replace('\\"', '"') + + array = list + pair = tuple + object = dict + number = inline_args(float) + + null = lambda self, _: None + true = lambda self, _: True + false = lambda self, _: False + + +parser = Lark_StandAlone(transformer=TreeToJson()) + +if __name__ == '__main__': + with open(sys.argv[1]) as f: + print(parser.parse(f.read())) + diff --git a/lark/__init__.py b/lark/__init__.py index 930fa01..1637a75 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError from .lark import Lark from .utils import inline_args -__version__ = "0.5.1" +__version__ = "0.5.2" diff --git a/lark/common.py b/lark/common.py index 55e9d28..f745018 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,16 +1,21 @@ import re -import sre_parse import sys +from .utils import get_regexp_width + Py36 = (sys.version_info[:2] >= (3, 6)) + +###{standalone +def is_terminal(sym): + return sym.isupper() + class GrammarError(Exception): pass class ParseError(Exception): pass - class UnexpectedToken(ParseError): def __init__(self, token, expected, seq, index): self.token = token @@ -31,9 +36,8 @@ class UnexpectedToken(ParseError): super(UnexpectedToken, self).__init__(message) +###} -def is_terminal(sym): - return isinstance(sym, Terminal) or sym.isupper() or sym == '$end' class LexerConf: @@ -44,7 +48,6 @@ class LexerConf: class ParserConf: def __init__(self, rules, callback, start): - assert all(len(r) == 4 for r in rules) self.rules = rules self.callback = callback self.start = start @@ -93,10 +96,10 @@ class PatternRE(Pattern): @property def min_width(self): - return sre_parse.parse(self.to_regexp()).getwidth()[0] + return get_regexp_width(self.to_regexp())[0] @property def max_width(self): - return sre_parse.parse(self.to_regexp()).getwidth()[1] + return get_regexp_width(self.to_regexp())[1] class TokenDef(object): def __init__(self, name, pattern, priority=1): @@ -108,27 +111,3 @@ class TokenDef(object): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - -class Terminal: - def __init__(self, data): - self.data = data - - def __repr__(self): - return '%r' % self.data - - def __eq__(self, other): - return isinstance(other, type(self)) and self.data == other.data - def __hash__(self): - return hash(self.data) - - -class Terminal_Regexp(Terminal): - def __init__(self, name, regexp): - Terminal.__init__(self, regexp) - self.name = name - self.match = re.compile(regexp).match - -class Terminal_Token(Terminal): - def match(self, other): - return self.data == other.type - diff --git a/lark/grammar.py b/lark/grammar.py new file mode 100644 index 0000000..d257bc4 --- /dev/null +++ b/lark/grammar.py @@ -0,0 +1,37 @@ + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.options = options + + def __str__(self): + return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + +class RuleOptions: + def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.create_token = create_token # used for scanless postprocessing + self.priority = priority + + self.filter_out = filter_out # remove this rule from the tree + # used for "token"-rules in scanless + + def __repr__(self): + return 'RuleOptions(%r, %r, %r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.create_token, + self.priority, + self.filter_out + ) diff --git a/lark/grammars/common.g b/lark/grammars/common.g index a54d49d..c38f485 100644 --- a/lark/grammars/common.g +++ b/lark/grammars/common.g @@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT // float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ _EXP: ("e"|"E") SIGNED_INT FLOAT: INT _EXP | DECIMAL _EXP? +SIGNED_FLOAT: ["+"|"-"] INT NUMBER: FLOAT | INT SIGNED_NUMBER: ["+"|"-"] NUMBER diff --git a/lark/indenter.py b/lark/indenter.py index a5f107d..34e61a0 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -2,6 +2,7 @@ from .lexer import Token +###{standalone class Indenter: def __init__(self): self.paren_level = 0 @@ -50,3 +51,5 @@ class Indenter: @property def always_accept(self): return (self.NL_type,) + +###} diff --git a/lark/lark.py b/lark/lark.py index 8029638..fa564ed 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -169,13 +169,15 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) - rules, callback = self.parse_tree_builder.apply(self.options.transformer) + + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) + callback = self._parse_tree_builder.create_callback(self.options.transformer) if self.profiler: for f in dir(callback): if not (f.startswith('__') and f.endswith('__')): setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) - parser_conf = ParserConf(rules, callback, self.options.start) + + parser_conf = ParserConf(self.rules, callback, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) diff --git a/lark/lexer.py b/lark/lexer.py index 2741af0..64cfb46 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -5,6 +5,7 @@ import re from .utils import Str, classify from .common import is_terminal, PatternStr, PatternRE, TokenDef +###{standalone class LexError(Exception): pass @@ -48,27 +49,75 @@ class Token(Str): __hash__ = Str.__hash__ -class Regex: - def __init__(self, pattern, flags=()): - self.pattern = pattern - self.flags = flags -def _regexp_has_newline(r): - return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + ignore_types = list(ignore_types) + line_ctr = LineCounter() -def _create_unless_callback(strs): - mres = build_mres(strs, match_whole=True) - def unless_callback(t): - # if t in strs: - # t.type = strs[t] - for mre, type_from_index in mres: + while True: + lexer = self.lexer + for mre, type_from_index in lexer.mres: + m = mre.match(stream, line_ctr.char_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + yield t + + line_ctr.feed(value, type_ in newline_types) + break + else: + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + break + +class UnlessCallback: + def __init__(self, mres): + self.mres = mres + + def __call__(self, t): + for mre, type_from_index in self.mres: m = mre.match(t.value) if m: value = m.group(0) t.type = type_from_index[m.lastindex] break return t - return unless_callback + +###} + + def _create_unless(tokens): tokens_by_type = classify(tokens, lambda t: type(t.pattern)) @@ -85,7 +134,7 @@ def _create_unless(tokens): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = _create_unless_callback(unless) + callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) tokens = [t for t in tokens if t not in embedded_strs] return tokens, callback @@ -110,13 +159,13 @@ def _build_mres(tokens, max_size, match_whole): def build_mres(tokens, match_whole=False): return _build_mres(tokens, len(tokens), match_whole) +def _regexp_has_newline(r): + return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) -class Lexer(object): +class Lexer: def __init__(self, tokens, ignore=()): assert all(isinstance(t, TokenDef) for t in tokens), tokens - self.ignore = ignore - self.newline_char = '\n' tokens = list(tokens) # Sanitization @@ -129,14 +178,11 @@ class Lexer(object): if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) - token_names = {t.name for t in tokens} - for t in ignore: - if t not in token_names: - raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) + assert set(ignore) <= {t.name for t in tokens} # Init self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = [t for t in ignore] + self.ignore_types = list(ignore) tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) @@ -147,46 +193,8 @@ class Lexer(object): self.mres = build_mres(tokens) - def lex(self, stream): - lex_pos = 0 - line = 1 - col_start_pos = 0 - newline_types = list(self.newline_types) - ignore_types = list(self.ignore_types) - while True: - for mre, type_from_index in self.mres: - m = mre.match(stream, lex_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - to_yield = type_ not in ignore_types - - if to_yield: - t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) - end_col = t.column + len(value) - if t.type in self.callback: - t = self.callback[t.type](t) - - if type_ in newline_types: - newlines = value.count(self.newline_char) - if newlines: - line += newlines - last_newline_index = value.rindex(self.newline_char) + 1 - col_start_pos = lex_pos + last_newline_index - end_col = len(value) - last_newline_index - - if to_yield: - t.end_line = line - t.end_col = end_col - yield t - - lex_pos += len(value) - break - else: - if lex_pos < len(stream): - raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) - break + return _Lex(self).lex(stream, self.newline_types, self.ignore_types) class ContextualLexer: @@ -204,7 +212,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] + state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] lexer = Lexer(state_tokens, ignore=ignore) lexer_by_tokens[key] = lexer @@ -218,33 +226,9 @@ class ContextualLexer: self.parser_state = state def lex(self, stream): - lex_pos = 0 - line = 1 - col_start_pos = 0 - newline_types = list(self.root_lexer.newline_types) - ignore_types = list(self.root_lexer.ignore_types) - while True: - lexer = self.lexers[self.parser_state] - for mre, type_from_index in lexer.mres: - m = mre.match(stream, lex_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - yield t + l = _Lex(self.lexers[self.parser_state]) + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] - if type_ in newline_types: - newlines = value.count(lexer.newline_char) - if newlines: - line += newlines - col_start_pos = lex_pos + value.rindex(lexer.newline_char) - lex_pos += len(value) - break - else: - if lex_pos < len(stream): - raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) - break diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 72e2e22..2d01277 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef +from .grammar import RuleOptions, Rule from .tree import Tree as T, Transformer, InlineTransformer, Visitor @@ -127,7 +128,7 @@ RULES = { class EBNF_to_BNF(InlineTransformer): def __init__(self): - self.new_rules = {} + self.new_rules = [] self.rules_by_expr = {} self.prefix = 'anon' self.i = 0 @@ -140,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) self.i += 1 t = Token('RULE', new_name, -1) - self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options + tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) + self.new_rules.append((new_name, tree, self.rule_options)) self.rules_by_expr[expr] = t return t @@ -174,7 +176,6 @@ class SimplifyRule_Visitor(Visitor): break tree.expand_kids_by_index(*to_expand) - def expansion(self, tree): # rules_list unpacking # a : b (c|d) e @@ -194,7 +195,7 @@ class SimplifyRule_Visitor(Visitor): tree.data = 'expansions' tree.children = [self.visit(T('expansion', [option if i==j else other for j, other in enumerate(tree.children)])) - for option in child.children] + for option in set(child.children)] break else: break @@ -208,7 +209,10 @@ class SimplifyRule_Visitor(Visitor): tree.data = 'expansions' tree.children = aliases - expansions = _flatten + def expansions(self, tree): + self._flatten(tree) + tree.children = list(set(tree.children)) + class RuleTreeToText(Transformer): def expansions(self, x): @@ -389,12 +393,6 @@ def _interleave(l, item): def _choice_of_rules(rules): return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) -def dict_update_safe(d1, d2): - for k, v in d2.items(): - assert k not in d1 - d1[k] = v - - class Grammar: def __init__(self, rule_defs, token_defs, ignore): self.token_defs = token_defs @@ -411,6 +409,7 @@ class Grammar: terms_to_ignore = {name:'__'+name for name in self.ignore} if terms_to_ignore: assert set(terms_to_ignore) <= {name for name, _t in term_defs} + term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] expr = Token('RULE', '__ignore') for r, tree, _o in rule_defs: @@ -466,57 +465,41 @@ class Grammar: # ================= # Compile Rules # ================= - ebnf_to_bnf = EBNF_to_BNF() - simplify_rule = SimplifyRule_Visitor() + # 1. Pre-process terminals transformer = PrepareLiterals() if not lexer: transformer *= SplitLiterals() transformer *= ExtractAnonTokens(tokens) # Adds to tokens - rules = {} + # 2. Convert EBNF to BNF (and apply step 1) + ebnf_to_bnf = EBNF_to_BNF() + rules = [] for name, rule_tree, options in rule_defs: - assert name not in rules, name ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None tree = transformer.transform(rule_tree) - rules[name] = ebnf_to_bnf.transform(tree), options + rules.append((name, ebnf_to_bnf.transform(tree), options)) + rules += ebnf_to_bnf.new_rules - dict_update_safe(rules, ebnf_to_bnf.new_rules) - - for tree, _o in rules.values(): - simplify_rule.visit(tree) + assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" + # 3. Compile tree to Rule objects rule_tree_to_text = RuleTreeToText() - rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()} - - return tokens, rules, self.ignore + simplify_rule = SimplifyRule_Visitor() + compiled_rules = [] + for name, tree, options in rules: + simplify_rule.visit(tree) + expansions = rule_tree_to_text.transform(tree) + for expansion, alias in expansions: + if alias and name.startswith('_'): + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) -class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.create_token = create_token # used for scanless postprocessing - self.priority = priority - - self.filter_out = filter_out # remove this rule from the tree - # used for "token"-rules in scanless - @classmethod - def from_rule(cls, name, *x): - if len(x) > 1: - priority, expansions = x - priority = int(priority) - else: - expansions ,= x - priority = None - - keep_all_tokens = name.startswith('!') - name = name.lstrip('!') - expand1 = name.startswith('?') - name = name.lstrip('?') + rule = Rule(name, expansion, alias, options) + compiled_rules.append(rule) - return name, expansions, cls(keep_all_tokens, expand1, priority=priority) + return tokens, compiled_rules, self.ignore @@ -553,15 +536,30 @@ def resolve_token_references(token_defs): if not changed: break +def options_from_rule(name, *x): + if len(x) > 1: + priority, expansions = x + priority = int(priority) + else: + expansions ,= x + priority = None + + keep_all_tokens = name.startswith('!') + name = name.lstrip('!') + expand1 = name.startswith('?') + name = name.lstrip('?') + + return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] - rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()] - d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} - rules, callback = ParseTreeBuilder(d, T).apply() + rules = [options_from_rule(name, x) for name, x in RULES.items()] + rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] + callback = ParseTreeBuilder(rules, T).create_callback() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) + parser_conf = ParserConf(rules, callback, 'start') self.parser = LALR(lexer_conf, parser_conf) @@ -636,7 +634,6 @@ class GrammarLoader: ignore_names.append(name) token_defs.append((name, (t, 0))) - # Verify correctness 2 token_names = set() for name, _ in token_defs: @@ -644,10 +641,13 @@ class GrammarLoader: raise GrammarError("Token '%s' defined more than once" % name) token_names.add(name) + if set(ignore_names) > token_names: + raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) + # Resolve token references resolve_token_references(token_defs) - rules = [RuleOptions.from_rule(*x) for x in rule_defs] + rules = [options_from_rule(*x) for x in rule_defs] rule_names = set() for name, _x, _o in rules: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 975121d..7e52125 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,6 +1,9 @@ from .common import is_terminal, GrammarError from .utils import suppress from .lexer import Token +from .grammar import Rule + +###{standalone class NodeBuilder: def __init__(self, tree_class, name): @@ -27,7 +30,7 @@ class Factory: def __call__(self, node_builder): return self.cls(node_builder, *self.args) - + class TokenWrapper: "Used for fixing the results of scanless parsing" @@ -106,51 +109,53 @@ class ParseTreeBuilder: self.rule_builders = list(self._init_builders(rules)) + self.user_aliases = {} + def _init_builders(self, rules): filter_out = set() - for origin, (expansions, options) in rules.items(): - if options and options.filter_out: - assert origin.startswith('_') # Just to make sure - filter_out.add(origin) + for rule in rules: + if rule.options and rule.options.filter_out: + assert rule.origin.startswith('_') # Just to make sure + filter_out.add(rule.origin) - for origin, (expansions, options) in rules.items(): + for rule in rules: + options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand1 = options.expand1 if options else False create_token = options.create_token if options else False - for expansion, alias in expansions: - if alias and origin.startswith('_'): - raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) + wrapper_chain = filter(None, [ + (expand1 and not rule.alias) and Expand1, + create_token and Factory(TokenWrapper, create_token), + create_rule_handler(rule.expansion, keep_all_tokens, filter_out), + self.propagate_positions and PropagatePositions, + ]) - wrapper_chain = filter(None, [ - (expand1 and not alias) and Expand1, - create_token and Factory(TokenWrapper, create_token), - create_rule_handler(expansion, keep_all_tokens, filter_out), - self.propagate_positions and PropagatePositions, - ]) + yield rule, wrapper_chain - yield origin, expansion, options, alias or origin, wrapper_chain - - def apply(self, transformer=None): + def create_callback(self, transformer=None): callback = Callback() - new_rules = [] - for origin, expansion, options, alias, wrapper_chain in self.rule_builders: - callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion)) + for rule, wrapper_chain in self.rule_builders: + internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) + user_callback_name = rule.alias or rule.origin try: - f = transformer._get_func(alias) + f = transformer._get_func(user_callback_name) except AttributeError: - f = NodeBuilder(self.tree_class, alias) + f = NodeBuilder(self.tree_class, user_callback_name) + + self.user_aliases[rule] = rule.alias + rule.alias = internal_callback_name for w in wrapper_chain: f = w(f) - if hasattr(callback, callback_name): - raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) - setattr(callback, callback_name, f) + if hasattr(callback, internal_callback_name): + raise GrammarError("Rule '%s' already exists" % (rule,)) + setattr(callback, internal_callback_name, f) - new_rules.append(( origin, expansion, callback_name, options )) + return callback - return new_rules, callback +###} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 37c6dd0..bc87921 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,5 +1,5 @@ import re -import sre_parse +from .utils import get_regexp_width from parsers.grammar_analysis import GrammarAnalyzer from .lexer import Lexer, ContextualLexer, Token @@ -9,10 +9,16 @@ from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree class WithLexer: - def __init__(self, lexer_conf): + def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) + def init_contextual_lexer(self, lexer_conf, parser_conf): + self.lexer_conf = lexer_conf + d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} + always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) + def lex(self, text): stream = self.lexer.lex(text) if self.lexer_conf.postlex: @@ -23,32 +29,22 @@ class WithLexer: class LALR(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf) - - self.parser_conf = parser_conf self.parser = lalr_parser.Parser(parser_conf) + self.init_traditional_lexer(lexer_conf) def parse(self, text): - tokens = self.lex(text) - return self.parser.parse(tokens) + token_stream = self.lex(text) + return self.parser.parse(token_stream) -class LALR_ContextualLexer: +class LALR_ContextualLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - self.lexer_conf = lexer_conf - self.parser_conf = parser_conf - self.parser = lalr_parser.Parser(parser_conf) - - d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()} - always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) + self.init_contextual_lexer(lexer_conf, parser_conf) def parse(self, text): - tokens = self.lexer.lex(text) - if self.lexer_conf.postlex: - tokens = self.lexer_conf.postlex.process(tokens) - return self.parser.parse(tokens, self.lexer.set_parser_state) + token_stream = self.lex(text) + return self.parser.parse(token_stream, self.lexer.set_parser_state) def get_ambiguity_resolver(options): if not options or options.ambiguity == 'resolve': @@ -60,55 +56,47 @@ def get_ambiguity_resolver(options): raise ValueError(options) def tokenize_text(text): - new_text = [] line = 1 col_start_pos = 0 for i, ch in enumerate(text): if '\n' in ch: line += ch.count('\n') col_start_pos = i + ch.rindex('\n') - new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) - return new_text + yield Token('CHAR', ch, line=line, column=i - col_start_pos) class Earley_NoLex: def __init__(self, lexer_conf, parser_conf, options=None): - self.token_by_name = {t.name:t for t in lexer_conf.tokens} + self._prepare_match(lexer_conf) - rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] - - self.parser = earley.Parser(rules, - parser_conf.start, - parser_conf.callback, + self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=get_ambiguity_resolver(options)) - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - if width != (1,1): - raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) - yield Terminal_Regexp(sym, regexp) - else: - yield sym + + def match(self, term, text, index=0): + return self.regexps[term].match(text, index) + + def _prepare_match(self, lexer_conf): + self.regexps = {} + for t in lexer_conf.tokens: + regexp = t.pattern.to_regexp() + width = get_regexp_width(regexp) + if width != (1,1): + raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) + self.regexps[t.name] = re.compile(regexp) def parse(self, text): - new_text = tokenize_text(text) - return self.parser.parse(new_text) + token_stream = tokenize_text(text) + return self.parser.parse(token_stream) class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf) - - rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] + self.init_traditional_lexer(lexer_conf) - self.parser = earley.Parser(rules, - parser_conf.start, - parser_conf.callback, + self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=get_ambiguity_resolver(options)) - def _prepare_expansion(self, expansion): - return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] + def match(self, term, token): + return term == token.type def parse(self, text): tokens = self.lex(text) @@ -119,27 +107,31 @@ class XEarley: def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} - rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] + self._prepare_match(lexer_conf) - ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] - - self.parser = xearley.Parser(rules, - parser_conf.start, - parser_conf.callback, + self.parser = xearley.Parser(parser_conf, + self.match, resolve_ambiguity=get_ambiguity_resolver(options), - ignore=ignore, + ignore=lexer_conf.ignore, predict_all=options.earley__predict_all ) - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - assert width - yield Terminal_Regexp(sym, regexp) + def match(self, term, text, index=0): + return self.regexps[term].match(text, index) + + def _prepare_match(self, lexer_conf): + self.regexps = {} + for t in lexer_conf.tokens: + regexp = t.pattern.to_regexp() + try: + width = get_regexp_width(regexp)[0] + except ValueError: + raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp)) else: - yield sym + if width == 0: + raise ValueError("Dynamic Earley doesn't allow zero-width regexps") + + self.regexps[t.name] = re.compile(regexp) def parse(self, text): return self.parser.parse(text) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index dbe6834..3586c22 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -13,14 +13,11 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken, Terminal +from ..common import ParseError, UnexpectedToken, is_terminal from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse from .grammar_analysis import GrammarAnalyzer -class EndToken: - type = '$end' - class Derivation(Tree): _hash = None @@ -35,8 +32,6 @@ class Derivation(Tree): self._hash = Tree.__hash__(self) return self._hash -END_TOKEN = EndToken() - class Item(object): "An Earley Item, the atom of the algorithm." @@ -59,11 +54,8 @@ class Item(object): new_tree = Derivation(self.rule, self.tree.children + [tree]) return self.__class__(self.rule, self.ptr+1, self.start, new_tree) - def similar(self, other): - return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule - def __eq__(self, other): - return self.similar(other) #and (self.tree == other.tree) + return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule def __hash__(self): return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ @@ -134,7 +126,7 @@ class Column: self.completed[item_key] = item self.to_reduce.append(item) else: - if isinstance(item.expect, Terminal): + if is_terminal(item.expect): self.to_scan.append(item) else: k = item_key if self.predict_all else item @@ -151,31 +143,30 @@ class Column: __nonzero__ = __bool__ # Py2 backwards-compatibility class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): - self.analysis = GrammarAnalyzer(rules, start_symbol) - self.start_symbol = start_symbol + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): + self.analysis = GrammarAnalyzer(parser_conf) + self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity + self.FIRST = self.analysis.FIRST self.postprocess = {} self.predictions = {} - self.FIRST = {} - for rule in self.analysis.rules: - if rule.origin != '$root': # XXX kinda ugly - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + for rule in parser_conf.rules: + self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) + self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.start_symbol + start_symbol = start_symbol or self.parser_conf.start _Item = Item + match = self.term_matcher def predict(nonterm, column): - assert not isinstance(nonterm, Terminal), nonterm + assert not is_terminal(nonterm), nonterm return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): @@ -195,14 +186,13 @@ class Parser: for item in to_reduce: new_items = list(complete(item)) - for new_item in new_items: - if new_item.similar(item): - raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) + if item in new_items: + raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) column.add(new_items) def scan(i, token, column): next_set = Column(i, self.FIRST) - next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token)) + next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) if not next_set: expect = {i.expect for i in column.to_scan} @@ -249,24 +239,3 @@ class ApplyCallbacks(Transformer_NoRecurse): return callback(children) else: return Tree(rule.origin, children) - -# RULES = [ -# ('a', ['d']), -# ('d', ['b']), -# ('b', ['C']), -# ('b', ['b', 'C']), -# ('b', ['C', 'b']), -# ] -# p = Parser(RULES, 'a') -# for x in p.parse('CC'): -# print x.pretty() - -#--------------- -# RULES = [ -# ('s', ['a', 'a']), -# ('a', ['b', 'b']), -# ('b', ['C'], lambda (x,): x), -# ('b', ['b', 'C']), -# ] -# p = Parser(RULES, 's', {}) -# print p.parse('CCCCC').pretty() diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 9250c47..a8c7757 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -1,20 +1,8 @@ from ..utils import bfs, fzset from ..common import GrammarError, is_terminal +from ..grammar import Rule -class Rule(object): - """ - origin : a symbol - expansion : a list of symbols - """ - def __init__(self, origin, expansion, alias=None, options=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - self.options = options - - def __repr__(self): - return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) class RulePtr(object): def __init__(self, rule, index): @@ -106,28 +94,30 @@ def calculate_sets(rules): class GrammarAnalyzer(object): - def __init__(self, rule_tuples, start_symbol, debug=False): - self.start_symbol = start_symbol + def __init__(self, parser_conf, debug=False): + rules = parser_conf.rules + assert len(rules) == len(set(rules)) + + self.start_symbol = parser_conf.start self.debug = debug - rule_tuples = list(rule_tuples) - rule_tuples.append(('$root', [start_symbol, '$end'])) - rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] - - self.rules = set() - self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} - for origin, exp, alias, options in rule_tuples: - r = Rule( origin, exp, alias, options ) - self.rules.add(r) - self.rules_by_origin[origin].append(r) - - for r in self.rules: + + root_rule = Rule('$root', [self.start_symbol, '$END']) + + self.rules_by_origin = {r.origin: [] for r in rules} + for r in rules: + self.rules_by_origin[r.origin].append(r) + + self.rules_by_origin[root_rule.origin] = [root_rule] + + for r in rules: for sym in r.expansion: if not (is_terminal(sym) or sym in self.rules_by_origin): raise GrammarError("Using an undefined rule: %s" % sym) - self.init_state = self.expand_rule('$root') + self.start_state = self.expand_rule('$root') + self.rules = rules - self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule]) def expand_rule(self, rule): "Returns all init_ptrs accessible by rule (recursive)" diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index e763b08..6eb3fdf 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -14,7 +14,43 @@ from ..common import GrammarError, is_terminal from .grammar_analysis import GrammarAnalyzer -ACTION_SHIFT = 0 +class Action: + def __init__(self, name): + self.name = name + def __str__(self): + return self.name + def __repr__(self): + return str(self) + +Shift = Action('Shift') +Reduce = Action('Reduce') + +class ParseTable: + def __init__(self, states, start_state, end_state): + self.states = states + self.start_state = start_state + self.end_state = end_state + +class IntParseTable(ParseTable): + + @classmethod + def from_ParseTable(cls, parse_table): + enum = list(parse_table.states) + state_to_idx = {s:i for i,s in enumerate(enum)} + int_states = {} + + for s, la in parse_table.states.items(): + la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v + for k,v in la.items()} + int_states[ state_to_idx[s] ] = la + + + start_state = state_to_idx[parse_table.start_state] + end_state = state_to_idx[parse_table.end_state] + return cls(int_states, start_state, end_state) + + + class LALR_Analyzer(GrammarAnalyzer): @@ -27,7 +63,7 @@ class LALR_Analyzer(GrammarAnalyzer): sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) for rp in sat: for term in self.FOLLOW.get(rp.rule.origin, ()): - lookahead[term].append(('reduce', rp.rule)) + lookahead[term].append((Reduce, rp.rule)) d = classify(unsat, lambda rp: rp.next) for sym, rps in d.items(): @@ -38,8 +74,8 @@ class LALR_Analyzer(GrammarAnalyzer): rps |= self.expand_rule(rp.next) new_state = fzset(rps) - lookahead[sym].append(('shift', new_state)) - if sym == '$end': + lookahead[sym].append((Shift, new_state)) + if sym == '$END': self.end_states.append( new_state ) yield fzset(rps) @@ -50,7 +86,7 @@ class LALR_Analyzer(GrammarAnalyzer): for x in v: # XXX resolving shift/reduce into shift, like PLY # Give a proper warning - if x[0] == 'shift': + if x[0] is Shift: lookahead[k] = [x] for k, v in lookahead.items(): @@ -59,22 +95,15 @@ class LALR_Analyzer(GrammarAnalyzer): self.states[state] = {k:v[0] for k, v in lookahead.items()} - for _ in bfs([self.init_state], step): + for _ in bfs([self.start_state], step): pass self.end_state ,= self.end_states - # -- - self.enum = list(self.states) - self.enum_rev = {s:i for i,s in enumerate(self.enum)} - self.states_idx = {} - - for s, la in self.states.items(): - la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' - else (v[0], (v[1], len(v[1].expansion))) # Reduce - for k,v in la.items()} - self.states_idx[ self.enum_rev[s] ] = la + self._parse_table = ParseTable(self.states, self.start_state, self.end_state) + if self.debug: + self.parse_table = self._parse_table + else: + self.parse_table = IntParseTable.from_ParseTable(self._parse_table) - self.init_state_idx = self.enum_rev[self.init_state] - self.end_state_idx = self.enum_rev[self.end_state] diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f224bec..eafc4ea 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -3,30 +3,30 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken +from ..common import UnexpectedToken -from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT - -class FinalReduce: - def __init__(self, value): - self.value = value +from .lalr_analysis import LALR_Analyzer, Shift class Parser: def __init__(self, parser_conf): - assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" - self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) + assert all(r.options is None or r.options.priority is None + for r in parser_conf.rules), "LALR doesn't yet support prioritization" + self.analysis = analysis = LALR_Analyzer(parser_conf) analysis.compute_lookahead() callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) for rule in analysis.rules} - self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks) + self.parser_conf = parser_conf + self.parser = _Parser(analysis.parse_table, callbacks) self.parse = self.parser.parse +###{standalone + class _Parser: - def __init__(self, states, init_state, end_state, callbacks): - self.states = states - self.init_state = init_state - self.end_state = end_state + def __init__(self, parse_table, callbacks): + self.states = parse_table.states + self.start_state = parse_table.start_state + self.end_state = parse_table.end_state self.callbacks = callbacks def parse(self, seq, set_state=None): @@ -35,10 +35,10 @@ class _Parser: stream = iter(seq) states = self.states - state_stack = [self.init_state] + state_stack = [self.start_state] value_stack = [] - if set_state: set_state(self.init_state) + if set_state: set_state(self.start_state) def get_action(key): state = state_stack[-1] @@ -49,7 +49,8 @@ class _Parser: raise UnexpectedToken(token, expected, seq, i) - def reduce(rule, size): + def reduce(rule): + size = len(rule.expansion) if size: s = value_stack[-size:] del state_stack[-size:] @@ -60,7 +61,7 @@ class _Parser: value = self.callbacks[rule](s) _action, new_state = get_action(rule.origin) - assert _action == ACTION_SHIFT + assert _action is Shift state_stack.append(new_state) value_stack.append(value) @@ -72,22 +73,24 @@ class _Parser: action, arg = get_action(token.type) assert arg != self.end_state - if action == ACTION_SHIFT: + if action is Shift: state_stack.append(arg) value_stack.append(token) if set_state: set_state(arg) token = next(stream) i += 1 else: - reduce(*arg) + reduce(arg) except StopIteration: pass while True: - _action, arg = get_action('$end') - if _action == ACTION_SHIFT: + _action, arg = get_action('$END') + if _action is Shift: assert arg == self.end_state val ,= value_stack return val else: - reduce(*arg) + reduce(arg) + +###} diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 9b26190..420c469 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -20,7 +20,7 @@ from collections import defaultdict -from ..common import ParseError, UnexpectedToken, Terminal +from ..common import ParseError, UnexpectedToken, is_terminal from ..lexer import Token, UnexpectedInput from ..tree import Tree from .grammar_analysis import GrammarAnalyzer @@ -28,37 +28,34 @@ from .grammar_analysis import GrammarAnalyzer from .earley import ApplyCallbacks, Item, Column class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): - self.analysis = GrammarAnalyzer(rules, start_symbol) - self.start_symbol = start_symbol + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): + self.analysis = GrammarAnalyzer(parser_conf) + self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity self.ignore = list(ignore) self.predict_all = predict_all - + self.FIRST = self.analysis.FIRST self.postprocess = {} self.predictions = {} - self.FIRST = {} - - for rule in self.analysis.rules: - if rule.origin != '$root': # XXX kinda ugly - a = rule.alias - self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + for rule in parser_conf.rules: + self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) + self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.start_symbol + start_symbol = start_symbol or self.parser_conf.start delayed_matches = defaultdict(list) + match = self.term_matcher text_line = 1 text_column = 0 def predict(nonterm, column): - assert not isinstance(nonterm, Terminal), nonterm + assert not is_terminal(nonterm), nonterm return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): @@ -77,16 +74,15 @@ class Parser: column.add( predict(nonterm, column) ) for item in to_reduce: new_items = list(complete(item)) - for new_item in new_items: - if new_item.similar(item): - raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) + if item in new_items: + raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) column.add(new_items) def scan(i, token, column): to_scan = column.to_scan for x in self.ignore: - m = x.match(stream, i) + m = match(x, stream, i) if m: delayed_matches[m.end()] += set(to_scan) delayed_matches[m.end()] += set(column.to_reduce) @@ -99,16 +95,16 @@ class Parser: # delayed_matches[m.end()] += to_scan for item in to_scan: - m = item.expect.match(stream, i) + m = match(item.expect, stream, i) if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) + t = Token(item.expect, m.group(0), i, text_line, text_column) delayed_matches[m.end()].append(item.advance(t)) s = m.group(0) for j in range(1, len(s)): - m = item.expect.match(s[:-j]) + m = match(item.expect, s[:-j]) if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) + t = Token(item.expect, m.group(0), i, text_line, text_column) delayed_matches[i+m.end()].append(item.advance(t)) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) @@ -131,7 +127,7 @@ class Parser: if token == '\n': text_line += 1 - text_column = 1 + text_column = 0 else: text_column += 1 @@ -143,7 +139,7 @@ class Parser: if n.rule.origin==start_symbol and n.start is column0] if not solutions: - expected_tokens = [t.expect.name for t in column.to_scan] + expected_tokens = [t.expect for t in column.to_scan] raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) elif len(solutions) == 1: diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py new file mode 100644 index 0000000..0444614 --- /dev/null +++ b/lark/tools/standalone.py @@ -0,0 +1,203 @@ +###{standalone +# +# +# Lark Stand-alone Generator Tool +# ---------------------------------- +# Generates a stand-alone LALR(1) parser with a standard lexer +# +# Git: https://github.com/erezsh/lark +# Author: Erez Shinan (erezshin@gmail.com) +# +# +# >>> LICENSE +# +# This tool and its generated code use a separate license from Lark. +# +# It is licensed under GPLv2 or above. +# +# If you wish to purchase a commercial license for this tool and its +# generated code, contact me via email. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# See . +# +# +###} + +import codecs +import sys +import os +from pprint import pprint +from os import path +from collections import defaultdict + +import lark +from lark import Lark +from lark.parsers.lalr_analysis import Shift, Reduce + +from ..grammar import Rule + +__dir__ = path.dirname(__file__) +__larkdir__ = path.join(__dir__, path.pardir) + + +EXTRACT_STANDALONE_FILES = [ + 'tools/standalone.py', + 'utils.py', + 'common.py', + 'tree.py', + 'indenter.py', + 'lexer.py', + 'parse_tree_builder.py', + 'parsers/lalr_parser.py', +] + + +def extract_sections(lines): + section = None + text = [] + sections = defaultdict(list) + for l in lines: + if l.startswith('###'): + if l[3] == '{': + section = l[4:].strip() + elif l[3] == '}': + sections[section] += text + section = None + text = [] + else: + raise ValueError(l) + elif section: + text.append(l) + + return {name:''.join(text) for name, text in sections.items()} + +class LexerAtoms: + def __init__(self, lexer): + self.mres = [(p.pattern,d) for p,d in lexer.mres] + self.newline_types = lexer.newline_types + self.ignore_types = lexer.ignore_types + self.callback = {name:[(p.pattern,d) for p,d in c.mres] + for name, c in lexer.callback.items()} + + def print_python(self): + print('import re') + print('MRES = (') + pprint(self.mres) + print(')') + print('LEXER_CALLBACK = (') + pprint(self.callback) + print(')') + print('NEWLINE_TYPES = %s' % self.newline_types) + print('IGNORE_TYPES = %s' % self.ignore_types) + print('class LexerRegexps: pass') + print('lexer_regexps = LexerRegexps()') + print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') + print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') + print(' for n, mres in LEXER_CALLBACK.items()}') + print('lexer = _Lex(lexer_regexps)') + print('def lex(stream):') + print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)') + + +class GetRule: + def __init__(self, rule_id): + self.rule_id = rule_id + + def __repr__(self): + return 'RULES[%d]' % self.rule_id + +rule_ids = {} +token_types = {} + +def _get_token_type(token_type): + if token_type not in token_types: + token_types[token_type] = len(token_types) + return token_types[token_type] + +class ParserAtoms: + def __init__(self, parser): + self.parse_table = parser.analysis.parse_table + + def print_python(self): + print('class ParseTable: pass') + print('parse_table = ParseTable()') + print('STATES = {') + for state, actions in self.parse_table.states.items(): + print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()})) + print('}') + print('TOKEN_TYPES = (') + pprint({v:k for k, v in token_types.items()}) + print(')') + print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') + print(' for s, acts in STATES.items()}') + print('parse_table.start_state = %s' % self.parse_table.start_state) + print('parse_table.end_state = %s' % self.parse_table.end_state) + print('class Lark_StandAlone:') + print(' def __init__(self, transformer=None, postlex=None):') + print(' callback = parse_tree_builder.create_callback(transformer=transformer)') + print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}') + print(' self.parser = _Parser(parse_table, callbacks)') + print(' self.postlex = postlex') + print(' def parse(self, stream):') + print(' tokens = lex(stream)') + print(' if self.postlex: tokens = self.postlex.process(tokens)') + print(' return self.parser.parse(tokens)') + +class TreeBuilderAtoms: + def __init__(self, lark): + self.rules = lark.rules + self.ptb = lark._parse_tree_builder + + def print_python(self): + print('RULES = {') + for i, r in enumerate(self.rules): + rule_ids[r] = i + print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) + print('}') + print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') + +def main(fn, start): + with codecs.open(fn, encoding='utf8') as f: + lark_inst = Lark(f, parser="lalr", start=start) + + lexer_atoms = LexerAtoms(lark_inst.parser.lexer) + parser_atoms = ParserAtoms(lark_inst.parser.parser) + tree_builder_atoms = TreeBuilderAtoms(lark_inst) + + print('# The file was automatically generated by Lark v%s' % lark.__version__) + + for pyfile in EXTRACT_STANDALONE_FILES: + print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone']) + + print(open(os.path.join(__larkdir__, 'grammar.py')).read()) + print('Shift = 0') + print('Reduce = 1') + lexer_atoms.print_python() + tree_builder_atoms.print_python() + parser_atoms.print_python() + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Lark Stand-alone Generator Tool") + print("Usage: python -m lark.tools.standalone []") + sys.exit(1) + + if len(sys.argv) == 3: + fn, start = sys.argv[1:] + elif len(sys.argv) == 2: + fn, start = sys.argv[1], 'start' + else: + assert False, sys.argv + + main(fn, start) diff --git a/lark/tree.py b/lark/tree.py index 7251ce6..28f4fb6 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -7,6 +7,7 @@ from copy import deepcopy from .utils import inline_args +###{standalone class Tree(object): def __init__(self, data, children, rule=None): self.data = data @@ -34,6 +35,7 @@ class Tree(object): def pretty(self, indent_str=' '): return ''.join(self._pretty(0, indent_str)) +###} def expand_kids_by_index(self, *indices): for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices @@ -100,6 +102,7 @@ class Tree(object): +###{standalone class Transformer(object): def _get_func(self, name): return getattr(self, name) @@ -139,7 +142,7 @@ class TransformerChain(object): def __mul__(self, other): return TransformerChain(*self.transformers + (other,)) - + class InlineTransformer(Transformer): @@ -196,6 +199,7 @@ class Transformer_NoRecurse(Transformer): def __default__(self, t): return t +###} def pydot__tree_to_png(tree, filename): diff --git a/lark/utils.py b/lark/utils.py index d984400..abe036f 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,7 +1,4 @@ -import functools -import types from collections import deque -from contextlib import contextmanager class fzset(frozenset): def __repr__(self): @@ -49,8 +46,13 @@ try: except NameError: # Python 3 STRING_TYPE = str -Str = type(u'') +###{standalone +import types +import functools +from contextlib import contextmanager + +Str = type(u'') def inline_args(f): # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) @@ -76,19 +78,6 @@ def inline_args(f): return _f - -try: - compare = cmp -except NameError: - def compare(a, b): - if a == b: - return 0 - elif a > b: - return 1 - else: - return -1 - - try: from contextlib import suppress # Python 3 except ImportError: @@ -107,6 +96,26 @@ except ImportError: except excs: pass +###} +try: + compare = cmp +except NameError: + def compare(a, b): + if a == b: + return 0 + elif a > b: + return 1 + else: + return -1 + + +import sre_parse +import sre_constants +def get_regexp_width(regexp): + try: + return sre_parse.parse(regexp).getwidth() + except sre_constants.error: + raise ValueError(regexp) diff --git a/tests/test_parser.py b/tests/test_parser.py index d93e33b..8e954e2 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase): r = T().transform(g.parse("x")) self.assertEqual( r.children, [""] ) - + g = Lark("""start: a ?a : b b : "x" @@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase): r = T().transform(g.parse("xx")) self.assertEqual( r.children, [""] ) - + g = Lark("""start: a ?a : b b -> c b : "x" """, parser='lalr', transformer=T()) r = g.parse("xx") self.assertEqual( r.children, [""] ) - + @@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER): # Fails an Earley implementation without special handling for empty rules, # or re-processing of already completed rules. g = Lark(r"""start: B - B: ("ab"|/[^b]/)* + B: ("ab"|/[^b]/)+ """, lexer=LEXER) self.assertEqual( g.parse('abc').children[0], 'abc') @@ -796,6 +796,49 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, ['a', 'A']) + def test_twice_empty(self): + g = """!start: [["A"]] + """ + l = _Lark(g) + tree = l.parse('A') + self.assertEqual(tree.children, ['A']) + + tree = l.parse('') + self.assertEqual(tree.children, []) + + def test_undefined_ignore(self): + g = """!start: "A" + + %ignore B + """ + self.assertRaises( GrammarError, _Lark, g) + + @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO + def test_line_and_column(self): + g = r"""!start: "A" bc "D" + !bc: "B\nC" + """ + l = _Lark(g) + a, bc, d = l.parse("AB\nCD").children + self.assertEqual(a.line, 1) + self.assertEqual(a.column, 0) + + bc ,= bc.children + self.assertEqual(bc.line, 1) + self.assertEqual(bc.column, 1) + + self.assertEqual(d.line, 2) + self.assertEqual(d.column, 1) + + # self.assertEqual(a.end_line, 1) + # self.assertEqual(a.end_col, 1) + # self.assertEqual(bc.end_line, 2) + # self.assertEqual(bc.end_col, 1) + # self.assertEqual(d.end_line, 2) + # self.assertEqual(d.end_col, 2) + + + def test_reduce_cycle(self): """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. @@ -969,7 +1012,7 @@ def _make_parser_test(LEXER, PARSER): parser = _Lark(grammar) - tree = parser.parse("int 1 ! This is a comment\n") + tree = parser.parse("int 1 ! This is a comment\n") self.assertEqual(tree.children, ['1']) tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! @@ -983,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, []) + @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") def test_regex_escaping(self): g = _Lark("start: /[ab]/")