- Merging updated upstream into branch for file extension changes. - Will push so Pull Request has no remaining conflicts. - Also will change the file type of lark example grammar.tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
| @@ -165,3 +165,5 @@ If you're interested in taking one of these on, let me know and I will provide m | |||
| If you have any questions or want my assistance, you can email me at erezshin at gmail com. | |||
| I'm also available for contract work. | |||
| -- [Erez](https://github.com/erezsh) | |||
| @@ -7,9 +7,11 @@ | |||
| - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) | |||
| - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity | |||
| - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. | |||
| - [lark\_grammar.py](lark_grammar.py) + [lark.g](lark.g) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) | |||
| ### Advanced | |||
| - [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser | |||
| - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) | |||
| - [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language | |||
| - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature | |||
| @@ -0,0 +1,81 @@ | |||
| # | |||
| # This demonstrates example-driven error reporting with the LALR parser | |||
| # | |||
| from lark import Lark, UnexpectedToken | |||
| from .json_parser import json_grammar # Using the grammar from the json_parser example | |||
| json_parser = Lark(json_grammar, parser='lalr') | |||
| class JsonSyntaxError(SyntaxError): | |||
| def __str__(self): | |||
| context, line, column = self.args | |||
| return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) | |||
| class JsonMissingValue(JsonSyntaxError): | |||
| label = 'Missing Value' | |||
| class JsonMissingOpening(JsonSyntaxError): | |||
| label = 'Missing Opening' | |||
| class JsonMissingClosing(JsonSyntaxError): | |||
| label = 'Missing Closing' | |||
| class JsonMissingComma(JsonSyntaxError): | |||
| label = 'Missing Comma' | |||
| class JsonTrailingComma(JsonSyntaxError): | |||
| label = 'Trailing Comma' | |||
| def parse(json_text): | |||
| try: | |||
| j = json_parser.parse(json_text) | |||
| except UnexpectedToken as ut: | |||
| exc_class = ut.match_examples(json_parser.parse, { | |||
| JsonMissingValue: ['{"foo": }'], | |||
| JsonMissingOpening: ['{"foo": ]}', | |||
| '{"foor": }}'], | |||
| JsonMissingClosing: ['{"foo": [}', | |||
| '{', | |||
| '{"a": 1', | |||
| '[1'], | |||
| JsonMissingComma: ['[1 2]', | |||
| '[false 1]', | |||
| '["b" 1]', | |||
| '{"a":true 1:4}', | |||
| '{"a":1 1:4}', | |||
| '{"a":"b" 1:4}'], | |||
| JsonTrailingComma: ['[,]', | |||
| '[1,]', | |||
| '[1,2,]', | |||
| '{"foo":1,}', | |||
| '{"foo":false,"bar":true,}'] | |||
| }) | |||
| if not exc_class: | |||
| raise | |||
| raise exc_class(ut.get_context(json_text), ut.line, ut.column) | |||
| def test(): | |||
| try: | |||
| parse('{"key":') | |||
| except JsonMissingValue: | |||
| pass | |||
| try: | |||
| parse('{"key": "value"') | |||
| except JsonMissingClosing: | |||
| pass | |||
| try: | |||
| parse('{"key": ] ') | |||
| except JsonMissingOpening: | |||
| pass | |||
| if __name__ == '__main__': | |||
| test() | |||
| @@ -0,0 +1,49 @@ | |||
| start: (_item | _NL)* | |||
| _item: rule | |||
| | token | |||
| | statement | |||
| rule: RULE priority? ":" expansions _NL | |||
| token: TOKEN priority? ":" expansions _NL | |||
| priority: "." NUMBER | |||
| statement: "%ignore" expansions _NL -> ignore | |||
| | "%import" import_args ["->" TOKEN] _NL -> import | |||
| import_args: name ("." name)* | |||
| ?expansions: alias (_VBAR alias)* | |||
| ?alias: expansion ["->" RULE] | |||
| ?expansion: expr* | |||
| ?expr: atom [OP | "~" NUMBER [".." NUMBER]] | |||
| ?atom: "(" expansions ")" | |||
| | "[" expansions "]" -> maybe | |||
| | STRING ".." STRING -> literal_range | |||
| | name | |||
| | (REGEXP | STRING) -> literal | |||
| name: RULE | |||
| | TOKEN | |||
| _VBAR: _NL? "|" | |||
| OP: /[+*][?]?|[?](?![a-z])/ | |||
| RULE: /!?[_?]?[a-z][_a-z0-9]*/ | |||
| TOKEN: /_?[A-Z][_A-Z0-9]*/ | |||
| STRING: _STRING "i"? | |||
| REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/ | |||
| _NL: /(\r?\n)+\s*/ | |||
| %import common.ESCAPED_STRING -> _STRING | |||
| %import common.INT -> NUMBER | |||
| %import common.WS_INLINE | |||
| COMMENT: "//" /[^\n]/* | |||
| %ignore WS_INLINE | |||
| %ignore COMMENT | |||
| @@ -0,0 +1,18 @@ | |||
| from lark import Lark | |||
| parser = Lark(open('examples/lark.g'), parser="lalr") | |||
| grammar_files = [ | |||
| 'examples/python2.g', | |||
| 'examples/python3.g', | |||
| 'examples/lark.g', | |||
| 'lark/grammars/common.g', | |||
| ] | |||
| def test(): | |||
| for grammar_file in grammar_files: | |||
| tree = parser.parse(open(grammar_file).read()) | |||
| print("All grammars parsed successfully") | |||
| if __name__ == '__main__': | |||
| test() | |||
| @@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError | |||
| from .lark import Lark | |||
| from .utils import inline_args | |||
| __version__ = "0.5.5" | |||
| __version__ = "0.5.6" | |||
| @@ -1,7 +1,7 @@ | |||
| import re | |||
| import sys | |||
| from .utils import get_regexp_width | |||
| from .utils import get_regexp_width, STRING_TYPE | |||
| Py36 = (sys.version_info[:2] >= (3, 6)) | |||
| @@ -17,12 +17,13 @@ class ParseError(Exception): | |||
| pass | |||
| class UnexpectedToken(ParseError): | |||
| def __init__(self, token, expected, seq, index, considered_rules=None): | |||
| def __init__(self, token, expected, seq, index, considered_rules=None, state=None): | |||
| self.token = token | |||
| self.expected = expected | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| self.considered_rules = considered_rules | |||
| self.state = state | |||
| try: | |||
| context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | |||
| @@ -36,7 +37,36 @@ class UnexpectedToken(ParseError): | |||
| super(UnexpectedToken, self).__init__(message) | |||
| def match_examples(self, parse_fn, examples): | |||
| """ Given a parser instance and a dictionary mapping some label with | |||
| some malformed syntax examples, it'll return the label for the | |||
| example that bests matches the current error. | |||
| """ | |||
| assert self.state, "Not supported for this exception" | |||
| candidate = None | |||
| for label, example in examples.items(): | |||
| assert not isinstance(example, STRING_TYPE) | |||
| for malformed in example: | |||
| try: | |||
| parse_fn(malformed) | |||
| except UnexpectedToken as ut: | |||
| if ut.state == self.state: | |||
| if ut.token == self.token: # Try exact match first | |||
| return label | |||
| elif not candidate: | |||
| candidate = label | |||
| return candidate | |||
| def get_context(self, text, span=10): | |||
| pos = self.token.pos_in_stream | |||
| start = max(pos - span, 0) | |||
| end = pos + span | |||
| before = text[start:pos].rsplit('\n', 1)[-1] | |||
| after = text[pos:end].split('\n', 1)[0] | |||
| return before + after + '\n' + ' ' * len(before) + '^\n' | |||
| ###} | |||
| @@ -20,6 +20,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER | |||
| // | |||
| // Strings | |||
| // | |||
| //STRING: /"(\\\"|\\\\|[^"\n])*?"i?/ | |||
| STRING_INNER: ("\\\""|/[^"]/) | |||
| ESCAPED_STRING: "\"" STRING_INNER* "\"" | |||
| @@ -172,7 +172,7 @@ class Lark: | |||
| def _build_parser(self): | |||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') | |||
| callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||
| if self.profiler: | |||
| for f in dir(callback): | |||
| @@ -25,6 +25,8 @@ class UnexpectedInput(LexError): | |||
| self.considered_rules = considered_rules | |||
| class Token(Str): | |||
| __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') | |||
| def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | |||
| self = super(Token, cls).__new__(cls, value) | |||
| self.type = type_ | |||
| @@ -39,7 +41,7 @@ class Token(Str): | |||
| return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | |||
| def __reduce__(self): | |||
| return (self.__class__, (self.type, self.pos_in_stream, self.value, self.line, self.column, )) | |||
| return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) | |||
| def __repr__(self): | |||
| return 'Token(%s, %r)' % (self.type, self.value) | |||
| @@ -141,6 +143,8 @@ def _create_unless(tokens): | |||
| for retok in tokens_by_type.get(PatternRE, []): | |||
| unless = [] # {} | |||
| for strtok in tokens_by_type.get(PatternStr, []): | |||
| if strtok.priority > retok.priority: | |||
| continue | |||
| s = strtok.pattern.value | |||
| m = re.match(retok.pattern.to_regexp(), s) | |||
| if m and m.group(0) == s: | |||
| @@ -14,7 +14,7 @@ from .parsers.lalr_parser import UnexpectedToken | |||
| from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | |||
| from .grammar import RuleOptions, Rule | |||
| from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
| from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST | |||
| __path__ = os.path.dirname(__file__) | |||
| IMPORT_PATHS = [os.path.join(__path__, 'grammars')] | |||
| @@ -122,7 +122,7 @@ RULES = { | |||
| 'statement': ['ignore', 'import'], | |||
| 'ignore': ['_IGNORE expansions _NL'], | |||
| 'import': ['_IMPORT import_args _NL', | |||
| '_IMPORT import_args _TO TOKEN'], | |||
| '_IMPORT import_args _TO TOKEN _NL'], | |||
| 'import_args': ['_import_args'], | |||
| '_import_args': ['name', '_import_args _DOT name'], | |||
| @@ -145,14 +145,14 @@ class EBNF_to_BNF(InlineTransformer): | |||
| new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
| self.i += 1 | |||
| t = Token('RULE', new_name, -1) | |||
| tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||
| tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) | |||
| self.new_rules.append((new_name, tree, self.rule_options)) | |||
| self.rules_by_expr[expr] = t | |||
| return t | |||
| def expr(self, rule, op, *args): | |||
| if op.value == '?': | |||
| return T('expansions', [rule, T('expansion', [])]) | |||
| return ST('expansions', [rule, ST('expansion', [])]) | |||
| elif op.value == '+': | |||
| # a : b c+ d | |||
| # --> | |||
| @@ -165,7 +165,7 @@ class EBNF_to_BNF(InlineTransformer): | |||
| # a : b _c? d | |||
| # _c : _c c | c; | |||
| new_name = self._add_recurse_rule('star', rule) | |||
| return T('expansions', [new_name, T('expansion', [])]) | |||
| return ST('expansions', [new_name, ST('expansion', [])]) | |||
| elif op.value == '~': | |||
| if len(args) == 1: | |||
| mn = mx = int(args[0]) | |||
| @@ -173,7 +173,7 @@ class EBNF_to_BNF(InlineTransformer): | |||
| mn, mx = map(int, args) | |||
| if mx < mn: | |||
| raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | |||
| return T('expansions', [T('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||
| return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||
| assert False, op | |||
| @@ -183,7 +183,7 @@ class SimplifyRule_Visitor(Visitor): | |||
| def _flatten(tree): | |||
| while True: | |||
| to_expand = [i for i, child in enumerate(tree.children) | |||
| if isinstance(child, T) and child.data == tree.data] | |||
| if isinstance(child, Tree) and child.data == tree.data] | |||
| if not to_expand: | |||
| break | |||
| tree.expand_kids_by_index(*to_expand) | |||
| @@ -203,9 +203,9 @@ class SimplifyRule_Visitor(Visitor): | |||
| self._flatten(tree) | |||
| for i, child in enumerate(tree.children): | |||
| if isinstance(child, T) and child.data == 'expansions': | |||
| if isinstance(child, Tree) and child.data == 'expansions': | |||
| tree.data = 'expansions' | |||
| tree.children = [self.visit(T('expansion', [option if i==j else other | |||
| tree.children = [self.visit(ST('expansion', [option if i==j else other | |||
| for j, other in enumerate(tree.children)])) | |||
| for option in set(child.children)] | |||
| break | |||
| @@ -217,7 +217,7 @@ class SimplifyRule_Visitor(Visitor): | |||
| if rule.data == 'expansions': | |||
| aliases = [] | |||
| for child in tree.children[0].children: | |||
| aliases.append(T('alias', [child, alias_name])) | |||
| aliases.append(ST('alias', [child, alias_name])) | |||
| tree.data = 'expansions' | |||
| tree.children = aliases | |||
| @@ -239,7 +239,7 @@ class RuleTreeToText(Transformer): | |||
| class CanonizeTree(InlineTransformer): | |||
| def maybe(self, expr): | |||
| return T('expr', [expr, Token('OP', '?', -1)]) | |||
| return ST('expr', [expr, Token('OP', '?', -1)]) | |||
| def tokenmods(self, *args): | |||
| if len(args) == 1: | |||
| @@ -353,7 +353,7 @@ def _literal_to_pattern(literal): | |||
| class PrepareLiterals(InlineTransformer): | |||
| def literal(self, literal): | |||
| return T('pattern', [_literal_to_pattern(literal)]) | |||
| return ST('pattern', [_literal_to_pattern(literal)]) | |||
| def range(self, start, end): | |||
| assert start.type == end.type == 'STRING' | |||
| @@ -361,13 +361,13 @@ class PrepareLiterals(InlineTransformer): | |||
| end = end.value[1:-1] | |||
| assert len(start) == len(end) == 1, (start, end, len(start), len(end)) | |||
| regexp = '[%s-%s]' % (start, end) | |||
| return T('pattern', [PatternRE(regexp)]) | |||
| return ST('pattern', [PatternRE(regexp)]) | |||
| class SplitLiterals(InlineTransformer): | |||
| def pattern(self, p): | |||
| if isinstance(p, PatternStr) and len(p.value)>1: | |||
| return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||
| return T('pattern', [p]) | |||
| return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||
| return ST('pattern', [p]) | |||
| class TokenTreeToPattern(Transformer): | |||
| def pattern(self, ps): | |||
| @@ -375,6 +375,7 @@ class TokenTreeToPattern(Transformer): | |||
| return p | |||
| def expansion(self, items): | |||
| assert items | |||
| if len(items) == 1: | |||
| return items[0] | |||
| if len({i.flags for i in items}) > 1: | |||
| @@ -402,18 +403,20 @@ class TokenTreeToPattern(Transformer): | |||
| assert len(args) == 2 | |||
| return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) | |||
| def alias(self, t): | |||
| raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||
| def _interleave(l, item): | |||
| for e in l: | |||
| yield e | |||
| if isinstance(e, T): | |||
| if isinstance(e, Tree): | |||
| if e.data in ('literal', 'range'): | |||
| yield item | |||
| elif is_terminal(e): | |||
| yield item | |||
| def _choice_of_rules(rules): | |||
| return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | |||
| return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) | |||
| class Grammar: | |||
| def __init__(self, rule_defs, token_defs, ignore): | |||
| @@ -440,9 +443,9 @@ class Grammar: | |||
| if r == start: | |||
| exp.children = [expr] + exp.children | |||
| for exp in tree.find_data('expr'): | |||
| exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) | |||
| exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr))) | |||
| _ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||
| _ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||
| rule_defs.append(('__ignore', _ignore_tree, None)) | |||
| # Convert all tokens to rules | |||
| @@ -455,6 +458,9 @@ class Grammar: | |||
| exp.children[i] = Token(sym.type, new_terminal_names[sym]) | |||
| for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | |||
| if any(tree.find_data('alias')): | |||
| raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||
| if name.startswith('_'): | |||
| options = RuleOptions(filter_out=True, priority=-priority) | |||
| else: | |||
| @@ -481,6 +487,11 @@ class Grammar: | |||
| # Convert token-trees to strings/regexps | |||
| transformer = PrepareLiterals() * TokenTreeToPattern() | |||
| for name, (token_tree, priority) in token_defs: | |||
| for t in token_tree.find_data('expansion'): | |||
| if not t.children: | |||
| raise GrammarError("Tokens cannot be empty (%s)" % name) | |||
| tokens = [TokenDef(name, transformer.transform(token_tree), priority) | |||
| for name, (token_tree, priority) in token_defs] | |||
| @@ -516,7 +527,7 @@ class Grammar: | |||
| for expansion, alias in expansions: | |||
| if alias and name.startswith('_'): | |||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
| raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
| rule = Rule(name, expansion, alias, options) | |||
| compiled_rules.append(rule) | |||
| @@ -579,7 +590,7 @@ class GrammarLoader: | |||
| rules = [options_from_rule(name, x) for name, x in RULES.items()] | |||
| rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | |||
| callback = ParseTreeBuilder(rules, T).create_callback() | |||
| callback = ParseTreeBuilder(rules, ST).create_callback() | |||
| lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | |||
| parser_conf = ParserConf(rules, callback, 'start') | |||
| @@ -595,14 +606,22 @@ class GrammarLoader: | |||
| except UnexpectedInput as e: | |||
| raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) | |||
| except UnexpectedToken as e: | |||
| if e.expected == ['_COLON']: | |||
| raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) | |||
| elif e.expected == ['RULE']: | |||
| raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column)) | |||
| context = e.get_context(grammar_text) | |||
| error = e.match_examples(self.parser.parse, { | |||
| 'Unclosed parenthesis': ['a: (\n'], | |||
| 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], | |||
| 'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], | |||
| 'Alias expects lowercase name': ['a: -> "a"\n'], | |||
| 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], | |||
| 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], | |||
| 'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'], | |||
| '%import expects a name': ['%import "a"\n'], | |||
| '%ignore expects a value': ['%ignore %import\n'], | |||
| }) | |||
| if error: | |||
| raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context)) | |||
| elif 'STRING' in e.expected: | |||
| raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) | |||
| elif e.expected == ['_OR']: | |||
| raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) | |||
| raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) | |||
| raise | |||
| # Extract grammar items | |||
| @@ -57,6 +57,19 @@ class ChildFilter: | |||
| self.node_builder = node_builder | |||
| self.to_include = to_include | |||
| def __call__(self, children): | |||
| filtered = [] | |||
| for i, to_expand in self.to_include: | |||
| if to_expand: | |||
| filtered += children[i].children | |||
| else: | |||
| filtered.append(children[i]) | |||
| return self.node_builder(filtered) | |||
| class ChildFilterLALR(ChildFilter): | |||
| "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" | |||
| def __call__(self, children): | |||
| filtered = [] | |||
| for i, to_expand in self.to_include: | |||
| @@ -73,21 +86,22 @@ class ChildFilter: | |||
| def _should_expand(sym): | |||
| return not is_terminal(sym) and sym.startswith('_') | |||
| def maybe_create_child_filter(expansion, filter_out): | |||
| def maybe_create_child_filter(expansion, filter_out, ambiguous): | |||
| to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] | |||
| if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||
| return partial(ChildFilter, to_include) | |||
| return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) | |||
| class Callback(object): | |||
| pass | |||
| class ParseTreeBuilder: | |||
| def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): | |||
| def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | |||
| self.tree_class = tree_class | |||
| self.propagate_positions = propagate_positions | |||
| self.always_keep_all_tokens = keep_all_tokens | |||
| self.ambiguous = ambiguous | |||
| self.rule_builders = list(self._init_builders(rules)) | |||
| @@ -107,7 +121,7 @@ class ParseTreeBuilder: | |||
| wrapper_chain = filter(None, [ | |||
| create_token and partial(CreateToken, create_token), | |||
| (expand_single_child and not rule.alias) and ExpandSingleChild, | |||
| maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out), | |||
| maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), | |||
| self.propagate_positions and PropagatePositions, | |||
| ]) | |||
| @@ -15,9 +15,9 @@ class WithLexer: | |||
| def init_contextual_lexer(self, lexer_conf, parser_conf): | |||
| self.lexer_conf = lexer_conf | |||
| d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | |||
| states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | |||
| always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||
| self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||
| def lex(self, text): | |||
| stream = self.lexer.lex(text) | |||
| @@ -145,16 +145,16 @@ class Column: | |||
| class Parser: | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||
| self.analysis = GrammarAnalyzer(parser_conf) | |||
| analysis = GrammarAnalyzer(parser_conf) | |||
| self.parser_conf = parser_conf | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.FIRST = self.analysis.FIRST | |||
| self.FIRST = analysis.FIRST | |||
| self.postprocess = {} | |||
| self.predictions = {} | |||
| for rule in parser_conf.rules: | |||
| self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||
| self.term_matcher = term_matcher | |||
| @@ -5,6 +5,8 @@ from ..grammar import Rule | |||
| class RulePtr(object): | |||
| __slots__ = ('rule', 'index') | |||
| def __init__(self, rule, index): | |||
| assert isinstance(rule, Rule) | |||
| assert index <= len(rule.expansion) | |||
| @@ -134,7 +136,8 @@ class GrammarAnalyzer(object): | |||
| if not is_terminal(new_r): | |||
| yield new_r | |||
| _ = list(bfs([rule], _expand_rule)) | |||
| for _ in bfs([rule], _expand_rule): | |||
| pass | |||
| return fzset(init_ptrs) | |||
| @@ -2,7 +2,6 @@ | |||
| """ | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from ..common import UnexpectedToken | |||
| from .lalr_analysis import LALR_Analyzer, Shift | |||
| @@ -11,11 +10,12 @@ class Parser: | |||
| def __init__(self, parser_conf): | |||
| assert all(r.options is None or r.options.priority is None | |||
| for r in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
| self.analysis = analysis = LALR_Analyzer(parser_conf) | |||
| analysis = LALR_Analyzer(parser_conf) | |||
| analysis.compute_lookahead() | |||
| callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||
| for rule in parser_conf.rules} | |||
| self._parse_table = analysis.parse_table | |||
| self.parser_conf = parser_conf | |||
| self.parser = _Parser(analysis.parse_table, callbacks) | |||
| self.parse = self.parser.parse | |||
| @@ -46,8 +46,7 @@ class _Parser: | |||
| return states[state][key] | |||
| except KeyError: | |||
| expected = states[state].keys() | |||
| raise UnexpectedToken(token, expected, seq, i) | |||
| raise UnexpectedToken(token, expected, seq, i, state=state) | |||
| def reduce(rule): | |||
| size = len(rule.expansion) | |||
| @@ -9,11 +9,7 @@ from ..tree import Tree, Visitor_NoRecurse | |||
| # Author: Erez Sh | |||
| def _compare_rules(rule1, rule2): | |||
| c = -compare( len(rule1.expansion), len(rule2.expansion)) | |||
| if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here | |||
| c = -c | |||
| return c | |||
| return -compare( len(rule1.expansion), len(rule2.expansion)) | |||
| def _sum_priority(tree): | |||
| p = 0 | |||
| @@ -126,7 +126,7 @@ def _get_token_type(token_type): | |||
| class ParserAtoms: | |||
| def __init__(self, parser): | |||
| self.parse_table = parser.analysis.parse_table | |||
| self.parse_table = parser._parse_table | |||
| def print_python(self): | |||
| print('class ParseTable: pass') | |||
| @@ -99,6 +99,8 @@ class Tree(object): | |||
| self.data = data | |||
| self.children = children | |||
| class SlottedTree(Tree): | |||
| __slots__ = 'data', 'children', 'rule' | |||
| ###{standalone | |||
| @@ -172,6 +174,30 @@ class Visitor_NoRecurse(Visitor): | |||
| return tree | |||
| from functools import wraps | |||
| def visit_children_decor(func): | |||
| @wraps(func) | |||
| def inner(cls, tree): | |||
| values = cls.visit_children(tree) | |||
| return func(cls, values) | |||
| return inner | |||
| class Interpreter(object): | |||
| def visit(self, tree): | |||
| return getattr(self, tree.data)(tree) | |||
| def visit_children(self, tree): | |||
| return [self.visit(child) if isinstance(child, Tree) else child | |||
| for child in tree.children] | |||
| def __getattr__(self, name): | |||
| return self.__default__ | |||
| def __default__(self, tree): | |||
| return self.visit_children(tree) | |||
| class Transformer_NoRecurse(Transformer): | |||
| def transform(self, tree): | |||
| subtrees = list(tree.iter_subtrees()) | |||
| @@ -187,17 +187,22 @@ def _make_full_earley_test(LEXER): | |||
| l.parse(program) | |||
| def test_earley_scanless3(self): | |||
| "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
| # XXX Fails for scanless mode | |||
| # XXX Decided not to fix, because | |||
| # a) It's a subtle bug | |||
| # b) Scanless is intended for deprecation | |||
| # | |||
| # def test_earley_scanless3(self): | |||
| # "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
| grammar = """ | |||
| start: A A | |||
| A: "a"+ | |||
| """ | |||
| # grammar = """ | |||
| # start: A A | |||
| # A: "a"+ | |||
| # """ | |||
| l = Lark(grammar, parser='earley', lexer=LEXER) | |||
| res = l.parse("aaa") | |||
| self.assertEqual(res.children, ['aa', 'a']) | |||
| # l = Lark(grammar, parser='earley', lexer=LEXER) | |||
| # res = l.parse("aaa") | |||
| # self.assertEqual(res.children, ['aa', 'a']) | |||
| def test_earley_scanless4(self): | |||
| grammar = """ | |||
| @@ -293,6 +298,39 @@ def _make_full_earley_test(LEXER): | |||
| self.assertEqual(res, expected) | |||
| def test_explicit_ambiguity2(self): | |||
| grammar = r""" | |||
| start: NAME+ | |||
| NAME: /\w+/ | |||
| %ignore " " | |||
| """ | |||
| text = """cat""" | |||
| parser = Lark(grammar, start='start', ambiguity='explicit') | |||
| tree = parser.parse(text) | |||
| self.assertEqual(tree.data, '_ambig') | |||
| combinations = {tuple(str(s) for s in t.children) for t in tree.children} | |||
| self.assertEqual(combinations, { | |||
| ('cat',), | |||
| ('ca', 't'), | |||
| ('c', 'at'), | |||
| ('c', 'a' ,'t') | |||
| }) | |||
| def test_term_ambig_resolve(self): | |||
| grammar = r""" | |||
| !start: NAME+ | |||
| NAME: /\w+/ | |||
| %ignore " " | |||
| """ | |||
| text = """foo bar""" | |||
| parser = Lark(grammar) | |||
| tree = parser.parse(text) | |||
| self.assertEqual(tree.children, ['foo', 'bar']) | |||
| @@ -822,6 +860,12 @@ def _make_parser_test(LEXER, PARSER): | |||
| """ | |||
| self.assertRaises( GrammarError, _Lark, g) | |||
| def test_alias_in_terminal(self): | |||
| g = """start: TERM | |||
| TERM: "a" -> alias | |||
| """ | |||
| self.assertRaises( GrammarError, _Lark, g) | |||
| @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | |||
| def test_line_and_column(self): | |||
| g = r"""!start: "A" bc "D" | |||
| @@ -1129,6 +1173,18 @@ def _make_parser_test(LEXER, PARSER): | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||
| @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | |||
| def test_priority_vs_embedded(self): | |||
| g = """ | |||
| A.2: "a" | |||
| WORD: ("a".."z")+ | |||
| start: (A | WORD)+ | |||
| """ | |||
| l = _Lark(g) | |||
| t = l.parse('abc') | |||
| self.assertEqual(t.children, ['a', 'bc']) | |||
| self.assertEqual(t.children[0].type, 'A') | |||
| @@ -5,7 +5,7 @@ from unittest import TestCase | |||
| import copy | |||
| import pickle | |||
| from lark.tree import Tree | |||
| from lark.tree import Tree, Interpreter, visit_children_decor | |||
| class TestTrees(TestCase): | |||
| @@ -21,6 +21,45 @@ class TestTrees(TestCase): | |||
| assert pickle.loads(data) == s | |||
| def test_interp(self): | |||
| t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) | |||
| class Interp1(Interpreter): | |||
| def a(self, tree): | |||
| return self.visit_children(tree) + ['e'] | |||
| def b(self, tree): | |||
| return 'B' | |||
| def c(self, tree): | |||
| return 'C' | |||
| self.assertEqual(Interp1().visit(t), list('BCde')) | |||
| class Interp2(Interpreter): | |||
| @visit_children_decor | |||
| def a(self, values): | |||
| return values + ['e'] | |||
| def b(self, tree): | |||
| return 'B' | |||
| def c(self, tree): | |||
| return 'C' | |||
| self.assertEqual(Interp2().visit(t), list('BCde')) | |||
| class Interp3(Interpreter): | |||
| def b(self, tree): | |||
| return 'B' | |||
| def c(self, tree): | |||
| return 'C' | |||
| self.assertEqual(Interp3().visit(t), list('BCd')) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||