From 138f1d5d76d1a98e2489cb12503ec907fb6f6ed5 Mon Sep 17 00:00:00 2001 From: Parker <6646825+psboyce@users.noreply.github.com> Date: Sat, 14 Apr 2018 23:10:28 -0600 Subject: [PATCH 01/21] Fix order of members when pickling Token I found this while porting Token to C, essentially the value and pos_in_stream members of Token were swapped in ``__reduce__``, which means running ``pickle.loads`` and ``pickle.dumps`` would result in unpickled tokens whose value was the original's position in stream, and vice versa. In my C extension this caused a TypeError exception, but the behavior will have to be corrected in both. --- lark/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lexer.py b/lark/lexer.py index 0a46ee1..938d22b 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -41,7 +41,7 @@ class Token(Str): return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) def __reduce__(self): - return (self.__class__, (self.type, self.pos_in_stream, self.value, self.line, self.column, )) + return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) def __repr__(self): return 'Token(%s, %r)' % (self.type, self.value) From e69d567bce7376fd968471f3907c0d99bb9a46d3 Mon Sep 17 00:00:00 2001 From: DrSlump Date: Sun, 15 Apr 2018 12:42:13 +0200 Subject: [PATCH 02/21] example driven parser errors --- lark/common.py | 30 ++++++++++++++++++++++++++++-- lark/parsers/lalr_parser.py | 4 +--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/lark/common.py b/lark/common.py index 7611a2c..4091136 100644 --- a/lark/common.py +++ b/lark/common.py @@ -17,12 +17,13 @@ class ParseError(Exception): pass class UnexpectedToken(ParseError): - def __init__(self, token, expected, seq, index, considered_rules=None): + def __init__(self, token, expected, seq, index, considered_rules=None, state=None): self.token = token self.expected = expected self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.considered_rules = considered_rules + self.state = state try: context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) @@ -36,7 +37,32 @@ class UnexpectedToken(ParseError): super(UnexpectedToken, self).__init__(message) - + def match_examples(self, parse_fn, examples): + """ Given a parser instance and a dictionary mapping some label with + some malformed syntax examples, it'll return the label for the + example that bests matches the current error. + """ + if not self.state: + return None + + candidate = None + for label,example in examples.items(): + if not isinstance(example, (tuple, list)): + example = [example] + + for malformed in example: + try: + parse_fn(malformed) + except UnexpectedToken as ut: + if ut.state == self.state: + if ut.token == self.token: + return label + elif not candidate: + candidate = label + except: + pass + + return candidate ###} diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index a20db07..baea614 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -2,7 +2,6 @@ """ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com - from ..common import UnexpectedToken from .lalr_analysis import LALR_Analyzer, Shift @@ -47,8 +46,7 @@ class _Parser: return states[state][key] except KeyError: expected = states[state].keys() - - raise UnexpectedToken(token, expected, seq, i) + raise UnexpectedToken(token, expected, seq, i, state=state) def reduce(rule): size = len(rule.expansion) From 880f42dd1273f30f76f9f2c9ab116b26d923a684 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 18 Apr 2018 12:33:47 +0300 Subject: [PATCH 03/21] Corrections to PR and added get_context --- lark/common.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lark/common.py b/lark/common.py index 4091136..84a4139 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,7 +1,7 @@ import re import sys -from .utils import get_regexp_width +from .utils import get_regexp_width, STRING_TYPE Py36 = (sys.version_info[:2] >= (3, 6)) @@ -42,27 +42,31 @@ class UnexpectedToken(ParseError): some malformed syntax examples, it'll return the label for the example that bests matches the current error. """ - if not self.state: - return None + assert self.state, "Not supported for this exception" candidate = None - for label,example in examples.items(): - if not isinstance(example, (tuple, list)): - example = [example] + for label, example in examples.items(): + assert not isinstance(example, STRING_TYPE) for malformed in example: try: parse_fn(malformed) except UnexpectedToken as ut: if ut.state == self.state: - if ut.token == self.token: + if ut.token == self.token: # Try exact match first return label elif not candidate: candidate = label - except: - pass return candidate + + def get_context(self, text, span=10): + pos = self.token.pos_in_stream + start = max(pos - span, 0) + end = pos + span + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' ###} From 599b80e30af85fb49f0621ff5d6c808770584c22 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 18 Apr 2018 12:37:57 +0300 Subject: [PATCH 04/21] Added example for error reporting with LALR --- examples/README.md | 1 + examples/error_reporting_lalr.py | 81 ++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 examples/error_reporting_lalr.py diff --git a/examples/README.md b/examples/README.md index 88d3bb0..3fbe3ea 100644 --- a/examples/README.md +++ b/examples/README.md @@ -10,6 +10,7 @@ ### Advanced +- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) - [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature diff --git a/examples/error_reporting_lalr.py b/examples/error_reporting_lalr.py new file mode 100644 index 0000000..a1055fd --- /dev/null +++ b/examples/error_reporting_lalr.py @@ -0,0 +1,81 @@ +# +# This demonstrates example-driven error reporting with the LALR parser +# + +from lark import Lark, UnexpectedToken + +from .json_parser import json_grammar # Using the grammar from the json_parser example + +json_parser = Lark(json_grammar, parser='lalr') + +class JsonSyntaxError(SyntaxError): + def __str__(self): + context, line, column = self.args + return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) + +class JsonMissingValue(JsonSyntaxError): + label = 'Missing Value' + +class JsonMissingOpening(JsonSyntaxError): + label = 'Missing Opening' + +class JsonMissingClosing(JsonSyntaxError): + label = 'Missing Closing' + +class JsonMissingComma(JsonSyntaxError): + label = 'Missing Comma' + +class JsonTrailingComma(JsonSyntaxError): + label = 'Trailing Comma' + + +def parse(json_text): + try: + j = json_parser.parse(json_text) + except UnexpectedToken as ut: + exc_class = ut.match_examples(json_parser.parse, { + JsonMissingValue: ['{"foo": }'], + JsonMissingOpening: ['{"foo": ]}', + '{"foor": }}'], + JsonMissingClosing: ['{"foo": [}', + '{', + '{"a": 1', + '[1'], + JsonMissingComma: ['[1 2]', + '[false 1]', + '["b" 1]', + '{"a":true 1:4}', + '{"a":1 1:4}', + '{"a":"b" 1:4}'], + JsonTrailingComma: ['[,]', + '[1,]', + '[1,2,]', + '{"foo":1,}', + '{"foo":false,"bar":true,}'] + }) + if not exc_class: + raise + raise exc_class(ut.get_context(json_text), ut.line, ut.column) + + +def test(): + try: + parse('{"key":') + except JsonMissingValue: + pass + + try: + parse('{"key": "value"') + except JsonMissingClosing: + pass + + try: + parse('{"key": ] ') + except JsonMissingOpening: + pass + + +if __name__ == '__main__': + test() + + From 9848cac9f0b1e988214d12b1d4cd8972a2b5444f Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 18 Apr 2018 13:27:49 +0300 Subject: [PATCH 05/21] Improved Lark's error reporting for grammar syntax errors (Based on PR #129) --- lark/load_grammar.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a6b2d82..43d1bf5 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -600,14 +600,22 @@ class GrammarLoader: except UnexpectedInput as e: raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) except UnexpectedToken as e: - if e.expected == ['_COLON']: - raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) - elif e.expected == ['RULE']: - raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column)) + context = e.get_context(grammar_text) + error = e.match_examples(self.parser.parse, { + 'Unclosed parenthesis': ['a: (\n'], + 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], + 'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], + 'Alias expects lowercase name': ['a: -> "a"\n'], + 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], + 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], + 'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'], + '%import expects a name': ['%import "a"\n'], + '%ignore expects a value': ['%ignore %import\n'], + }) + if error: + raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context)) elif 'STRING' in e.expected: - raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) - elif e.expected == ['_OR']: - raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) + raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) raise # Extract grammar items From f5550b30403996bb5d1f24abc5b36c8e58b0c84f Mon Sep 17 00:00:00 2001 From: Ramon Klass Date: Thu, 19 Apr 2018 16:57:43 +0200 Subject: [PATCH 06/21] Implemented a new visitor class (Interpreter) that works top-down (PR #130) It emulates antlr's visitor behavior for a dynamic evaluation order of subtrees --- lark/tree.py | 24 ++++++++++++++++++++++++ tests/test_trees.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/lark/tree.py b/lark/tree.py index d496d75..e6d5ed7 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -174,6 +174,30 @@ class Visitor_NoRecurse(Visitor): return tree +from functools import wraps +def visit_children_decor(func): + @wraps(func) + def inner(cls, tree): + values = cls.visit_children(tree) + return func(cls, values) + return inner + +class Interpreter(object): + + def visit(self, tree): + return getattr(self, tree.data)(tree) + + def visit_children(self, tree): + return [self.visit(child) if isinstance(child, Tree) else child + for child in tree.children] + + def __getattr__(self, name): + return self.__default__ + + def __default__(self, tree): + self.visit_children(tree) + + class Transformer_NoRecurse(Transformer): def transform(self, tree): subtrees = list(tree.iter_subtrees()) diff --git a/tests/test_trees.py b/tests/test_trees.py index c90cc7d..c83b5ef 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -5,7 +5,7 @@ from unittest import TestCase import copy import pickle -from lark.tree import Tree +from lark.tree import Tree, Interpreter, visit_children_decor class TestTrees(TestCase): @@ -21,6 +21,37 @@ class TestTrees(TestCase): assert pickle.loads(data) == s + def test_interp(self): + t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) + + class Interp1(Interpreter): + def a(self, tree): + return self.visit_children(tree) + ['e'] + + def b(self, tree): + return 'B' + + def c(self, tree): + return 'C' + + self.assertEqual(Interp1().visit(t), list('BCde')) + + class Interp2(Interpreter): + @visit_children_decor + def a(self, values): + return values + ['e'] + + def b(self, tree): + return 'B' + + def c(self, tree): + return 'C' + + self.assertEqual(Interp2().visit(t), list('BCde')) + + + + if __name__ == '__main__': unittest.main() From 4c89d69d97f00da0c0ab043508ad2843ad230954 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 23 Apr 2018 10:20:43 +0300 Subject: [PATCH 07/21] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9e077cf..619be58 100644 --- a/README.md +++ b/README.md @@ -165,3 +165,5 @@ If you're interested in taking one of these on, let me know and I will provide m If you have any questions or want my assistance, you can email me at erezshin at gmail com. I'm also available for contract work. + + -- [Erez](https://github.com/erezsh) From 1854b81ebcdd05bf53175fa57615964d03218f56 Mon Sep 17 00:00:00 2001 From: Ramon Klass Date: Tue, 24 Apr 2018 00:14:03 +0200 Subject: [PATCH 08/21] interpreter: default behavior changed to return the values instead of discarding them, added test showcasing the behavior --- lark/tree.py | 2 +- tests/test_trees.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lark/tree.py b/lark/tree.py index e6d5ed7..ad086d2 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -195,7 +195,7 @@ class Interpreter(object): return self.__default__ def __default__(self, tree): - self.visit_children(tree) + return self.visit_children(tree) class Transformer_NoRecurse(Transformer): diff --git a/tests/test_trees.py b/tests/test_trees.py index c83b5ef..6017386 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -49,6 +49,14 @@ class TestTrees(TestCase): self.assertEqual(Interp2().visit(t), list('BCde')) + class Interp3(Interpreter): + def b(self, tree): + return 'B' + + def c(self, tree): + return 'C' + + self.assertEqual(Interp3().visit(t), list('BCd')) From 0f0776c0fa552aa74708570a7fa86f4bb6d54921 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 24 Apr 2018 15:36:53 +0300 Subject: [PATCH 09/21] BUGIX in lexer: Embedding strings overwrote priority (Issue #121) --- lark/lexer.py | 2 ++ tests/test_parser.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/lark/lexer.py b/lark/lexer.py index 938d22b..19e1be4 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -143,6 +143,8 @@ def _create_unless(tokens): for retok in tokens_by_type.get(PatternRE, []): unless = [] # {} for strtok in tokens_by_type.get(PatternStr, []): + if strtok.priority > retok.priority: + continue s = strtok.pattern.value m = re.match(retok.pattern.to_regexp(), s) if m and m.group(0) == s: diff --git a/tests/test_parser.py b/tests/test_parser.py index d4d63ca..5c68bec 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1173,6 +1173,18 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX + def test_priority_vs_embedded(self): + g = """ + A.2: "a" + WORD: ("a".."z")+ + + start: (A | WORD)+ + """ + l = _Lark(g) + t = l.parse('abc') + self.assertEqual(t.children, ['a', 'bc']) + self.assertEqual(t.children[0].type, 'A') From 209a3fe8fd1b1f6cd9267c84178cfbfb496065a3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 25 Apr 2018 01:54:16 +0300 Subject: [PATCH 10/21] Interface improvements for the Lark instance --- examples/python_parser.py | 18 +++++++----------- lark/lark.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/examples/python_parser.py b/examples/python_parser.py index d953a79..ddbd5c4 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -10,7 +10,7 @@ import glob, time from lark import Lark from lark.indenter import Indenter -__path__ = os.path.dirname(__file__) +# __path__ = os.path.dirname(__file__) class PythonIndenter(Indenter): NL_type = '_NEWLINE' @@ -20,18 +20,14 @@ class PythonIndenter(Indenter): DEDENT_type = '_DEDENT' tab_len = 8 +kwargs = dict(rel_to=__file__, postlex=PythonIndenter(), start='file_input') -grammar2_filename = os.path.join(__path__, 'python2.g') -grammar3_filename = os.path.join(__path__, 'python3.g') -with open(grammar2_filename) as f: - python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') -with open(grammar3_filename) as f: - python_parser3 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') +python_parser2 = Lark.open('python2.g', parser='lalr', **kwargs) +python_parser3 = Lark.open('python3.g',parser='lalr', **kwargs) +python_parser2_earley = Lark.open('python2.g', parser='earley', lexer='standard', **kwargs) +print(python_parser3) -with open(grammar2_filename) as f: - python_parser2_earley = Lark(f, parser='lalr', lexer='standard', postlex=PythonIndenter(), start='file_input') - def _read(fn, *args): kwargs = {'encoding': 'iso-8859-1'} with open(fn, *args, **kwargs) as f: @@ -82,6 +78,6 @@ def test_earley_equals_lalr(): if __name__ == '__main__': test_python_lib() - # test_earley_equals_lalr() + test_earley_equals_lalr() # python_parser3.parse(_read(sys.argv[1]) + '\n') diff --git a/lark/lark.py b/lark/lark.py index 2660bd7..3641a40 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -3,6 +3,7 @@ from __future__ import absolute_import import os import time from collections import defaultdict +from io import open from .utils import STRING_TYPE from .load_grammar import load_grammar @@ -105,12 +106,12 @@ class Lark: # Some, but not all file-like objects have a 'name' attribute try: - source = grammar.name + self.source = grammar.name except AttributeError: - source = '' + self.source = '' cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) else: - cache_file = "larkcache_%s" % os.path.basename(source) + cache_file = "larkcache_%s" % os.path.basename(self.source) # Drain file-like objects to get their contents try: @@ -150,7 +151,7 @@ class Lark: assert self.options.ambiguity in ('resolve', 'explicit', 'auto', 'resolve__antiscore_sum') # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, source) + self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) @@ -183,6 +184,27 @@ class Lark: return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + """Create an instance of Lark with the grammar given by its filename + + If rel_to is provided, the function will find the grammar filename in relation to it. + + Example: + + >>> Lark.open("grammar_file.g", rel_to=__file__, parser="lalr") + Lark(...) + + """ + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename) as f: + return cls(f, **options) + + def __repr__(self): + return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer) + def lex(self, text): if not hasattr(self, 'lexer'): From 4a7a66d77359954e86753e9467544ccb22af951e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 25 Apr 2018 01:55:10 +0300 Subject: [PATCH 11/21] .lark (preparing) --- lark/lark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index 3641a40..8ab2227 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -192,7 +192,7 @@ class Lark: Example: - >>> Lark.open("grammar_file.g", rel_to=__file__, parser="lalr") + >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") Lark(...) """ From 67f372c994599c19978d0e0ad36b80dc17983b27 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 13:16:15 +0300 Subject: [PATCH 12/21] Symbols instead of strings - initial --- examples/python_parser.py | 2 +- lark/grammar.py | 22 ++++++++++++++++++++++ lark/lexer.py | 4 ++-- lark/load_grammar.py | 14 ++++++++++---- lark/parse_tree_builder.py | 10 +++++----- lark/parsers/grammar_analysis.py | 20 ++++++++++---------- lark/parsers/lalr_analysis.py | 10 +++++----- lark/parsers/lalr_parser.py | 2 +- 8 files changed, 56 insertions(+), 28 deletions(-) diff --git a/examples/python_parser.py b/examples/python_parser.py index ddbd5c4..f738a35 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -78,6 +78,6 @@ def test_earley_equals_lalr(): if __name__ == '__main__': test_python_lib() - test_earley_equals_lalr() + # test_earley_equals_lalr() # python_parser3.parse(_read(sys.argv[1]) + '\n') diff --git a/lark/grammar.py b/lark/grammar.py index d257bc4..2689389 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,3 +1,25 @@ +class Symbol(object): + is_term = NotImplemented + + def __init__(self, name): + self.name = name + + def __eq__(self, other): + assert isinstance(other, Symbol), other + return self.is_term == other.is_term and self.name == other.name + + def __hash__(self): + return hash(self.name) + +class Terminal(Symbol): + is_term = True + + @property + def filter_out(self): + return self.name.startswith('_') + +class NonTerminal(Symbol): + is_term = False class Rule(object): """ diff --git a/lark/lexer.py b/lark/lexer.py index 19e1be4..e7af2a2 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from .utils import Str, classify -from .common import is_terminal, PatternStr, PatternRE, TokenDef +from .common import PatternStr, PatternRE, TokenDef ###{standalone class LexError(Exception): @@ -234,7 +234,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] + state_tokens = [tokens_by_name[n] for n in accepts if n.is_term and n.name!='$END'] lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer_by_tokens[key] = lexer diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 43d1bf5..6800801 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef -from .grammar import RuleOptions, Rule +from .grammar import RuleOptions, Rule, Terminal, NonTerminal from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -523,7 +523,9 @@ class Grammar: if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - rule = Rule(name, expansion, alias, options) + expansion = [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + + rule = Rule(NonTerminal(name), expansion, alias, options) compiled_rules.append(rule) return tokens, compiled_rules, self.ignore @@ -578,12 +580,16 @@ def options_from_rule(name, *x): return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) + +def symbols_from_strcase(expansion): + return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] - rules = [options_from_rule(name, x) for name, x in RULES.items()] - rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] + rules = [options_from_rule(name, x) for name, x in RULES.items()] + rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs] callback = ParseTreeBuilder(rules, ST).create_callback() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7c74178..54a1bac 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -84,7 +84,7 @@ class ChildFilterLALR(ChildFilter): return self.node_builder(filtered) def _should_expand(sym): - return not is_terminal(sym) and sym.startswith('_') + return not sym.is_term and sym.name.startswith('_') def maybe_create_child_filter(expansion, filter_out, ambiguous): to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] @@ -109,8 +109,8 @@ class ParseTreeBuilder: def _init_builders(self, rules): filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} - filter_out |= {sym for rule in rules for sym in rule.expansion if is_terminal(sym) and sym.startswith('_')} - assert all(x.startswith('_') for x in filter_out) + filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out} + assert all(t.filter_out for t in filter_out) for rule in rules: options = rule.options @@ -132,9 +132,9 @@ class ParseTreeBuilder: callback = Callback() for rule, wrapper_chain in self.rule_builders: - internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) + internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(x.name for x in rule.expansion)) - user_callback_name = rule.alias or rule.origin + user_callback_name = rule.alias or rule.origin.name try: f = transformer._get_func(user_callback_name) except AttributeError: diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index f34d5c1..f49e4bc 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -1,7 +1,7 @@ from ..utils import bfs, fzset, classify -from ..common import GrammarError, is_terminal -from ..grammar import Rule +from ..common import GrammarError +from ..grammar import Rule, Terminal, NonTerminal class RulePtr(object): @@ -67,7 +67,7 @@ def calculate_sets(rules): FIRST = {} FOLLOW = {} for sym in symbols: - FIRST[sym]={sym} if is_terminal(sym) else set() + FIRST[sym]={sym} if sym.is_term else set() FOLLOW[sym]=set() # Calculate NULLABLE and FIRST @@ -108,16 +108,16 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - rules = parser_conf.rules + [Rule('$root', [parser_conf.start, '$END'])] + rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])] self.rules_by_origin = classify(rules, lambda r: r.origin) assert len(rules) == len(set(rules)) for r in rules: for sym in r.expansion: - if not (is_terminal(sym) or sym in self.rules_by_origin): + if not (sym.is_term or sym in self.rules_by_origin): raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation - self.start_state = self.expand_rule('$root') + self.start_state = self.expand_rule(NonTerminal('$root')) self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) @@ -125,7 +125,7 @@ class GrammarAnalyzer(object): "Returns all init_ptrs accessible by rule (recursive)" init_ptrs = set() def _expand_rule(rule): - assert not is_terminal(rule), rule + assert not rule.is_term, rule for r in self.rules_by_origin[rule]: init_ptr = RulePtr(r, 0) @@ -133,7 +133,7 @@ class GrammarAnalyzer(object): if r.expansion: # if not empty rule new_r = init_ptr.next - if not is_terminal(new_r): + if not new_r.is_term: yield new_r for _ in bfs([rule], _expand_rule): @@ -142,8 +142,8 @@ class GrammarAnalyzer(object): return fzset(init_ptrs) def _first(self, r): - if is_terminal(r): + if r.is_term: return {r} else: - return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} + return {rp.next for rp in self.expand_rule(r) if rp.next.is_term} diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 4af28f9..6903be9 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -10,9 +10,9 @@ import logging from collections import defaultdict from ..utils import classify, classify_bool, bfs, fzset -from ..common import GrammarError, is_terminal +from ..common import GrammarError -from .grammar_analysis import GrammarAnalyzer +from .grammar_analysis import GrammarAnalyzer, Terminal class Action: def __init__(self, name): @@ -70,12 +70,12 @@ class LALR_Analyzer(GrammarAnalyzer): rps = {rp.advance(sym) for rp in rps} for rp in set(rps): - if not rp.is_satisfied and not is_terminal(rp.next): + if not rp.is_satisfied and not rp.next.is_term: rps |= self.expand_rule(rp.next) new_state = fzset(rps) lookahead[sym].append((Shift, new_state)) - if sym == '$END': + if sym == Terminal('$END'): self.end_states.append( new_state ) yield new_state @@ -93,7 +93,7 @@ class LALR_Analyzer(GrammarAnalyzer): if not len(v) == 1: raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v]))) - self.states[state] = {k:v[0] for k, v in lookahead.items()} + self.states[state] = {k.name:v[0] for k, v in lookahead.items()} for _ in bfs([self.start_state], step): pass diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index baea614..164a227 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,7 +59,7 @@ class _Parser: value = self.callbacks[rule](s) - _action, new_state = get_action(rule.origin) + _action, new_state = get_action(rule.origin.name) assert _action is Shift state_stack.append(new_state) value_stack.append(value) From cf7ddeee8863096309b69fda63e5ba04610e7286 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 13:26:28 +0300 Subject: [PATCH 13/21] Earley working too --- examples/python_parser.py | 2 +- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 7 ++++--- lark/parsers/earley.py | 9 +++++---- lark/parsers/xearley.py | 9 +++++---- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/examples/python_parser.py b/examples/python_parser.py index f738a35..ddbd5c4 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -78,6 +78,6 @@ def test_earley_equals_lalr(): if __name__ == '__main__': test_python_lib() - # test_earley_equals_lalr() + test_earley_equals_lalr() # python_parser3.parse(_read(sys.argv[1]) + '\n') diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 54a1bac..e81569f 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,4 +1,4 @@ -from .common import is_terminal, GrammarError +from .common import GrammarError from .utils import suppress from .lexer import Token from .grammar import Rule diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index a36252c..24c3622 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,9 +4,10 @@ from .utils import get_regexp_width from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import Lexer, ContextualLexer, Token -from .common import is_terminal, GrammarError +from .common import GrammarError from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree +from .grammar import Terminal class WithLexer: def init_traditional_lexer(self, lexer_conf): @@ -96,7 +97,7 @@ class Earley(WithLexer): resolve_ambiguity=get_ambiguity_resolver(options)) def match(self, term, token): - return term == token.type + return term.name == token.type def parse(self, text): tokens = self.lex(text) @@ -117,7 +118,7 @@ class XEarley: ) def match(self, term, text, index=0): - return self.regexps[term].match(text, index) + return self.regexps[term.name].match(text, index) def _prepare_match(self, lexer_conf): self.regexps = {} diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index d119e41..f6397dd 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -13,9 +13,10 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken, is_terminal +from ..common import ParseError, UnexpectedToken from ..tree import Tree, Transformer_NoRecurse from .grammar_analysis import GrammarAnalyzer +from ..grammar import NonTerminal class Derivation(Tree): @@ -127,7 +128,7 @@ class Column: self.completed[item_key] = item self.to_reduce.append(item) else: - if is_terminal(item.expect): + if item.expect.is_term: self.to_scan.append(item) else: k = item_key if self.predict_all else item @@ -161,13 +162,13 @@ class Parser: def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.parser_conf.start + start_symbol = NonTerminal(start_symbol or self.parser_conf.start) _Item = Item match = self.term_matcher def predict(nonterm, column): - assert not is_terminal(nonterm), nonterm + assert not nonterm.is_term, nonterm return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index d710f34..321b829 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -20,10 +20,11 @@ from collections import defaultdict -from ..common import ParseError, is_terminal +from ..common import ParseError from ..lexer import Token, UnexpectedInput from ..tree import Tree from .grammar_analysis import GrammarAnalyzer +from ..grammar import NonTerminal, Terminal from .earley import ApplyCallbacks, Item, Column @@ -32,7 +33,7 @@ class Parser: self.analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity - self.ignore = list(ignore) + self.ignore = [Terminal(t) for t in ignore] self.predict_all = predict_all self.FIRST = self.analysis.FIRST @@ -47,7 +48,7 @@ class Parser: def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.parser_conf.start + start_symbol = NonTerminal(start_symbol or self.parser_conf.start) delayed_matches = defaultdict(list) match = self.term_matcher @@ -55,7 +56,7 @@ class Parser: text_column = 0 def predict(nonterm, column): - assert not is_terminal(nonterm), nonterm + assert not nonterm.is_term, nonterm return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): From 4a5aa745ea99af62db563cd9170d2a432eb061f2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 14:53:05 +0300 Subject: [PATCH 14/21] All tests passing --- lark/grammar.py | 6 ++++ lark/lexer.py | 4 +-- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 13 +++++++-- lark/parsers/cyk.py | 58 +++++++++++--------------------------- lark/parsers/xearley.py | 4 +-- 6 files changed, 37 insertions(+), 50 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 2689389..b555c34 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -8,9 +8,15 @@ class Symbol(object): assert isinstance(other, Symbol), other return self.is_term == other.is_term and self.name == other.name + def __ne__(self, other): + return not (self == other) + def __hash__(self): return hash(self.name) + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.name) + class Terminal(Symbol): is_term = True diff --git a/lark/lexer.py b/lark/lexer.py index e7af2a2..19e1be4 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from .utils import Str, classify -from .common import PatternStr, PatternRE, TokenDef +from .common import is_terminal, PatternStr, PatternRE, TokenDef ###{standalone class LexError(Exception): @@ -234,7 +234,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if n.is_term and n.name!='$END'] + state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer_by_tokens[key] = lexer diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index e81569f..1d4e2b8 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -110,7 +110,7 @@ class ParseTreeBuilder: def _init_builders(self, rules): filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out} - assert all(t.filter_out for t in filter_out) + assert all(t.name.startswith('_') for t in filter_out) for rule in rules: options = rule.options diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 24c3622..e4401c1 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,7 +7,11 @@ from .lexer import Lexer, ContextualLexer, Token from .common import GrammarError from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree -from .grammar import Terminal +from .grammar import Terminal, NonTerminal + +def terminals(seq): + # return [Terminal(t) for t in seq] + return seq class WithLexer: def init_traditional_lexer(self, lexer_conf): @@ -18,7 +22,10 @@ class WithLexer: self.lexer_conf = lexer_conf states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) + self.lexer = ContextualLexer(lexer_conf.tokens, states, + ignore=terminals(lexer_conf.ignore), + always_accept=terminals(always_accept), + user_callbacks=lexer_conf.callbacks) def lex(self, text): stream = self.lexer.lex(text) @@ -74,7 +81,7 @@ class Earley_NoLex: def match(self, term, text, index=0): - return self.regexps[term].match(text, index) + return self.regexps[term.name].match(text, index) def _prepare_match(self, lexer_conf): self.regexps = {} diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py index 9d643aa..e2bcd83 100644 --- a/lark/parsers/cyk.py +++ b/lark/parsers/cyk.py @@ -8,47 +8,19 @@ from collections import defaultdict import itertools -from ..common import ParseError, is_terminal +from ..common import ParseError from ..lexer import Token from ..tree import Tree +from ..grammar import Terminal as T, NonTerminal as NT, Symbol try: xrange except NameError: xrange = range -class Symbol(object): - """Any grammar symbol.""" - - def __init__(self, s): - self.s = s - - def __repr__(self): - return '%s(%s)' % (type(self).__name__, str(self)) - - def __str__(self): - return str(self.s) - - def __eq__(self, other): - return self.s == str(other) - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - return hash((type(self), str(self.s))) - - -class T(Symbol): - """Terminal.""" - - def match(self, s): - return self.s == s.type - - -class NT(Symbol): - """Non-terminal.""" - pass +def match(t, s): + assert isinstance(t, T) + return t.name == s.type class Rule(object): @@ -121,10 +93,12 @@ class Parser(object): def _to_rule(self, lark_rule): """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" + assert isinstance(lark_rule.origin, NT) + assert all(isinstance(x, Symbol) for x in lark_rule.expansion) return Rule( - NT(lark_rule.origin), [ - T(x) if is_terminal(x) else NT(x) for x in lark_rule.expansion - ], weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule.alias) + lark_rule.origin, lark_rule.expansion, + weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, + alias=lark_rule.alias) def parse(self, tokenized): # pylint: disable=invalid-name """Parses input, which is a list of tokens.""" @@ -132,7 +106,7 @@ class Parser(object): # Check if the parse succeeded. if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]): raise ParseError('Parsing failed.') - parse = trees[(0, len(tokenized) - 1)][NT(self.start)] + parse = trees[(0, len(tokenized) - 1)][self.start] return self._to_tree(revert_cnf(parse)) def _to_tree(self, rule_node): @@ -143,8 +117,8 @@ class Parser(object): if isinstance(child, RuleNode): children.append(self._to_tree(child)) else: - assert isinstance(child.s, Token) - children.append(child.s) + assert isinstance(child.name, Token) + children.append(child.name) t = Tree(orig_rule.origin, children) t.rule=orig_rule return t @@ -169,7 +143,7 @@ def _parse(s, g): # Populate base case with existing terminal production rules for i, w in enumerate(s): for terminal, rules in g.terminal_rules.items(): - if terminal.match(w): + if match(terminal, w): for rule in rules: table[(i, i)].add(rule) if (rule.lhs not in trees[(i, i)] or @@ -349,13 +323,13 @@ def revert_cnf(node): if isinstance(node, T): return node # Reverts TERM rule. - if node.rule.lhs.s.startswith('__T_'): + if node.rule.lhs.name.startswith('__T_'): return node.children[0] else: children = [] for child in map(revert_cnf, node.children): # Reverts BIN rule. - if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'): + if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'): children += child.children else: children.append(child) diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 321b829..c64bfee 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -98,14 +98,14 @@ class Parser: for item in to_scan: m = match(item.expect, stream, i) if m: - t = Token(item.expect, m.group(0), i, text_line, text_column) + t = Token(item.expect.name, m.group(0), i, text_line, text_column) delayed_matches[m.end()].append(item.advance(t)) s = m.group(0) for j in range(1, len(s)): m = match(item.expect, s[:-j]) if m: - t = Token(item.expect, m.group(0), i, text_line, text_column) + t = Token(item.expect.name, m.group(0), i, text_line, text_column) delayed_matches[i+m.end()].append(item.advance(t)) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) From 33caa391d544bb079902b6bf735d295e4ac13a4a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 16:52:39 +0300 Subject: [PATCH 15/21] Breaking backwards compatibility: * Removed the scanless parsing feature (dynamic lexing is king) * Default LALR lexer is now contextual --- docs/reference.md | 176 --------------------- examples/README.md | 3 +- examples/{conf_nolex.py => conf_earley.py} | 13 +- examples/{conf.py => conf_lalr.py} | 9 +- lark/grammar.py | 7 +- lark/lark.py | 8 +- lark/load_grammar.py | 76 +-------- lark/parse_tree_builder.py | 13 -- lark/parser_frontends.py | 28 +--- lark/tools/standalone.py | 2 +- tests/__main__.py | 4 +- tests/test_parser.py | 53 ++----- 12 files changed, 43 insertions(+), 349 deletions(-) delete mode 100644 docs/reference.md rename examples/{conf_nolex.py => conf_earley.py} (73%) rename examples/{conf.py => conf_lalr.py} (77%) diff --git a/docs/reference.md b/docs/reference.md deleted file mode 100644 index 90553f5..0000000 --- a/docs/reference.md +++ /dev/null @@ -1,176 +0,0 @@ -# Lark Reference - -## What is Lark? - -Lark is a general-purpose parsing library. It's written in Python, and supports two parsing algorithms: Earley (default) and LALR(1). - -Lark also supports scanless parsing (with Earley), contextual lexing (with LALR), and regular lexing for both parsers. - -Lark is a re-write of my previous parsing library, [PlyPlus](https://github.com/erezsh/plyplus). - -## Grammar - -Lark accepts its grammars in [EBNF](https://www.wikiwand.com/en/Extended_Backus%E2%80%93Naur_form) form. - -The grammar is a list of rules and terminals, each in their own line. - -Rules and terminals can be defined on multiple lines when using the *OR* operator ( | ). - -Comments start with // and last to the end of the line (C++ style) - -Lark begins the parse with the rule 'start', unless specified otherwise in the options. - -It might help to think of Rules and Terminals as existing in two separate layers, so that all the terminals are recognized first, and all the rules are recognized afterwards. This is not always how things happen (depending on your choice of parser & lexer), but the concept is relevant in all cases. - -### Rules - -Each rule is defined in terms of: - - name : list of items to match - | another list of items -> optional_alias - | etc. - -An alias is a name for the specific rule alternative. It affects tree construction. - -An item is a: - - - rule - - terminal - - (item item ..) - Group items - - [item item ..] - Maybe. Same as: "(item item ..)?" - - item? - Zero or one instances of item ("maybe") - - item\* - Zero or more instances of item - - item+ - One or more instances of item - - -Example: - - float: "-"? DIGIT* "." DIGIT+ exp - | "-"? DIGIT+ exp - - exp: "-"? ("e" | "E") DIGIT+ - - DIGIT: /[0-9]/ - -### Terminals - -Terminals are defined just like rules, but cannot contain rules: - - NAME : list of items to match - -Example: - - IF: "if" - INTEGER : /[0-9]+/ - DECIMAL: INTEGER "." INTEGER - WHITESPACE: (" " | /\t/ )+ - -## Tree Construction - -Lark builds a tree automatically based on the structure of the grammar. Is also accepts some hints. - -In general, Lark will place each rule as a branch, and its matches as the children of the branch. - -Terminals are always values in the tree, never branches. - -In grammar rules, using item+ or item\* will result in a list of items. - -Example: - - expr: "(" expr ")" - | NAME+ - - NAME: /\w+/ - - %ignore " " - -Lark will parse "(((hello world)))" as: - - expr - expr - expr - "hello" - "world" - -The brackets do not appear in the tree by design. - -Terminals that won't appear in the tree are: - - - Unnamed literals (like "keyword" or "+") - - Terminals whose name starts with an underscore (like \_DIGIT) - -Terminals that *will* appear in the tree are: - - - Unnamed regular expressions (like /[0-9]/) - - Named terminals whose name starts with a letter (like DIGIT) - -## Shaping the tree - -a. Rules whose name begins with an underscore will be inlined into their containing rule. - -Example: - - start: "(" _greet ")" - _greet: /\w+/ /\w+/ - -Lark will parse "(hello world)" as: - - start - "hello" - "world" - - -b. Rules that receive a question mark (?) at the beginning of their definition, will be inlined if they have a single child. - -Example: - - start: greet greet - ?greet: "(" /\w+/ ")" - | /\w+ /\w+/ - -Lark will parse "hello world (planet)" as: - - start - greet - "hello" - "world" - "planet" - -c. Rules that begin with an exclamation mark will keep all their terminals (they won't get filtered). - -d. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. - -Example: - - start: greet greet - greet: "hello" -> hello - | "world" - -Lark will parse "hello world" as: - - start - hello - greet - -## Lark Options - -When initializing the Lark object, you can provide it with keyword options: - -- start - The start symbol (Default: "start") -- parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") - Note: "lalr" requires a lexer -- lexer - Decides whether or not to use a lexer stage - - None: Don't use a lexer - - "standard": Use a standard lexer - - "contextual": Stronger lexer (only works with parser="lalr") - - "auto" (default): Choose for me based on grammar and parser - -- transformer - Applies the transformer to every parse tree (only allowed with parser="lalr") -- postlex - Lexer post-processing (Default: None) - -To be supported: - -- debug -- cache\_grammar -- keep\_all\_tokens -- profile - Measure run-time usage in Lark. Read results from the profiler property (Default: False) diff --git a/examples/README.md b/examples/README.md index 3fbe3ea..37076d5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -12,5 +12,6 @@ - [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) -- [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language +- [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language +- [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature diff --git a/examples/conf_nolex.py b/examples/conf_earley.py similarity index 73% rename from examples/conf_nolex.py rename to examples/conf_earley.py index 8634a46..71517f0 100644 --- a/examples/conf_nolex.py +++ b/examples/conf_earley.py @@ -1,16 +1,14 @@ # -# This example demonstrates scanless parsing using the dynamic-lexer earley frontend +# This example demonstrates parsing using the dynamic-lexer earley frontend # # Using a lexer for configuration files is tricky, because values don't # have to be surrounded by delimiters. Using a standard lexer for this just won't work. # # In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. # -# Future versions of lark will make it easier to write these kinds of grammars. -# # Another approach is to use the contextual lexer with LALR. It is less powerful than Earley, # but it can handle some ambiguity when lexing and it's much faster. -# See examples/conf.py for an example of that approach. +# See examples/conf_lalr.py for an example of that approach. # @@ -19,14 +17,14 @@ from lark import Lark parser = Lark(r""" start: _NL? section+ section: "[" NAME "]" _NL item+ - item: NAME "=" VALUE _NL - VALUE: /./* + item: NAME "=" VALUE? _NL + VALUE: /./+ %import common.CNAME -> NAME %import common.NEWLINE -> _NL %import common.WS_INLINE %ignore WS_INLINE - """, lexer='dynamic') + """, parser="earley") def test(): sample_conf = """ @@ -34,6 +32,7 @@ def test(): a=Hello this="that",4 +empty= """ r = parser.parse(sample_conf) diff --git a/examples/conf.py b/examples/conf_lalr.py similarity index 77% rename from examples/conf.py rename to examples/conf_lalr.py index ac5a4a2..417d2af 100644 --- a/examples/conf.py +++ b/examples/conf_lalr.py @@ -1,16 +1,16 @@ # # This example demonstrates the power of the contextual lexer, by parsing a config file. # -# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily +# The tokens NAME and VALUE match the same input. A standard lexer would arbitrarily # choose one over the other, which would lead to a (confusing) parse error. -# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows +# However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows # which one of them to expect at each point during the parse. # The lexer then only matches the tokens that the parser expects. # The result is a correct parse, something that is impossible with a regular lexer. # # Another approach is to discard a lexer altogether and use the Earley algorithm. # It will handle more cases than the contextual lexer, but at the cost of performance. -# See examples/conf_nolex.py for an example of that approach. +# See examples/conf_earley.py for an example of that approach. # from lark import Lark @@ -25,13 +25,14 @@ parser = Lark(r""" %import common.WS_INLINE %ignore WS_INLINE - """, parser="lalr", lexer="contextual") + """, parser="lalr") sample_conf = """ [bla] a=Hello this="that",4 +empty= """ print(parser.parse(sample_conf).pretty()) diff --git a/lark/grammar.py b/lark/grammar.py index b555c34..bf12b10 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -46,20 +46,17 @@ class Rule(object): class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): + def __init__(self, keep_all_tokens=False, expand1=False, filter_out=False, priority=None): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 - self.create_token = create_token # used for scanless postprocessing self.priority = priority self.filter_out = filter_out # remove this rule from the tree - # used for "token"-rules in scanless def __repr__(self): - return 'RuleOptions(%r, %r, %r, %r, %r)' % ( + return 'RuleOptions(%r, %r, %r, %r)' % ( self.keep_all_tokens, self.expand1, - self.create_token, self.priority, self.filter_out ) diff --git a/lark/lark.py b/lark/lark.py index 8ab2227..4fc0062 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -23,9 +23,9 @@ class LarkOptions(object): Note: "lalr" requires a lexer lexer - Decides whether or not to use a lexer stage - None: Don't use a lexer (scanless, only works with parser="earley") "standard": Use a standard lexer "contextual": Stronger lexer (only works with parser="lalr") + "dynamic": Flexible and powerful (only with parser="earley") "auto" (default): Choose for me based on grammar and parser ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" @@ -131,7 +131,7 @@ class Lark: if self.options.lexer == 'auto': if self.options.parser == 'lalr': - self.options.lexer = 'standard' + self.options.lexer = 'contextual' elif self.options.parser == 'earley': self.options.lexer = 'dynamic' elif self.options.parser == 'cyk': @@ -139,7 +139,7 @@ class Lark: else: assert False, self.options.parser lexer = self.options.lexer - assert lexer in ('standard', 'contextual', 'dynamic', None) + assert lexer in ('standard', 'contextual', 'dynamic') if self.options.ambiguity == 'auto': if self.options.parser == 'earley': @@ -154,7 +154,7 @@ class Lark: self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF - tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) + tokens, self.rules, self.ignore_tokens = self.grammar.compile() self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 6800801..be96b1b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -363,12 +363,6 @@ class PrepareLiterals(InlineTransformer): regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) -class SplitLiterals(InlineTransformer): - def pattern(self, p): - if isinstance(p, PatternStr) and len(p.value)>1: - return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) - return ST('pattern', [p]) - class TokenTreeToPattern(Transformer): def pattern(self, ps): p ,= ps @@ -405,15 +399,6 @@ class TokenTreeToPattern(Transformer): def alias(self, t): raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") -def _interleave(l, item): - for e in l: - yield e - if isinstance(e, Tree): - if e.data in ('literal', 'range'): - yield item - elif is_terminal(e): - yield item - def _choice_of_rules(rules): return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) @@ -423,62 +408,9 @@ class Grammar: self.rule_defs = rule_defs self.ignore = ignore - def _prepare_scanless_grammar(self, start): - # XXX Pretty hacky! There should be a better way to write this method.. - - rule_defs = deepcopy(self.rule_defs) - term_defs = self.token_defs - - # Implement the "%ignore" feature without a lexer.. - terms_to_ignore = {name:'__'+name for name in self.ignore} - if terms_to_ignore: - assert set(terms_to_ignore) <= {name for name, _t in term_defs} - - term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] - expr = Token('RULE', '__ignore') - for r, tree, _o in rule_defs: - for exp in tree.find_data('expansion'): - exp.children = list(_interleave(exp.children, expr)) - if r == start: - exp.children = [expr] + exp.children - for exp in tree.find_data('expr'): - exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr))) - - _ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) - rule_defs.append(('__ignore', _ignore_tree, None)) - - # Convert all tokens to rules - new_terminal_names = {name: '__token_'+name for name, _t in term_defs} - - for name, tree, options in rule_defs: - for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): - for i, sym in enumerate(exp.children): - if sym in new_terminal_names: - exp.children[i] = Token(sym.type, new_terminal_names[sym]) - - for name, (tree, priority) in term_defs: # TODO transfer priority to rule? - if any(tree.find_data('alias')): - raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") - - if name.startswith('_'): - options = RuleOptions(filter_out=True, priority=-priority) - else: - options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority) - - name = new_terminal_names[name] - inner_name = name + '_inner' - rule_defs.append((name, _choice_of_rules([inner_name]), None)) - rule_defs.append((inner_name, tree, options)) - - return [], rule_defs - - - def compile(self, lexer=False, start=None): - if not lexer: - token_defs, rule_defs = self._prepare_scanless_grammar(start) - else: - token_defs = list(self.token_defs) - rule_defs = self.rule_defs + def compile(self): + token_defs = list(self.token_defs) + rule_defs = self.rule_defs # ================= # Compile Tokens @@ -495,8 +427,6 @@ class Grammar: # 1. Pre-process terminals transformer = PrepareLiterals() - if not lexer: - transformer *= SplitLiterals() transformer *= ExtractAnonTokens(tokens) # Adds to tokens # 2. Convert EBNF to BNF (and apply step 1) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 1d4e2b8..94fcdb9 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -18,17 +18,6 @@ class ExpandSingleChild: return self.node_builder(children) -class CreateToken: - "Used for fixing the results of scanless parsing" - - def __init__(self, token_name, node_builder): - self.node_builder = node_builder - self.token_name = token_name - - def __call__(self, children): - return self.node_builder( [Token(self.token_name, ''.join(children))] ) - - class PropagatePositions: def __init__(self, node_builder): self.node_builder = node_builder @@ -116,10 +105,8 @@ class ParseTreeBuilder: options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand_single_child = options.expand1 if options else False - create_token = options.create_token if options else False wrapper_chain = filter(None, [ - create_token and partial(CreateToken, create_token), (expand_single_child and not rule.alias) and ExpandSingleChild, maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), self.propagate_positions and PropagatePositions, diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e4401c1..b7a9225 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -72,30 +72,6 @@ def tokenize_text(text): col_start_pos = i + ch.rindex('\n') yield Token('CHAR', ch, line=line, column=i - col_start_pos) -class Earley_NoLex: - def __init__(self, lexer_conf, parser_conf, options=None): - self._prepare_match(lexer_conf) - - self.parser = earley.Parser(parser_conf, self.match, - resolve_ambiguity=get_ambiguity_resolver(options)) - - - def match(self, term, text, index=0): - return self.regexps[term.name].match(text, index) - - def _prepare_match(self, lexer_conf): - self.regexps = {} - for t in lexer_conf.tokens: - regexp = t.pattern.to_regexp() - width = get_regexp_width(regexp) - if width != (1,1): - raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (t.name, regexp, width)) - self.regexps[t.name] = re.compile(regexp) - - def parse(self, text): - token_stream = tokenize_text(text) - return self.parser.parse(token_stream) - class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): self.init_traditional_lexer(lexer_conf) @@ -190,9 +166,7 @@ def get_frontend(parser, lexer): else: raise ValueError('Unknown lexer: %s' % lexer) elif parser=='earley': - if lexer is None: - return Earley_NoLex - elif lexer=='standard': + if lexer=='standard': return Earley elif lexer=='dynamic': return XEarley diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 61ce94e..2e155f7 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -168,7 +168,7 @@ class TreeBuilderAtoms: print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') def main(fobj, start): - lark_inst = Lark(fobj, parser="lalr", start=start) + lark_inst = Lark(fobj, parser="lalr", lexer="standard", start=start) lexer_atoms = LexerAtoms(lark_inst.parser.lexer) parser_atoms = ParserAtoms(lark_inst.parser.parser) diff --git a/tests/__main__.py b/tests/__main__.py index eebf0d9..5a30a4e 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -19,10 +19,10 @@ from .test_parser import ( TestEarleyStandard, TestCykStandard, TestLalrContextual, - TestEarleyScanless, + # TestEarleyScanless, TestEarleyDynamic, - TestFullEarleyScanless, + # TestFullEarleyScanless, TestFullEarleyDynamic, TestParsers, diff --git a/tests/test_parser.py b/tests/test_parser.py index 5c68bec..76ad509 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -48,9 +48,6 @@ class TestParsers(unittest.TestCase): self.assertRaises(GrammarError, Lark, g, parser='lalr') - l = Lark(g, parser='earley', lexer=None) - self.assertRaises(ParseError, l.parse, 'a') - l = Lark(g, parser='earley', lexer='dynamic') self.assertRaises(ParseError, l.parse, 'a') @@ -155,7 +152,7 @@ class TestParsers(unittest.TestCase): def _make_full_earley_test(LEXER): class _TestFullEarley(unittest.TestCase): - def test_anon_in_scanless(self): + def test_anon(self): # Fails an Earley implementation without special handling for empty rules, # or re-processing of already completed rules. g = Lark(r"""start: B @@ -164,14 +161,14 @@ def _make_full_earley_test(LEXER): self.assertEqual( g.parse('abc').children[0], 'abc') - def test_earley_scanless(self): + def test_earley(self): g = Lark("""start: A "b" c A: "a"+ c: "abc" """, parser="earley", lexer=LEXER) x = g.parse('aaaababc') - def test_earley_scanless2(self): + def test_earley2(self): grammar = """ start: statement+ @@ -187,24 +184,19 @@ def _make_full_earley_test(LEXER): l.parse(program) - # XXX Fails for scanless mode - # XXX Decided not to fix, because - # a) It's a subtle bug - # b) Scanless is intended for deprecation - # - # def test_earley_scanless3(self): - # "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" + def test_earley3(self): + "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" - # grammar = """ - # start: A A - # A: "a"+ - # """ + grammar = """ + start: A A + A: "a"+ + """ - # l = Lark(grammar, parser='earley', lexer=LEXER) - # res = l.parse("aaa") - # self.assertEqual(res.children, ['aa', 'a']) + l = Lark(grammar, parser='earley', lexer=LEXER) + res = l.parse("aaa") + self.assertEqual(res.children, ['aa', 'a']) - def test_earley_scanless4(self): + def test_earley4(self): grammar = """ start: A A? A: "a"+ @@ -259,7 +251,6 @@ def _make_full_earley_test(LEXER): assert x.data == '_ambig', x assert len(x.children) == 2 - @unittest.skipIf(LEXER==None, "BUG in scanless parsing!") # TODO fix bug! def test_fruitflies_ambig(self): grammar = """ start: noun verb noun -> simple @@ -350,7 +341,7 @@ def _make_full_earley_test(LEXER): # assert x.data != '_ambig', x # assert len(x.children) == 1 - _NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize() + _NAME = "TestFullEarley" + LEXER.capitalize() _TestFullEarley.__name__ = _NAME globals()[_NAME] = _TestFullEarley @@ -402,7 +393,6 @@ def _make_parser_test(LEXER, PARSER): """) g.parse(u'\xa3\u0101\u00a3') - @unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing") def test_unicode2(self): g = _Lark(r"""start: UNIA UNIB UNIA UNIC UNIA: /\xa3/ @@ -614,11 +604,7 @@ def _make_parser_test(LEXER, PARSER): self.assertSequenceEqual(x.children, ['HelloWorld']) - @unittest.skipIf(LEXER is None, "Known bug with scanless parsing") # TODO def test_token_collision2(self): - # NOTE: This test reveals a bug in token reconstruction in Scanless Earley - # I probably need to re-write grammar transformation - g = _Lark(""" !start: "starts" @@ -662,7 +648,6 @@ def _make_parser_test(LEXER, PARSER): x = g.parse('aaaab') x = g.parse('b') - @unittest.skipIf(LEXER in (None, 'dynamic'), "Known bug with scanless parsing") # TODO def test_token_not_anon(self): """Tests that "a" is matched as A, rather than an anonymous token. @@ -755,7 +740,6 @@ def _make_parser_test(LEXER, PARSER): """) x = g.parse('AB') - @unittest.skipIf(LEXER == None, "Scanless can't handle regexps") def test_regex_quote(self): g = r""" start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING @@ -866,7 +850,6 @@ def _make_parser_test(LEXER, PARSER): """ self.assertRaises( GrammarError, _Lark, g) - @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO def test_line_and_column(self): g = r"""!start: "A" bc "D" !bc: "B\nC" @@ -1054,7 +1037,6 @@ def _make_parser_test(LEXER, PARSER): - @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_ignore(self): grammar = r""" @@ -1081,7 +1063,6 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, []) - @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") def test_regex_escaping(self): g = _Lark("start: /[ab]/") g.parse('a') @@ -1188,7 +1169,7 @@ def _make_parser_test(LEXER, PARSER): - _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize() + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME globals()[_NAME] = _TestParser @@ -1199,13 +1180,13 @@ _TO_TEST = [ ('dynamic', 'earley'), ('standard', 'lalr'), ('contextual', 'lalr'), - (None, 'earley'), + # (None, 'earley'), ] for _LEXER, _PARSER in _TO_TEST: _make_parser_test(_LEXER, _PARSER) -for _LEXER in (None, 'dynamic'): +for _LEXER in ('dynamic',): _make_full_earley_test(_LEXER) if __name__ == '__main__': From 1839c324d3c34fe57f0b03711c6814f042b8e8da Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 17:32:46 +0300 Subject: [PATCH 16/21] Small refactoring step --- lark/load_grammar.py | 26 +++++++++++++++++++------- lark/utils.py | 7 ++++--- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index be96b1b..e87870f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,6 +13,7 @@ from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule, Terminal, NonTerminal +from .utils import classify from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -108,10 +109,14 @@ RULES = { '?atom': ['_LPAR expansions _RPAR', 'maybe', - 'name', + 'terminal', + 'nonterminal', 'literal', 'range'], + 'terminal': ['TOKEN'], + 'nonterminal': ['RULE'], + '?name': ['RULE', 'TOKEN'], 'maybe': ['_LBRA expansions _RBRA'], @@ -514,6 +519,12 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] +class PrepareGrammar(InlineTransformer): + def terminal(self, name): + return name + def nonterminal(self, name): + return name + class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] @@ -554,15 +565,16 @@ class GrammarLoader: raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) raise - # Extract grammar items + tree = PrepareGrammar().transform(tree) - token_defs = [c.children for c in tree.children if c.data=='token'] - rule_defs = [c.children for c in tree.children if c.data=='rule'] - statements = [c.children for c in tree.children if c.data=='statement'] - assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children) + # Extract grammar items + defs = classify(tree.children, lambda c: c.data, lambda c: c.children) + token_defs = defs.pop('token', []) + rule_defs = defs.pop('rule', []) + statements = defs.pop('statement', []) + assert not defs token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs] - token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs] # Execute statements diff --git a/lark/utils.py b/lark/utils.py index f606704..0018e49 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -17,14 +17,15 @@ def classify_bool(seq, pred): return true_elems, false_elems -def classify(seq, key=None): +def classify(seq, key=None, value=None): d = {} for item in seq: k = key(item) if (key is not None) else item + v = value(item) if (value is not None) else item if k in d: - d[k].append(item) + d[k].append(v) else: - d[k] = [item] + d[k] = [v] return d def bfs(initial, expand): From c5e6cf0954d49b592b162a1870223965742da84a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 11:26:53 +0300 Subject: [PATCH 17/21] Refactoring to introduce Symbol instances before creating anons --- lark/load_grammar.py | 62 ++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index e87870f..6af12d0 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef -from .grammar import RuleOptions, Rule, Terminal, NonTerminal +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -108,11 +108,13 @@ RULES = { ], '?atom': ['_LPAR expansions _RPAR', - 'maybe', - 'terminal', - 'nonterminal', - 'literal', - 'range'], + 'maybe', + 'value'], + + 'value': ['terminal', + 'nonterminal', + 'literal', + 'range'], 'terminal': ['TOKEN'], 'nonterminal': ['RULE'], @@ -149,7 +151,7 @@ class EBNF_to_BNF(InlineTransformer): new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) self.i += 1 - t = Token('RULE', new_name, -1) + t = NonTerminal(Token('RULE', new_name, -1)) tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) self.new_rules.append((new_name, tree, self.rule_options)) self.rules_by_expr[expr] = t @@ -235,7 +237,7 @@ class RuleTreeToText(Transformer): def expansions(self, x): return x def expansion(self, symbols): - return [sym.value for sym in symbols], None + return symbols, None def alias(self, x): (expansion, _alias), alias = x assert _alias is None, (alias, expansion, '-', _alias) @@ -305,7 +307,7 @@ class ExtractAnonTokens(InlineTransformer): self.token_reverse[p] = tokendef self.tokens.append(tokendef) - return Token('TOKEN', token_name, -1) + return Terminal(Token('TOKEN', token_name, -1)) def _rfind(s, choices): @@ -349,7 +351,7 @@ def _literal_to_pattern(literal): s = _fix_escaping(x) - if v[0] == '"': + if literal.type == 'STRING': s = s.replace('\\\\', '\\') return { 'STRING': PatternStr, @@ -368,6 +370,7 @@ class PrepareLiterals(InlineTransformer): regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) + class TokenTreeToPattern(Transformer): def pattern(self, ps): p ,= ps @@ -404,6 +407,17 @@ class TokenTreeToPattern(Transformer): def alias(self, t): raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") + def value(self, v): + return v[0] + +class PrepareSymbols(Transformer): + def value(self, v): + v ,= v + if isinstance(v, Tree): + return v + return {'TOKEN': Terminal, + 'RULE': NonTerminal}[v.type](v.value) + def _choice_of_rules(rules): return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) @@ -432,6 +446,7 @@ class Grammar: # 1. Pre-process terminals transformer = PrepareLiterals() + transformer *= PrepareSymbols() transformer *= ExtractAnonTokens(tokens) # Adds to tokens # 2. Convert EBNF to BNF (and apply step 1) @@ -458,7 +473,7 @@ class Grammar: if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - expansion = [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + assert all(isinstance(x, Symbol) for x in expansion), expansion rule = Rule(NonTerminal(name), expansion, alias, options) compiled_rules.append(rule) @@ -489,14 +504,14 @@ def resolve_token_references(token_defs): while True: changed = False for name, (token_tree, _p) in token_defs: - for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')): - for i, item in enumerate(exp.children): - if isinstance(item, Token): - if item.type == 'RULE': - raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) - if item.type == 'TOKEN': - exp.children[i] = token_dict[item] - changed = True + for exp in token_tree.find_data('value'): + item ,= exp.children + if isinstance(item, Token): + if item.type == 'RULE': + raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) + if item.type == 'TOKEN': + exp.children[0] = token_dict[item] + changed = True if not changed: break @@ -525,6 +540,7 @@ class PrepareGrammar(InlineTransformer): def nonterminal(self, name): return name + class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] @@ -609,9 +625,11 @@ class GrammarLoader: t2 ,= t.children if t2.data=='expansion' and len(t2.children) == 1: item ,= t2.children - if isinstance(item, Token) and item.type == 'TOKEN': - ignore_names.append(item.value) - continue + if item.data == 'value': + item ,= item.children + if isinstance(item, Token) and item.type == 'TOKEN': + ignore_names.append(item.value) + continue name = '__IGNORE_%d'% len(ignore_names) ignore_names.append(name) From 7b32ffd83a9d682c5c71df49467a6a481702d3f8 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 12:05:11 +0300 Subject: [PATCH 18/21] Fixed token visibility rules (Issue #109) Anonymous tokens would become visible if they had the same value as named tokens. That's because they are merged for the lexer. But after this change, the rules for visibility are based on their use in the rule, and not their name or identity. --- lark/grammar.py | 14 ++++++-------- lark/load_grammar.py | 11 +++++++---- lark/parse_tree_builder.py | 11 ++++------- lark/parser_frontends.py | 9 ++------- tests/test_parser.py | 19 ++++++++----------- 5 files changed, 27 insertions(+), 37 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index bf12b10..37c2997 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -20,9 +20,10 @@ class Symbol(object): class Terminal(Symbol): is_term = True - @property - def filter_out(self): - return self.name.startswith('_') + def __init__(self, name, filter_out=False): + self.name = name + self.filter_out = filter_out + class NonTerminal(Symbol): is_term = False @@ -46,17 +47,14 @@ class Rule(object): class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, filter_out=False, priority=None): + def __init__(self, keep_all_tokens=False, expand1=False, priority=None): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority - self.filter_out = filter_out # remove this rule from the tree - def __repr__(self): - return 'RuleOptions(%r, %r, %r, %r)' % ( + return 'RuleOptions(%r, %r, %r)' % ( self.keep_all_tokens, self.expand1, self.priority, - self.filter_out ) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 6af12d0..9ebacb1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -307,7 +307,7 @@ class ExtractAnonTokens(InlineTransformer): self.token_reverse[p] = tokendef self.tokens.append(tokendef) - return Terminal(Token('TOKEN', token_name, -1)) + return Terminal(Token('TOKEN', token_name, -1), filter_out=isinstance(p, PatternStr)) def _rfind(s, choices): @@ -415,8 +415,11 @@ class PrepareSymbols(Transformer): v ,= v if isinstance(v, Tree): return v - return {'TOKEN': Terminal, - 'RULE': NonTerminal}[v.type](v.value) + elif v.type == 'RULE': + return NonTerminal(v.value) + elif v.type == 'TOKEN': + return Terminal(v.value, filter_out=v.startswith('_')) + assert False def _choice_of_rules(rules): return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) @@ -532,7 +535,7 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): - return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion] class PrepareGrammar(InlineTransformer): def terminal(self, name): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 94fcdb9..59bbc86 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -75,8 +75,9 @@ class ChildFilterLALR(ChildFilter): def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') -def maybe_create_child_filter(expansion, filter_out, ambiguous): - to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): + to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) + if keep_all_tokens or not (sym.is_term and sym.filter_out)] if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) @@ -97,10 +98,6 @@ class ParseTreeBuilder: self.user_aliases = {} def _init_builders(self, rules): - filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} - filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out} - assert all(t.name.startswith('_') for t in filter_out) - for rule in rules: options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) @@ -108,7 +105,7 @@ class ParseTreeBuilder: wrapper_chain = filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), self.propagate_positions and PropagatePositions, ]) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index b7a9225..f322524 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,11 +7,6 @@ from .lexer import Lexer, ContextualLexer, Token from .common import GrammarError from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree -from .grammar import Terminal, NonTerminal - -def terminals(seq): - # return [Terminal(t) for t in seq] - return seq class WithLexer: def init_traditional_lexer(self, lexer_conf): @@ -23,8 +18,8 @@ class WithLexer: states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () self.lexer = ContextualLexer(lexer_conf.tokens, states, - ignore=terminals(lexer_conf.ignore), - always_accept=terminals(always_accept), + ignore=lexer_conf.ignore, + always_accept=always_accept, user_callbacks=lexer_conf.callbacks) def lex(self, text): diff --git a/tests/test_parser.py b/tests/test_parser.py index 76ad509..21a3dc6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -649,28 +649,25 @@ def _make_parser_test(LEXER, PARSER): x = g.parse('b') def test_token_not_anon(self): - """Tests that "a" is matched as A, rather than an anonymous token. - - That means that "a" is not filtered out, despite being an 'immediate string'. - Whether or not this is the intuitive behavior, I'm not sure yet. - - Perhaps the right thing to do is report a collision (if such is relevant) - - -Erez + """Tests that "a" is matched as an anonymous token, and not A. """ g = _Lark("""start: "a" A: "a" """) x = g.parse('a') + self.assertEqual(len(x.children), 0, '"a" should be considered anonymous') - self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') + g = _Lark("""start: "a" A + A: "a" """) + x = g.parse('aa') + self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous') self.assertEqual(x.children[0].type, "A") g = _Lark("""start: /a/ A: /a/ """) x = g.parse('a') - self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous') - self.assertEqual(x.children[0].type, "A") + self.assertEqual(len(x.children), 1) + self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/") @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_maybe(self): From 0d56b0cf303f71c64fa68c7a061c5e69cef788f6 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 12:13:22 +0300 Subject: [PATCH 19/21] Anon terminals no longer need to start with _ --- lark/load_grammar.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 9ebacb1..95a96f5 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -254,7 +254,7 @@ class CanonizeTree(InlineTransformer): tokenmods, value = args return tokenmods + [value] -class ExtractAnonTokens(InlineTransformer): +class PrepareAnonTerminals(InlineTransformer): "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them" def __init__(self, tokens): @@ -278,7 +278,7 @@ class ExtractAnonTokens(InlineTransformer): try: token_name = _TOKEN_NAMES[value] except KeyError: - if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set: + if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: token_name = '%s%d' % (value.upper(), self.i) try: # Make sure we don't have unicode in our token names @@ -289,8 +289,6 @@ class ExtractAnonTokens(InlineTransformer): token_name = 'ANONSTR_%d' % self.i self.i += 1 - token_name = '__' + token_name - elif isinstance(p, PatternRE): if p in self.token_reverse: # Kind of a wierd placement.name token_name = self.token_reverse[p].name @@ -448,9 +446,7 @@ class Grammar: # ================= # 1. Pre-process terminals - transformer = PrepareLiterals() - transformer *= PrepareSymbols() - transformer *= ExtractAnonTokens(tokens) # Adds to tokens + transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(tokens) # Adds to tokens # 2. Convert EBNF to BNF (and apply step 1) ebnf_to_bnf = EBNF_to_BNF() From ea413fd648f219dd3336130563e5b8e725fee453 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 12:24:01 +0300 Subject: [PATCH 20/21] Simplify PrepareAnonTerminals --- lark/load_grammar.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 95a96f5..3aa9827 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol -from .utils import classify +from .utils import classify, suppress from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -269,35 +269,32 @@ class PrepareAnonTerminals(InlineTransformer): if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags: raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) + token_name = None + if isinstance(p, PatternStr): try: # If already defined, use the user-defined token name token_name = self.token_reverse[p].name except KeyError: - # Try to assign an indicative anon-token name, otherwise use a numbered name + # Try to assign an indicative anon-token name try: token_name = _TOKEN_NAMES[value] except KeyError: if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: - token_name = '%s%d' % (value.upper(), self.i) - try: - # Make sure we don't have unicode in our token names - token_name.encode('ascii') - except UnicodeEncodeError: - token_name = 'ANONSTR_%d' % self.i - else: - token_name = 'ANONSTR_%d' % self.i - self.i += 1 + with suppress(UnicodeEncodeError): + value.upper().encode('ascii') # Make sure we don't have unicode in our token names + token_name = value.upper() elif isinstance(p, PatternRE): if p in self.token_reverse: # Kind of a wierd placement.name token_name = self.token_reverse[p].name - else: - token_name = 'ANONRE_%d' % self.i - self.i += 1 else: assert False, p + if token_name is None: + token_name = '__ANON_%d' % self.i + self.i += 1 + if token_name not in self.token_set: assert p not in self.token_reverse self.token_set.add(token_name) From 2b4ef11ebf1770871d5b03f1f245701beab43072 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 10 May 2018 12:34:19 +0300 Subject: [PATCH 21/21] Columns now start at 1 --- lark/lexer.py | 4 ++-- lark/parsers/xearley.py | 4 ++-- tests/test_parser.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 19e1be4..51ccf6c 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -63,7 +63,7 @@ class LineCounter: self.newline_char = '\n' self.char_pos = 0 self.line = 1 - self.column = 0 + self.column = 1 self.line_start_pos = 0 def feed(self, token, test_newline=True): @@ -78,7 +78,7 @@ class LineCounter: self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos + self.column = self.char_pos - self.line_start_pos + 1 class _Lex: "Built to serve both Lexer and ContextualLexer" diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index c64bfee..5e8fb28 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -53,7 +53,7 @@ class Parser: match = self.term_matcher text_line = 1 - text_column = 0 + text_column = 1 def predict(nonterm, column): assert not nonterm.is_term, nonterm @@ -128,7 +128,7 @@ class Parser: if token == '\n': text_line += 1 - text_column = 0 + text_column = 1 else: text_column += 1 diff --git a/tests/test_parser.py b/tests/test_parser.py index 21a3dc6..6823f0c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -854,22 +854,22 @@ def _make_parser_test(LEXER, PARSER): l = _Lark(g) a, bc, d = l.parse("AB\nCD").children self.assertEqual(a.line, 1) - self.assertEqual(a.column, 0) + self.assertEqual(a.column, 1) bc ,= bc.children self.assertEqual(bc.line, 1) - self.assertEqual(bc.column, 1) + self.assertEqual(bc.column, 2) self.assertEqual(d.line, 2) - self.assertEqual(d.column, 1) + self.assertEqual(d.column, 2) if LEXER != 'dynamic': self.assertEqual(a.end_line, 1) - self.assertEqual(a.end_column, 1) + self.assertEqual(a.end_column, 2) self.assertEqual(bc.end_line, 2) - self.assertEqual(bc.end_column, 1) + self.assertEqual(bc.end_column, 2) self.assertEqual(d.end_line, 2) - self.assertEqual(d.end_column, 2) + self.assertEqual(d.end_column, 3)