| @@ -4,6 +4,7 @@ | |||
| /lark_parser.egg-info/** | |||
| tags | |||
| .vscode | |||
| .idea | |||
| .ropeproject | |||
| .cache | |||
| /dist | |||
| @@ -72,7 +72,7 @@ Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like ba | |||
|  | |||
| See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) | |||
| See more [examples here](https://github.com/lark-parser/lark/tree/master/examples) | |||
| @@ -95,7 +95,7 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) | |||
| - Extensive test suite [](https://codecov.io/gh/erezsh/lark) | |||
| - And much more! | |||
| See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/Features) | |||
| See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/) | |||
| ### Comparison to other libraries | |||
| @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une | |||
| from .lexer import Token | |||
| from .lark import Lark | |||
| __version__ = "0.7.2" | |||
| __version__ = "0.7.4" | |||
| @@ -8,7 +8,6 @@ from .exceptions import UnexpectedCharacters, LexError | |||
| ###{standalone | |||
| class Pattern(Serialize): | |||
| __serialize_fields__ = 'value', 'flags' | |||
| def __init__(self, value, flags=()): | |||
| self.value = value | |||
| @@ -41,6 +40,8 @@ class Pattern(Serialize): | |||
| class PatternStr(Pattern): | |||
| __serialize_fields__ = 'value', 'flags' | |||
| type = "str" | |||
| def to_regexp(self): | |||
| @@ -52,6 +53,8 @@ class PatternStr(Pattern): | |||
| max_width = min_width | |||
| class PatternRE(Pattern): | |||
| __serialize_fields__ = 'value', 'flags', '_width' | |||
| type = "re" | |||
| def to_regexp(self): | |||
| @@ -98,7 +101,7 @@ class Token(Str): | |||
| self.type = type_ | |||
| self.pos_in_stream = pos_in_stream | |||
| self.value = Str(value) | |||
| self.value = value | |||
| self.line = line | |||
| self.column = column | |||
| self.end_line = end_line | |||
| @@ -265,13 +268,14 @@ def build_mres(terminals, match_whole=False): | |||
| return _build_mres(terminals, len(terminals), match_whole) | |||
| def _regexp_has_newline(r): | |||
| """Expressions that may indicate newlines in a regexp: | |||
| r"""Expressions that may indicate newlines in a regexp: | |||
| - newlines (\n) | |||
| - escaped newline (\\n) | |||
| - anything but ([^...]) | |||
| - any-char (.) when the flag (?s) exists | |||
| - spaces (\s) | |||
| """ | |||
| return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) | |||
| return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) | |||
| class Lexer(object): | |||
| """Lexer interface | |||
| @@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder | |||
| from .parser_frontends import LALR_TraditionalLexer | |||
| from .common import LexerConf, ParserConf | |||
| from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | |||
| from .utils import classify, suppress, dedup_list | |||
| from .utils import classify, suppress, dedup_list, Str | |||
| from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken | |||
| from .tree import Tree, SlottedTree as ST | |||
| @@ -351,7 +351,10 @@ def _fix_escaping(s): | |||
| for n in i: | |||
| w += n | |||
| if n == '\\': | |||
| n2 = next(i) | |||
| try: | |||
| n2 = next(i) | |||
| except StopIteration: | |||
| raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s) | |||
| if n2 == '\\': | |||
| w += '\\\\' | |||
| elif n2 not in 'uxnftr': | |||
| @@ -451,9 +454,9 @@ class PrepareSymbols(Transformer_InPlace): | |||
| if isinstance(v, Tree): | |||
| return v | |||
| elif v.type == 'RULE': | |||
| return NonTerminal(v.value) | |||
| return NonTerminal(Str(v.value)) | |||
| elif v.type == 'TERMINAL': | |||
| return Terminal(v.value, filter_out=v.startswith('_')) | |||
| return Terminal(Str(v.value), filter_out=v.startswith('_')) | |||
| assert False | |||
| def _choice_of_rules(rules): | |||
| @@ -511,12 +514,12 @@ class Grammar: | |||
| simplify_rule = SimplifyRule_Visitor() | |||
| compiled_rules = [] | |||
| for i, rule_content in enumerate(rules): | |||
| for rule_content in rules: | |||
| name, tree, options = rule_content | |||
| simplify_rule.visit(tree) | |||
| expansions = rule_tree_to_text.transform(tree) | |||
| for expansion, alias in expansions: | |||
| for i, (expansion, alias) in enumerate(expansions): | |||
| if alias and name.startswith('_'): | |||
| raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
| @@ -538,7 +541,7 @@ class Grammar: | |||
| for dups in duplicates.values(): | |||
| if len(dups) > 1: | |||
| if dups[0].expansion: | |||
| raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) | |||
| raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups)) | |||
| # Empty rule; assert all other attributes are equal | |||
| assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) | |||
| @@ -605,7 +608,9 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): | |||
| _, tree, _ = imported_rules[symbol] | |||
| except KeyError: | |||
| raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) | |||
| return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL')) | |||
| return _find_used_symbols(tree) | |||
| def get_namespace_name(name): | |||
| try: | |||
| @@ -682,6 +687,11 @@ class PrepareGrammar(Transformer_InPlace): | |||
| return name | |||
| def _find_used_symbols(tree): | |||
| assert tree.data == 'expansions' | |||
| return {t for x in tree.find_data('expansion') | |||
| for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | |||
| class GrammarLoader: | |||
| def __init__(self): | |||
| terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] | |||
| @@ -843,9 +853,7 @@ class GrammarLoader: | |||
| rule_names.add(name) | |||
| for name, expansions, _o in rules: | |||
| used_symbols = {t for x in expansions.find_data('expansion') | |||
| for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | |||
| for sym in used_symbols: | |||
| for sym in _find_used_symbols(expansions): | |||
| if sym.type == 'TERMINAL': | |||
| if sym not in terminal_names: | |||
| raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) | |||
| @@ -118,7 +118,7 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
| class LALR_CustomLexer(LALR_WithLexer): | |||
| def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||
| self.lexer = lexer_cls(self.lexer_conf) | |||
| self.lexer = lexer_cls(lexer_conf) | |||
| debug = options.debug if options else False | |||
| self.parser = LALR_Parser(parser_conf, debug=debug) | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
| @@ -139,7 +139,8 @@ class Earley(WithLexer): | |||
| self.init_traditional_lexer() | |||
| resolve_ambiguity = options.ambiguity == 'resolve' | |||
| self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity) | |||
| debug = options.debug if options else False | |||
| self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug) | |||
| def match(self, term, token): | |||
| return term.name == token.type | |||
| @@ -152,10 +153,12 @@ class XEarley(_ParserFrontend): | |||
| self._prepare_match(lexer_conf) | |||
| resolve_ambiguity = options.ambiguity == 'resolve' | |||
| debug = options.debug if options else False | |||
| self.parser = xearley.Parser(parser_conf, | |||
| self.match, | |||
| ignore=lexer_conf.ignore, | |||
| resolve_ambiguity=resolve_ambiguity, | |||
| debug=debug, | |||
| **kw | |||
| ) | |||
| @@ -20,10 +20,11 @@ from .earley_common import Item, TransitiveItem | |||
| from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor | |||
| class Parser: | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True): | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False): | |||
| analysis = GrammarAnalyzer(parser_conf) | |||
| self.parser_conf = parser_conf | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.debug = debug | |||
| self.FIRST = analysis.FIRST | |||
| self.NULLABLE = analysis.NULLABLE | |||
| @@ -296,6 +297,10 @@ class Parser: | |||
| # symbol should have been completed in the last step of the Earley cycle, and will be in | |||
| # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
| solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||
| if self.debug: | |||
| from .earley_forest import ForestToPyDotVisitor | |||
| debug_walker = ForestToPyDotVisitor() | |||
| debug_walker.visit(solutions[0], "sppf.png") | |||
| if not solutions: | |||
| expected_tokens = [t.expect for t in to_scan] | |||
| @@ -122,7 +122,7 @@ class PackedNode(ForestNode): | |||
| ambiguously. Hence, we use the sort order to identify | |||
| the order in which ambiguous children should be considered. | |||
| """ | |||
| return self.is_empty, -self.priority, -self.rule.order | |||
| return self.is_empty, -self.priority, self.rule.order | |||
| def __iter__(self): | |||
| return iter([self.left, self.right]) | |||
| @@ -24,8 +24,8 @@ from .earley_forest import SymbolNode | |||
| class Parser(BaseParser): | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False): | |||
| BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity) | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False): | |||
| BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug) | |||
| self.ignore = [Terminal(t) for t in ignore] | |||
| self.complete_lex = complete_lex | |||
| @@ -0,0 +1,39 @@ | |||
| import codecs | |||
| import sys | |||
| import json | |||
| from lark import Lark | |||
| from lark.grammar import RuleOptions, Rule | |||
| from lark.lexer import TerminalDef | |||
| import argparse | |||
| argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #description='''Lark Serialization Tool -- Stores Lark's internal state & LALR analysis as a convenient JSON file''') | |||
| argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file') | |||
| argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)') | |||
| argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")', nargs='+') | |||
| argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")') | |||
| def serialize(infile, outfile, lexer, start): | |||
| lark_inst = Lark(infile, parser="lalr", lexer=lexer, start=start) # TODO contextual | |||
| data, memo = lark_inst.memo_serialize([TerminalDef, Rule]) | |||
| outfile.write('{\n') | |||
| outfile.write(' "data": %s,\n' % json.dumps(data)) | |||
| outfile.write(' "memo": %s\n' % json.dumps(memo)) | |||
| outfile.write('}\n') | |||
| def main(): | |||
| if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv: | |||
| print("Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file") | |||
| print("") | |||
| argparser.print_help() | |||
| else: | |||
| args = argparser.parse_args() | |||
| serialize(args.grammar_file, args.out, args.lexer, args.start) | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -56,30 +56,6 @@ class Tree(object): | |||
| def __hash__(self): | |||
| return hash((self.data, tuple(self.children))) | |||
| ###} | |||
| def expand_kids_by_index(self, *indices): | |||
| "Expand (inline) children at the given indices" | |||
| for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
| kid = self.children[i] | |||
| self.children[i:i+1] = kid.children | |||
| def find_pred(self, pred): | |||
| "Find all nodes where pred(tree) == True" | |||
| return filter(pred, self.iter_subtrees()) | |||
| def find_data(self, data): | |||
| "Find all nodes where tree.data == data" | |||
| return self.find_pred(lambda t: t.data == data) | |||
| def scan_values(self, pred): | |||
| for c in self.children: | |||
| if isinstance(c, Tree): | |||
| for t in c.scan_values(pred): | |||
| yield t | |||
| else: | |||
| if pred(c): | |||
| yield c | |||
| def iter_subtrees(self): | |||
| # TODO: Re-write as a more efficient version | |||
| @@ -102,6 +78,31 @@ class Tree(object): | |||
| yield x | |||
| seen.add(id(x)) | |||
| def find_pred(self, pred): | |||
| "Find all nodes where pred(tree) == True" | |||
| return filter(pred, self.iter_subtrees()) | |||
| def find_data(self, data): | |||
| "Find all nodes where tree.data == data" | |||
| return self.find_pred(lambda t: t.data == data) | |||
| ###} | |||
| def expand_kids_by_index(self, *indices): | |||
| "Expand (inline) children at the given indices" | |||
| for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
| kid = self.children[i] | |||
| self.children[i:i+1] = kid.children | |||
| def scan_values(self, pred): | |||
| for c in self.children: | |||
| if isinstance(c, Tree): | |||
| for t in c.scan_values(pred): | |||
| yield t | |||
| else: | |||
| if pred(c): | |||
| yield c | |||
| def iter_subtrees_topdown(self): | |||
| stack = [self] | |||
| while stack: | |||
| @@ -160,7 +160,7 @@ def smart_decorator(f, create_decorator): | |||
| elif isinstance(f, partial): | |||
| # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 | |||
| return create_decorator(f.__func__, True) | |||
| return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True)) | |||
| else: | |||
| return create_decorator(f.__func__.__call__, True) | |||
| @@ -172,7 +172,7 @@ import sre_parse | |||
| import sre_constants | |||
| def get_regexp_width(regexp): | |||
| try: | |||
| return sre_parse.parse(regexp).getwidth() | |||
| return [int(x) for x in sre_parse.parse(regexp).getwidth()] | |||
| except sre_constants.error: | |||
| raise ValueError(regexp) | |||
| @@ -0,0 +1,10 @@ | |||
| version: 2 | |||
| mkdocs: | |||
| configuration: mkdocs.yml | |||
| fail_on_warning: false | |||
| formats: all | |||
| python: | |||
| version: 3.5 | |||
| @@ -21,6 +21,7 @@ from .test_parser import ( | |||
| TestCykStandard, | |||
| TestLalrContextual, | |||
| TestEarleyDynamic, | |||
| TestLalrCustom, | |||
| # TestFullEarleyStandard, | |||
| TestFullEarleyDynamic, | |||
| @@ -22,7 +22,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte | |||
| from lark.tree import Tree | |||
| from lark.visitors import Transformer, Transformer_InPlace, v_args | |||
| from lark.grammar import Rule | |||
| from lark.lexer import TerminalDef | |||
| from lark.lexer import TerminalDef, Lexer, TraditionalLexer | |||
| __path__ = os.path.dirname(__file__) | |||
| def _read(n, *args): | |||
| @@ -431,12 +431,22 @@ def _make_full_earley_test(LEXER): | |||
| _TestFullEarley.__name__ = _NAME | |||
| globals()[_NAME] = _TestFullEarley | |||
| class CustomLexer(Lexer): | |||
| """ | |||
| Purpose of this custom lexer is to test the integration, | |||
| so it uses the traditionalparser as implementation without custom lexing behaviour. | |||
| """ | |||
| def __init__(self, lexer_conf): | |||
| self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||
| def lex(self, *args, **kwargs): | |||
| return self.lexer.lex(*args, **kwargs) | |||
| def _make_parser_test(LEXER, PARSER): | |||
| lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER | |||
| def _Lark(grammar, **kwargs): | |||
| return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) | |||
| return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
| def _Lark_open(gfilename, **kwargs): | |||
| return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) | |||
| return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
| class _TestParser(unittest.TestCase): | |||
| def test_basic1(self): | |||
| g = _Lark("""start: a+ b a* "b" a* | |||
| @@ -1532,7 +1542,7 @@ def _make_parser_test(LEXER, PARSER): | |||
| parser = _Lark(grammar) | |||
| @unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") | |||
| @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)") | |||
| def test_serialize(self): | |||
| grammar = """ | |||
| start: _ANY b "C" | |||
| @@ -1558,6 +1568,28 @@ def _make_parser_test(LEXER, PARSER): | |||
| self.assertEqual(parser.parse('xa', 'a'), Tree('a', [])) | |||
| self.assertEqual(parser.parse('xb', 'b'), Tree('b', [])) | |||
| def test_lexer_detect_newline_tokens(self): | |||
| # Detect newlines in regular tokens | |||
| g = _Lark(r"""start: "go" tail* | |||
| !tail : SA "@" | SB "@" | SC "@" | SD "@" | |||
| SA : "a" /\n/ | |||
| SB : /b./s | |||
| SC : "c" /[^a-z]/ | |||
| SD : "d" /\s/ | |||
| """) | |||
| a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children] | |||
| self.assertEqual(a.line, 2) | |||
| self.assertEqual(b.line, 3) | |||
| self.assertEqual(c.line, 4) | |||
| self.assertEqual(d.line, 5) | |||
| # Detect newlines in ignored tokens | |||
| for re in ['/\\n/', '/[^a-z]/', '/\\s/']: | |||
| g = _Lark('''!start: "a" "a" | |||
| %ignore {}'''.format(re)) | |||
| a, b = g.parse('a\na').children | |||
| self.assertEqual(a.line, 1) | |||
| self.assertEqual(b.line, 2) | |||
| _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | |||
| @@ -1572,6 +1604,7 @@ _TO_TEST = [ | |||
| ('dynamic_complete', 'earley'), | |||
| ('standard', 'lalr'), | |||
| ('contextual', 'lalr'), | |||
| ('custom', 'lalr'), | |||
| # (None, 'earley'), | |||
| ] | |||
| @@ -4,6 +4,7 @@ import unittest | |||
| from unittest import TestCase | |||
| import copy | |||
| import pickle | |||
| import functools | |||
| from lark.tree import Tree | |||
| from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard | |||
| @@ -146,6 +147,22 @@ class TestTrees(TestCase): | |||
| res = T().transform(t) | |||
| self.assertEqual(res, 2.9) | |||
| def test_partial(self): | |||
| tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])]) | |||
| def test(prefix, s, postfix): | |||
| return prefix + s.upper() + postfix | |||
| @v_args(inline=True) | |||
| class T(Transformer): | |||
| a = functools.partial(test, "@", postfix="!") | |||
| b = functools.partial(lambda s: s + "!") | |||
| res = T().transform(tree) | |||
| assert res.children == ["@TEST1!", "test2!"] | |||
| def test_discard(self): | |||
| class MyTransformer(Transformer): | |||
| def a(self, args): | |||