diff --git a/examples/conf.py b/examples/conf.py index c872b09..f9cc84a 100644 --- a/examples/conf.py +++ b/examples/conf.py @@ -34,4 +34,4 @@ a=Hello this="that",4 """ -print parser.parse(sample_conf).pretty() +print(parser.parse(sample_conf).pretty()) diff --git a/lark/common.py b/lark/common.py index 53b6a86..06220f0 100644 --- a/lark/common.py +++ b/lark/common.py @@ -39,6 +39,7 @@ class LexerConf: class ParserConf: def __init__(self, rules, callback, start): + assert all(len(r)==3 for r in rules) self.rules = rules self.callback = callback self.start = start diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 453ed45..fe09a61 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -61,7 +61,7 @@ TOKENS = { '_COLON': ':', '_OR': r'\|', '_DOT': r'\.', - 'RULE': '[_?*]?[a-z][_a-z0-9]*', + 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TOKEN': '_?[A-Z][_A-Z0-9]*', 'STRING': r'".*?[^\\]"', 'REGEXP': r"/(?!/).*?[^\\]/", @@ -302,6 +302,8 @@ class GrammarLoader: raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) elif 'tokenvalue' in e.expected: raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) + elif e.expected == ['_OR']: + raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) raise # ================= @@ -363,7 +365,7 @@ class GrammarLoader: used_symbols = {symbol for expansions in rules.values() for expansion, _alias in expansions for symbol in expansion} - rule_set = {r.lstrip('?') for r in rules} + rule_set = {r.lstrip('!').lstrip('?') for r in rules} for sym in used_symbols: if is_terminal(sym): if sym not in token_set: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 060fc76..1b1b2cd 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -12,24 +12,25 @@ def create_expand1_tree_builder_function(tree_builder): return tree_builder(children) return expand1 -def create_rule_handler(expansion, usermethod): - to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) - if not (is_terminal(sym) and sym.startswith('_'))] - - if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): - def _build_ast(match): - children = [] - for i, to_expand in to_include: - if to_expand: - children += match[i].children - else: - children.append(match[i]) +def create_rule_handler(expansion, usermethod, keep_all_tokens): + if not keep_all_tokens: + to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) + if not (is_terminal(sym) and sym.startswith('_'))] + + if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): + def _build_ast(match): + children = [] + for i, to_expand in to_include: + if to_expand: + children += match[i].children + else: + children.append(match[i]) - return usermethod(children) - else: - _build_ast = usermethod + return usermethod(children) + return _build_ast - return _build_ast + # else, if no filtering required.. + return usermethod class ParseTreeBuilder: @@ -48,6 +49,11 @@ class ParseTreeBuilder: callback = Callback() new_rules = [] for origin, expansions in rules.items(): + keep_all_tokens = False + if origin.startswith('!'): + origin=origin.lstrip('!') + keep_all_tokens = True + expand1 = origin.startswith('?') _origin = origin.lstrip('?') @@ -69,7 +75,7 @@ class ParseTreeBuilder: if expand1: f = create_expand1_tree_builder_function(f) - alias_handler = create_rule_handler(expansion, f) + alias_handler = create_rule_handler(expansion, f, keep_all_tokens) if hasattr(callback, _alias): raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 668815c..54b67bb 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -22,6 +22,7 @@ class WithLexer: class LALR(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) + self.parser_conf = parser_conf analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) analyzer.analyze() @@ -95,7 +96,7 @@ class Earley_NoLex: regexp = self.token_by_name[sym].to_regexp() width = sre_parse.parse(regexp).getwidth() if not width == (1,1): - raise GrammarError('Dynamic lexing requires all tokens have the width 1 (%s is %s)' % (regexp, width)) + raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width)) yield sym, re.compile(regexp) else: yield sym diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tools/nearley.py b/tools/nearley.py deleted file mode 100644 index b78d3b8..0000000 --- a/tools/nearley.py +++ /dev/null @@ -1,150 +0,0 @@ -"Converts between Lark and Nearley grammars. Work in progress!" - -import os.path -import sys - -from lark import Lark, InlineTransformer - -nearley_grammar = r""" - start: (ruledef|directive)+ - - directive: "@" NAME (STRING|NAME) - | "@" _JS -> js_code - ruledef: NAME "->" expansions - | NAME REGEXP "->" expansions -> macro - expansions: expansion ("|" expansion)* - - expansion: expr+ _JS? - - ?expr: item [":" /[+*?]/] - - ?item: rule|string|regexp - | "(" expansions ")" - - rule: NAME - string: STRING - regexp: REGEXP - _JS: /(?s){%.*?%}/ - - NAME: /[a-zA-Z_$]\w*/ - WS.ignore: /[\t \f\n]+/ - COMMENT.ignore: /\#[^\n]*/ - REGEXP: /\[.*?\]/ - STRING: /".*?"/ - - """ - - - -class NearleyToLark(InlineTransformer): - def __init__(self, builtin_path): - self.builtin_path = builtin_path - - def rule(self, name): - # return {'_': '_WS?', '__':'_WS'}.get(name, name) - return {'_': '_ws_maybe', '__':'_ws'}.get(name, name) - - def ruledef(self, name, exps): - name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) - return '%s: %s' % (name, exps) - - def expr(self, item, op): - return '(%s)%s' % (item, op) - - def regexp(self, r): - return '/%s/' % r - - def string(self, s): - # TODO allow regular strings, and split them in the parser frontend - return ' '.join('"%s"'%ch for ch in s[1:-1]) - - def expansion(self, *x): - return ' '.join(x) - - def expansions(self, *x): - return '(%s)' % ('\n |'.join(x)) - - def js_code(self): - return '' - - def macro(self, *args): - return '' # TODO support macros?! - - def directive(self, name, *args): - if name == 'builtin': - arg = args[0][1:-1] - with open(os.path.join(self.builtin_path, arg)) as f: - text = f.read() - return nearley_to_lark(text, self.builtin_path) - elif name == 'preprocessor': - return '' - - raise Exception('Unknown directive: %s' % name) - - def start(self, *rules): - return '\n'.join(filter(None, rules)) - -def nearley_to_lark(g, builtin_path): - parser = Lark(nearley_grammar) - tree = parser.parse(g) - return NearleyToLark(builtin_path).transform(tree) - - -def test(): - css_example_grammar = """ -# http://www.w3.org/TR/css3-color/#colorunits - - @builtin "whitespace.ne" - @builtin "number.ne" - @builtin "postprocessors.ne" - - csscolor -> "#" hexdigit hexdigit hexdigit hexdigit hexdigit hexdigit {% - function(d) { - return { - "r": parseInt(d[1]+d[2], 16), - "g": parseInt(d[3]+d[4], 16), - "b": parseInt(d[5]+d[6], 16), - } - } - %} - | "#" hexdigit hexdigit hexdigit {% - function(d) { - return { - "r": parseInt(d[1]+d[1], 16), - "g": parseInt(d[2]+d[2], 16), - "b": parseInt(d[3]+d[3], 16), - } - } - %} - | "rgb" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ ")" {% $({"r": 4, "g": 8, "b": 12}) %} - | "hsl" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ ")" {% $({"h": 4, "s": 8, "l": 12}) %} - | "rgba" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ "," _ decimal _ ")" {% $({"r": 4, "g": 8, "b": 12, "a": 16}) %} - | "hsla" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ "," _ decimal _ ")" {% $({"h": 4, "s": 8, "l": 12, "a": 16}) %} - - hexdigit -> [a-fA-F0-9] - colnum -> unsigned_int {% id %} | percentage {% - function(d) {return Math.floor(d[0]*255); } - %} - """ - converted_grammar = nearley_to_lark(css_example_grammar, '/home/erez/nearley/builtin') - print(converted_grammar) - - l = Lark(converted_grammar, start='csscolor', parser='earley_nolex') - print(l.parse('#a199ff').pretty()) - print(l.parse('rgb(255, 70%, 3)').pretty()) - - -def main(): - try: - nearley_lib = sys.argv[1] - except IndexError: - print("Reads Nearley grammar from stdin and outputs a lark grammar.") - print("Usage: %s " % sys.argv[0]) - return - - grammar = sys.stdin.read() - print(nearley_to_lark(grammar, os.path.join(nearley_lib, 'builtin'))) - - -if __name__ == '__main__': - main()