| @@ -34,4 +34,4 @@ a=Hello | |||||
| this="that",4 | this="that",4 | ||||
| """ | """ | ||||
| print parser.parse(sample_conf).pretty() | |||||
| print(parser.parse(sample_conf).pretty()) | |||||
| @@ -39,6 +39,7 @@ class LexerConf: | |||||
| class ParserConf: | class ParserConf: | ||||
| def __init__(self, rules, callback, start): | def __init__(self, rules, callback, start): | ||||
| assert all(len(r)==3 for r in rules) | |||||
| self.rules = rules | self.rules = rules | ||||
| self.callback = callback | self.callback = callback | ||||
| self.start = start | self.start = start | ||||
| @@ -61,7 +61,7 @@ TOKENS = { | |||||
| '_COLON': ':', | '_COLON': ':', | ||||
| '_OR': r'\|', | '_OR': r'\|', | ||||
| '_DOT': r'\.', | '_DOT': r'\.', | ||||
| 'RULE': '[_?*]?[a-z][_a-z0-9]*', | |||||
| 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | |||||
| 'TOKEN': '_?[A-Z][_A-Z0-9]*', | 'TOKEN': '_?[A-Z][_A-Z0-9]*', | ||||
| 'STRING': r'".*?[^\\]"', | 'STRING': r'".*?[^\\]"', | ||||
| 'REGEXP': r"/(?!/).*?[^\\]/", | 'REGEXP': r"/(?!/).*?[^\\]/", | ||||
| @@ -302,6 +302,8 @@ class GrammarLoader: | |||||
| raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) | raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) | ||||
| elif 'tokenvalue' in e.expected: | elif 'tokenvalue' in e.expected: | ||||
| raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) | raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) | ||||
| elif e.expected == ['_OR']: | |||||
| raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) | |||||
| raise | raise | ||||
| # ================= | # ================= | ||||
| @@ -363,7 +365,7 @@ class GrammarLoader: | |||||
| used_symbols = {symbol for expansions in rules.values() | used_symbols = {symbol for expansions in rules.values() | ||||
| for expansion, _alias in expansions | for expansion, _alias in expansions | ||||
| for symbol in expansion} | for symbol in expansion} | ||||
| rule_set = {r.lstrip('?') for r in rules} | |||||
| rule_set = {r.lstrip('!').lstrip('?') for r in rules} | |||||
| for sym in used_symbols: | for sym in used_symbols: | ||||
| if is_terminal(sym): | if is_terminal(sym): | ||||
| if sym not in token_set: | if sym not in token_set: | ||||
| @@ -12,24 +12,25 @@ def create_expand1_tree_builder_function(tree_builder): | |||||
| return tree_builder(children) | return tree_builder(children) | ||||
| return expand1 | return expand1 | ||||
| def create_rule_handler(expansion, usermethod): | |||||
| to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||||
| if not (is_terminal(sym) and sym.startswith('_'))] | |||||
| if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||||
| def _build_ast(match): | |||||
| children = [] | |||||
| for i, to_expand in to_include: | |||||
| if to_expand: | |||||
| children += match[i].children | |||||
| else: | |||||
| children.append(match[i]) | |||||
| def create_rule_handler(expansion, usermethod, keep_all_tokens): | |||||
| if not keep_all_tokens: | |||||
| to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||||
| if not (is_terminal(sym) and sym.startswith('_'))] | |||||
| if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||||
| def _build_ast(match): | |||||
| children = [] | |||||
| for i, to_expand in to_include: | |||||
| if to_expand: | |||||
| children += match[i].children | |||||
| else: | |||||
| children.append(match[i]) | |||||
| return usermethod(children) | |||||
| else: | |||||
| _build_ast = usermethod | |||||
| return usermethod(children) | |||||
| return _build_ast | |||||
| return _build_ast | |||||
| # else, if no filtering required.. | |||||
| return usermethod | |||||
| class ParseTreeBuilder: | class ParseTreeBuilder: | ||||
| @@ -48,6 +49,11 @@ class ParseTreeBuilder: | |||||
| callback = Callback() | callback = Callback() | ||||
| new_rules = [] | new_rules = [] | ||||
| for origin, expansions in rules.items(): | for origin, expansions in rules.items(): | ||||
| keep_all_tokens = False | |||||
| if origin.startswith('!'): | |||||
| origin=origin.lstrip('!') | |||||
| keep_all_tokens = True | |||||
| expand1 = origin.startswith('?') | expand1 = origin.startswith('?') | ||||
| _origin = origin.lstrip('?') | _origin = origin.lstrip('?') | ||||
| @@ -69,7 +75,7 @@ class ParseTreeBuilder: | |||||
| if expand1: | if expand1: | ||||
| f = create_expand1_tree_builder_function(f) | f = create_expand1_tree_builder_function(f) | ||||
| alias_handler = create_rule_handler(expansion, f) | |||||
| alias_handler = create_rule_handler(expansion, f, keep_all_tokens) | |||||
| if hasattr(callback, _alias): | if hasattr(callback, _alias): | ||||
| raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | ||||
| @@ -22,6 +22,7 @@ class WithLexer: | |||||
| class LALR(WithLexer): | class LALR(WithLexer): | ||||
| def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
| WithLexer.__init__(self, lexer_conf) | WithLexer.__init__(self, lexer_conf) | ||||
| self.parser_conf = parser_conf | |||||
| analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | ||||
| analyzer.analyze() | analyzer.analyze() | ||||
| @@ -95,7 +96,7 @@ class Earley_NoLex: | |||||
| regexp = self.token_by_name[sym].to_regexp() | regexp = self.token_by_name[sym].to_regexp() | ||||
| width = sre_parse.parse(regexp).getwidth() | width = sre_parse.parse(regexp).getwidth() | ||||
| if not width == (1,1): | if not width == (1,1): | ||||
| raise GrammarError('Dynamic lexing requires all tokens have the width 1 (%s is %s)' % (regexp, width)) | |||||
| raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width)) | |||||
| yield sym, re.compile(regexp) | yield sym, re.compile(regexp) | ||||
| else: | else: | ||||
| yield sym | yield sym | ||||
| @@ -1,150 +0,0 @@ | |||||
| "Converts between Lark and Nearley grammars. Work in progress!" | |||||
| import os.path | |||||
| import sys | |||||
| from lark import Lark, InlineTransformer | |||||
| nearley_grammar = r""" | |||||
| start: (ruledef|directive)+ | |||||
| directive: "@" NAME (STRING|NAME) | |||||
| | "@" _JS -> js_code | |||||
| ruledef: NAME "->" expansions | |||||
| | NAME REGEXP "->" expansions -> macro | |||||
| expansions: expansion ("|" expansion)* | |||||
| expansion: expr+ _JS? | |||||
| ?expr: item [":" /[+*?]/] | |||||
| ?item: rule|string|regexp | |||||
| | "(" expansions ")" | |||||
| rule: NAME | |||||
| string: STRING | |||||
| regexp: REGEXP | |||||
| _JS: /(?s){%.*?%}/ | |||||
| NAME: /[a-zA-Z_$]\w*/ | |||||
| WS.ignore: /[\t \f\n]+/ | |||||
| COMMENT.ignore: /\#[^\n]*/ | |||||
| REGEXP: /\[.*?\]/ | |||||
| STRING: /".*?"/ | |||||
| """ | |||||
| class NearleyToLark(InlineTransformer): | |||||
| def __init__(self, builtin_path): | |||||
| self.builtin_path = builtin_path | |||||
| def rule(self, name): | |||||
| # return {'_': '_WS?', '__':'_WS'}.get(name, name) | |||||
| return {'_': '_ws_maybe', '__':'_ws'}.get(name, name) | |||||
| def ruledef(self, name, exps): | |||||
| name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) | |||||
| return '%s: %s' % (name, exps) | |||||
| def expr(self, item, op): | |||||
| return '(%s)%s' % (item, op) | |||||
| def regexp(self, r): | |||||
| return '/%s/' % r | |||||
| def string(self, s): | |||||
| # TODO allow regular strings, and split them in the parser frontend | |||||
| return ' '.join('"%s"'%ch for ch in s[1:-1]) | |||||
| def expansion(self, *x): | |||||
| return ' '.join(x) | |||||
| def expansions(self, *x): | |||||
| return '(%s)' % ('\n |'.join(x)) | |||||
| def js_code(self): | |||||
| return '' | |||||
| def macro(self, *args): | |||||
| return '' # TODO support macros?! | |||||
| def directive(self, name, *args): | |||||
| if name == 'builtin': | |||||
| arg = args[0][1:-1] | |||||
| with open(os.path.join(self.builtin_path, arg)) as f: | |||||
| text = f.read() | |||||
| return nearley_to_lark(text, self.builtin_path) | |||||
| elif name == 'preprocessor': | |||||
| return '' | |||||
| raise Exception('Unknown directive: %s' % name) | |||||
| def start(self, *rules): | |||||
| return '\n'.join(filter(None, rules)) | |||||
| def nearley_to_lark(g, builtin_path): | |||||
| parser = Lark(nearley_grammar) | |||||
| tree = parser.parse(g) | |||||
| return NearleyToLark(builtin_path).transform(tree) | |||||
| def test(): | |||||
| css_example_grammar = """ | |||||
| # http://www.w3.org/TR/css3-color/#colorunits | |||||
| @builtin "whitespace.ne" | |||||
| @builtin "number.ne" | |||||
| @builtin "postprocessors.ne" | |||||
| csscolor -> "#" hexdigit hexdigit hexdigit hexdigit hexdigit hexdigit {% | |||||
| function(d) { | |||||
| return { | |||||
| "r": parseInt(d[1]+d[2], 16), | |||||
| "g": parseInt(d[3]+d[4], 16), | |||||
| "b": parseInt(d[5]+d[6], 16), | |||||
| } | |||||
| } | |||||
| %} | |||||
| | "#" hexdigit hexdigit hexdigit {% | |||||
| function(d) { | |||||
| return { | |||||
| "r": parseInt(d[1]+d[1], 16), | |||||
| "g": parseInt(d[2]+d[2], 16), | |||||
| "b": parseInt(d[3]+d[3], 16), | |||||
| } | |||||
| } | |||||
| %} | |||||
| | "rgb" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ ")" {% $({"r": 4, "g": 8, "b": 12}) %} | |||||
| | "hsl" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ ")" {% $({"h": 4, "s": 8, "l": 12}) %} | |||||
| | "rgba" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ "," _ decimal _ ")" {% $({"r": 4, "g": 8, "b": 12, "a": 16}) %} | |||||
| | "hsla" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ "," _ decimal _ ")" {% $({"h": 4, "s": 8, "l": 12, "a": 16}) %} | |||||
| hexdigit -> [a-fA-F0-9] | |||||
| colnum -> unsigned_int {% id %} | percentage {% | |||||
| function(d) {return Math.floor(d[0]*255); } | |||||
| %} | |||||
| """ | |||||
| converted_grammar = nearley_to_lark(css_example_grammar, '/home/erez/nearley/builtin') | |||||
| print(converted_grammar) | |||||
| l = Lark(converted_grammar, start='csscolor', parser='earley_nolex') | |||||
| print(l.parse('#a199ff').pretty()) | |||||
| print(l.parse('rgb(255, 70%, 3)').pretty()) | |||||
| def main(): | |||||
| try: | |||||
| nearley_lib = sys.argv[1] | |||||
| except IndexError: | |||||
| print("Reads Nearley grammar from stdin and outputs a lark grammar.") | |||||
| print("Usage: %s <nearley_lib_path>" % sys.argv[0]) | |||||
| return | |||||
| grammar = sys.stdin.read() | |||||
| print(nearley_to_lark(grammar, os.path.join(nearley_lib, 'builtin'))) | |||||
| if __name__ == '__main__': | |||||
| main() | |||||