Added the '!' prefix to keep all tokens in rule. Also removed tools (it's now in lark/tools)

8 years ago · 0b902b1d03
--- a/examples/conf.py
+++ b/examples/conf.py
@@ -34,4 +34,4 @@ a=Hello
 this="that",4
 """

 print parser.parse(sample_conf).pretty()
 print(parser.parse(sample_conf).pretty())
--- a/lark/common.py
+++ b/lark/common.py
@@ -39,6 +39,7 @@ class LexerConf:

 class ParserConf:
    def __init__(self, rules, callback, start):
        assert all(len(r)==3 for r in rules)
        self.rules = rules
        self.callback = callback
        self.start = start
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -61,7 +61,7 @@ TOKENS = {
    '_COLON': ':',
    '_OR': r'\|',
    '_DOT': r'\.',
    'RULE': '[_?*]?[a-z][_a-z0-9]*',
    'RULE': '!?[_?]?[a-z][_a-z0-9]*',
    'TOKEN': '_?[A-Z][_A-Z0-9]*',
    'STRING': r'".*?[^\\]"',
    'REGEXP': r"/(?!/).*?[^\\]/",
@@ -302,6 +302,8 @@ class GrammarLoader:
                raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
            elif 'tokenvalue' in e.expected:
                raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
            elif e.expected == ['_OR']:
                raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
            raise

        # =================
@@ -363,7 +365,7 @@ class GrammarLoader:
        used_symbols = {symbol for expansions in rules.values()
                               for expansion, _alias in expansions
                               for symbol in expansion}
        rule_set = {r.lstrip('?') for r in rules}
        rule_set = {r.lstrip('!').lstrip('?') for r in rules}
        for sym in used_symbols:
            if is_terminal(sym):
                if sym not in token_set:
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -12,24 +12,25 @@ def create_expand1_tree_builder_function(tree_builder):
            return tree_builder(children)
    return expand1

 def create_rule_handler(expansion, usermethod):
    to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion)
                  if not (is_terminal(sym) and sym.startswith('_'))]

    if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
        def _build_ast(match):
            children = []
            for i, to_expand in to_include:
                if to_expand:
                    children += match[i].children
                else:
                    children.append(match[i])
 def create_rule_handler(expansion, usermethod, keep_all_tokens):
    if not keep_all_tokens:
        to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion)
                      if not (is_terminal(sym) and sym.startswith('_'))]

        if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
            def _build_ast(match):
                children = []
                for i, to_expand in to_include:
                    if to_expand:
                        children += match[i].children
                    else:
                        children.append(match[i])

            return usermethod(children)
    else:
        _build_ast = usermethod
                return usermethod(children)
            return _build_ast

    return _build_ast
    # else, if no filtering required..
    return usermethod


 class ParseTreeBuilder:
@@ -48,6 +49,11 @@ class ParseTreeBuilder:
        callback = Callback()
        new_rules = []
        for origin, expansions in rules.items():
            keep_all_tokens = False
            if origin.startswith('!'):
                origin=origin.lstrip('!')
                keep_all_tokens = True

            expand1 = origin.startswith('?')
            _origin = origin.lstrip('?')

@@ -69,7 +75,7 @@ class ParseTreeBuilder:
                        if expand1:
                            f = create_expand1_tree_builder_function(f)

                alias_handler = create_rule_handler(expansion, f)
                alias_handler = create_rule_handler(expansion, f, keep_all_tokens)

                if hasattr(callback, _alias):
                    raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -22,6 +22,7 @@ class WithLexer:
 class LALR(WithLexer):
    def __init__(self, lexer_conf, parser_conf):
        WithLexer.__init__(self, lexer_conf)
        self.parser_conf = parser_conf

        analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
        analyzer.analyze()
@@ -95,7 +96,7 @@ class Earley_NoLex:
                regexp = self.token_by_name[sym].to_regexp()
                width = sre_parse.parse(regexp).getwidth()
                if not width == (1,1):
                    raise GrammarError('Dynamic lexing requires all tokens have the width 1 (%s is %s)' % (regexp, width))
                    raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width))
                yield sym, re.compile(regexp)
            else:
                yield sym
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/nearley.py
+++ b/tools/nearley.py
@@ -1,150 +0,0 @@
 "Converts between Lark and Nearley grammars. Work in progress!"

 import os.path
 import sys

 from lark import Lark, InlineTransformer

 nearley_grammar = r"""
    start: (ruledef|directive)+

    directive: "@" NAME (STRING|NAME)
             | "@" _JS  -> js_code
    ruledef: NAME "->" expansions
           | NAME REGEXP "->" expansions -> macro
    expansions: expansion ("|" expansion)*

    expansion: expr+ _JS?

    ?expr: item [":" /[+*?]/]

    ?item: rule|string|regexp
         | "(" expansions ")"

    rule: NAME
    string: STRING
    regexp: REGEXP
    _JS: /(?s){%.*?%}/

    NAME: /[a-zA-Z_$]\w*/
    WS.ignore: /[\t \f\n]+/
    COMMENT.ignore: /\#[^\n]*/
    REGEXP: /\[.*?\]/
    STRING: /".*?"/

    """



 class NearleyToLark(InlineTransformer):
    def __init__(self, builtin_path):
        self.builtin_path = builtin_path

    def rule(self, name):
        # return {'_': '_WS?', '__':'_WS'}.get(name, name)
        return {'_': '_ws_maybe', '__':'_ws'}.get(name, name)

    def ruledef(self, name, exps):
        name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name)
        return '%s: %s' % (name, exps)

    def expr(self, item, op):
        return '(%s)%s' % (item, op)

    def regexp(self, r):
        return '/%s/' % r

    def string(self, s):
        # TODO allow regular strings, and split them in the parser frontend
        return ' '.join('"%s"'%ch for ch in s[1:-1])

    def expansion(self, *x):
        return ' '.join(x)

    def expansions(self, *x):
        return '(%s)' % ('\n    |'.join(x))

    def js_code(self):
        return ''

    def macro(self, *args):
        return ''   # TODO support macros?!

    def directive(self, name, *args):
        if name == 'builtin':
            arg = args[0][1:-1]
            with open(os.path.join(self.builtin_path, arg)) as f:
                text = f.read()
            return nearley_to_lark(text, self.builtin_path)
        elif name == 'preprocessor':
            return ''

        raise Exception('Unknown directive: %s' % name)

    def start(self, *rules):
        return '\n'.join(filter(None, rules))

 def nearley_to_lark(g, builtin_path):
    parser = Lark(nearley_grammar)
    tree = parser.parse(g)
    return NearleyToLark(builtin_path).transform(tree)


 def test():
    css_example_grammar = """
 # http://www.w3.org/TR/css3-color/#colorunits

    @builtin "whitespace.ne"
    @builtin "number.ne"
    @builtin "postprocessors.ne"

    csscolor -> "#" hexdigit hexdigit hexdigit hexdigit hexdigit hexdigit {%
        function(d) {
            return {
                "r": parseInt(d[1]+d[2], 16),
                "g": parseInt(d[3]+d[4], 16),
                "b": parseInt(d[5]+d[6], 16),
            }
        }
    %}
              | "#" hexdigit hexdigit hexdigit {%
        function(d) {
            return {
                "r": parseInt(d[1]+d[1], 16),
                "g": parseInt(d[2]+d[2], 16),
                "b": parseInt(d[3]+d[3], 16),
            }
        }
    %}
              | "rgb"  _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ ")" {% $({"r": 4, "g": 8, "b": 12}) %}
              | "hsl"  _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ ")" {% $({"h": 4, "s": 8, "l": 12}) %}
              | "rgba" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ "," _ decimal _ ")" {% $({"r": 4, "g": 8, "b": 12, "a": 16}) %}
              | "hsla" _ "(" _ colnum _ "," _ colnum _ "," _ colnum _ "," _ decimal _ ")" {% $({"h": 4, "s": 8, "l": 12, "a": 16}) %}

    hexdigit -> [a-fA-F0-9]
    colnum -> unsigned_int {% id %} | percentage {%
        function(d) {return Math.floor(d[0]*255); }
    %}
    """
    converted_grammar = nearley_to_lark(css_example_grammar, '/home/erez/nearley/builtin')
    print(converted_grammar)

    l = Lark(converted_grammar, start='csscolor', parser='earley_nolex')
    print(l.parse('#a199ff').pretty())
    print(l.parse('rgb(255, 70%, 3)').pretty())


 def main():
    try:
        nearley_lib = sys.argv[1]
    except IndexError:
        print("Reads Nearley grammar from stdin and outputs a lark grammar.")
        print("Usage: %s <nearley_lib_path>" % sys.argv[0])
        return

    grammar = sys.stdin.read()
    print(nearley_to_lark(grammar, os.path.join(nearley_lib, 'builtin')))


 if __name__ == '__main__':
    main()