diff --git a/examples/conf.py b/examples/conf.py index 13b928b..4a6f388 100644 --- a/examples/conf.py +++ b/examples/conf.py @@ -19,13 +19,12 @@ parser = Lark(r""" start: _NL? section+ section: "[" NAME "]" _NL item+ item: NAME "=" VALUE _NL - NAME: /[a-zA-Z_]\w*/ - VALUE: /.*/ + VALUE: /./* + %import common.CNAME -> NAME + %import common.NEWLINE -> _NL - _NL: /(\r?\n)+/ - - %ignore /[\t \f]+/ - %ignore /\#[^\n]*/ + %import common.WS_INLINE + %ignore WS_INLINE """, parser="lalr", lexer="contextual") diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py index 2b0f6d9..02a82a6 100644 --- a/examples/conf_nolex.py +++ b/examples/conf_nolex.py @@ -12,25 +12,21 @@ # See examples/conf.py for an example of that approach. # -from lark import Lark, Transformer + +from lark import Lark parser = Lark(r""" - start: _nl? section+ - section: "[" name "]" _nl item+ - item: name "=" value _nl - name: /[a-zA-Z_]/ /\w/* - value: /./+ - _nl: (_CR? _LF)+ - - _CR : /\r/ - _LF : /\n/ + start: _NL? section+ + section: "[" NAME "]" _NL item+ + item: NAME "=" VALUE _NL + VALUE: /./* + %import common.CNAME -> NAME + %import common.NEWLINE -> _NL + + %import common.WS_INLINE + %ignore WS_INLINE """, lexer=None) -class RestoreTokens(Transformer): - value = ''.join - name = ''.join - - def test(): sample_conf = """ [bla] @@ -40,7 +36,7 @@ this="that",4 """ r = parser.parse(sample_conf) - print(RestoreTokens().transform(r).pretty()) + print r.pretty() if __name__ == '__main__': test() diff --git a/lark/grammars/common.g b/lark/grammars/common.g index 3db6ec1..c7b2733 100644 --- a/lark/grammars/common.g +++ b/lark/grammars/common.g @@ -39,3 +39,7 @@ CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)* WS_INLINE: (" "|/\t/)+ WS: /[ \t\f\r\n]/+ +CR : /\r/ +LF : /\n/ +NEWLINE: (CR? LF)+ + diff --git a/lark/lark.py b/lark/lark.py index 67aeb96..3fb4d52 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -119,21 +119,23 @@ class Lark: assert not self.options.profile, "Feature temporarily disabled" self.profiler = Profiler() if self.options.profile else None + lexer = self.options.lexer + if lexer == 'auto': + if self.options.parser == 'lalr': + lexer = 'standard' + elif self.options.parser == 'earley': + lexer = 'standard' + self.options.lexer = lexer + self.grammar = load_grammar(grammar) - tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True) + tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer)) self.ignore_tokens = self.grammar.extra['ignore'] self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) - if self.options.lexer == 'auto': - if self.options.parser == 'lalr': - self.options.lexer = 'standard' - elif self.options.parser == 'earley': - self.options.lexer = 'standard' - if self.options.parser: self.parser = self._build_parser() - elif self.options.lexer: + elif lexer: self.lexer = self._build_lexer() if self.profiler: self.profiler.enter_section('outside_lark') diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f210e3d..9ab4cac 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -239,6 +239,15 @@ class ExtractAnonTokens(InlineTransformer): self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)} self.i = 0 + def range(self, start, end): + assert start.type == end.type == 'STRING' + start = start.value[1:-1] + end = end.value[1:-1] + assert len(start) == len(end) == 1 + regexp = '/[%s-%s]/' % (start, end) + t = Token('REGEXP', regexp) + return self.tokenvalue(t) + def tokenvalue(self, token): value = token.value[1:-1] if token.type == 'STRING': @@ -325,8 +334,19 @@ class Grammar: self.extra = extra def compile(self, lexer=False): - assert lexer - + # assert lexer + if not lexer: + self.rule_defs += self.token_defs + self.token_defs = [] + + for name, tree in self.rule_defs: + for tokenvalue in tree.find_data('tokenvalue'): + value ,= tokenvalue.children + if value.type == 'STRING': + assert value[0] == value[-1] == '"' + if len(value)>3: + tokenvalue.data = 'expansion' + tokenvalue.children = [T('tokenvalue', [Token('STRING', '"%s"'%ch)]) for ch in value[1:-1]] tokendefs = list(self.token_defs) # ================= diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index dfe35e8..5007375 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -6,6 +6,7 @@ from .lexer import Lexer, ContextualLexer, Token from .common import is_terminal, GrammarError, ParserConf from .parsers import lalr_parser, earley, nearley from .parsers.grammar_analysis import Rule +from .tree import Transformer class WithLexer: def __init__(self, lexer_conf): @@ -121,10 +122,16 @@ class Nearley_NoLex: class Earley_NoLex: def __init__(self, lexer_conf, parser_conf): + self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)} + rules = [] + for name, exp, alias in parser_conf.rules: + name = self.tokens_to_convert.get(name, name) + exp = [self.tokens_to_convert.get(x, x) for x in exp] + rules.append((name, exp, alias)) + self.token_by_name = {t.name:t for t in lexer_conf.tokens} - rules = [(n, list(self._prepare_expansion(x)), a) - for n,x,a in parser_conf.rules] + rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules] self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) @@ -142,7 +149,16 @@ class Earley_NoLex: def parse(self, text): res = self.parser.parse(text) assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' - return res[0] + res = res[0] + + class RestoreTokens(Transformer): + pass + + for t in self.tokens_to_convert: + setattr(RestoreTokens, t, ''.join) + + res = RestoreTokens().transform(res) + return res def get_frontend(parser, lexer): diff --git a/tests/test_parser.py b/tests/test_parser.py index 1ab8cfe..5e2f34e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -39,9 +39,19 @@ class TestParsers(unittest.TestCase): l2 = g.parse('(a,b,c,*x)') assert l == l2, '%s != %s' % (l.pretty(), l2.pretty()) + + def test_earley_nolex(self): + g = Lark("""start: A "b" c + A: "a"+ + c: "abc" + """, parser="earley", lexer=None) + x = g.parse('aaaababc') + + class TestEarley(unittest.TestCase): pass + def _make_parser_test(LEXER, PARSER): def _Lark(grammar, **kwargs): return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)