From da15f99edb23a916cc2c1eca87a87c653c4654f7 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 14 Feb 2017 13:12:00 +0200 Subject: [PATCH] Added the earley_nolex frontend, and a conf_nolex example to use it --- examples/calc.py | 4 ++++ examples/conf_nolex.py | 42 +++++++++++++++++++++++++++++++++++++++ examples/indented_tree.py | 14 ++++++++----- examples/json_parser.py | 6 ++++++ lark/parser_frontends.py | 13 ++++++++---- lark/parsers/earley.py | 6 ++++-- 6 files changed, 74 insertions(+), 11 deletions(-) create mode 100644 examples/conf_nolex.py diff --git a/examples/calc.py b/examples/calc.py index 02574a8..dc936cf 100644 --- a/examples/calc.py +++ b/examples/calc.py @@ -1,3 +1,7 @@ +# +# This example shows how to write a basic calculator with variables. +# + from lark import Lark, InlineTransformer calc_grammar = """ diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py new file mode 100644 index 0000000..6ae7340 --- /dev/null +++ b/examples/conf_nolex.py @@ -0,0 +1,42 @@ +# +# This example demonstrates lex-less parsing using the earley_nolex frontend +# +# Using a lexer for configuration files is tricky, because values don't +# have to be surrounded by delimiters. +# In this example with skip lexing and let the Earley parser resolve the ambiguity. +# +# Future versions of lark will make it easier to write these kinds of grammars. +# + +from lark import Lark, Transformer + +parser = Lark(r""" + start: _nl? section+ + section: "[" name "]" _nl item+ + item: name "=" value _nl + name: /[a-zA-Z_]/ /\w/* + value: /./+ + _nl: (_CR? _LF)+ + + _CR : /\r/ + _LF : /\n/ + """, parser="earley_nolex") + +class RestoreTokens(Transformer): + value = ''.join + name = ''.join + + +def test(): + sample_conf = """ +[bla] + +a=Hello +this="that",4 +""" + + r = parser.parse(sample_conf) + print(RestoreTokens().transform(r).pretty()) + +if __name__ == '__main__': + test() diff --git a/examples/indented_tree.py b/examples/indented_tree.py index 1a0a202..dc42086 100644 --- a/examples/indented_tree.py +++ b/examples/indented_tree.py @@ -1,8 +1,12 @@ -"""This example demonstrates usage of the Indenter class. - -Since indentation is context-sensitive, a postlex stage is introduced to manufacture INDENT/DEDENT tokens. -It is crucial for the indenter that the NL_type matches the spaces (and tabs) after the newline. -""" +# +# This example demonstrates usage of the Indenter class. +# +# Since indentation is context-sensitive, a postlex stage is introduced to +# manufacture INDENT/DEDENT tokens. +# +# It is crucial for the indenter that the NL_type matches +# the spaces (and tabs) after the newline. +# from lark.lark import Lark from lark.indenter import Indenter diff --git a/examples/json_parser.py b/examples/json_parser.py index 2d520db..b29e7ab 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -1,3 +1,9 @@ +# +# This example shows how to write a basic JSON parser +# +# The code is short and clear, but has good performance. +# + import sys from lark import Lark, inline_args, Transformer diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index fad3ed5..af1275d 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,9 +1,10 @@ import re +import sre_parse from .lexer import Lexer from .parsers.lalr_analysis import GrammarAnalyzer -from .common import is_terminal +from .common import is_terminal, GrammarError from .parsers import lalr_parser, earley class WithLexer: @@ -54,7 +55,7 @@ class Earley(WithLexer): assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -class Earley2: +class Earley_NoLex: def __init__(self, lexer_conf, parser_conf): self.token_by_name = {t.name:t for t in lexer_conf.tokens} @@ -68,7 +69,11 @@ class Earley2: def _prepare_expansion(self, expansion): for sym in expansion: if is_terminal(sym): - yield sym, re.compile(self.token_by_name[sym].to_regexp()) + regexp = self.token_by_name[sym].to_regexp() + width = sre_parse.parse(regexp).getwidth() + if not width == (1,1): + raise GrammarError('Dynamic lexing requires all tokens have the width 1 (%s is %s)' % (regexp, width)) + yield sym, re.compile(regexp) else: yield sym @@ -77,4 +82,4 @@ class Earley2: assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } +ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex } diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index b8fb2ab..b2a511e 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -43,9 +43,11 @@ class State(object): # PORT: originally tests regexp if self.expect_symbol[1] is not None: - match = self.expect_symbol[1].match(stream, pos) + match = self.expect_symbol[1].match(inp) + if match: + return self.next_state(inp) - if self.expect_symbol[0] == inp.type: + elif self.expect_symbol[0] == inp.type: return self.next_state(inp) def consume_nonterminal(self, inp):