Bladeren bron

Better support for scanless parsing

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 jaren geleden
bovenliggende
commit
c9b45823ed
7 gewijzigde bestanden met toevoegingen van 82 en 35 verwijderingen
  1. +5
    -6
      examples/conf.py
  2. +12
    -16
      examples/conf_nolex.py
  3. +4
    -0
      lark/grammars/common.g
  4. +10
    -8
      lark/lark.py
  5. +22
    -2
      lark/load_grammar.py
  6. +19
    -3
      lark/parser_frontends.py
  7. +10
    -0
      tests/test_parser.py

+ 5
- 6
examples/conf.py Bestand weergeven

@@ -19,13 +19,12 @@ parser = Lark(r"""
start: _NL? section+ start: _NL? section+
section: "[" NAME "]" _NL item+ section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL item: NAME "=" VALUE _NL
NAME: /[a-zA-Z_]\w*/
VALUE: /.*/
VALUE: /./*
%import common.CNAME -> NAME
%import common.NEWLINE -> _NL


_NL: /(\r?\n)+/

%ignore /[\t \f]+/
%ignore /\#[^\n]*/
%import common.WS_INLINE
%ignore WS_INLINE
""", parser="lalr", lexer="contextual") """, parser="lalr", lexer="contextual")






+ 12
- 16
examples/conf_nolex.py Bestand weergeven

@@ -12,25 +12,21 @@
# See examples/conf.py for an example of that approach. # See examples/conf.py for an example of that approach.
# #


from lark import Lark, Transformer

from lark import Lark


parser = Lark(r""" parser = Lark(r"""
start: _nl? section+
section: "[" name "]" _nl item+
item: name "=" value _nl
name: /[a-zA-Z_]/ /\w/*
value: /./+
_nl: (_CR? _LF)+
_CR : /\r/
_LF : /\n/
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL
VALUE: /./*
%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
%import common.WS_INLINE
%ignore WS_INLINE
""", lexer=None) """, lexer=None)


class RestoreTokens(Transformer):
value = ''.join
name = ''.join


def test(): def test():
sample_conf = """ sample_conf = """
[bla] [bla]
@@ -40,7 +36,7 @@ this="that",4
""" """


r = parser.parse(sample_conf) r = parser.parse(sample_conf)
print(RestoreTokens().transform(r).pretty())
print r.pretty()


if __name__ == '__main__': if __name__ == '__main__':
test() test()

+ 4
- 0
lark/grammars/common.g Bestand weergeven

@@ -39,3 +39,7 @@ CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
WS_INLINE: (" "|/\t/)+ WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+ WS: /[ \t\f\r\n]/+


CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+


+ 10
- 8
lark/lark.py Bestand weergeven

@@ -119,21 +119,23 @@ class Lark:
assert not self.options.profile, "Feature temporarily disabled" assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None self.profiler = Profiler() if self.options.profile else None


lexer = self.options.lexer
if lexer == 'auto':
if self.options.parser == 'lalr':
lexer = 'standard'
elif self.options.parser == 'earley':
lexer = 'standard'
self.options.lexer = lexer

self.grammar = load_grammar(grammar) self.grammar = load_grammar(grammar)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer))
self.ignore_tokens = self.grammar.extra['ignore'] self.ignore_tokens = self.grammar.extra['ignore']


self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)


if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
self.options.lexer = 'standard'
elif self.options.parser == 'earley':
self.options.lexer = 'standard'

if self.options.parser: if self.options.parser:
self.parser = self._build_parser() self.parser = self._build_parser()
elif self.options.lexer:
elif lexer:
self.lexer = self._build_lexer() self.lexer = self._build_lexer()


if self.profiler: self.profiler.enter_section('outside_lark') if self.profiler: self.profiler.enter_section('outside_lark')


+ 22
- 2
lark/load_grammar.py Bestand weergeven

@@ -239,6 +239,15 @@ class ExtractAnonTokens(InlineTransformer):
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)} self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
self.i = 0 self.i = 0


def range(self, start, end):
assert start.type == end.type == 'STRING'
start = start.value[1:-1]
end = end.value[1:-1]
assert len(start) == len(end) == 1
regexp = '/[%s-%s]/' % (start, end)
t = Token('REGEXP', regexp)
return self.tokenvalue(t)

def tokenvalue(self, token): def tokenvalue(self, token):
value = token.value[1:-1] value = token.value[1:-1]
if token.type == 'STRING': if token.type == 'STRING':
@@ -325,8 +334,19 @@ class Grammar:
self.extra = extra self.extra = extra


def compile(self, lexer=False): def compile(self, lexer=False):
assert lexer

# assert lexer
if not lexer:
self.rule_defs += self.token_defs
self.token_defs = []

for name, tree in self.rule_defs:
for tokenvalue in tree.find_data('tokenvalue'):
value ,= tokenvalue.children
if value.type == 'STRING':
assert value[0] == value[-1] == '"'
if len(value)>3:
tokenvalue.data = 'expansion'
tokenvalue.children = [T('tokenvalue', [Token('STRING', '"%s"'%ch)]) for ch in value[1:-1]]
tokendefs = list(self.token_defs) tokendefs = list(self.token_defs)


# ================= # =================


+ 19
- 3
lark/parser_frontends.py Bestand weergeven

@@ -6,6 +6,7 @@ from .lexer import Lexer, ContextualLexer, Token
from .common import is_terminal, GrammarError, ParserConf from .common import is_terminal, GrammarError, ParserConf
from .parsers import lalr_parser, earley, nearley from .parsers import lalr_parser, earley, nearley
from .parsers.grammar_analysis import Rule from .parsers.grammar_analysis import Rule
from .tree import Transformer


class WithLexer: class WithLexer:
def __init__(self, lexer_conf): def __init__(self, lexer_conf):
@@ -121,10 +122,16 @@ class Nearley_NoLex:


class Earley_NoLex: class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf): def __init__(self, lexer_conf, parser_conf):
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
rules = []
for name, exp, alias in parser_conf.rules:
name = self.tokens_to_convert.get(name, name)
exp = [self.tokens_to_convert.get(x, x) for x in exp]
rules.append((name, exp, alias))

self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}


rules = [(n, list(self._prepare_expansion(x)), a)
for n,x,a in parser_conf.rules]
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules]


self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))


@@ -142,7 +149,16 @@ class Earley_NoLex:
def parse(self, text): def parse(self, text):
res = self.parser.parse(text) res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]
res = res[0]

class RestoreTokens(Transformer):
pass

for t in self.tokens_to_convert:
setattr(RestoreTokens, t, ''.join)

res = RestoreTokens().transform(res)
return res




def get_frontend(parser, lexer): def get_frontend(parser, lexer):


+ 10
- 0
tests/test_parser.py Bestand weergeven

@@ -39,9 +39,19 @@ class TestParsers(unittest.TestCase):
l2 = g.parse('(a,b,c,*x)') l2 = g.parse('(a,b,c,*x)')
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty()) assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())



def test_earley_nolex(self):
g = Lark("""start: A "b" c
A: "a"+
c: "abc"
""", parser="earley", lexer=None)
x = g.parse('aaaababc')


class TestEarley(unittest.TestCase): class TestEarley(unittest.TestCase):
pass pass



def _make_parser_test(LEXER, PARSER): def _make_parser_test(LEXER, PARSER):
def _Lark(grammar, **kwargs): def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)


Laden…
Annuleren
Opslaan