Explorar el Código

Better support for scanless parsing

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan hace 7 años
padre
commit
c9b45823ed
Se han modificado 7 ficheros con 82 adiciones y 35 borrados
  1. +5
    -6
      examples/conf.py
  2. +12
    -16
      examples/conf_nolex.py
  3. +4
    -0
      lark/grammars/common.g
  4. +10
    -8
      lark/lark.py
  5. +22
    -2
      lark/load_grammar.py
  6. +19
    -3
      lark/parser_frontends.py
  7. +10
    -0
      tests/test_parser.py

+ 5
- 6
examples/conf.py Ver fichero

@@ -19,13 +19,12 @@ parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL
NAME: /[a-zA-Z_]\w*/
VALUE: /.*/
VALUE: /./*
%import common.CNAME -> NAME
%import common.NEWLINE -> _NL

_NL: /(\r?\n)+/

%ignore /[\t \f]+/
%ignore /\#[^\n]*/
%import common.WS_INLINE
%ignore WS_INLINE
""", parser="lalr", lexer="contextual")




+ 12
- 16
examples/conf_nolex.py Ver fichero

@@ -12,25 +12,21 @@
# See examples/conf.py for an example of that approach.
#

from lark import Lark, Transformer

from lark import Lark

parser = Lark(r"""
start: _nl? section+
section: "[" name "]" _nl item+
item: name "=" value _nl
name: /[a-zA-Z_]/ /\w/*
value: /./+
_nl: (_CR? _LF)+
_CR : /\r/
_LF : /\n/
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL
VALUE: /./*
%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
%import common.WS_INLINE
%ignore WS_INLINE
""", lexer=None)

class RestoreTokens(Transformer):
value = ''.join
name = ''.join


def test():
sample_conf = """
[bla]
@@ -40,7 +36,7 @@ this="that",4
"""

r = parser.parse(sample_conf)
print(RestoreTokens().transform(r).pretty())
print r.pretty()

if __name__ == '__main__':
test()

+ 4
- 0
lark/grammars/common.g Ver fichero

@@ -39,3 +39,7 @@ CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+

CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+


+ 10
- 8
lark/lark.py Ver fichero

@@ -119,21 +119,23 @@ class Lark:
assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None

lexer = self.options.lexer
if lexer == 'auto':
if self.options.parser == 'lalr':
lexer = 'standard'
elif self.options.parser == 'earley':
lexer = 'standard'
self.options.lexer = lexer

self.grammar = load_grammar(grammar)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer))
self.ignore_tokens = self.grammar.extra['ignore']

self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)

if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
self.options.lexer = 'standard'
elif self.options.parser == 'earley':
self.options.lexer = 'standard'

if self.options.parser:
self.parser = self._build_parser()
elif self.options.lexer:
elif lexer:
self.lexer = self._build_lexer()

if self.profiler: self.profiler.enter_section('outside_lark')


+ 22
- 2
lark/load_grammar.py Ver fichero

@@ -239,6 +239,15 @@ class ExtractAnonTokens(InlineTransformer):
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
self.i = 0

def range(self, start, end):
assert start.type == end.type == 'STRING'
start = start.value[1:-1]
end = end.value[1:-1]
assert len(start) == len(end) == 1
regexp = '/[%s-%s]/' % (start, end)
t = Token('REGEXP', regexp)
return self.tokenvalue(t)

def tokenvalue(self, token):
value = token.value[1:-1]
if token.type == 'STRING':
@@ -325,8 +334,19 @@ class Grammar:
self.extra = extra

def compile(self, lexer=False):
assert lexer

# assert lexer
if not lexer:
self.rule_defs += self.token_defs
self.token_defs = []

for name, tree in self.rule_defs:
for tokenvalue in tree.find_data('tokenvalue'):
value ,= tokenvalue.children
if value.type == 'STRING':
assert value[0] == value[-1] == '"'
if len(value)>3:
tokenvalue.data = 'expansion'
tokenvalue.children = [T('tokenvalue', [Token('STRING', '"%s"'%ch)]) for ch in value[1:-1]]
tokendefs = list(self.token_defs)

# =================


+ 19
- 3
lark/parser_frontends.py Ver fichero

@@ -6,6 +6,7 @@ from .lexer import Lexer, ContextualLexer, Token
from .common import is_terminal, GrammarError, ParserConf
from .parsers import lalr_parser, earley, nearley
from .parsers.grammar_analysis import Rule
from .tree import Transformer

class WithLexer:
def __init__(self, lexer_conf):
@@ -121,10 +122,16 @@ class Nearley_NoLex:

class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
rules = []
for name, exp, alias in parser_conf.rules:
name = self.tokens_to_convert.get(name, name)
exp = [self.tokens_to_convert.get(x, x) for x in exp]
rules.append((name, exp, alias))

self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [(n, list(self._prepare_expansion(x)), a)
for n,x,a in parser_conf.rules]
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules]

self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))

@@ -142,7 +149,16 @@ class Earley_NoLex:
def parse(self, text):
res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]
res = res[0]

class RestoreTokens(Transformer):
pass

for t in self.tokens_to_convert:
setattr(RestoreTokens, t, ''.join)

res = RestoreTokens().transform(res)
return res


def get_frontend(parser, lexer):


+ 10
- 0
tests/test_parser.py Ver fichero

@@ -39,9 +39,19 @@ class TestParsers(unittest.TestCase):
l2 = g.parse('(a,b,c,*x)')
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())


def test_earley_nolex(self):
g = Lark("""start: A "b" c
A: "a"+
c: "abc"
""", parser="earley", lexer=None)
x = g.parse('aaaababc')


class TestEarley(unittest.TestCase):
pass


def _make_parser_test(LEXER, PARSER):
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)


Cargando…
Cancelar
Guardar