Parcourir la source

Merge branch 'superlexer2' (Contextual Lexer)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan il y a 7 ans
Parent
révision
137536b6a6
9 fichiers modifiés avec 158 ajouts et 24 suppressions
  1. +37
    -0
      examples/conf.py
  2. +5
    -1
      examples/conf_nolex.py
  3. +2
    -0
      lark/common.py
  4. +5
    -2
      lark/indenter.py
  5. +62
    -0
      lark/lexer.py
  6. +25
    -2
      lark/parser_frontends.py
  7. +20
    -16
      lark/parsers/lalr_parser.py
  8. +1
    -1
      tests/__main__.py
  9. +1
    -2
      tests/test_parser.py

+ 37
- 0
examples/conf.py Voir le fichier

@@ -0,0 +1,37 @@
#
# This example demonstrates the power of the contextual lexer, by parsing a config file.
#
# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily
# choose one over the other, which would lead to a (confusing) parse error.
# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows
# which one of them to expect at each point during the parse.
# The lexer then only matches the tokens that the parser expects.
# The result is a correct parse, something that is impossible with a regular lexer.
#
# Another approach is to discard a lexer altogether and use the Earley algorithm.
# It will handle more cases than the contextual lexer, but at the cost of performance.
# See examples/conf_nolex.py for an example of that approach.
#

from lark import Lark

parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL
NAME: /[a-zA-Z_]\w*/
VALUE: /.*/

WS.ignore: /[\t \f]+/
COMMENT.ignore: /\#[^\n]*/
_NL: /(\r?\n)+/
""", parser="lalr_contextual_lexer")


sample_conf = """
[bla]
a=Hello
this="that",4
"""

print parser.parse(sample_conf).pretty()

+ 5
- 1
examples/conf_nolex.py Voir le fichier

@@ -1,5 +1,5 @@
#
# This example demonstrates lex-less parsing using the earley_nolex frontend
# This example demonstrates scanless parsing using the earley_nolex frontend
#
# Using a lexer for configuration files is tricky, because values don't
# have to be surrounded by delimiters.
@@ -7,6 +7,10 @@
#
# Future versions of lark will make it easier to write these kinds of grammars.
#
# Another approach is to use the contextual lexer. It is less powerful than the scanless approach,
# but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1).
# See examples/conf.py for an example of that approach.
#

from lark import Lark, Transformer



+ 2
- 0
lark/common.py Voir le fichier

@@ -17,6 +17,8 @@ class UnexpectedToken(ParseError):
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
except AttributeError:
context = seq[index:index+5]
except TypeError:
context = "<no context>"
message = ("Unexpected token %r at line %s, column %s.\n"
"Expected: %s\n"
"Context: %s" % (token, self.line, self.column, expected, context))


+ 5
- 2
lark/indenter.py Voir le fichier

@@ -26,7 +26,6 @@ class Indenter:

assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])


def process(self, stream):
for token in stream:
if token.type == self.NL_type:
@@ -37,7 +36,7 @@ class Indenter:

if token.type in self.OPEN_PAREN_types:
self.paren_level += 1
if token.type in self.CLOSE_PAREN_types:
elif token.type in self.CLOSE_PAREN_types:
self.paren_level -= 1
assert self.paren_level >= 0

@@ -47,3 +46,7 @@ class Indenter:

assert self.indent_level == [0], self.indent_level

# XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
@property
def always_accept(self):
return (self.NL_type,)

+ 62
- 0
lark/lexer.py Voir le fichier

@@ -3,6 +3,7 @@
import re

from .utils import Str, classify
from .common import is_terminal

class LexError(Exception):
pass
@@ -169,3 +170,64 @@ class Lexer(object):
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break


class ContextualLexer:
def __init__(self, tokens, states, ignore=(), always_accept=()):
tokens_by_name = {}
for t in tokens:
assert t.name not in tokens_by_name
tokens_by_name[t.name] = t

lexer_by_tokens = {}
self.lexers = {}
for state, accepts in states.items():
key = frozenset(accepts)
try:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) # For python3
accepts |= set(ignore)
accepts |= set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
lexer = Lexer(state_tokens, ignore=ignore)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

self.root_lexer = Lexer(tokens, ignore=ignore)

def lex(self, stream, parser):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.root_lexer.newline_types)
ignore_types = list(self.root_lexer.ignore_types)
while True:
lexer = self.lexers[parser.state]
for mre, type_from_index in lexer.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, lex_pos)
t.line = line
t.column = lex_pos - col_start_pos
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
yield t

if type_ in newline_types:
newlines = value.count(lexer.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(lexer.newline_char)
lex_pos += len(value)
break
else:
if lex_pos < len(stream):
print("Allowed tokens:", lexer.tokens)
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break


+ 25
- 2
lark/parser_frontends.py Voir le fichier

@@ -1,7 +1,7 @@
import re
import sre_parse

from .lexer import Lexer
from .lexer import Lexer, ContextualLexer
from .parsers.lalr_analysis import GrammarAnalyzer

from .common import is_terminal, GrammarError
@@ -31,6 +31,29 @@ class LALR(WithLexer):
tokens = list(self.lex(text))
return self.parser.parse(tokens)

class LALR_ContextualLexer:
def __init__(self, lexer_conf, parser_conf):
self.lexer_conf = lexer_conf
self.parser_conf = parser_conf

self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
self.analyzer.analyze()

d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()}
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore,
always_accept=lexer_conf.postlex.always_accept
if lexer_conf.postlex else ())


def parse(self, text):
parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback)
tokens = self.lexer.lex(text, parser)
if self.lexer_conf.postlex:
tokens = self.lexer_conf.postlex.process(tokens)
return parser.parse(tokens, True)



class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf):
WithLexer.__init__(self, lexer_conf)
@@ -82,4 +105,4 @@ class Earley_NoLex:
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]

ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex }
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer }

+ 20
- 16
lark/parsers/lalr_parser.py Voir le fichier

@@ -7,13 +7,15 @@ class Parser(object):
self.analysis = analysis
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None)
for rule in analysis.rules}
self.state = self.analysis.init_state_idx

def parse(self, seq):
def parse(self, seq, set_state=False):
i = 0
stream = iter(seq)
states_idx = self.analysis.states_idx

state_stack = [self.analysis.init_state_idx]
value_stack = []
i = 0

def get_action(key):
state = state_stack[-1]
@@ -21,11 +23,6 @@ class Parser(object):
return states_idx[state][key]
except KeyError:
expected = states_idx[state].keys()
try:
token = seq[i]
except IndexError:
assert key == '$end'
token = seq[-1]

raise UnexpectedToken(token, expected, seq, i)

@@ -48,15 +45,22 @@ class Parser(object):
value_stack.append(res)

# Main LALR-parser loop
while i < len(seq):
action, arg = get_action(seq[i].type)

if action == ACTION_SHIFT:
state_stack.append(arg)
value_stack.append(seq[i])
i+= 1
else:
reduce(*arg)
try:
token = next(stream)
i += 1
while True:
action, arg = get_action(token.type)

if action == ACTION_SHIFT:
state_stack.append(arg)
value_stack.append(token)
if set_state: self.state = arg
token = next(stream)
i += 1
else:
reduce(*arg)
except StopIteration:
pass

while True:
_action, rule = get_action('$end')


+ 1
- 1
tests/__main__.py Voir le fichier

@@ -5,7 +5,7 @@ import logging

from .test_trees import TestTrees
# from .test_selectors import TestSelectors
from .test_parser import TestLalr, TestEarley, TestParsers
from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers
# from .test_grammars import TestPythonG, TestConfigG

logging.basicConfig(level=logging.INFO)


+ 1
- 2
tests/test_parser.py Voir le fichier

@@ -356,11 +356,10 @@ def _make_parser_test(PARSER):
_TestParser.__name__ = _NAME
globals()[_NAME] = _TestParser

for PARSER in ['lalr', 'earley']:
for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']:
_make_parser_test(PARSER)



if __name__ == '__main__':
unittest.main()


Chargement…
Annuler
Enregistrer