Browse Source

Added conf.py example and indenter support in contextual lexing

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 years ago
parent
commit
d4425887d6
5 changed files with 59 additions and 11 deletions
  1. +37
    -0
      examples/conf.py
  2. +5
    -1
      examples/conf_nolex.py
  3. +5
    -2
      lark/indenter.py
  4. +5
    -5
      lark/lexer.py
  5. +7
    -3
      lark/parser_frontends.py

+ 37
- 0
examples/conf.py View File

@@ -0,0 +1,37 @@
#
# This example demonstrates the power of the contextual lexer, by parsing a config file.
#
# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily
# choose one over the other, which would lead to a (confusing) parse error.
# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows
# which one of them to expect at each point during the parse.
# The lexer then only matches the tokens that the parser expects.
# The result is a correct parse, something that is impossible with a regular lexer.
#
# Another approach is to discard a lexer altogether and use the Earley algorithm.
# It will handle more cases than the contextual lexer, but at the cost of performance.
# See examples/conf_nolex.py for an example of that approach.
#

from lark import Lark

parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL
NAME: /[a-zA-Z_]\w*/
VALUE: /.*/

WS.ignore: /[\t \f]+/
COMMENT.ignore: /\#[^\n]*/
_NL: /(\r?\n)+/
""", parser="lalr_contextual_lexer")


sample_conf = """
[bla]
a=Hello
this="that",4
"""

print parser.parse(sample_conf).pretty()

+ 5
- 1
examples/conf_nolex.py View File

@@ -1,5 +1,5 @@
#
# This example demonstrates lex-less parsing using the earley_nolex frontend
# This example demonstrates scanless parsing using the earley_nolex frontend
#
# Using a lexer for configuration files is tricky, because values don't
# have to be surrounded by delimiters.
@@ -7,6 +7,10 @@
#
# Future versions of lark will make it easier to write these kinds of grammars.
#
# Another approach is to use the contextual lexer. It is less powerful than the scanless approach,
# but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1).
# See examples/conf.py for an example of that approach.
#

from lark import Lark, Transformer



+ 5
- 2
lark/indenter.py View File

@@ -26,7 +26,6 @@ class Indenter:

assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])


def process(self, stream):
for token in stream:
if token.type == self.NL_type:
@@ -37,7 +36,7 @@ class Indenter:

if token.type in self.OPEN_PAREN_types:
self.paren_level += 1
if token.type in self.CLOSE_PAREN_types:
elif token.type in self.CLOSE_PAREN_types:
self.paren_level -= 1
assert self.paren_level >= 0

@@ -47,3 +46,7 @@ class Indenter:

assert self.indent_level == [0], self.indent_level

# XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
@property
def always_accept(self):
return (self.NL_type,)

+ 5
- 5
lark/lexer.py View File

@@ -173,7 +173,7 @@ class Lexer(object):


class ContextualLexer:
def __init__(self, tokens, states, ignore=()):
def __init__(self, tokens, states, ignore=(), always_accept=()):
tokens_by_name = {}
for t in tokens:
assert t.name not in tokens_by_name
@@ -186,10 +186,9 @@ class ContextualLexer:
try:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = list(accepts) # For python3
accepts += ignore
# if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts:
# accepts.append('_NEWLINE') # XXX hack for now
accepts = set(accepts) # For python3
accepts |= set(ignore)
accepts |= set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
lexer = Lexer(state_tokens, ignore=ignore)
lexer_by_tokens[key] = lexer
@@ -228,6 +227,7 @@ class ContextualLexer:
break
else:
if lex_pos < len(stream):
print("Allowed tokens:", lexer.tokens)
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break


+ 7
- 3
lark/parser_frontends.py View File

@@ -40,13 +40,17 @@ class LALR_ContextualLexer:
self.analyzer.analyze()

d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()}
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore)
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore,
always_accept=lexer_conf.postlex.always_accept
if lexer_conf.postlex else ())


def parse(self, text):
parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback)
l = self.lexer.lex(text, parser)
return parser.parse(l, True)
tokens = self.lexer.lex(text, parser)
if self.lexer_conf.postlex:
tokens = self.lexer_conf.postlex.process(tokens)
return parser.parse(tokens, True)





Loading…
Cancel
Save