@@ -0,0 +1,54 @@ | |||
# | |||
# This example demonstrates using Lark with a custom lexer. | |||
# | |||
# You can use a custom lexer to tokenize text when the lexers offered by Lark | |||
# are too slow, or not flexible enough. | |||
# | |||
# You can also use it (as shown in this example) to tokenize streams of objects. | |||
# | |||
from lark import Lark, Transformer, v_args | |||
from lark.lexer import Lexer, Token | |||
class TypeLexer(Lexer): | |||
def __init__(self, lexer_conf): | |||
pass | |||
def lex(self, data): | |||
print(data) | |||
for obj in data: | |||
if isinstance(obj, int): | |||
yield Token('INT', obj) | |||
elif isinstance(obj, (type(''), type(u''))): | |||
yield Token('STR', obj) | |||
else: | |||
raise TypeError(obj) | |||
parser = Lark(""" | |||
start: data_item+ | |||
data_item: STR INT* | |||
%declare STR INT | |||
""", parser='lalr', lexer=TypeLexer) | |||
class ParseToDict(Transformer): | |||
@v_args(inline=True) | |||
def data_item(self, name, *numbers): | |||
return name.value, [n.value for n in numbers] | |||
start = dict | |||
def test(): | |||
data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6] | |||
tree = parser.parse(data) | |||
res = ParseToDict().transform(tree) | |||
print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]} | |||
if __name__ == '__main__': | |||
test() |
@@ -8,7 +8,7 @@ | |||
# the spaces (and tabs) after the newline. | |||
# | |||
from lark.lark import Lark | |||
from lark import Lark | |||
from lark.indenter import Indenter | |||
tree_grammar = r""" | |||
@@ -10,7 +10,7 @@ from .load_grammar import load_grammar | |||
from .tree import Tree | |||
from .common import LexerConf, ParserConf | |||
from .lexer import Lexer | |||
from .lexer import Lexer, TraditionalLexer | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import get_frontend | |||
@@ -142,7 +142,7 @@ class Lark: | |||
else: | |||
assert False, self.options.parser | |||
lexer = self.options.lexer | |||
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') | |||
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer) | |||
if self.options.ambiguity == 'auto': | |||
if self.options.parser == 'earley': | |||
@@ -171,7 +171,7 @@ class Lark: | |||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
def _build_lexer(self): | |||
return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||
def _build_parser(self): | |||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
@@ -168,6 +168,17 @@ def _regexp_has_newline(r): | |||
return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) | |||
class Lexer: | |||
"""Lexer interface | |||
Method Signatures: | |||
lex(self, stream) -> Iterator[Token] | |||
set_parser_state(self, state) # Optional | |||
""" | |||
set_parser_state = NotImplemented | |||
lex = NotImplemented | |||
class TraditionalLexer(Lexer): | |||
def __init__(self, tokens, ignore=(), user_callbacks={}): | |||
assert all(isinstance(t, TokenDef) for t in tokens), tokens | |||
@@ -206,7 +217,7 @@ class Lexer: | |||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||
class ContextualLexer: | |||
class ContextualLexer(Lexer): | |||
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): | |||
tokens_by_name = {} | |||
for t in tokens: | |||
@@ -222,12 +233,12 @@ class ContextualLexer: | |||
except KeyError: | |||
accepts = set(accepts) | set(ignore) | set(always_accept) | |||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||
lexer_by_tokens[key] = lexer | |||
self.lexers[state] = lexer | |||
self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks) | |||
self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks) | |||
self.set_parser_state(None) # Needs to be set on the outside | |||
@@ -10,7 +10,7 @@ from .lexer import Token | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import LALR | |||
from .parser_frontends import LALR_TraditionalLexer | |||
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | |||
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | |||
from .utils import classify, suppress | |||
@@ -568,7 +568,7 @@ class GrammarLoader: | |||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | |||
parser_conf = ParserConf(rules, callback, 'start') | |||
self.parser = LALR(lexer_conf, parser_conf) | |||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | |||
self.canonize_tree = CanonizeTree() | |||
@@ -1,8 +1,9 @@ | |||
import re | |||
from .utils import get_regexp_width | |||
from functools import partial | |||
from .utils import get_regexp_width | |||
from .parsers.grammar_analysis import GrammarAnalyzer | |||
from .lexer import Lexer, ContextualLexer, Token | |||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | |||
from .exceptions import GrammarError | |||
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | |||
@@ -11,7 +12,7 @@ from .tree import Tree | |||
class WithLexer: | |||
def init_traditional_lexer(self, lexer_conf): | |||
self.lexer_conf = lexer_conf | |||
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||
def init_contextual_lexer(self, lexer_conf, parser_conf): | |||
self.lexer_conf = lexer_conf | |||
@@ -29,25 +30,27 @@ class WithLexer: | |||
else: | |||
return stream | |||
def parse(self, text): | |||
token_stream = self.lex(text) | |||
sps = self.lexer.set_parser_state | |||
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) | |||
class LALR(WithLexer): | |||
class LALR_TraditionalLexer(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.parser = lalr_parser.Parser(parser_conf) | |||
self.init_traditional_lexer(lexer_conf) | |||
def parse(self, text): | |||
token_stream = self.lex(text) | |||
return self.parser.parse(token_stream) | |||
class LALR_ContextualLexer(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.parser = lalr_parser.Parser(parser_conf) | |||
self.init_contextual_lexer(lexer_conf, parser_conf) | |||
def parse(self, text): | |||
token_stream = self.lex(text) | |||
return self.parser.parse(token_stream, self.lexer.set_parser_state) | |||
class LALR_CustomLexer(WithLexer): | |||
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||
self.parser = lalr_parser.Parser(parser_conf) | |||
self.lexer_conf = lexer_conf | |||
self.lexer = lexer_cls(lexer_conf) | |||
def get_ambiguity_resolver(options): | |||
if not options or options.ambiguity == 'resolve': | |||
@@ -77,10 +80,6 @@ class Earley(WithLexer): | |||
def match(self, term, token): | |||
return term.name == token.type | |||
def parse(self, text): | |||
tokens = self.lex(text) | |||
return self.parser.parse(tokens) | |||
class XEarley: | |||
def __init__(self, lexer_conf, parser_conf, options=None, **kw): | |||
@@ -161,9 +160,11 @@ def get_frontend(parser, lexer): | |||
if lexer is None: | |||
raise ValueError('The LALR parser requires use of a lexer') | |||
elif lexer == 'standard': | |||
return LALR | |||
return LALR_TraditionalLexer | |||
elif lexer == 'contextual': | |||
return LALR_ContextualLexer | |||
elif issubclass(lexer, Lexer): | |||
return partial(LALR_CustomLexer, lexer) | |||
else: | |||
raise ValueError('Unknown lexer: %s' % lexer) | |||
elif parser=='earley': | |||