@@ -0,0 +1,54 @@ | |||||
# | |||||
# This example demonstrates using Lark with a custom lexer. | |||||
# | |||||
# You can use a custom lexer to tokenize text when the lexers offered by Lark | |||||
# are too slow, or not flexible enough. | |||||
# | |||||
# You can also use it (as shown in this example) to tokenize streams of objects. | |||||
# | |||||
from lark import Lark, Transformer, v_args | |||||
from lark.lexer import Lexer, Token | |||||
class TypeLexer(Lexer): | |||||
def __init__(self, lexer_conf): | |||||
pass | |||||
def lex(self, data): | |||||
print(data) | |||||
for obj in data: | |||||
if isinstance(obj, int): | |||||
yield Token('INT', obj) | |||||
elif isinstance(obj, (type(''), type(u''))): | |||||
yield Token('STR', obj) | |||||
else: | |||||
raise TypeError(obj) | |||||
parser = Lark(""" | |||||
start: data_item+ | |||||
data_item: STR INT* | |||||
%declare STR INT | |||||
""", parser='lalr', lexer=TypeLexer) | |||||
class ParseToDict(Transformer): | |||||
@v_args(inline=True) | |||||
def data_item(self, name, *numbers): | |||||
return name.value, [n.value for n in numbers] | |||||
start = dict | |||||
def test(): | |||||
data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6] | |||||
tree = parser.parse(data) | |||||
res = ParseToDict().transform(tree) | |||||
print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]} | |||||
if __name__ == '__main__': | |||||
test() |
@@ -8,7 +8,7 @@ | |||||
# the spaces (and tabs) after the newline. | # the spaces (and tabs) after the newline. | ||||
# | # | ||||
from lark.lark import Lark | |||||
from lark import Lark | |||||
from lark.indenter import Indenter | from lark.indenter import Indenter | ||||
tree_grammar = r""" | tree_grammar = r""" | ||||
@@ -10,7 +10,7 @@ from .load_grammar import load_grammar | |||||
from .tree import Tree | from .tree import Tree | ||||
from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
from .lexer import Lexer | |||||
from .lexer import Lexer, TraditionalLexer | |||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
from .parser_frontends import get_frontend | from .parser_frontends import get_frontend | ||||
@@ -142,7 +142,7 @@ class Lark: | |||||
else: | else: | ||||
assert False, self.options.parser | assert False, self.options.parser | ||||
lexer = self.options.lexer | lexer = self.options.lexer | ||||
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') | |||||
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer) | |||||
if self.options.ambiguity == 'auto': | if self.options.ambiguity == 'auto': | ||||
if self.options.parser == 'earley': | if self.options.parser == 'earley': | ||||
@@ -171,7 +171,7 @@ class Lark: | |||||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | ||||
def _build_lexer(self): | def _build_lexer(self): | ||||
return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||||
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||||
def _build_parser(self): | def _build_parser(self): | ||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
@@ -168,6 +168,17 @@ def _regexp_has_newline(r): | |||||
return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) | return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) | ||||
class Lexer: | class Lexer: | ||||
"""Lexer interface | |||||
Method Signatures: | |||||
lex(self, stream) -> Iterator[Token] | |||||
set_parser_state(self, state) # Optional | |||||
""" | |||||
set_parser_state = NotImplemented | |||||
lex = NotImplemented | |||||
class TraditionalLexer(Lexer): | |||||
def __init__(self, tokens, ignore=(), user_callbacks={}): | def __init__(self, tokens, ignore=(), user_callbacks={}): | ||||
assert all(isinstance(t, TokenDef) for t in tokens), tokens | assert all(isinstance(t, TokenDef) for t in tokens), tokens | ||||
@@ -206,7 +217,7 @@ class Lexer: | |||||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | ||||
class ContextualLexer: | |||||
class ContextualLexer(Lexer): | |||||
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): | def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): | ||||
tokens_by_name = {} | tokens_by_name = {} | ||||
for t in tokens: | for t in tokens: | ||||
@@ -222,12 +233,12 @@ class ContextualLexer: | |||||
except KeyError: | except KeyError: | ||||
accepts = set(accepts) | set(ignore) | set(always_accept) | accepts = set(accepts) | set(ignore) | set(always_accept) | ||||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | ||||
lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
self.lexers[state] = lexer | self.lexers[state] = lexer | ||||
self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
self.set_parser_state(None) # Needs to be set on the outside | self.set_parser_state(None) # Needs to be set on the outside | ||||
@@ -10,7 +10,7 @@ from .lexer import Token | |||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
from .parser_frontends import LALR | |||||
from .parser_frontends import LALR_TraditionalLexer | |||||
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | ||||
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | ||||
from .utils import classify, suppress | from .utils import classify, suppress | ||||
@@ -568,7 +568,7 @@ class GrammarLoader: | |||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
self.parser = LALR(lexer_conf, parser_conf) | |||||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | |||||
self.canonize_tree = CanonizeTree() | self.canonize_tree = CanonizeTree() | ||||
@@ -1,8 +1,9 @@ | |||||
import re | import re | ||||
from .utils import get_regexp_width | |||||
from functools import partial | |||||
from .utils import get_regexp_width | |||||
from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
from .lexer import Lexer, ContextualLexer, Token | |||||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | |||||
from .exceptions import GrammarError | from .exceptions import GrammarError | ||||
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | ||||
@@ -11,7 +12,7 @@ from .tree import Tree | |||||
class WithLexer: | class WithLexer: | ||||
def init_traditional_lexer(self, lexer_conf): | def init_traditional_lexer(self, lexer_conf): | ||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||||
def init_contextual_lexer(self, lexer_conf, parser_conf): | def init_contextual_lexer(self, lexer_conf, parser_conf): | ||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
@@ -29,25 +30,27 @@ class WithLexer: | |||||
else: | else: | ||||
return stream | return stream | ||||
def parse(self, text): | |||||
token_stream = self.lex(text) | |||||
sps = self.lexer.set_parser_state | |||||
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) | |||||
class LALR(WithLexer): | |||||
class LALR_TraditionalLexer(WithLexer): | |||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
self.parser = lalr_parser.Parser(parser_conf) | self.parser = lalr_parser.Parser(parser_conf) | ||||
self.init_traditional_lexer(lexer_conf) | self.init_traditional_lexer(lexer_conf) | ||||
def parse(self, text): | |||||
token_stream = self.lex(text) | |||||
return self.parser.parse(token_stream) | |||||
class LALR_ContextualLexer(WithLexer): | class LALR_ContextualLexer(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
self.parser = lalr_parser.Parser(parser_conf) | self.parser = lalr_parser.Parser(parser_conf) | ||||
self.init_contextual_lexer(lexer_conf, parser_conf) | self.init_contextual_lexer(lexer_conf, parser_conf) | ||||
def parse(self, text): | |||||
token_stream = self.lex(text) | |||||
return self.parser.parse(token_stream, self.lexer.set_parser_state) | |||||
class LALR_CustomLexer(WithLexer): | |||||
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||||
self.parser = lalr_parser.Parser(parser_conf) | |||||
self.lexer_conf = lexer_conf | |||||
self.lexer = lexer_cls(lexer_conf) | |||||
def get_ambiguity_resolver(options): | def get_ambiguity_resolver(options): | ||||
if not options or options.ambiguity == 'resolve': | if not options or options.ambiguity == 'resolve': | ||||
@@ -77,10 +80,6 @@ class Earley(WithLexer): | |||||
def match(self, term, token): | def match(self, term, token): | ||||
return term.name == token.type | return term.name == token.type | ||||
def parse(self, text): | |||||
tokens = self.lex(text) | |||||
return self.parser.parse(tokens) | |||||
class XEarley: | class XEarley: | ||||
def __init__(self, lexer_conf, parser_conf, options=None, **kw): | def __init__(self, lexer_conf, parser_conf, options=None, **kw): | ||||
@@ -161,9 +160,11 @@ def get_frontend(parser, lexer): | |||||
if lexer is None: | if lexer is None: | ||||
raise ValueError('The LALR parser requires use of a lexer') | raise ValueError('The LALR parser requires use of a lexer') | ||||
elif lexer == 'standard': | elif lexer == 'standard': | ||||
return LALR | |||||
return LALR_TraditionalLexer | |||||
elif lexer == 'contextual': | elif lexer == 'contextual': | ||||
return LALR_ContextualLexer | return LALR_ContextualLexer | ||||
elif issubclass(lexer, Lexer): | |||||
return partial(LALR_CustomLexer, lexer) | |||||
else: | else: | ||||
raise ValueError('Unknown lexer: %s' % lexer) | raise ValueError('Unknown lexer: %s' % lexer) | ||||
elif parser=='earley': | elif parser=='earley': | ||||