Browse Source

Added option to provide a custom lexer (with example)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.1
Erez Shinan 6 years ago
parent
commit
a49df1b2e4
6 changed files with 92 additions and 26 deletions
  1. +54
    -0
      examples/custom_lexer.py
  2. +1
    -1
      examples/indented_tree.py
  3. +3
    -3
      lark/lark.py
  4. +14
    -3
      lark/lexer.py
  5. +2
    -2
      lark/load_grammar.py
  6. +18
    -17
      lark/parser_frontends.py

+ 54
- 0
examples/custom_lexer.py View File

@@ -0,0 +1,54 @@
#
# This example demonstrates using Lark with a custom lexer.
#
# You can use a custom lexer to tokenize text when the lexers offered by Lark
# are too slow, or not flexible enough.
#
# You can also use it (as shown in this example) to tokenize streams of objects.
#


from lark import Lark, Transformer, v_args
from lark.lexer import Lexer, Token

class TypeLexer(Lexer):
def __init__(self, lexer_conf):
pass

def lex(self, data):
print(data)
for obj in data:
if isinstance(obj, int):
yield Token('INT', obj)
elif isinstance(obj, (type(''), type(u''))):
yield Token('STR', obj)
else:
raise TypeError(obj)

parser = Lark("""
start: data_item+
data_item: STR INT*

%declare STR INT
""", parser='lalr', lexer=TypeLexer)


class ParseToDict(Transformer):
@v_args(inline=True)
def data_item(self, name, *numbers):
return name.value, [n.value for n in numbers]

start = dict


def test():
data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6]

tree = parser.parse(data)
res = ParseToDict().transform(tree)

print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]}


if __name__ == '__main__':
test()

+ 1
- 1
examples/indented_tree.py View File

@@ -8,7 +8,7 @@
# the spaces (and tabs) after the newline. # the spaces (and tabs) after the newline.
# #


from lark.lark import Lark
from lark import Lark
from lark.indenter import Indenter from lark.indenter import Indenter


tree_grammar = r""" tree_grammar = r"""


+ 3
- 3
lark/lark.py View File

@@ -10,7 +10,7 @@ from .load_grammar import load_grammar
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


from .lexer import Lexer
from .lexer import Lexer, TraditionalLexer
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend from .parser_frontends import get_frontend


@@ -142,7 +142,7 @@ class Lark:
else: else:
assert False, self.options.parser assert False, self.options.parser
lexer = self.options.lexer lexer = self.options.lexer
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete')
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)


if self.options.ambiguity == 'auto': if self.options.ambiguity == 'auto':
if self.options.parser == 'earley': if self.options.parser == 'earley':
@@ -171,7 +171,7 @@ class Lark:
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC


def _build_lexer(self): def _build_lexer(self):
return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)


def _build_parser(self): def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)


+ 14
- 3
lark/lexer.py View File

@@ -168,6 +168,17 @@ def _regexp_has_newline(r):
return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r)


class Lexer: class Lexer:
"""Lexer interface

Method Signatures:
lex(self, stream) -> Iterator[Token]

set_parser_state(self, state) # Optional
"""
set_parser_state = NotImplemented
lex = NotImplemented

class TraditionalLexer(Lexer):
def __init__(self, tokens, ignore=(), user_callbacks={}): def __init__(self, tokens, ignore=(), user_callbacks={}):
assert all(isinstance(t, TokenDef) for t in tokens), tokens assert all(isinstance(t, TokenDef) for t in tokens), tokens


@@ -206,7 +217,7 @@ class Lexer:
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) return _Lex(self).lex(stream, self.newline_types, self.ignore_types)




class ContextualLexer:
class ContextualLexer(Lexer):
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {} tokens_by_name = {}
for t in tokens: for t in tokens:
@@ -222,12 +233,12 @@ class ContextualLexer:
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept) accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer


self.lexers[state] = lexer self.lexers[state] = lexer


self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks)


self.set_parser_state(None) # Needs to be set on the outside self.set_parser_state(None) # Needs to be set on the outside




+ 2
- 2
lark/load_grammar.py View File

@@ -10,7 +10,7 @@ from .lexer import Token




from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR
from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress from .utils import classify, suppress
@@ -568,7 +568,7 @@ class GrammarLoader:
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, 'start') parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf)
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)


self.canonize_tree = CanonizeTree() self.canonize_tree = CanonizeTree()




+ 18
- 17
lark/parser_frontends.py View File

@@ -1,8 +1,9 @@
import re import re
from .utils import get_regexp_width
from functools import partial


from .utils import get_regexp_width
from .parsers.grammar_analysis import GrammarAnalyzer from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import Lexer, ContextualLexer, Token
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token


from .exceptions import GrammarError from .exceptions import GrammarError
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
@@ -11,7 +12,7 @@ from .tree import Tree
class WithLexer: class WithLexer:
def init_traditional_lexer(self, lexer_conf): def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)


def init_contextual_lexer(self, lexer_conf, parser_conf): def init_contextual_lexer(self, lexer_conf, parser_conf):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
@@ -29,25 +30,27 @@ class WithLexer:
else: else:
return stream return stream


def parse(self, text):
token_stream = self.lex(text)
sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])


class LALR(WithLexer):
class LALR_TraditionalLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf) self.parser = lalr_parser.Parser(parser_conf)
self.init_traditional_lexer(lexer_conf) self.init_traditional_lexer(lexer_conf)


def parse(self, text):
token_stream = self.lex(text)
return self.parser.parse(token_stream)


class LALR_ContextualLexer(WithLexer): class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf) self.parser = lalr_parser.Parser(parser_conf)
self.init_contextual_lexer(lexer_conf, parser_conf) self.init_contextual_lexer(lexer_conf, parser_conf)


def parse(self, text):
token_stream = self.lex(text)
return self.parser.parse(token_stream, self.lexer.set_parser_state)
class LALR_CustomLexer(WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.lexer_conf = lexer_conf
self.lexer = lexer_cls(lexer_conf)



def get_ambiguity_resolver(options): def get_ambiguity_resolver(options):
if not options or options.ambiguity == 'resolve': if not options or options.ambiguity == 'resolve':
@@ -77,10 +80,6 @@ class Earley(WithLexer):
def match(self, term, token): def match(self, term, token):
return term.name == token.type return term.name == token.type


def parse(self, text):
tokens = self.lex(text)
return self.parser.parse(tokens)



class XEarley: class XEarley:
def __init__(self, lexer_conf, parser_conf, options=None, **kw): def __init__(self, lexer_conf, parser_conf, options=None, **kw):
@@ -161,9 +160,11 @@ def get_frontend(parser, lexer):
if lexer is None: if lexer is None:
raise ValueError('The LALR parser requires use of a lexer') raise ValueError('The LALR parser requires use of a lexer')
elif lexer == 'standard': elif lexer == 'standard':
return LALR
return LALR_TraditionalLexer
elif lexer == 'contextual': elif lexer == 'contextual':
return LALR_ContextualLexer return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
return partial(LALR_CustomLexer, lexer)
else: else:
raise ValueError('Unknown lexer: %s' % lexer) raise ValueError('Unknown lexer: %s' % lexer)
elif parser=='earley': elif parser=='earley':


Loading…
Cancel
Save