Browse Source

Added option to provide a custom lexer (with example)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.1
Erez Shinan 6 years ago
parent
commit
a49df1b2e4
6 changed files with 92 additions and 26 deletions
  1. +54
    -0
      examples/custom_lexer.py
  2. +1
    -1
      examples/indented_tree.py
  3. +3
    -3
      lark/lark.py
  4. +14
    -3
      lark/lexer.py
  5. +2
    -2
      lark/load_grammar.py
  6. +18
    -17
      lark/parser_frontends.py

+ 54
- 0
examples/custom_lexer.py View File

@@ -0,0 +1,54 @@
#
# This example demonstrates using Lark with a custom lexer.
#
# You can use a custom lexer to tokenize text when the lexers offered by Lark
# are too slow, or not flexible enough.
#
# You can also use it (as shown in this example) to tokenize streams of objects.
#


from lark import Lark, Transformer, v_args
from lark.lexer import Lexer, Token

class TypeLexer(Lexer):
def __init__(self, lexer_conf):
pass

def lex(self, data):
print(data)
for obj in data:
if isinstance(obj, int):
yield Token('INT', obj)
elif isinstance(obj, (type(''), type(u''))):
yield Token('STR', obj)
else:
raise TypeError(obj)

parser = Lark("""
start: data_item+
data_item: STR INT*

%declare STR INT
""", parser='lalr', lexer=TypeLexer)


class ParseToDict(Transformer):
@v_args(inline=True)
def data_item(self, name, *numbers):
return name.value, [n.value for n in numbers]

start = dict


def test():
data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6]

tree = parser.parse(data)
res = ParseToDict().transform(tree)

print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]}


if __name__ == '__main__':
test()

+ 1
- 1
examples/indented_tree.py View File

@@ -8,7 +8,7 @@
# the spaces (and tabs) after the newline.
#

from lark.lark import Lark
from lark import Lark
from lark.indenter import Indenter

tree_grammar = r"""


+ 3
- 3
lark/lark.py View File

@@ -10,7 +10,7 @@ from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf

from .lexer import Lexer
from .lexer import Lexer, TraditionalLexer
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend

@@ -142,7 +142,7 @@ class Lark:
else:
assert False, self.options.parser
lexer = self.options.lexer
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete')
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)

if self.options.ambiguity == 'auto':
if self.options.parser == 'earley':
@@ -171,7 +171,7 @@ class Lark:
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC

def _build_lexer(self):
return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)

def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer)


+ 14
- 3
lark/lexer.py View File

@@ -168,6 +168,17 @@ def _regexp_has_newline(r):
return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r)

class Lexer:
"""Lexer interface

Method Signatures:
lex(self, stream) -> Iterator[Token]

set_parser_state(self, state) # Optional
"""
set_parser_state = NotImplemented
lex = NotImplemented

class TraditionalLexer(Lexer):
def __init__(self, tokens, ignore=(), user_callbacks={}):
assert all(isinstance(t, TokenDef) for t in tokens), tokens

@@ -206,7 +217,7 @@ class Lexer:
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)


class ContextualLexer:
class ContextualLexer(Lexer):
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {}
for t in tokens:
@@ -222,12 +233,12 @@ class ContextualLexer:
except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks)

self.set_parser_state(None) # Needs to be set on the outside



+ 2
- 2
lark/load_grammar.py View File

@@ -10,7 +10,7 @@ from .lexer import Token


from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR
from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress
@@ -568,7 +568,7 @@ class GrammarLoader:
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf)
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)

self.canonize_tree = CanonizeTree()



+ 18
- 17
lark/parser_frontends.py View File

@@ -1,8 +1,9 @@
import re
from .utils import get_regexp_width
from functools import partial

from .utils import get_regexp_width
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import Lexer, ContextualLexer, Token
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token

from .exceptions import GrammarError
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
@@ -11,7 +12,7 @@ from .tree import Tree
class WithLexer:
def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)

def init_contextual_lexer(self, lexer_conf, parser_conf):
self.lexer_conf = lexer_conf
@@ -29,25 +30,27 @@ class WithLexer:
else:
return stream

def parse(self, text):
token_stream = self.lex(text)
sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])

class LALR(WithLexer):
class LALR_TraditionalLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.init_traditional_lexer(lexer_conf)

def parse(self, text):
token_stream = self.lex(text)
return self.parser.parse(token_stream)


class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.init_contextual_lexer(lexer_conf, parser_conf)

def parse(self, text):
token_stream = self.lex(text)
return self.parser.parse(token_stream, self.lexer.set_parser_state)
class LALR_CustomLexer(WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.lexer_conf = lexer_conf
self.lexer = lexer_cls(lexer_conf)


def get_ambiguity_resolver(options):
if not options or options.ambiguity == 'resolve':
@@ -77,10 +80,6 @@ class Earley(WithLexer):
def match(self, term, token):
return term.name == token.type

def parse(self, text):
tokens = self.lex(text)
return self.parser.parse(tokens)


class XEarley:
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
@@ -161,9 +160,11 @@ def get_frontend(parser, lexer):
if lexer is None:
raise ValueError('The LALR parser requires use of a lexer')
elif lexer == 'standard':
return LALR
return LALR_TraditionalLexer
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
return partial(LALR_CustomLexer, lexer)
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser=='earley':


Loading…
Cancel
Save