Browse Source

Fix to new serializer code (Discussed in issue #349)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.7.1
Erez Shinan 5 years ago
parent
commit
a798dec779
7 changed files with 102 additions and 52 deletions
  1. +12
    -1
      lark/common.py
  2. +1
    -11
      lark/lexer.py
  3. +37
    -30
      lark/parser_frontends.py
  4. +2
    -1
      lark/parsers/lalr_parser.py
  5. +1
    -0
      lark/tools/standalone.py
  6. +13
    -9
      lark/utils.py
  7. +36
    -0
      tests/test_tools.py

+ 12
- 1
lark/common.py View File

@@ -1,12 +1,23 @@
from .utils import Serialize
from .lexer import TerminalDef

###{standalone

class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore'
__serialize_namespace__ = TerminalDef,

class LexerConf:
def __init__(self, tokens, ignore=(), postlex=None, callbacks=None):
self.tokens = tokens
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}

def _deserialize(self):
self.callbacks = {} # TODO

###}

class ParserConf:
def __init__(self, rules, callbacks, start):
self.rules = rules


+ 1
- 11
lark/lexer.py View File

@@ -261,7 +261,7 @@ def _regexp_has_newline(r):
"""
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)

class Lexer(Serialize):
class Lexer(object):
"""Lexer interface

Method Signatures:
@@ -274,13 +274,6 @@ class Lexer(Serialize):


class TraditionalLexer(Lexer):
__serialize_fields__ = 'terminals', 'ignore_types', 'newline_types'
__serialize_namespace__ = TerminalDef,

def _deserialize(self):
self.user_callbacks = {} # TODO implement
self.build()


def __init__(self, terminals, ignore=(), user_callbacks={}):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals
@@ -329,9 +322,6 @@ class TraditionalLexer(Lexer):


class ContextualLexer(Lexer):
__serialize_fields__ = 'root_lexer', 'lexers'
__serialize_namespace__ = TraditionalLexer,

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {}
for t in terminals:


+ 37
- 30
lark/parser_frontends.py View File

@@ -8,6 +8,7 @@ from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
from .tree import Tree
from .common import LexerConf

###{standalone

@@ -50,34 +51,24 @@ class WithLexer(Serialize):
parser = None
lexer_conf = None

__serialize_fields__ = 'parser', 'lexer'
__serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer
__serialize_fields__ = 'parser', 'lexer_conf'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.postlex = lexer_conf.postlex

@classmethod
def deserialize(cls, data, memo, callbacks, postlex):
inst = super(WithLexer, cls).deserialize(data, memo)
inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.init_lexer()
return inst
def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
self.postlex = lexer_conf.postlex

def init_contextual_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.postlex = lexer_conf.postlex
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, states,
ignore=lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=lexer_conf.callbacks)

def lex(self, text):
stream = self.lexer.lex(text)
return self.postlex.process(stream) if self.postlex else stream
@@ -87,26 +78,40 @@ class WithLexer(Serialize):
sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)

class LALR_TraditionalLexer(WithLexer):
class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_traditional_lexer(lexer_conf)
WithLexer.__init__(self, lexer_conf, parser_conf, options)

class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_contextual_lexer(lexer_conf)
self.init_lexer()

def init_lexer(self):
raise NotImplementedError()

class LALR_TraditionalLexer(LALR_WithLexer):
def init_lexer(self):
self.init_traditional_lexer()

class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
ignore=self.lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks)
###}

class LALR_CustomLexer(WithLexer):
class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.parser = LALR_Parser(parser_conf)
self.lexer_conf = lexer_conf
self.lexer = lexer_cls(lexer_conf)
pass # TODO

def init_lexer(self):
self.lexer = lexer_cls(self.lexer_conf)

def tokenize_text(text):
line = 1
@@ -119,7 +124,8 @@ def tokenize_text(text):

class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
@@ -172,7 +178,8 @@ class XEarley_CompleteLex(XEarley):
class CYK(WithLexer):

def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()

self._analysis = GrammarAnalyzer(parser_conf)
self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)


+ 2
- 1
lark/parsers/lalr_parser.py View File

@@ -25,7 +25,8 @@ class LALR_Parser(object):
@classmethod
def deserialize(cls, data, memo, callbacks):
inst = cls.__new__(cls)
inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks)
inst._parse_table = IntParseTable.deserialize(data, memo)
inst.parser = _Parser(inst._parse_table, callbacks)
return inst

def serialize(self, memo):


+ 1
- 0
lark/tools/standalone.py View File

@@ -65,6 +65,7 @@ EXTRACT_STANDALONE_FILES = [
'indenter.py',
'grammar.py',
'lexer.py',
'common.py',
'parse_tree_builder.py',
'parsers/lalr_parser.py',
'parsers/lalr_analysis.py',


+ 13
- 9
lark/utils.py View File

@@ -103,7 +103,10 @@ class Serialize(object):

inst = cls.__new__(cls)
for f in fields:
setattr(inst, f, _deserialize(data[f], namespace, memo))
try:
setattr(inst, f, _deserialize(data[f], namespace, memo))
except KeyError as e:
raise KeyError("Cannot find key for class", cls, e)
postprocess = getattr(inst, '_deserialize', None)
if postprocess:
postprocess()
@@ -164,6 +167,15 @@ def smart_decorator(f, create_decorator):

import sys, re
Py36 = (sys.version_info[:2] >= (3, 6))

import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
except sre_constants.error:
raise ValueError(regexp)

###}


@@ -209,14 +221,6 @@ except NameError:
return -1


import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
except sre_constants.error:
raise ValueError(regexp)


class Enumerator(Serialize):
def __init__(self):


+ 36
- 0
tests/test_tools.py View File

@@ -13,6 +13,9 @@ try:
except ImportError:
from io import StringIO




class TestStandalone(TestCase):
def setUp(self):
pass
@@ -74,6 +77,39 @@ class TestStandalone(TestCase):
x = l2.parse('ABAB')
self.assertEqual(x, ['a', 'b'])

def test_postlex(self):
from lark.indenter import Indenter
class MyIndenter(Indenter):
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8

grammar = r"""
start: "(" ")" _NEWLINE
_NEWLINE: /\n/
"""

# from lark import Lark
# l = Lark(grammar, parser='lalr', lexer='contextual', postlex=MyIndenter())
# x = l.parse('(\n)\n')
# print('@@', x)


context = self._create_standalone(grammar)
_Lark = context['Lark_StandAlone']

# l = _Lark(postlex=MyIndenter())
# x = l.parse('()\n')
# print(x)
l = _Lark(postlex=MyIndenter())
x = l.parse('(\n)\n')
print(x)




if __name__ == '__main__':
unittest.main()


Loading…
Cancel
Save