Browse Source

Fix to new serializer code (Discussed in issue #349)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.7.1
Erez Shinan 6 years ago
parent
commit
a798dec779
7 changed files with 102 additions and 52 deletions
  1. +12
    -1
      lark/common.py
  2. +1
    -11
      lark/lexer.py
  3. +37
    -30
      lark/parser_frontends.py
  4. +2
    -1
      lark/parsers/lalr_parser.py
  5. +1
    -0
      lark/tools/standalone.py
  6. +13
    -9
      lark/utils.py
  7. +36
    -0
      tests/test_tools.py

+ 12
- 1
lark/common.py View File

@@ -1,12 +1,23 @@
from .utils import Serialize
from .lexer import TerminalDef


###{standalone

class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore'
__serialize_namespace__ = TerminalDef,


class LexerConf:
def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): def __init__(self, tokens, ignore=(), postlex=None, callbacks=None):
self.tokens = tokens self.tokens = tokens
self.ignore = ignore self.ignore = ignore
self.postlex = postlex self.postlex = postlex
self.callbacks = callbacks or {} self.callbacks = callbacks or {}


def _deserialize(self):
self.callbacks = {} # TODO

###}

class ParserConf: class ParserConf:
def __init__(self, rules, callbacks, start): def __init__(self, rules, callbacks, start):
self.rules = rules self.rules = rules


+ 1
- 11
lark/lexer.py View File

@@ -261,7 +261,7 @@ def _regexp_has_newline(r):
""" """
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)


class Lexer(Serialize):
class Lexer(object):
"""Lexer interface """Lexer interface


Method Signatures: Method Signatures:
@@ -274,13 +274,6 @@ class Lexer(Serialize):




class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):
__serialize_fields__ = 'terminals', 'ignore_types', 'newline_types'
__serialize_namespace__ = TerminalDef,

def _deserialize(self):
self.user_callbacks = {} # TODO implement
self.build()



def __init__(self, terminals, ignore=(), user_callbacks={}): def __init__(self, terminals, ignore=(), user_callbacks={}):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals assert all(isinstance(t, TerminalDef) for t in terminals), terminals
@@ -329,9 +322,6 @@ class TraditionalLexer(Lexer):




class ContextualLexer(Lexer): class ContextualLexer(Lexer):
__serialize_fields__ = 'root_lexer', 'lexers'
__serialize_namespace__ = TraditionalLexer,

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:


+ 37
- 30
lark/parser_frontends.py View File

@@ -8,6 +8,7 @@ from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule from .grammar import Rule
from .tree import Tree from .tree import Tree
from .common import LexerConf


###{standalone ###{standalone


@@ -50,34 +51,24 @@ class WithLexer(Serialize):
parser = None parser = None
lexer_conf = None lexer_conf = None


__serialize_fields__ = 'parser', 'lexer'
__serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer
__serialize_fields__ = 'parser', 'lexer_conf'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.postlex = lexer_conf.postlex


@classmethod @classmethod
def deserialize(cls, data, memo, callbacks, postlex): def deserialize(cls, data, memo, callbacks, postlex):
inst = super(WithLexer, cls).deserialize(data, memo) inst = super(WithLexer, cls).deserialize(data, memo)
inst.postlex = postlex inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.init_lexer()
return inst return inst
def _serialize(self, data, memo): def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo) data['parser'] = data['parser'].serialize(memo)


def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
self.postlex = lexer_conf.postlex

def init_contextual_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.postlex = lexer_conf.postlex
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, states,
ignore=lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=lexer_conf.callbacks)

def lex(self, text): def lex(self, text):
stream = self.lexer.lex(text) stream = self.lexer.lex(text)
return self.postlex.process(stream) if self.postlex else stream return self.postlex.process(stream) if self.postlex else stream
@@ -87,26 +78,40 @@ class WithLexer(Serialize):
sps = self.lexer.set_parser_state sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])


def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)


class LALR_TraditionalLexer(WithLexer):
class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug) self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_traditional_lexer(lexer_conf)
WithLexer.__init__(self, lexer_conf, parser_conf, options)


class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_contextual_lexer(lexer_conf)
self.init_lexer()

def init_lexer(self):
raise NotImplementedError()

class LALR_TraditionalLexer(LALR_WithLexer):
def init_lexer(self):
self.init_traditional_lexer()


class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
ignore=self.lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks)
###} ###}


class LALR_CustomLexer(WithLexer):
class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.parser = LALR_Parser(parser_conf)
self.lexer_conf = lexer_conf
self.lexer = lexer_cls(lexer_conf)
pass # TODO

def init_lexer(self):
self.lexer = lexer_cls(self.lexer_conf)


def tokenize_text(text): def tokenize_text(text):
line = 1 line = 1
@@ -119,7 +124,8 @@ def tokenize_text(text):


class Earley(WithLexer): class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()


resolve_ambiguity = options.ambiguity == 'resolve' resolve_ambiguity = options.ambiguity == 'resolve'
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity) self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
@@ -172,7 +178,8 @@ class XEarley_CompleteLex(XEarley):
class CYK(WithLexer): class CYK(WithLexer):


def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()


self._analysis = GrammarAnalyzer(parser_conf) self._analysis = GrammarAnalyzer(parser_conf)
self._parser = cyk.Parser(parser_conf.rules, parser_conf.start) self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)


+ 2
- 1
lark/parsers/lalr_parser.py View File

@@ -25,7 +25,8 @@ class LALR_Parser(object):
@classmethod @classmethod
def deserialize(cls, data, memo, callbacks): def deserialize(cls, data, memo, callbacks):
inst = cls.__new__(cls) inst = cls.__new__(cls)
inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks)
inst._parse_table = IntParseTable.deserialize(data, memo)
inst.parser = _Parser(inst._parse_table, callbacks)
return inst return inst


def serialize(self, memo): def serialize(self, memo):


+ 1
- 0
lark/tools/standalone.py View File

@@ -65,6 +65,7 @@ EXTRACT_STANDALONE_FILES = [
'indenter.py', 'indenter.py',
'grammar.py', 'grammar.py',
'lexer.py', 'lexer.py',
'common.py',
'parse_tree_builder.py', 'parse_tree_builder.py',
'parsers/lalr_parser.py', 'parsers/lalr_parser.py',
'parsers/lalr_analysis.py', 'parsers/lalr_analysis.py',


+ 13
- 9
lark/utils.py View File

@@ -103,7 +103,10 @@ class Serialize(object):


inst = cls.__new__(cls) inst = cls.__new__(cls)
for f in fields: for f in fields:
setattr(inst, f, _deserialize(data[f], namespace, memo))
try:
setattr(inst, f, _deserialize(data[f], namespace, memo))
except KeyError as e:
raise KeyError("Cannot find key for class", cls, e)
postprocess = getattr(inst, '_deserialize', None) postprocess = getattr(inst, '_deserialize', None)
if postprocess: if postprocess:
postprocess() postprocess()
@@ -164,6 +167,15 @@ def smart_decorator(f, create_decorator):


import sys, re import sys, re
Py36 = (sys.version_info[:2] >= (3, 6)) Py36 = (sys.version_info[:2] >= (3, 6))

import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
except sre_constants.error:
raise ValueError(regexp)

###} ###}




@@ -209,14 +221,6 @@ except NameError:
return -1 return -1




import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
except sre_constants.error:
raise ValueError(regexp)



class Enumerator(Serialize): class Enumerator(Serialize):
def __init__(self): def __init__(self):


+ 36
- 0
tests/test_tools.py View File

@@ -13,6 +13,9 @@ try:
except ImportError: except ImportError:
from io import StringIO from io import StringIO





class TestStandalone(TestCase): class TestStandalone(TestCase):
def setUp(self): def setUp(self):
pass pass
@@ -74,6 +77,39 @@ class TestStandalone(TestCase):
x = l2.parse('ABAB') x = l2.parse('ABAB')
self.assertEqual(x, ['a', 'b']) self.assertEqual(x, ['a', 'b'])


def test_postlex(self):
from lark.indenter import Indenter
class MyIndenter(Indenter):
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8

grammar = r"""
start: "(" ")" _NEWLINE
_NEWLINE: /\n/
"""

# from lark import Lark
# l = Lark(grammar, parser='lalr', lexer='contextual', postlex=MyIndenter())
# x = l.parse('(\n)\n')
# print('@@', x)


context = self._create_standalone(grammar)
_Lark = context['Lark_StandAlone']

# l = _Lark(postlex=MyIndenter())
# x = l.parse('()\n')
# print(x)
l = _Lark(postlex=MyIndenter())
x = l.parse('(\n)\n')
print(x)





if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()


Loading…
Cancel
Save