Browse Source

Using a mostly-generic serialization method

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.7.1
Erez Shinan 6 years ago
parent
commit
d13ebb9c15
7 changed files with 145 additions and 163 deletions
  1. +17
    -35
      lark/grammar.py
  2. +6
    -11
      lark/lark.py
  3. +20
    -63
      lark/lexer.py
  4. +16
    -22
      lark/parser_frontends.py
  5. +31
    -1
      lark/parsers/lalr_analysis.py
  6. +7
    -31
      lark/parsers/lalr_parser.py
  7. +48
    -0
      lark/utils.py

+ 17
- 35
lark/grammar.py View File

@@ -1,4 +1,6 @@
class Symbol(object):
from .utils import Serialize

class Symbol(Serialize):
is_term = NotImplemented

def __init__(self, name):
@@ -19,16 +21,10 @@ class Symbol(object):

fullrepr = property(__repr__)

@classmethod
def deserialize(cls, data):
class_ = {
'T': Terminal,
'NT': NonTerminal,
}[data[0]]
return class_(*data[1:])


class Terminal(Symbol):
__serialize_fields__ = 'name', 'filter_out'

is_term = True

def __init__(self, name, filter_out=False):
@@ -39,23 +35,25 @@ class Terminal(Symbol):
def fullrepr(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)

def serialize(self):
return ['T', self.name, self.filter_out]


class NonTerminal(Symbol):
__serialize_fields__ = 'name',

is_term = False

def serialize(self):
return ['NT', self.name]

class Rule(object):
class Rule(Serialize):
"""
origin : a symbol
expansion : a list of symbols
order : index of this expansion amongst all rules of the same name
"""
__slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')

__serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
__serialize_namespace__ = lambda: (Terminal, NonTerminal, RuleOptions)

def __init__(self, origin, expansion, order=0, alias=None, options=None):
self.origin = origin
self.expansion = expansion
@@ -64,6 +62,8 @@ class Rule(object):
self.options = options
self._hash = hash((self.origin, tuple(self.expansion)))

def _deserialize(self):
self._hash = hash((self.origin, tuple(self.expansion)))

def __str__(self):
return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
@@ -79,22 +79,11 @@ class Rule(object):
return False
return self.origin == other.origin and self.expansion == other.expansion

def serialize(self):
return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.order, self.alias, self.options.serialize() if self.options else None]

@classmethod
def deserialize(cls, data):
origin, expansion, order, alias, options = data
return cls(
Symbol.deserialize(origin),
[Symbol.deserialize(s) for s in expansion],
order,
alias,
RuleOptions.deserialize(options) if options else None
)


class RuleOptions:
class RuleOptions(Serialize):
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices'

def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
@@ -107,10 +96,3 @@ class RuleOptions:
self.expand1,
self.priority,
)

def serialize(self):
return [self.keep_all_tokens, self.expand1, self.priority, list(self.empty_indices)]
@classmethod
def deserialize(cls, data):
return cls(*data)

+ 6
- 11
lark/lark.py View File

@@ -5,7 +5,7 @@ import time
from collections import defaultdict
from io import open

from .utils import STRING_TYPE
from .utils import STRING_TYPE, Serialize
from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf
@@ -13,9 +13,10 @@ from .common import LexerConf, ParserConf
from .lexer import Lexer, TraditionalLexer
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend
from .grammar import Rule


class LarkOptions(object):
class LarkOptions(Serialize):
"""Specifies the options for Lark

"""
@@ -132,7 +133,7 @@ class Profiler:
return wrapper


class Lark:
class Lark(Serialize):
def __init__(self, grammar, **options):
"""
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
@@ -223,6 +224,8 @@ class Lark:
if __init__.__doc__:
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC

__serialize_fields__ = 'parser', 'rules', 'options'

def _build_lexer(self):
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)

@@ -236,16 +239,8 @@ class Lark:
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)

def serialize(self):
return {
'parser': self.parser.serialize(),
'rules': [r.serialize() for r in self.rules],
'options': self.options.serialize(),
}
@classmethod
def deserialize(cls, data):
from .grammar import Rule
inst = cls.__new__(cls)
inst.options = LarkOptions.deserialize(data['options'])
inst.rules = [Rule.deserialize(r) for r in data['rules']]


+ 20
- 63
lark/lexer.py View File

@@ -2,10 +2,12 @@

import re

from .utils import Str, classify, get_regexp_width, Py36
from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError

class Pattern(object):
class Pattern(Serialize):
__serialize_fields__ = 'value', 'flags'

def __init__(self, value, flags=()):
self.value = value
self.flags = frozenset(flags)
@@ -35,15 +37,6 @@ class Pattern(object):
value = ('(?%s)' % f) + value
return value

@classmethod
def deserialize(cls, data):
class_ = {
's': PatternStr,
're': PatternRE,
}[data[0]]
value, flags = data[1:]
return class_(value, frozenset(flags))


class PatternStr(Pattern):
def to_regexp(self):
@@ -54,9 +47,6 @@ class PatternStr(Pattern):
return len(self.value)
max_width = min_width

def serialize(self):
return ['s', self.value, list(self.flags)]

class PatternRE(Pattern):
def to_regexp(self):
return self._get_flags(self.value)
@@ -68,10 +58,11 @@ class PatternRE(Pattern):
def max_width(self):
return get_regexp_width(self.to_regexp())[1]

def serialize(self):
return ['re', self.value, list(self.flags)]

class TerminalDef(object):
class TerminalDef(Serialize):
__serialize_fields__ = 'name', 'pattern', 'priority'
__serialize_namespace__ = lambda: (PatternStr, PatternRE)

def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
@@ -81,14 +72,6 @@ class TerminalDef(object):
def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)

def serialize(self):
return [self.name, self.pattern.serialize(), self.priority]

@classmethod
def deserialize(cls, data):
name, pattern, priority = data
return cls(name, Pattern.deserialize(pattern), priority)



###{standalone
@@ -278,7 +261,7 @@ def _regexp_has_newline(r):
"""
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)

class Lexer:
class Lexer(Serialize):
"""Lexer interface

Method Signatures:
@@ -289,15 +272,16 @@ class Lexer:
set_parser_state = NotImplemented
lex = NotImplemented

@classmethod
def deserialize(cls, data):
class_ = {
'traditional': TraditionalLexer,
'contextual': ContextualLexer,
}[data['type']]
return class_.deserialize(data)

class TraditionalLexer(Lexer):
__serialize_fields__ = 'terminals', 'ignore_types', 'newline_types'
__serialize_namespace__ = lambda: (TerminalDef,)

def _deserialize(self):
self.mres = build_mres(self.terminals)
self.callback = {} # TODO implement


def __init__(self, terminals, ignore=(), user_callbacks={}):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

@@ -339,26 +323,13 @@ class TraditionalLexer(Lexer):
def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)

def serialize(self):
return {
'type': 'traditional',
'terminals': [t.serialize() for t in self.terminals],
'ignore_types': self.ignore_types,
'newline_types': self.newline_types,
}

@classmethod
def deserialize(cls, data):
inst = cls.__new__(cls)
inst.terminals = [TerminalDef.deserialize(t) for t in data['terminals']]
inst.mres = build_mres(inst.terminals)
inst.ignore_types = data['ignore_types']
inst.newline_types = data['newline_types']
inst.callback = {} # TODO implement
return inst


class ContextualLexer(Lexer):
__serialize_fields__ = 'root_lexer', 'lexers'
__serialize_namespace__ = lambda: (TraditionalLexer,)

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {}
for t in terminals:
@@ -392,17 +363,3 @@ class ContextualLexer(Lexer):
yield x
l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state

def serialize(self):
return {
'type': 'contextual',
'root_lexer': self.root_lexer.serialize(),
'lexers': {state: lexer.serialize() for state, lexer in self.lexers.items()}
}

@classmethod
def deserialize(cls, data):
inst = cls.__new__(cls)
inst.lexers = {state:Lexer.deserialize(lexer) for state, lexer in data['lexers'].items()}
inst.root_lexer = TraditionalLexer.deserialize(data['root_lexer'])
return inst

+ 16
- 22
lark/parser_frontends.py View File

@@ -1,17 +1,31 @@
import re
from functools import partial

from .utils import get_regexp_width
from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
from .parsers import lalr_parser, earley, xearley, cyk
from .grammar import Rule
from .tree import Tree

class WithLexer(object):
class WithLexer(Serialize):
lexer = None
parser = None
lexer_conf = None

__serialize_fields__ = 'parser', 'lexer'
__serialize_namespace__ = lambda: (Rule, ContextualLexer, LALR_ContextualLexer)

@classmethod
def deserialize(cls, data, callbacks):
inst = super(WithLexer, cls).deserialize(data)
inst.postlex = None # TODO
inst.parser = lalr_parser.Parser.deserialize(inst.parser, callbacks)
return inst
def _serialize(self, data):
data['parser'] = data['parser'].serialize()

def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
@@ -36,26 +50,6 @@ class WithLexer(object):
sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])

def serialize(self):
return {
'type': type(self).__name__,
'parser': self.parser.serialize(),
'lexer': self.lexer.serialize(),
}
@classmethod
def deserialize(cls, data, callbacks):
class_ = {
'LALR_TraditionalLexer': LALR_TraditionalLexer,
'LALR_ContextualLexer': LALR_ContextualLexer,
}[data['type']]
parser = lalr_parser.Parser.deserialize(data['parser'], callbacks)
assert parser
inst = class_.__new__(class_)
inst.parser = parser
inst.lexer = Lexer.deserialize(data['lexer'])
inst.postlex = None # TODO
return inst


class LALR_TraditionalLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):


+ 31
- 1
lark/parsers/lalr_analysis.py View File

@@ -9,7 +9,7 @@ For now, shift/reduce conflicts are automatically resolved as shifts.
import logging
from collections import defaultdict

from ..utils import classify, classify_bool, bfs, fzset
from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal
@@ -31,6 +31,36 @@ class ParseTable:
self.start_state = start_state
self.end_state = end_state

def serialize(self):
tokens = Enumerator()
rules = Enumerator()

states = {
state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}
for state, actions in self.states.items()
}

return {
'tokens': tokens.reversed(),
'rules': {idx: r.serialize() for idx, r in rules.reversed().items()},
'states': states,
'start_state': self.start_state,
'end_state': self.end_state,
}

@classmethod
def deserialize(cls, data):
tokens = data['tokens']
rules = data['rules']
states = {
state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg))
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
return cls(states, data['start_state'], data['end_state'])


class IntParseTable(ParseTable):

@classmethod


+ 7
- 31
lark/parsers/lalr_parser.py View File

@@ -5,12 +5,12 @@
from ..exceptions import UnexpectedToken
from ..lexer import Token
from ..grammar import Rule
from ..utils import Enumerator
from ..utils import Enumerator, Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable


class Parser(object):
class Parser:
def __init__(self, parser_conf, debug=False):
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -21,42 +21,18 @@ class Parser(object):
self._parse_table = analysis.parse_table
self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse

def serialize(self):
tokens = Enumerator()
rules = Enumerator()

states = {
state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}
for state, actions in self._parse_table.states.items()
}

return {
'tokens': tokens.reversed(),
'rules': {idx: r.serialize() for idx, r in rules.reversed().items()},
'states': states,
'start_state': self._parse_table.start_state,
'end_state': self._parse_table.end_state,
}
@classmethod
def deserialize(cls, data, callbacks):
tokens = data['tokens']
rules = {idx: Rule.deserialize(r) for idx, r in data['rules'].items()}
states = {
state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg))
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
parse_table = IntParseTable(states, data['start_state'], data['end_state'])
inst = cls.__new__(cls)
inst.parser = _Parser(parse_table, callbacks)
inst.parse = inst.parser.parse
inst.parser = _Parser(IntParseTable.deserialize(data), callbacks)
return inst

def serialize(self):
return self._parse_table.serialize()

def parse(self, *args):
return self.parser.parse(*args)

###{standalone



+ 48
- 0
lark/utils.py View File

@@ -44,6 +44,54 @@ def bfs(initial, expand):



def _serialize(value):
if isinstance(value, Serialize):
return value.serialize()
elif isinstance(value, list):
return [_serialize(elem) for elem in value]
elif isinstance(value, frozenset):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem) for key, elem in value.items()}
return value

def _deserialize(data, namespace):
if isinstance(data, dict):
if '__type__' in data: # Object
class_ = namespace[data['__type__']]
return class_.deserialize(data)
return {key:_deserialize(value, namespace) for key, value in data.items()}
elif isinstance(data, list):
return [_deserialize(value, namespace) for value in data]
return data


class Serialize(object):
def serialize(self):
fields = getattr(self, '__serialize_fields__')
res = {f: _serialize(getattr(self, f)) for f in fields}
res['__type__'] = type(self).__name__
postprocess = getattr(self, '_serialize', None)
if postprocess:
postprocess(res)
return res

@classmethod
def deserialize(cls, data):
namespace = getattr(cls, '__serialize_namespace__', dict)
namespace = {c.__name__:c for c in namespace()}

fields = getattr(cls, '__serialize_fields__')

inst = cls.__new__(cls)
for f in fields:
setattr(inst, f, _deserialize(data[f], namespace))
postprocess = getattr(inst, '_deserialize', None)
if postprocess:
postprocess()
return inst



###{standalone
try:


Loading…
Cancel
Save