Browse Source

Using a mostly-generic serialization method

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.7.1
Erez Shinan 6 years ago
parent
commit
d13ebb9c15
7 changed files with 145 additions and 163 deletions
  1. +17
    -35
      lark/grammar.py
  2. +6
    -11
      lark/lark.py
  3. +20
    -63
      lark/lexer.py
  4. +16
    -22
      lark/parser_frontends.py
  5. +31
    -1
      lark/parsers/lalr_analysis.py
  6. +7
    -31
      lark/parsers/lalr_parser.py
  7. +48
    -0
      lark/utils.py

+ 17
- 35
lark/grammar.py View File

@@ -1,4 +1,6 @@
class Symbol(object):
from .utils import Serialize

class Symbol(Serialize):
is_term = NotImplemented is_term = NotImplemented


def __init__(self, name): def __init__(self, name):
@@ -19,16 +21,10 @@ class Symbol(object):


fullrepr = property(__repr__) fullrepr = property(__repr__)


@classmethod
def deserialize(cls, data):
class_ = {
'T': Terminal,
'NT': NonTerminal,
}[data[0]]
return class_(*data[1:])



class Terminal(Symbol): class Terminal(Symbol):
__serialize_fields__ = 'name', 'filter_out'

is_term = True is_term = True


def __init__(self, name, filter_out=False): def __init__(self, name, filter_out=False):
@@ -39,23 +35,25 @@ class Terminal(Symbol):
def fullrepr(self): def fullrepr(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)


def serialize(self):
return ['T', self.name, self.filter_out]




class NonTerminal(Symbol): class NonTerminal(Symbol):
__serialize_fields__ = 'name',

is_term = False is_term = False


def serialize(self):
return ['NT', self.name]


class Rule(object):
class Rule(Serialize):
""" """
origin : a symbol origin : a symbol
expansion : a list of symbols expansion : a list of symbols
order : index of this expansion amongst all rules of the same name order : index of this expansion amongst all rules of the same name
""" """
__slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')

__serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
__serialize_namespace__ = lambda: (Terminal, NonTerminal, RuleOptions)

def __init__(self, origin, expansion, order=0, alias=None, options=None): def __init__(self, origin, expansion, order=0, alias=None, options=None):
self.origin = origin self.origin = origin
self.expansion = expansion self.expansion = expansion
@@ -64,6 +62,8 @@ class Rule(object):
self.options = options self.options = options
self._hash = hash((self.origin, tuple(self.expansion))) self._hash = hash((self.origin, tuple(self.expansion)))


def _deserialize(self):
self._hash = hash((self.origin, tuple(self.expansion)))


def __str__(self): def __str__(self):
return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
@@ -79,22 +79,11 @@ class Rule(object):
return False return False
return self.origin == other.origin and self.expansion == other.expansion return self.origin == other.origin and self.expansion == other.expansion


def serialize(self):
return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.order, self.alias, self.options.serialize() if self.options else None]

@classmethod
def deserialize(cls, data):
origin, expansion, order, alias, options = data
return cls(
Symbol.deserialize(origin),
[Symbol.deserialize(s) for s in expansion],
order,
alias,
RuleOptions.deserialize(options) if options else None
)




class RuleOptions:
class RuleOptions(Serialize):
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices'

def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()):
self.keep_all_tokens = keep_all_tokens self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1 self.expand1 = expand1
@@ -107,10 +96,3 @@ class RuleOptions:
self.expand1, self.expand1,
self.priority, self.priority,
) )

def serialize(self):
return [self.keep_all_tokens, self.expand1, self.priority, list(self.empty_indices)]
@classmethod
def deserialize(cls, data):
return cls(*data)

+ 6
- 11
lark/lark.py View File

@@ -5,7 +5,7 @@ import time
from collections import defaultdict from collections import defaultdict
from io import open from io import open


from .utils import STRING_TYPE
from .utils import STRING_TYPE, Serialize
from .load_grammar import load_grammar from .load_grammar import load_grammar
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf
@@ -13,9 +13,10 @@ from .common import LexerConf, ParserConf
from .lexer import Lexer, TraditionalLexer from .lexer import Lexer, TraditionalLexer
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend from .parser_frontends import get_frontend
from .grammar import Rule




class LarkOptions(object):
class LarkOptions(Serialize):
"""Specifies the options for Lark """Specifies the options for Lark


""" """
@@ -132,7 +133,7 @@ class Profiler:
return wrapper return wrapper




class Lark:
class Lark(Serialize):
def __init__(self, grammar, **options): def __init__(self, grammar, **options):
""" """
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
@@ -223,6 +224,8 @@ class Lark:
if __init__.__doc__: if __init__.__doc__:
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC


__serialize_fields__ = 'parser', 'rules', 'options'

def _build_lexer(self): def _build_lexer(self):
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)


@@ -236,16 +239,8 @@ class Lark:
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) return self.parser_class(self.lexer_conf, parser_conf, options=self.options)


def serialize(self):
return {
'parser': self.parser.serialize(),
'rules': [r.serialize() for r in self.rules],
'options': self.options.serialize(),
}
@classmethod @classmethod
def deserialize(cls, data): def deserialize(cls, data):
from .grammar import Rule
inst = cls.__new__(cls) inst = cls.__new__(cls)
inst.options = LarkOptions.deserialize(data['options']) inst.options = LarkOptions.deserialize(data['options'])
inst.rules = [Rule.deserialize(r) for r in data['rules']] inst.rules = [Rule.deserialize(r) for r in data['rules']]


+ 20
- 63
lark/lexer.py View File

@@ -2,10 +2,12 @@


import re import re


from .utils import Str, classify, get_regexp_width, Py36
from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError from .exceptions import UnexpectedCharacters, LexError


class Pattern(object):
class Pattern(Serialize):
__serialize_fields__ = 'value', 'flags'

def __init__(self, value, flags=()): def __init__(self, value, flags=()):
self.value = value self.value = value
self.flags = frozenset(flags) self.flags = frozenset(flags)
@@ -35,15 +37,6 @@ class Pattern(object):
value = ('(?%s)' % f) + value value = ('(?%s)' % f) + value
return value return value


@classmethod
def deserialize(cls, data):
class_ = {
's': PatternStr,
're': PatternRE,
}[data[0]]
value, flags = data[1:]
return class_(value, frozenset(flags))



class PatternStr(Pattern): class PatternStr(Pattern):
def to_regexp(self): def to_regexp(self):
@@ -54,9 +47,6 @@ class PatternStr(Pattern):
return len(self.value) return len(self.value)
max_width = min_width max_width = min_width


def serialize(self):
return ['s', self.value, list(self.flags)]

class PatternRE(Pattern): class PatternRE(Pattern):
def to_regexp(self): def to_regexp(self):
return self._get_flags(self.value) return self._get_flags(self.value)
@@ -68,10 +58,11 @@ class PatternRE(Pattern):
def max_width(self): def max_width(self):
return get_regexp_width(self.to_regexp())[1] return get_regexp_width(self.to_regexp())[1]


def serialize(self):
return ['re', self.value, list(self.flags)]


class TerminalDef(object):
class TerminalDef(Serialize):
__serialize_fields__ = 'name', 'pattern', 'priority'
__serialize_namespace__ = lambda: (PatternStr, PatternRE)

def __init__(self, name, pattern, priority=1): def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern assert isinstance(pattern, Pattern), pattern
self.name = name self.name = name
@@ -81,14 +72,6 @@ class TerminalDef(object):
def __repr__(self): def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)


def serialize(self):
return [self.name, self.pattern.serialize(), self.priority]

@classmethod
def deserialize(cls, data):
name, pattern, priority = data
return cls(name, Pattern.deserialize(pattern), priority)





###{standalone ###{standalone
@@ -278,7 +261,7 @@ def _regexp_has_newline(r):
""" """
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)


class Lexer:
class Lexer(Serialize):
"""Lexer interface """Lexer interface


Method Signatures: Method Signatures:
@@ -289,15 +272,16 @@ class Lexer:
set_parser_state = NotImplemented set_parser_state = NotImplemented
lex = NotImplemented lex = NotImplemented


@classmethod
def deserialize(cls, data):
class_ = {
'traditional': TraditionalLexer,
'contextual': ContextualLexer,
}[data['type']]
return class_.deserialize(data)


class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):
__serialize_fields__ = 'terminals', 'ignore_types', 'newline_types'
__serialize_namespace__ = lambda: (TerminalDef,)

def _deserialize(self):
self.mres = build_mres(self.terminals)
self.callback = {} # TODO implement


def __init__(self, terminals, ignore=(), user_callbacks={}): def __init__(self, terminals, ignore=(), user_callbacks={}):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals assert all(isinstance(t, TerminalDef) for t in terminals), terminals


@@ -339,26 +323,13 @@ class TraditionalLexer(Lexer):
def lex(self, stream): def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) return _Lex(self).lex(stream, self.newline_types, self.ignore_types)


def serialize(self):
return {
'type': 'traditional',
'terminals': [t.serialize() for t in self.terminals],
'ignore_types': self.ignore_types,
'newline_types': self.newline_types,
}


@classmethod
def deserialize(cls, data):
inst = cls.__new__(cls)
inst.terminals = [TerminalDef.deserialize(t) for t in data['terminals']]
inst.mres = build_mres(inst.terminals)
inst.ignore_types = data['ignore_types']
inst.newline_types = data['newline_types']
inst.callback = {} # TODO implement
return inst




class ContextualLexer(Lexer): class ContextualLexer(Lexer):
__serialize_fields__ = 'root_lexer', 'lexers'
__serialize_namespace__ = lambda: (TraditionalLexer,)

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:
@@ -392,17 +363,3 @@ class ContextualLexer(Lexer):
yield x yield x
l.lexer = self.lexers[self.parser_state] l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state l.state = self.parser_state

def serialize(self):
return {
'type': 'contextual',
'root_lexer': self.root_lexer.serialize(),
'lexers': {state: lexer.serialize() for state, lexer in self.lexers.items()}
}

@classmethod
def deserialize(cls, data):
inst = cls.__new__(cls)
inst.lexers = {state:Lexer.deserialize(lexer) for state, lexer in data['lexers'].items()}
inst.root_lexer = TraditionalLexer.deserialize(data['root_lexer'])
return inst

+ 16
- 22
lark/parser_frontends.py View File

@@ -1,17 +1,31 @@
import re import re
from functools import partial from functools import partial


from .utils import get_regexp_width
from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
from .parsers import lalr_parser, earley, xearley, cyk from .parsers import lalr_parser, earley, xearley, cyk
from .grammar import Rule
from .tree import Tree from .tree import Tree


class WithLexer(object):
class WithLexer(Serialize):
lexer = None lexer = None
parser = None parser = None
lexer_conf = None lexer_conf = None


__serialize_fields__ = 'parser', 'lexer'
__serialize_namespace__ = lambda: (Rule, ContextualLexer, LALR_ContextualLexer)

@classmethod
def deserialize(cls, data, callbacks):
inst = super(WithLexer, cls).deserialize(data)
inst.postlex = None # TODO
inst.parser = lalr_parser.Parser.deserialize(inst.parser, callbacks)
return inst
def _serialize(self, data):
data['parser'] = data['parser'].serialize()

def init_traditional_lexer(self, lexer_conf): def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
@@ -36,26 +50,6 @@ class WithLexer(object):
sps = self.lexer.set_parser_state sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])


def serialize(self):
return {
'type': type(self).__name__,
'parser': self.parser.serialize(),
'lexer': self.lexer.serialize(),
}
@classmethod
def deserialize(cls, data, callbacks):
class_ = {
'LALR_TraditionalLexer': LALR_TraditionalLexer,
'LALR_ContextualLexer': LALR_ContextualLexer,
}[data['type']]
parser = lalr_parser.Parser.deserialize(data['parser'], callbacks)
assert parser
inst = class_.__new__(class_)
inst.parser = parser
inst.lexer = Lexer.deserialize(data['lexer'])
inst.postlex = None # TODO
return inst



class LALR_TraditionalLexer(WithLexer): class LALR_TraditionalLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):


+ 31
- 1
lark/parsers/lalr_analysis.py View File

@@ -9,7 +9,7 @@ For now, shift/reduce conflicts are automatically resolved as shifts.
import logging import logging
from collections import defaultdict from collections import defaultdict


from ..utils import classify, classify_bool, bfs, fzset
from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError from ..exceptions import GrammarError


from .grammar_analysis import GrammarAnalyzer, Terminal from .grammar_analysis import GrammarAnalyzer, Terminal
@@ -31,6 +31,36 @@ class ParseTable:
self.start_state = start_state self.start_state = start_state
self.end_state = end_state self.end_state = end_state


def serialize(self):
tokens = Enumerator()
rules = Enumerator()

states = {
state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}
for state, actions in self.states.items()
}

return {
'tokens': tokens.reversed(),
'rules': {idx: r.serialize() for idx, r in rules.reversed().items()},
'states': states,
'start_state': self.start_state,
'end_state': self.end_state,
}

@classmethod
def deserialize(cls, data):
tokens = data['tokens']
rules = data['rules']
states = {
state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg))
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
return cls(states, data['start_state'], data['end_state'])


class IntParseTable(ParseTable): class IntParseTable(ParseTable):


@classmethod @classmethod


+ 7
- 31
lark/parsers/lalr_parser.py View File

@@ -5,12 +5,12 @@
from ..exceptions import UnexpectedToken from ..exceptions import UnexpectedToken
from ..lexer import Token from ..lexer import Token
from ..grammar import Rule from ..grammar import Rule
from ..utils import Enumerator
from ..utils import Enumerator, Serialize


from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable




class Parser(object):
class Parser:
def __init__(self, parser_conf, debug=False): def __init__(self, parser_conf, debug=False):
assert all(r.options is None or r.options.priority is None assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization" for r in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -21,42 +21,18 @@ class Parser(object):
self._parse_table = analysis.parse_table self._parse_table = analysis.parse_table
self.parser_conf = parser_conf self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks) self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse


def serialize(self):
tokens = Enumerator()
rules = Enumerator()

states = {
state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}
for state, actions in self._parse_table.states.items()
}

return {
'tokens': tokens.reversed(),
'rules': {idx: r.serialize() for idx, r in rules.reversed().items()},
'states': states,
'start_state': self._parse_table.start_state,
'end_state': self._parse_table.end_state,
}
@classmethod @classmethod
def deserialize(cls, data, callbacks): def deserialize(cls, data, callbacks):
tokens = data['tokens']
rules = {idx: Rule.deserialize(r) for idx, r in data['rules'].items()}
states = {
state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg))
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
parse_table = IntParseTable(states, data['start_state'], data['end_state'])
inst = cls.__new__(cls) inst = cls.__new__(cls)
inst.parser = _Parser(parse_table, callbacks)
inst.parse = inst.parser.parse
inst.parser = _Parser(IntParseTable.deserialize(data), callbacks)
return inst return inst


def serialize(self):
return self._parse_table.serialize()


def parse(self, *args):
return self.parser.parse(*args)


###{standalone ###{standalone




+ 48
- 0
lark/utils.py View File

@@ -44,6 +44,54 @@ def bfs(initial, expand):






def _serialize(value):
if isinstance(value, Serialize):
return value.serialize()
elif isinstance(value, list):
return [_serialize(elem) for elem in value]
elif isinstance(value, frozenset):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem) for key, elem in value.items()}
return value

def _deserialize(data, namespace):
if isinstance(data, dict):
if '__type__' in data: # Object
class_ = namespace[data['__type__']]
return class_.deserialize(data)
return {key:_deserialize(value, namespace) for key, value in data.items()}
elif isinstance(data, list):
return [_deserialize(value, namespace) for value in data]
return data


class Serialize(object):
def serialize(self):
fields = getattr(self, '__serialize_fields__')
res = {f: _serialize(getattr(self, f)) for f in fields}
res['__type__'] = type(self).__name__
postprocess = getattr(self, '_serialize', None)
if postprocess:
postprocess(res)
return res

@classmethod
def deserialize(cls, data):
namespace = getattr(cls, '__serialize_namespace__', dict)
namespace = {c.__name__:c for c in namespace()}

fields = getattr(cls, '__serialize_fields__')

inst = cls.__new__(cls)
for f in fields:
setattr(inst, f, _deserialize(data[f], namespace))
postprocess = getattr(inst, '_deserialize', None)
if postprocess:
postprocess()
return inst




###{standalone ###{standalone
try: try:


Loading…
Cancel
Save