Browse Source

Merge branch 'serialize'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.7.1
Erez Shinan 6 years ago
parent
commit
5137e990ba
13 changed files with 1850 additions and 630 deletions
  1. +1
    -1
      examples/standalone/create_standalone.sh
  2. +1443
    -384
      examples/standalone/json_parser.py
  3. +38
    -14
      lark/grammar.py
  4. +71
    -34
      lark/lark.py
  5. +28
    -7
      lark/lexer.py
  6. +2
    -2
      lark/parse_tree_builder.py
  7. +64
    -44
      lark/parser_frontends.py
  8. +33
    -3
      lark/parsers/lalr_analysis.py
  9. +17
    -4
      lark/parsers/lalr_parser.py
  10. +23
    -134
      lark/tools/standalone.py
  11. +107
    -3
      lark/utils.py
  12. +19
    -0
      tests/test_parser.py
  13. +4
    -0
      tests/test_tools.py

+ 1
- 1
examples/standalone/create_standalone.sh View File

@@ -1 +1 @@
python -m lark.tools.standalone json.lark > json_parser.py
PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py

+ 1443
- 384
examples/standalone/json_parser.py
File diff suppressed because it is too large
View File


+ 38
- 14
lark/grammar.py View File

@@ -1,4 +1,8 @@
class Symbol(object):
from .utils import Serialize

###{standalone

class Symbol(Serialize):
is_term = NotImplemented is_term = NotImplemented


def __init__(self, name): def __init__(self, name):
@@ -19,7 +23,10 @@ class Symbol(object):


fullrepr = property(__repr__) fullrepr = property(__repr__)



class Terminal(Symbol): class Terminal(Symbol):
__serialize_fields__ = 'name', 'filter_out'

is_term = True is_term = True


def __init__(self, name, filter_out=False): def __init__(self, name, filter_out=False):
@@ -31,16 +38,42 @@ class Terminal(Symbol):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)





class NonTerminal(Symbol): class NonTerminal(Symbol):
__serialize_fields__ = 'name',

is_term = False is_term = False


class Rule(object):


class RuleOptions(Serialize):
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices'

def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority
self.empty_indices = empty_indices

def __repr__(self):
return 'RuleOptions(%r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
)


class Rule(Serialize):
""" """
origin : a symbol origin : a symbol
expansion : a list of symbols expansion : a list of symbols
order : index of this expansion amongst all rules of the same name order : index of this expansion amongst all rules of the same name
""" """
__slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')

__serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
__serialize_namespace__ = Terminal, NonTerminal, RuleOptions

def __init__(self, origin, expansion, order=0, alias=None, options=None): def __init__(self, origin, expansion, order=0, alias=None, options=None):
self.origin = origin self.origin = origin
self.expansion = expansion self.expansion = expansion
@@ -49,6 +82,8 @@ class Rule(object):
self.options = options self.options = options
self._hash = hash((self.origin, tuple(self.expansion))) self._hash = hash((self.origin, tuple(self.expansion)))


def _deserialize(self):
self._hash = hash((self.origin, tuple(self.expansion)))


def __str__(self): def __str__(self):
return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
@@ -65,16 +100,5 @@ class Rule(object):
return self.origin == other.origin and self.expansion == other.expansion return self.origin == other.origin and self.expansion == other.expansion




class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, priority=None):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority
self.empty_indices = ()


def __repr__(self):
return 'RuleOptions(%r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
)
###}

+ 71
- 34
lark/lark.py View File

@@ -5,7 +5,7 @@ import time
from collections import defaultdict from collections import defaultdict
from io import open from io import open


from .utils import STRING_TYPE
from .utils import STRING_TYPE, Serialize, SerializeMemoizer
from .load_grammar import load_grammar from .load_grammar import load_grammar
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf
@@ -13,9 +13,11 @@ from .common import LexerConf, ParserConf
from .lexer import Lexer, TraditionalLexer from .lexer import Lexer, TraditionalLexer
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend from .parser_frontends import get_frontend
from .grammar import Rule


###{standalone


class LarkOptions(object):
class LarkOptions(Serialize):
"""Specifies the options for Lark """Specifies the options for Lark


""" """
@@ -51,24 +53,39 @@ class LarkOptions(object):
if __doc__: if __doc__:
__doc__ += OPTIONS_DOC __doc__ += OPTIONS_DOC


_defaults = {
'debug': False,
'keep_all_tokens': False,
'tree_class': None,
'cache_grammar': False,
'postlex': None,
'parser': 'earley',
'lexer': 'auto',
'transformer': None,
'start': 'start',
'profile': False,
'priority': 'auto',
'ambiguity': 'auto',
'propagate_positions': False,
'lexer_callbacks': {},
'maybe_placeholders': False,
}

def __init__(self, options_dict): def __init__(self, options_dict):
o = dict(options_dict) o = dict(options_dict)


self.debug = bool(o.pop('debug', False))
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False))
self.tree_class = o.pop('tree_class', Tree)
self.cache_grammar = o.pop('cache_grammar', False)
self.postlex = o.pop('postlex', None)
self.parser = o.pop('parser', 'earley')
self.lexer = o.pop('lexer', 'auto')
self.transformer = o.pop('transformer', None)
self.start = o.pop('start', 'start')
self.profile = o.pop('profile', False)
self.priority = o.pop('priority', 'auto')
self.ambiguity = o.pop('ambiguity', 'auto')
self.propagate_positions = o.pop('propagate_positions', False)
self.lexer_callbacks = o.pop('lexer_callbacks', {})
self.maybe_placeholders = o.pop('maybe_placeholders', False)
options = {}
for name, default in self._defaults.items():
if name in o:
value = o.pop(name)
if isinstance(default, bool):
value = bool(value)
else:
value = default

options[name] = value

self.__dict__['options'] = options


assert self.parser in ('earley', 'lalr', 'cyk', None) assert self.parser in ('earley', 'lalr', 'cyk', None)


@@ -79,6 +96,19 @@ class LarkOptions(object):
if o: if o:
raise ValueError("Unknown options: %s" % o.keys()) raise ValueError("Unknown options: %s" % o.keys())


def __getattr__(self, name):
return self.options[name]
def __setattr__(self, name, value):
assert name in self.options
self.options[name] = value

def serialize(self, memo):
return self.options

@classmethod
def deserialize(cls, data, memo):
return cls(data)



class Profiler: class Profiler:
def __init__(self): def __init__(self):
@@ -104,7 +134,7 @@ class Profiler:
return wrapper return wrapper




class Lark:
class Lark(Serialize):
def __init__(self, grammar, **options): def __init__(self, grammar, **options):
""" """
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
@@ -195,18 +225,35 @@ class Lark:
if __init__.__doc__: if __init__.__doc__:
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC


__serialize_fields__ = 'parser', 'rules', 'options'

def _build_lexer(self): def _build_lexer(self):
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)


def _build_parser(self):
def _prepare_callbacks(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders)
self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)


self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders)
callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
def _build_parser(self):
self._prepare_callbacks()
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)


parser_conf = ParserConf(self.rules, callbacks, self.options.start)
@classmethod
def deserialize(cls, data, namespace, memo, transformer=None, postlex=None):
if memo:
memo = SerializeMemoizer.deserialize(memo, namespace, {})
inst = cls.__new__(cls)
options = dict(data['options'])
options['transformer'] = transformer
options['postlex'] = postlex
inst.options = LarkOptions.deserialize(options, memo)
inst.rules = [Rule.deserialize(r, memo) for r in data['rules']]
inst._prepare_callbacks()
inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex)
return inst


return self.parser_class(self.lexer_conf, parser_conf, options=self.options)


@classmethod @classmethod
def open(cls, grammar_filename, rel_to=None, **options): def open(cls, grammar_filename, rel_to=None, **options):
@@ -243,14 +290,4 @@ class Lark:
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise."
return self.parser.parse(text) return self.parser.parse(text)


# if self.profiler:
# self.profiler.enter_section('lex')
# l = list(self.lex(text))
# self.profiler.enter_section('parse')
# try:
# return self.parser.parse(l)
# finally:
# self.profiler.enter_section('outside_lark')
# else:
# l = list(self.lex(text))
# return self.parser.parse(l)
###}

+ 28
- 7
lark/lexer.py View File

@@ -2,10 +2,14 @@


import re import re


from .utils import Str, classify, get_regexp_width, Py36
from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError from .exceptions import UnexpectedCharacters, LexError


class Pattern(object):
###{standalone

class Pattern(Serialize):
__serialize_fields__ = 'value', 'flags'

def __init__(self, value, flags=()): def __init__(self, value, flags=()):
self.value = value self.value = value
self.flags = frozenset(flags) self.flags = frozenset(flags)
@@ -35,6 +39,7 @@ class Pattern(object):
value = ('(?%s)' % f) + value value = ('(?%s)' % f) + value
return value return value



class PatternStr(Pattern): class PatternStr(Pattern):
def to_regexp(self): def to_regexp(self):
return self._get_flags(re.escape(self.value)) return self._get_flags(re.escape(self.value))
@@ -55,7 +60,11 @@ class PatternRE(Pattern):
def max_width(self): def max_width(self):
return get_regexp_width(self.to_regexp())[1] return get_regexp_width(self.to_regexp())[1]


class TerminalDef(object):

class TerminalDef(Serialize):
__serialize_fields__ = 'name', 'pattern', 'priority'
__serialize_namespace__ = PatternStr, PatternRE

def __init__(self, name, pattern, priority=1): def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern assert isinstance(pattern, Pattern), pattern
self.name = name self.name = name
@@ -67,7 +76,6 @@ class TerminalDef(object):






###{standalone
class Token(Str): class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')


@@ -198,7 +206,6 @@ class CallChain:
return self.callback2(t) if self.cond(t2) else t2 return self.callback2(t) if self.cond(t2) else t2




###}






@@ -254,7 +261,7 @@ def _regexp_has_newline(r):
""" """
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)


class Lexer:
class Lexer(Serialize):
"""Lexer interface """Lexer interface


Method Signatures: Method Signatures:
@@ -265,7 +272,16 @@ class Lexer:
set_parser_state = NotImplemented set_parser_state = NotImplemented
lex = NotImplemented lex = NotImplemented



class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):
__serialize_fields__ = 'terminals', 'ignore_types', 'newline_types'
__serialize_namespace__ = TerminalDef,

def _deserialize(self):
self.mres = build_mres(self.terminals)
self.callback = {} # TODO implement


def __init__(self, terminals, ignore=(), user_callbacks={}): def __init__(self, terminals, ignore=(), user_callbacks={}):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals assert all(isinstance(t, TerminalDef) for t in terminals), terminals


@@ -308,7 +324,12 @@ class TraditionalLexer(Lexer):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) return _Lex(self).lex(stream, self.newline_types, self.ignore_types)






class ContextualLexer(Lexer): class ContextualLexer(Lexer):
__serialize_fields__ = 'root_lexer', 'lexers'
__serialize_namespace__ = TraditionalLexer,

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:
@@ -343,4 +364,4 @@ class ContextualLexer(Lexer):
l.lexer = self.lexers[self.parser_state] l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state l.state = self.parser_state


###}

+ 2
- 2
lark/parse_tree_builder.py View File

@@ -209,12 +209,12 @@ class ParseTreeBuilder:
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand_single_child = options.expand1 if options else False expand_single_child = options.expand1 if options else False


wrapper_chain = filter(None, [
wrapper_chain = list(filter(None, [
(expand_single_child and not rule.alias) and ExpandSingleChild, (expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None),
self.propagate_positions and PropagatePositions, self.propagate_positions and PropagatePositions,
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
])
]))


yield rule, wrapper_chain yield rule, wrapper_chain




+ 64
- 44
lark/parser_frontends.py View File

@@ -1,25 +1,78 @@
import re import re
from functools import partial from functools import partial


from .utils import get_regexp_width
from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
from .parsers import lalr_parser, earley, xearley, cyk
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
from .tree import Tree from .tree import Tree


class WithLexer:
###{standalone

def get_frontend(parser, lexer):
if parser=='lalr':
if lexer is None:
raise ValueError('The LALR parser requires use of a lexer')
elif lexer == 'standard':
return LALR_TraditionalLexer
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
return partial(LALR_CustomLexer, lexer)
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser=='earley':
if lexer=='standard':
return Earley
elif lexer=='dynamic':
return XEarley
elif lexer=='dynamic_complete':
return XEarley_CompleteLex
elif lexer=='contextual':
raise ValueError('The Earley parser does not support the contextual parser')
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser == 'cyk':
if lexer == 'standard':
return CYK
else:
raise ValueError('CYK parser requires using standard parser.')
else:
raise ValueError('Unknown parser: %s' % parser)




class WithLexer(Serialize):
lexer = None lexer = None
parser = None parser = None
lexer_conf = None lexer_conf = None


__serialize_fields__ = 'parser', 'lexer'
__serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer

@classmethod
def deserialize(cls, data, memo, callbacks, postlex):
inst = super(WithLexer, cls).deserialize(data, memo)
inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
return inst
def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def init_traditional_lexer(self, lexer_conf): def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
self.postlex = lexer_conf.postlex


def init_contextual_lexer(self, lexer_conf): def init_contextual_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.postlex = lexer_conf.postlex
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, states, self.lexer = ContextualLexer(lexer_conf.tokens, states,
ignore=lexer_conf.ignore, ignore=lexer_conf.ignore,
always_accept=always_accept, always_accept=always_accept,
@@ -27,30 +80,31 @@ class WithLexer:


def lex(self, text): def lex(self, text):
stream = self.lexer.lex(text) stream = self.lexer.lex(text)
if self.lexer_conf.postlex:
return self.lexer_conf.postlex.process(stream)
return stream
return self.postlex.process(stream) if self.postlex else stream


def parse(self, text): def parse(self, text):
token_stream = self.lex(text) token_stream = self.lex(text)
sps = self.lexer.set_parser_state sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])



class LALR_TraditionalLexer(WithLexer): class LALR_TraditionalLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False debug = options.debug if options else False
self.parser = lalr_parser.Parser(parser_conf, debug=debug)
self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_traditional_lexer(lexer_conf) self.init_traditional_lexer(lexer_conf)


class LALR_ContextualLexer(WithLexer): class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False debug = options.debug if options else False
self.parser = lalr_parser.Parser(parser_conf, debug=debug)
self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_contextual_lexer(lexer_conf) self.init_contextual_lexer(lexer_conf)


###}

class LALR_CustomLexer(WithLexer): class LALR_CustomLexer(WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.parser = LALR_Parser(parser_conf)
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.lexer = lexer_cls(lexer_conf) self.lexer = lexer_cls(lexer_conf)


@@ -141,37 +195,3 @@ class CYK(WithLexer):
def _apply_callback(self, tree): def _apply_callback(self, tree):
return self.callbacks[tree.rule](tree.children) return self.callbacks[tree.rule](tree.children)



def get_frontend(parser, lexer):
if parser=='lalr':
if lexer is None:
raise ValueError('The LALR parser requires use of a lexer')
elif lexer == 'standard':
return LALR_TraditionalLexer
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
return partial(LALR_CustomLexer, lexer)
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser=='earley':
if lexer=='standard':
return Earley
elif lexer=='dynamic':
return XEarley
elif lexer=='dynamic_complete':
return XEarley_CompleteLex
elif lexer=='contextual':
raise ValueError('The Earley parser does not support the contextual parser')
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser == 'cyk':
if lexer == 'standard':
return CYK
else:
raise ValueError('CYK parser requires using standard parser.')
else:
raise ValueError('Unknown parser: %s' % parser)




+ 33
- 3
lark/parsers/lalr_analysis.py View File

@@ -9,10 +9,13 @@ For now, shift/reduce conflicts are automatically resolved as shifts.
import logging import logging
from collections import defaultdict from collections import defaultdict


from ..utils import classify, classify_bool, bfs, fzset
from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError from ..exceptions import GrammarError


from .grammar_analysis import GrammarAnalyzer, Terminal from .grammar_analysis import GrammarAnalyzer, Terminal
from ..grammar import Rule

###{standalone


class Action: class Action:
def __init__(self, name): def __init__(self, name):
@@ -31,6 +34,34 @@ class ParseTable:
self.start_state = start_state self.start_state = start_state
self.end_state = end_state self.end_state = end_state


def serialize(self, memo):
tokens = Enumerator()
rules = Enumerator()

states = {
state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}
for state, actions in self.states.items()
}

return {
'tokens': tokens.reversed(),
'states': states,
'start_state': self.start_state,
'end_state': self.end_state,
}

@classmethod
def deserialize(cls, data, memo):
tokens = data['tokens']
states = {
state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
return cls(states, data['start_state'], data['end_state'])


class IntParseTable(ParseTable): class IntParseTable(ParseTable):


@classmethod @classmethod
@@ -49,8 +80,7 @@ class IntParseTable(ParseTable):
end_state = state_to_idx[parse_table.end_state] end_state = state_to_idx[parse_table.end_state]
return cls(int_states, start_state, end_state) return cls(int_states, start_state, end_state)




###}


class LALR_Analyzer(GrammarAnalyzer): class LALR_Analyzer(GrammarAnalyzer):




+ 17
- 4
lark/parsers/lalr_parser.py View File

@@ -4,10 +4,13 @@
# Email : erezshin@gmail.com # Email : erezshin@gmail.com
from ..exceptions import UnexpectedToken from ..exceptions import UnexpectedToken
from ..lexer import Token from ..lexer import Token
from ..utils import Enumerator, Serialize


from .lalr_analysis import LALR_Analyzer, Shift
from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable


class Parser:

###{standalone
class LALR_Parser(object):
def __init__(self, parser_conf, debug=False): def __init__(self, parser_conf, debug=False):
assert all(r.options is None or r.options.priority is None assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization" for r in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -18,9 +21,19 @@ class Parser:
self._parse_table = analysis.parse_table self._parse_table = analysis.parse_table
self.parser_conf = parser_conf self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks) self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse


###{standalone
@classmethod
def deserialize(cls, data, memo, callbacks):
inst = cls.__new__(cls)
inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks)
return inst

def serialize(self, memo):
return self._parse_table.serialize(memo)

def parse(self, *args):
return self.parser.parse(*args)



class _Parser: class _Parser:
def __init__(self, parse_table, callbacks): def __init__(self, parse_table, callbacks):


+ 23
- 134
lark/tools/standalone.py View File

@@ -36,6 +36,7 @@
# #
###} ###}


import pprint
import codecs import codecs
import sys import sys
import os import os
@@ -47,6 +48,10 @@ import lark
from lark import Lark from lark import Lark
from lark.parsers.lalr_analysis import Reduce from lark.parsers.lalr_analysis import Reduce



from lark.grammar import RuleOptions, Rule
from lark.lexer import TerminalDef

_dir = path.dirname(__file__) _dir = path.dirname(__file__)
_larkdir = path.join(_dir, path.pardir) _larkdir = path.join(_dir, path.pardir)


@@ -58,12 +63,15 @@ EXTRACT_STANDALONE_FILES = [
'tree.py', 'tree.py',
'visitors.py', 'visitors.py',
'indenter.py', 'indenter.py',
'grammar.py',
'lexer.py', 'lexer.py',
'parse_tree_builder.py', 'parse_tree_builder.py',
'parsers/lalr_parser.py', 'parsers/lalr_parser.py',
'parsers/lalr_analysis.py',
'parser_frontends.py',
'lark.py',
] ]



def extract_sections(lines): def extract_sections(lines):
section = None section = None
text = [] text = []
@@ -83,152 +91,33 @@ def extract_sections(lines):


return {name:''.join(text) for name, text in sections.items()} return {name:''.join(text) for name, text in sections.items()}


def _prepare_mres(mres):
return [(p.pattern,{i: t for i, t in d.items()}) for p,d in mres]

class TraditionalLexerAtoms:
def __init__(self, lexer):
self.mres = _prepare_mres(lexer.mres)
self.newline_types = lexer.newline_types
self.ignore_types = lexer.ignore_types
self.callback = {name:_prepare_mres(c.mres)
for name, c in lexer.callback.items()}

def print_python(self):
print('import re')
print('class LexerRegexps: pass')
print('NEWLINE_TYPES = %s' % self.newline_types)
print('IGNORE_TYPES = %s' % self.ignore_types)
self._print_python('lexer')

def _print_python(self, var_name):
print('MRES = (')
pprint(self.mres)
print(')')
print('LEXER_CALLBACK = (')
pprint(self.callback)
print(')')
print('lexer_regexps = LexerRegexps()')
print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]')
print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])')
print(' for n, mres in LEXER_CALLBACK.items()}')
print('%s = (lexer_regexps)' % var_name)


class ContextualLexerAtoms:
def __init__(self, lexer):
self.lexer_atoms = {state: TraditionalLexerAtoms(lexer) for state, lexer in lexer.lexers.items()}
self.root_lexer_atoms = TraditionalLexerAtoms(lexer.root_lexer)

def print_python(self):
print('import re')
print('class LexerRegexps: pass')
print('NEWLINE_TYPES = %s' % self.root_lexer_atoms.newline_types)
print('IGNORE_TYPES = %s' % self.root_lexer_atoms.ignore_types)

print('LEXERS = {}')
for state, lexer_atoms in self.lexer_atoms.items():
lexer_atoms._print_python('LEXERS[%d]' % state)

print('class ContextualLexer:')
print(' def __init__(self):')
print(' self.lexers = LEXERS')
print(' self.set_parser_state(None)')
print(' def set_parser_state(self, state):')
print(' self.parser_state = state')
print(' def lex(self, stream):')
print(' newline_types = NEWLINE_TYPES')
print(' ignore_types = IGNORE_TYPES')
print(' lexers = LEXERS')
print(' l = _Lex(lexers[self.parser_state], self.parser_state)')
print(' for x in l.lex(stream, newline_types, ignore_types):')
print(' yield x')
print(' l.lexer = lexers[self.parser_state]')
print(' l.state = self.parser_state')

print('CON_LEXER = ContextualLexer()')
print('def lex(stream):')
print(' return CON_LEXER.lex(stream)')

class GetRule:
def __init__(self, rule_id):
self.rule_id = rule_id

def __repr__(self):
return 'RULES[%d]' % self.rule_id

rule_ids = {}
token_types = {}

def _get_token_type(token_type):
if token_type not in token_types:
token_types[token_type] = len(token_types)
return token_types[token_type]

class ParserAtoms:
def __init__(self, parser):
self.parse_table = parser._parse_table

def print_python(self):
print('class ParseTable: pass')
print('parse_table = ParseTable()')
print('STATES = {')
for state, actions in self.parse_table.states.items():
print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}))
print('}')
print('TOKEN_TYPES = (')
pprint({v:k for k, v in token_types.items()})
print(')')
print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}')
print(' for s, acts in STATES.items()}')
print('parse_table.start_state = %s' % self.parse_table.start_state)
print('parse_table.end_state = %s' % self.parse_table.end_state)
print('class Lark_StandAlone:')
print(' def __init__(self, transformer=None, postlex=None):')
print(' callbacks = parse_tree_builder.create_callback(transformer=transformer)')
print(' self.parser = _Parser(parse_table, callbacks)')
print(' self.postlex = postlex')
print(' def parse(self, stream):')
print(' tokens = lex(stream)')
print(' sps = CON_LEXER.set_parser_state')
print(' if self.postlex: tokens = self.postlex.process(tokens)')
print(' return self.parser.parse(tokens, sps)')

class TreeBuilderAtoms:
def __init__(self, lark):
self.rules = lark.rules

def print_python(self):
# print('class InlineTransformer: pass')
print('RULES = {')
for i, r in enumerate(self.rules):
rule_ids[r] = i
print(' %d: Rule(%r, [%s], alias=%r, options=%r),' % (i, r.origin, ', '.join(s.fullrepr for s in r.expansion), r.alias, r.options ))
print('}')
print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)')


def main(fobj, start): def main(fobj, start):
lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start)


lexer_atoms = ContextualLexerAtoms(lark_inst.parser.lexer)
parser_atoms = ParserAtoms(lark_inst.parser.parser)
tree_builder_atoms = TreeBuilderAtoms(lark_inst)

print('# The file was automatically generated by Lark v%s' % lark.__version__) print('# The file was automatically generated by Lark v%s' % lark.__version__)


for pyfile in EXTRACT_STANDALONE_FILES: for pyfile in EXTRACT_STANDALONE_FILES:
with open(os.path.join(_larkdir, pyfile)) as f: with open(os.path.join(_larkdir, pyfile)) as f:
print (extract_sections(f)['standalone']) print (extract_sections(f)['standalone'])


with open(os.path.join(_larkdir, 'grammar.py')) as grammar_py:
print(grammar_py.read())
data, m = lark_inst.memo_serialize([TerminalDef, Rule])
print( 'DATA = (' )
# pprint(data, width=160)
print(data)
print(')')
print( 'MEMO = (')
print(m)
print(')')



print('Shift = 0') print('Shift = 0')
print('Reduce = 1') print('Reduce = 1')
lexer_atoms.print_python()
tree_builder_atoms.print_python()
parser_atoms.print_python()
print("def Lark_StandAlone(transformer=None, postlex=None):")
print(" namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}")
print(" return Lark.deserialize(DATA, namespace, MEMO, transformer=transformer, postlex=postlex)")




if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) < 2: if len(sys.argv) < 2:


+ 107
- 3
lark/utils.py View File

@@ -1,8 +1,6 @@
import sys import sys
from collections import deque from collections import deque


Py36 = (sys.version_info[:2] >= (3, 6))

class fzset(frozenset): class fzset(frozenset):
def __repr__(self): def __repr__(self):
return '{%s}' % ', '.join(map(repr, self)) return '{%s}' % ', '.join(map(repr, self))
@@ -45,7 +43,89 @@ def bfs(initial, expand):






def _serialize(value, memo):
# if memo and memo.in_types(value):
# return {'__memo__': memo.memoized.get(value)}

if isinstance(value, Serialize):
return value.serialize(memo)
elif isinstance(value, list):
return [_serialize(elem, memo) for elem in value]
elif isinstance(value, frozenset):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem, memo) for key, elem in value.items()}
return value

###{standalone ###{standalone
def _deserialize(data, namespace, memo):
if isinstance(data, dict):
if '__type__' in data: # Object
class_ = namespace[data['__type__']]
return class_.deserialize(data, memo)
elif '@' in data:
return memo[data['@']]
return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
elif isinstance(data, list):
return [_deserialize(value, namespace, memo) for value in data]
return data


class Serialize(object):
def memo_serialize(self, types_to_memoize):
memo = SerializeMemoizer(types_to_memoize)
return self.serialize(memo), memo.serialize()

def serialize(self, memo=None):
if memo and memo.in_types(self):
return {'@': memo.memoized.get(self)}

fields = getattr(self, '__serialize_fields__')
res = {f: _serialize(getattr(self, f), memo) for f in fields}
res['__type__'] = type(self).__name__
postprocess = getattr(self, '_serialize', None)
if postprocess:
postprocess(res, memo)
return res

@classmethod
def deserialize(cls, data, memo):
namespace = getattr(cls, '__serialize_namespace__', {})
namespace = {c.__name__:c for c in namespace}

fields = getattr(cls, '__serialize_fields__')

if '@' in data:
return memo[data['@']]

inst = cls.__new__(cls)
for f in fields:
setattr(inst, f, _deserialize(data[f], namespace, memo))
postprocess = getattr(inst, '_deserialize', None)
if postprocess:
postprocess()
return inst


class SerializeMemoizer(Serialize):
__serialize_fields__ = 'memoized',

def __init__(self, types_to_memoize):
self.types_to_memoize = tuple(types_to_memoize)
self.memoized = Enumerator()

def in_types(self, value):
return isinstance(value, self.types_to_memoize)

def serialize(self):
return _serialize(self.memoized.reversed(), None)

@classmethod
def deserialize(cls, data, namespace, memo):
return _deserialize(data, namespace, memo)



try: try:
STRING_TYPE = basestring STRING_TYPE = basestring
except NameError: # Python 3 except NameError: # Python 3
@@ -79,6 +159,11 @@ def smart_decorator(f, create_decorator):
else: else:
return create_decorator(f.__func__.__call__, True) return create_decorator(f.__func__.__call__, True)


import sys, re
Py36 = (sys.version_info[:2] >= (3, 6))
###}


def dedup_list(l): def dedup_list(l):
"""Given a list (l) will removing duplicates from the list, """Given a list (l) will removing duplicates from the list,
preserving the original order of the list. Assumes that preserving the original order of the list. Assumes that
@@ -86,7 +171,7 @@ def dedup_list(l):
dedup = set() dedup = set()
return [ x for x in l if not (x in dedup or dedup.add(x))] return [ x for x in l if not (x in dedup or dedup.add(x))]


###}




try: try:
@@ -128,3 +213,22 @@ def get_regexp_width(regexp):
return sre_parse.parse(regexp).getwidth() return sre_parse.parse(regexp).getwidth()
except sre_constants.error: except sre_constants.error:
raise ValueError(regexp) raise ValueError(regexp)


class Enumerator(Serialize):
def __init__(self):
self.enums = {}

def get(self, item):
if item not in self.enums:
self.enums[item] = len(self.enums)
return self.enums[item]

def __len__(self):
return len(self.enums)

def reversed(self):
r = {v: k for k, v in self.enums.items()}
assert len(r) == len(self.enums)
return r


+ 19
- 0
tests/test_parser.py View File

@@ -21,6 +21,8 @@ from lark.lark import Lark
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
from lark.tree import Tree from lark.tree import Tree
from lark.visitors import Transformer from lark.visitors import Transformer
from lark.grammar import Rule
from lark.lexer import TerminalDef


__path__ = os.path.dirname(__file__) __path__ = os.path.dirname(__file__)
def _read(n, *args): def _read(n, *args):
@@ -1429,6 +1431,23 @@ def _make_parser_test(LEXER, PARSER):


parser.parse(r'"That" "And a \"b"') parser.parse(r'"That" "And a \"b"')


@unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)")
def test_serialize(self):
grammar = """
start: "A" b "C"
b: "B"
"""
parser = _Lark(grammar)
d = parser.serialize()
parser2 = Lark.deserialize(d, {}, {})
self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) )

namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}
d, m = parser.memo_serialize(namespace.values())
parser3 = Lark.deserialize(d, namespace, m)
self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) )




_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME _TestParser.__name__ = _NAME


+ 4
- 0
tests/test_tools.py View File

@@ -70,6 +70,10 @@ class TestStandalone(TestCase):
x = T().transform(x) x = T().transform(x)
self.assertEqual(x, ['a', 'b']) self.assertEqual(x, ['a', 'b'])


l2 = _Lark(transformer=T())
x = l2.parse('ABAB')
self.assertEqual(x, ['a', 'b'])



if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()


Loading…
Cancel
Save