@@ -1 +1 @@ | |||||
python -m lark.tools.standalone json.lark > json_parser.py | |||||
PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py |
@@ -1,4 +1,8 @@ | |||||
class Symbol(object): | |||||
from .utils import Serialize | |||||
###{standalone | |||||
class Symbol(Serialize): | |||||
is_term = NotImplemented | is_term = NotImplemented | ||||
def __init__(self, name): | def __init__(self, name): | ||||
@@ -19,7 +23,10 @@ class Symbol(object): | |||||
fullrepr = property(__repr__) | fullrepr = property(__repr__) | ||||
class Terminal(Symbol): | class Terminal(Symbol): | ||||
__serialize_fields__ = 'name', 'filter_out' | |||||
is_term = True | is_term = True | ||||
def __init__(self, name, filter_out=False): | def __init__(self, name, filter_out=False): | ||||
@@ -31,16 +38,42 @@ class Terminal(Symbol): | |||||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) | return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) | ||||
class NonTerminal(Symbol): | class NonTerminal(Symbol): | ||||
__serialize_fields__ = 'name', | |||||
is_term = False | is_term = False | ||||
class Rule(object): | |||||
class RuleOptions(Serialize): | |||||
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' | |||||
def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): | |||||
self.keep_all_tokens = keep_all_tokens | |||||
self.expand1 = expand1 | |||||
self.priority = priority | |||||
self.empty_indices = empty_indices | |||||
def __repr__(self): | |||||
return 'RuleOptions(%r, %r, %r)' % ( | |||||
self.keep_all_tokens, | |||||
self.expand1, | |||||
self.priority, | |||||
) | |||||
class Rule(Serialize): | |||||
""" | """ | ||||
origin : a symbol | origin : a symbol | ||||
expansion : a list of symbols | expansion : a list of symbols | ||||
order : index of this expansion amongst all rules of the same name | order : index of this expansion amongst all rules of the same name | ||||
""" | """ | ||||
__slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') | __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') | ||||
__serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' | |||||
__serialize_namespace__ = Terminal, NonTerminal, RuleOptions | |||||
def __init__(self, origin, expansion, order=0, alias=None, options=None): | def __init__(self, origin, expansion, order=0, alias=None, options=None): | ||||
self.origin = origin | self.origin = origin | ||||
self.expansion = expansion | self.expansion = expansion | ||||
@@ -49,6 +82,8 @@ class Rule(object): | |||||
self.options = options | self.options = options | ||||
self._hash = hash((self.origin, tuple(self.expansion))) | self._hash = hash((self.origin, tuple(self.expansion))) | ||||
def _deserialize(self): | |||||
self._hash = hash((self.origin, tuple(self.expansion))) | |||||
def __str__(self): | def __str__(self): | ||||
return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) | return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) | ||||
@@ -65,16 +100,5 @@ class Rule(object): | |||||
return self.origin == other.origin and self.expansion == other.expansion | return self.origin == other.origin and self.expansion == other.expansion | ||||
class RuleOptions: | |||||
def __init__(self, keep_all_tokens=False, expand1=False, priority=None): | |||||
self.keep_all_tokens = keep_all_tokens | |||||
self.expand1 = expand1 | |||||
self.priority = priority | |||||
self.empty_indices = () | |||||
def __repr__(self): | |||||
return 'RuleOptions(%r, %r, %r)' % ( | |||||
self.keep_all_tokens, | |||||
self.expand1, | |||||
self.priority, | |||||
) | |||||
###} |
@@ -5,7 +5,7 @@ import time | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from io import open | from io import open | ||||
from .utils import STRING_TYPE | |||||
from .utils import STRING_TYPE, Serialize, SerializeMemoizer | |||||
from .load_grammar import load_grammar | from .load_grammar import load_grammar | ||||
from .tree import Tree | from .tree import Tree | ||||
from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
@@ -13,9 +13,11 @@ from .common import LexerConf, ParserConf | |||||
from .lexer import Lexer, TraditionalLexer | from .lexer import Lexer, TraditionalLexer | ||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
from .parser_frontends import get_frontend | from .parser_frontends import get_frontend | ||||
from .grammar import Rule | |||||
###{standalone | |||||
class LarkOptions(object): | |||||
class LarkOptions(Serialize): | |||||
"""Specifies the options for Lark | """Specifies the options for Lark | ||||
""" | """ | ||||
@@ -51,24 +53,39 @@ class LarkOptions(object): | |||||
if __doc__: | if __doc__: | ||||
__doc__ += OPTIONS_DOC | __doc__ += OPTIONS_DOC | ||||
_defaults = { | |||||
'debug': False, | |||||
'keep_all_tokens': False, | |||||
'tree_class': None, | |||||
'cache_grammar': False, | |||||
'postlex': None, | |||||
'parser': 'earley', | |||||
'lexer': 'auto', | |||||
'transformer': None, | |||||
'start': 'start', | |||||
'profile': False, | |||||
'priority': 'auto', | |||||
'ambiguity': 'auto', | |||||
'propagate_positions': False, | |||||
'lexer_callbacks': {}, | |||||
'maybe_placeholders': False, | |||||
} | |||||
def __init__(self, options_dict): | def __init__(self, options_dict): | ||||
o = dict(options_dict) | o = dict(options_dict) | ||||
self.debug = bool(o.pop('debug', False)) | |||||
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | |||||
self.tree_class = o.pop('tree_class', Tree) | |||||
self.cache_grammar = o.pop('cache_grammar', False) | |||||
self.postlex = o.pop('postlex', None) | |||||
self.parser = o.pop('parser', 'earley') | |||||
self.lexer = o.pop('lexer', 'auto') | |||||
self.transformer = o.pop('transformer', None) | |||||
self.start = o.pop('start', 'start') | |||||
self.profile = o.pop('profile', False) | |||||
self.priority = o.pop('priority', 'auto') | |||||
self.ambiguity = o.pop('ambiguity', 'auto') | |||||
self.propagate_positions = o.pop('propagate_positions', False) | |||||
self.lexer_callbacks = o.pop('lexer_callbacks', {}) | |||||
self.maybe_placeholders = o.pop('maybe_placeholders', False) | |||||
options = {} | |||||
for name, default in self._defaults.items(): | |||||
if name in o: | |||||
value = o.pop(name) | |||||
if isinstance(default, bool): | |||||
value = bool(value) | |||||
else: | |||||
value = default | |||||
options[name] = value | |||||
self.__dict__['options'] = options | |||||
assert self.parser in ('earley', 'lalr', 'cyk', None) | assert self.parser in ('earley', 'lalr', 'cyk', None) | ||||
@@ -79,6 +96,19 @@ class LarkOptions(object): | |||||
if o: | if o: | ||||
raise ValueError("Unknown options: %s" % o.keys()) | raise ValueError("Unknown options: %s" % o.keys()) | ||||
def __getattr__(self, name): | |||||
return self.options[name] | |||||
def __setattr__(self, name, value): | |||||
assert name in self.options | |||||
self.options[name] = value | |||||
def serialize(self, memo): | |||||
return self.options | |||||
@classmethod | |||||
def deserialize(cls, data, memo): | |||||
return cls(data) | |||||
class Profiler: | class Profiler: | ||||
def __init__(self): | def __init__(self): | ||||
@@ -104,7 +134,7 @@ class Profiler: | |||||
return wrapper | return wrapper | ||||
class Lark: | |||||
class Lark(Serialize): | |||||
def __init__(self, grammar, **options): | def __init__(self, grammar, **options): | ||||
""" | """ | ||||
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) | grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) | ||||
@@ -195,18 +225,35 @@ class Lark: | |||||
if __init__.__doc__: | if __init__.__doc__: | ||||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | ||||
__serialize_fields__ = 'parser', 'rules', 'options' | |||||
def _build_lexer(self): | def _build_lexer(self): | ||||
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | ||||
def _build_parser(self): | |||||
def _prepare_callbacks(self): | |||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) | |||||
self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) | |||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) | |||||
callbacks = self._parse_tree_builder.create_callback(self.options.transformer) | |||||
def _build_parser(self): | |||||
self._prepare_callbacks() | |||||
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) | |||||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||||
parser_conf = ParserConf(self.rules, callbacks, self.options.start) | |||||
@classmethod | |||||
def deserialize(cls, data, namespace, memo, transformer=None, postlex=None): | |||||
if memo: | |||||
memo = SerializeMemoizer.deserialize(memo, namespace, {}) | |||||
inst = cls.__new__(cls) | |||||
options = dict(data['options']) | |||||
options['transformer'] = transformer | |||||
options['postlex'] = postlex | |||||
inst.options = LarkOptions.deserialize(options, memo) | |||||
inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] | |||||
inst._prepare_callbacks() | |||||
inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) | |||||
return inst | |||||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||||
@classmethod | @classmethod | ||||
def open(cls, grammar_filename, rel_to=None, **options): | def open(cls, grammar_filename, rel_to=None, **options): | ||||
@@ -243,14 +290,4 @@ class Lark: | |||||
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." | "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." | ||||
return self.parser.parse(text) | return self.parser.parse(text) | ||||
# if self.profiler: | |||||
# self.profiler.enter_section('lex') | |||||
# l = list(self.lex(text)) | |||||
# self.profiler.enter_section('parse') | |||||
# try: | |||||
# return self.parser.parse(l) | |||||
# finally: | |||||
# self.profiler.enter_section('outside_lark') | |||||
# else: | |||||
# l = list(self.lex(text)) | |||||
# return self.parser.parse(l) | |||||
###} |
@@ -2,10 +2,14 @@ | |||||
import re | import re | ||||
from .utils import Str, classify, get_regexp_width, Py36 | |||||
from .utils import Str, classify, get_regexp_width, Py36, Serialize | |||||
from .exceptions import UnexpectedCharacters, LexError | from .exceptions import UnexpectedCharacters, LexError | ||||
class Pattern(object): | |||||
###{standalone | |||||
class Pattern(Serialize): | |||||
__serialize_fields__ = 'value', 'flags' | |||||
def __init__(self, value, flags=()): | def __init__(self, value, flags=()): | ||||
self.value = value | self.value = value | ||||
self.flags = frozenset(flags) | self.flags = frozenset(flags) | ||||
@@ -35,6 +39,7 @@ class Pattern(object): | |||||
value = ('(?%s)' % f) + value | value = ('(?%s)' % f) + value | ||||
return value | return value | ||||
class PatternStr(Pattern): | class PatternStr(Pattern): | ||||
def to_regexp(self): | def to_regexp(self): | ||||
return self._get_flags(re.escape(self.value)) | return self._get_flags(re.escape(self.value)) | ||||
@@ -55,7 +60,11 @@ class PatternRE(Pattern): | |||||
def max_width(self): | def max_width(self): | ||||
return get_regexp_width(self.to_regexp())[1] | return get_regexp_width(self.to_regexp())[1] | ||||
class TerminalDef(object): | |||||
class TerminalDef(Serialize): | |||||
__serialize_fields__ = 'name', 'pattern', 'priority' | |||||
__serialize_namespace__ = PatternStr, PatternRE | |||||
def __init__(self, name, pattern, priority=1): | def __init__(self, name, pattern, priority=1): | ||||
assert isinstance(pattern, Pattern), pattern | assert isinstance(pattern, Pattern), pattern | ||||
self.name = name | self.name = name | ||||
@@ -67,7 +76,6 @@ class TerminalDef(object): | |||||
###{standalone | |||||
class Token(Str): | class Token(Str): | ||||
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') | __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') | ||||
@@ -198,7 +206,6 @@ class CallChain: | |||||
return self.callback2(t) if self.cond(t2) else t2 | return self.callback2(t) if self.cond(t2) else t2 | ||||
###} | |||||
@@ -254,7 +261,7 @@ def _regexp_has_newline(r): | |||||
""" | """ | ||||
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) | return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) | ||||
class Lexer: | |||||
class Lexer(Serialize): | |||||
"""Lexer interface | """Lexer interface | ||||
Method Signatures: | Method Signatures: | ||||
@@ -265,7 +272,16 @@ class Lexer: | |||||
set_parser_state = NotImplemented | set_parser_state = NotImplemented | ||||
lex = NotImplemented | lex = NotImplemented | ||||
class TraditionalLexer(Lexer): | class TraditionalLexer(Lexer): | ||||
__serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' | |||||
__serialize_namespace__ = TerminalDef, | |||||
def _deserialize(self): | |||||
self.mres = build_mres(self.terminals) | |||||
self.callback = {} # TODO implement | |||||
def __init__(self, terminals, ignore=(), user_callbacks={}): | def __init__(self, terminals, ignore=(), user_callbacks={}): | ||||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals | assert all(isinstance(t, TerminalDef) for t in terminals), terminals | ||||
@@ -308,7 +324,12 @@ class TraditionalLexer(Lexer): | |||||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | ||||
class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
__serialize_fields__ = 'root_lexer', 'lexers' | |||||
__serialize_namespace__ = TraditionalLexer, | |||||
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): | def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): | ||||
tokens_by_name = {} | tokens_by_name = {} | ||||
for t in terminals: | for t in terminals: | ||||
@@ -343,4 +364,4 @@ class ContextualLexer(Lexer): | |||||
l.lexer = self.lexers[self.parser_state] | l.lexer = self.lexers[self.parser_state] | ||||
l.state = self.parser_state | l.state = self.parser_state | ||||
###} |
@@ -209,12 +209,12 @@ class ParseTreeBuilder: | |||||
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | ||||
expand_single_child = options.expand1 if options else False | expand_single_child = options.expand1 if options else False | ||||
wrapper_chain = filter(None, [ | |||||
wrapper_chain = list(filter(None, [ | |||||
(expand_single_child and not rule.alias) and ExpandSingleChild, | (expand_single_child and not rule.alias) and ExpandSingleChild, | ||||
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), | maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), | ||||
self.propagate_positions and PropagatePositions, | self.propagate_positions and PropagatePositions, | ||||
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), | self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), | ||||
]) | |||||
])) | |||||
yield rule, wrapper_chain | yield rule, wrapper_chain | ||||
@@ -1,25 +1,78 @@ | |||||
import re | import re | ||||
from functools import partial | from functools import partial | ||||
from .utils import get_regexp_width | |||||
from .utils import get_regexp_width, Serialize | |||||
from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | ||||
from .parsers import lalr_parser, earley, xearley, cyk | |||||
from .parsers import earley, xearley, cyk | |||||
from .parsers.lalr_parser import LALR_Parser | |||||
from .grammar import Rule | |||||
from .tree import Tree | from .tree import Tree | ||||
class WithLexer: | |||||
###{standalone | |||||
def get_frontend(parser, lexer): | |||||
if parser=='lalr': | |||||
if lexer is None: | |||||
raise ValueError('The LALR parser requires use of a lexer') | |||||
elif lexer == 'standard': | |||||
return LALR_TraditionalLexer | |||||
elif lexer == 'contextual': | |||||
return LALR_ContextualLexer | |||||
elif issubclass(lexer, Lexer): | |||||
return partial(LALR_CustomLexer, lexer) | |||||
else: | |||||
raise ValueError('Unknown lexer: %s' % lexer) | |||||
elif parser=='earley': | |||||
if lexer=='standard': | |||||
return Earley | |||||
elif lexer=='dynamic': | |||||
return XEarley | |||||
elif lexer=='dynamic_complete': | |||||
return XEarley_CompleteLex | |||||
elif lexer=='contextual': | |||||
raise ValueError('The Earley parser does not support the contextual parser') | |||||
else: | |||||
raise ValueError('Unknown lexer: %s' % lexer) | |||||
elif parser == 'cyk': | |||||
if lexer == 'standard': | |||||
return CYK | |||||
else: | |||||
raise ValueError('CYK parser requires using standard parser.') | |||||
else: | |||||
raise ValueError('Unknown parser: %s' % parser) | |||||
class WithLexer(Serialize): | |||||
lexer = None | lexer = None | ||||
parser = None | parser = None | ||||
lexer_conf = None | lexer_conf = None | ||||
__serialize_fields__ = 'parser', 'lexer' | |||||
__serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer | |||||
@classmethod | |||||
def deserialize(cls, data, memo, callbacks, postlex): | |||||
inst = super(WithLexer, cls).deserialize(data, memo) | |||||
inst.postlex = postlex | |||||
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | |||||
return inst | |||||
def _serialize(self, data, memo): | |||||
data['parser'] = data['parser'].serialize(memo) | |||||
def init_traditional_lexer(self, lexer_conf): | def init_traditional_lexer(self, lexer_conf): | ||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | ||||
self.postlex = lexer_conf.postlex | |||||
def init_contextual_lexer(self, lexer_conf): | def init_contextual_lexer(self, lexer_conf): | ||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
self.postlex = lexer_conf.postlex | |||||
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | ||||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||||
always_accept = self.postlex.always_accept if self.postlex else () | |||||
self.lexer = ContextualLexer(lexer_conf.tokens, states, | self.lexer = ContextualLexer(lexer_conf.tokens, states, | ||||
ignore=lexer_conf.ignore, | ignore=lexer_conf.ignore, | ||||
always_accept=always_accept, | always_accept=always_accept, | ||||
@@ -27,30 +80,31 @@ class WithLexer: | |||||
def lex(self, text): | def lex(self, text): | ||||
stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
if self.lexer_conf.postlex: | |||||
return self.lexer_conf.postlex.process(stream) | |||||
return stream | |||||
return self.postlex.process(stream) if self.postlex else stream | |||||
def parse(self, text): | def parse(self, text): | ||||
token_stream = self.lex(text) | token_stream = self.lex(text) | ||||
sps = self.lexer.set_parser_state | sps = self.lexer.set_parser_state | ||||
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) | return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) | ||||
class LALR_TraditionalLexer(WithLexer): | class LALR_TraditionalLexer(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
debug = options.debug if options else False | debug = options.debug if options else False | ||||
self.parser = lalr_parser.Parser(parser_conf, debug=debug) | |||||
self.parser = LALR_Parser(parser_conf, debug=debug) | |||||
self.init_traditional_lexer(lexer_conf) | self.init_traditional_lexer(lexer_conf) | ||||
class LALR_ContextualLexer(WithLexer): | class LALR_ContextualLexer(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
debug = options.debug if options else False | debug = options.debug if options else False | ||||
self.parser = lalr_parser.Parser(parser_conf, debug=debug) | |||||
self.parser = LALR_Parser(parser_conf, debug=debug) | |||||
self.init_contextual_lexer(lexer_conf) | self.init_contextual_lexer(lexer_conf) | ||||
###} | |||||
class LALR_CustomLexer(WithLexer): | class LALR_CustomLexer(WithLexer): | ||||
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | ||||
self.parser = lalr_parser.Parser(parser_conf) | |||||
self.parser = LALR_Parser(parser_conf) | |||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
self.lexer = lexer_cls(lexer_conf) | self.lexer = lexer_cls(lexer_conf) | ||||
@@ -141,37 +195,3 @@ class CYK(WithLexer): | |||||
def _apply_callback(self, tree): | def _apply_callback(self, tree): | ||||
return self.callbacks[tree.rule](tree.children) | return self.callbacks[tree.rule](tree.children) | ||||
def get_frontend(parser, lexer): | |||||
if parser=='lalr': | |||||
if lexer is None: | |||||
raise ValueError('The LALR parser requires use of a lexer') | |||||
elif lexer == 'standard': | |||||
return LALR_TraditionalLexer | |||||
elif lexer == 'contextual': | |||||
return LALR_ContextualLexer | |||||
elif issubclass(lexer, Lexer): | |||||
return partial(LALR_CustomLexer, lexer) | |||||
else: | |||||
raise ValueError('Unknown lexer: %s' % lexer) | |||||
elif parser=='earley': | |||||
if lexer=='standard': | |||||
return Earley | |||||
elif lexer=='dynamic': | |||||
return XEarley | |||||
elif lexer=='dynamic_complete': | |||||
return XEarley_CompleteLex | |||||
elif lexer=='contextual': | |||||
raise ValueError('The Earley parser does not support the contextual parser') | |||||
else: | |||||
raise ValueError('Unknown lexer: %s' % lexer) | |||||
elif parser == 'cyk': | |||||
if lexer == 'standard': | |||||
return CYK | |||||
else: | |||||
raise ValueError('CYK parser requires using standard parser.') | |||||
else: | |||||
raise ValueError('Unknown parser: %s' % parser) | |||||
@@ -9,10 +9,13 @@ For now, shift/reduce conflicts are automatically resolved as shifts. | |||||
import logging | import logging | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from ..utils import classify, classify_bool, bfs, fzset | |||||
from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator | |||||
from ..exceptions import GrammarError | from ..exceptions import GrammarError | ||||
from .grammar_analysis import GrammarAnalyzer, Terminal | from .grammar_analysis import GrammarAnalyzer, Terminal | ||||
from ..grammar import Rule | |||||
###{standalone | |||||
class Action: | class Action: | ||||
def __init__(self, name): | def __init__(self, name): | ||||
@@ -31,6 +34,34 @@ class ParseTable: | |||||
self.start_state = start_state | self.start_state = start_state | ||||
self.end_state = end_state | self.end_state = end_state | ||||
def serialize(self, memo): | |||||
tokens = Enumerator() | |||||
rules = Enumerator() | |||||
states = { | |||||
state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg)) | |||||
for token, (action, arg) in actions.items()} | |||||
for state, actions in self.states.items() | |||||
} | |||||
return { | |||||
'tokens': tokens.reversed(), | |||||
'states': states, | |||||
'start_state': self.start_state, | |||||
'end_state': self.end_state, | |||||
} | |||||
@classmethod | |||||
def deserialize(cls, data, memo): | |||||
tokens = data['tokens'] | |||||
states = { | |||||
state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg)) | |||||
for token, (action, arg) in actions.items()} | |||||
for state, actions in data['states'].items() | |||||
} | |||||
return cls(states, data['start_state'], data['end_state']) | |||||
class IntParseTable(ParseTable): | class IntParseTable(ParseTable): | ||||
@classmethod | @classmethod | ||||
@@ -49,8 +80,7 @@ class IntParseTable(ParseTable): | |||||
end_state = state_to_idx[parse_table.end_state] | end_state = state_to_idx[parse_table.end_state] | ||||
return cls(int_states, start_state, end_state) | return cls(int_states, start_state, end_state) | ||||
###} | |||||
class LALR_Analyzer(GrammarAnalyzer): | class LALR_Analyzer(GrammarAnalyzer): | ||||
@@ -4,10 +4,13 @@ | |||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..exceptions import UnexpectedToken | from ..exceptions import UnexpectedToken | ||||
from ..lexer import Token | from ..lexer import Token | ||||
from ..utils import Enumerator, Serialize | |||||
from .lalr_analysis import LALR_Analyzer, Shift | |||||
from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable | |||||
class Parser: | |||||
###{standalone | |||||
class LALR_Parser(object): | |||||
def __init__(self, parser_conf, debug=False): | def __init__(self, parser_conf, debug=False): | ||||
assert all(r.options is None or r.options.priority is None | assert all(r.options is None or r.options.priority is None | ||||
for r in parser_conf.rules), "LALR doesn't yet support prioritization" | for r in parser_conf.rules), "LALR doesn't yet support prioritization" | ||||
@@ -18,9 +21,19 @@ class Parser: | |||||
self._parse_table = analysis.parse_table | self._parse_table = analysis.parse_table | ||||
self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
self.parser = _Parser(analysis.parse_table, callbacks) | self.parser = _Parser(analysis.parse_table, callbacks) | ||||
self.parse = self.parser.parse | |||||
###{standalone | |||||
@classmethod | |||||
def deserialize(cls, data, memo, callbacks): | |||||
inst = cls.__new__(cls) | |||||
inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) | |||||
return inst | |||||
def serialize(self, memo): | |||||
return self._parse_table.serialize(memo) | |||||
def parse(self, *args): | |||||
return self.parser.parse(*args) | |||||
class _Parser: | class _Parser: | ||||
def __init__(self, parse_table, callbacks): | def __init__(self, parse_table, callbacks): | ||||
@@ -36,6 +36,7 @@ | |||||
# | # | ||||
###} | ###} | ||||
import pprint | |||||
import codecs | import codecs | ||||
import sys | import sys | ||||
import os | import os | ||||
@@ -47,6 +48,10 @@ import lark | |||||
from lark import Lark | from lark import Lark | ||||
from lark.parsers.lalr_analysis import Reduce | from lark.parsers.lalr_analysis import Reduce | ||||
from lark.grammar import RuleOptions, Rule | |||||
from lark.lexer import TerminalDef | |||||
_dir = path.dirname(__file__) | _dir = path.dirname(__file__) | ||||
_larkdir = path.join(_dir, path.pardir) | _larkdir = path.join(_dir, path.pardir) | ||||
@@ -58,12 +63,15 @@ EXTRACT_STANDALONE_FILES = [ | |||||
'tree.py', | 'tree.py', | ||||
'visitors.py', | 'visitors.py', | ||||
'indenter.py', | 'indenter.py', | ||||
'grammar.py', | |||||
'lexer.py', | 'lexer.py', | ||||
'parse_tree_builder.py', | 'parse_tree_builder.py', | ||||
'parsers/lalr_parser.py', | 'parsers/lalr_parser.py', | ||||
'parsers/lalr_analysis.py', | |||||
'parser_frontends.py', | |||||
'lark.py', | |||||
] | ] | ||||
def extract_sections(lines): | def extract_sections(lines): | ||||
section = None | section = None | ||||
text = [] | text = [] | ||||
@@ -83,152 +91,33 @@ def extract_sections(lines): | |||||
return {name:''.join(text) for name, text in sections.items()} | return {name:''.join(text) for name, text in sections.items()} | ||||
def _prepare_mres(mres): | |||||
return [(p.pattern,{i: t for i, t in d.items()}) for p,d in mres] | |||||
class TraditionalLexerAtoms: | |||||
def __init__(self, lexer): | |||||
self.mres = _prepare_mres(lexer.mres) | |||||
self.newline_types = lexer.newline_types | |||||
self.ignore_types = lexer.ignore_types | |||||
self.callback = {name:_prepare_mres(c.mres) | |||||
for name, c in lexer.callback.items()} | |||||
def print_python(self): | |||||
print('import re') | |||||
print('class LexerRegexps: pass') | |||||
print('NEWLINE_TYPES = %s' % self.newline_types) | |||||
print('IGNORE_TYPES = %s' % self.ignore_types) | |||||
self._print_python('lexer') | |||||
def _print_python(self, var_name): | |||||
print('MRES = (') | |||||
pprint(self.mres) | |||||
print(')') | |||||
print('LEXER_CALLBACK = (') | |||||
pprint(self.callback) | |||||
print(')') | |||||
print('lexer_regexps = LexerRegexps()') | |||||
print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') | |||||
print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') | |||||
print(' for n, mres in LEXER_CALLBACK.items()}') | |||||
print('%s = (lexer_regexps)' % var_name) | |||||
class ContextualLexerAtoms: | |||||
def __init__(self, lexer): | |||||
self.lexer_atoms = {state: TraditionalLexerAtoms(lexer) for state, lexer in lexer.lexers.items()} | |||||
self.root_lexer_atoms = TraditionalLexerAtoms(lexer.root_lexer) | |||||
def print_python(self): | |||||
print('import re') | |||||
print('class LexerRegexps: pass') | |||||
print('NEWLINE_TYPES = %s' % self.root_lexer_atoms.newline_types) | |||||
print('IGNORE_TYPES = %s' % self.root_lexer_atoms.ignore_types) | |||||
print('LEXERS = {}') | |||||
for state, lexer_atoms in self.lexer_atoms.items(): | |||||
lexer_atoms._print_python('LEXERS[%d]' % state) | |||||
print('class ContextualLexer:') | |||||
print(' def __init__(self):') | |||||
print(' self.lexers = LEXERS') | |||||
print(' self.set_parser_state(None)') | |||||
print(' def set_parser_state(self, state):') | |||||
print(' self.parser_state = state') | |||||
print(' def lex(self, stream):') | |||||
print(' newline_types = NEWLINE_TYPES') | |||||
print(' ignore_types = IGNORE_TYPES') | |||||
print(' lexers = LEXERS') | |||||
print(' l = _Lex(lexers[self.parser_state], self.parser_state)') | |||||
print(' for x in l.lex(stream, newline_types, ignore_types):') | |||||
print(' yield x') | |||||
print(' l.lexer = lexers[self.parser_state]') | |||||
print(' l.state = self.parser_state') | |||||
print('CON_LEXER = ContextualLexer()') | |||||
print('def lex(stream):') | |||||
print(' return CON_LEXER.lex(stream)') | |||||
class GetRule: | |||||
def __init__(self, rule_id): | |||||
self.rule_id = rule_id | |||||
def __repr__(self): | |||||
return 'RULES[%d]' % self.rule_id | |||||
rule_ids = {} | |||||
token_types = {} | |||||
def _get_token_type(token_type): | |||||
if token_type not in token_types: | |||||
token_types[token_type] = len(token_types) | |||||
return token_types[token_type] | |||||
class ParserAtoms: | |||||
def __init__(self, parser): | |||||
self.parse_table = parser._parse_table | |||||
def print_python(self): | |||||
print('class ParseTable: pass') | |||||
print('parse_table = ParseTable()') | |||||
print('STATES = {') | |||||
for state, actions in self.parse_table.states.items(): | |||||
print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) | |||||
for token, (action, arg) in actions.items()})) | |||||
print('}') | |||||
print('TOKEN_TYPES = (') | |||||
pprint({v:k for k, v in token_types.items()}) | |||||
print(')') | |||||
print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') | |||||
print(' for s, acts in STATES.items()}') | |||||
print('parse_table.start_state = %s' % self.parse_table.start_state) | |||||
print('parse_table.end_state = %s' % self.parse_table.end_state) | |||||
print('class Lark_StandAlone:') | |||||
print(' def __init__(self, transformer=None, postlex=None):') | |||||
print(' callbacks = parse_tree_builder.create_callback(transformer=transformer)') | |||||
print(' self.parser = _Parser(parse_table, callbacks)') | |||||
print(' self.postlex = postlex') | |||||
print(' def parse(self, stream):') | |||||
print(' tokens = lex(stream)') | |||||
print(' sps = CON_LEXER.set_parser_state') | |||||
print(' if self.postlex: tokens = self.postlex.process(tokens)') | |||||
print(' return self.parser.parse(tokens, sps)') | |||||
class TreeBuilderAtoms: | |||||
def __init__(self, lark): | |||||
self.rules = lark.rules | |||||
def print_python(self): | |||||
# print('class InlineTransformer: pass') | |||||
print('RULES = {') | |||||
for i, r in enumerate(self.rules): | |||||
rule_ids[r] = i | |||||
print(' %d: Rule(%r, [%s], alias=%r, options=%r),' % (i, r.origin, ', '.join(s.fullrepr for s in r.expansion), r.alias, r.options )) | |||||
print('}') | |||||
print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') | |||||
def main(fobj, start): | def main(fobj, start): | ||||
lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) | lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) | ||||
lexer_atoms = ContextualLexerAtoms(lark_inst.parser.lexer) | |||||
parser_atoms = ParserAtoms(lark_inst.parser.parser) | |||||
tree_builder_atoms = TreeBuilderAtoms(lark_inst) | |||||
print('# The file was automatically generated by Lark v%s' % lark.__version__) | print('# The file was automatically generated by Lark v%s' % lark.__version__) | ||||
for pyfile in EXTRACT_STANDALONE_FILES: | for pyfile in EXTRACT_STANDALONE_FILES: | ||||
with open(os.path.join(_larkdir, pyfile)) as f: | with open(os.path.join(_larkdir, pyfile)) as f: | ||||
print (extract_sections(f)['standalone']) | print (extract_sections(f)['standalone']) | ||||
with open(os.path.join(_larkdir, 'grammar.py')) as grammar_py: | |||||
print(grammar_py.read()) | |||||
data, m = lark_inst.memo_serialize([TerminalDef, Rule]) | |||||
print( 'DATA = (' ) | |||||
# pprint(data, width=160) | |||||
print(data) | |||||
print(')') | |||||
print( 'MEMO = (') | |||||
print(m) | |||||
print(')') | |||||
print('Shift = 0') | print('Shift = 0') | ||||
print('Reduce = 1') | print('Reduce = 1') | ||||
lexer_atoms.print_python() | |||||
tree_builder_atoms.print_python() | |||||
parser_atoms.print_python() | |||||
print("def Lark_StandAlone(transformer=None, postlex=None):") | |||||
print(" namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}") | |||||
print(" return Lark.deserialize(DATA, namespace, MEMO, transformer=transformer, postlex=postlex)") | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
if len(sys.argv) < 2: | if len(sys.argv) < 2: | ||||
@@ -1,8 +1,6 @@ | |||||
import sys | import sys | ||||
from collections import deque | from collections import deque | ||||
Py36 = (sys.version_info[:2] >= (3, 6)) | |||||
class fzset(frozenset): | class fzset(frozenset): | ||||
def __repr__(self): | def __repr__(self): | ||||
return '{%s}' % ', '.join(map(repr, self)) | return '{%s}' % ', '.join(map(repr, self)) | ||||
@@ -45,7 +43,89 @@ def bfs(initial, expand): | |||||
def _serialize(value, memo): | |||||
# if memo and memo.in_types(value): | |||||
# return {'__memo__': memo.memoized.get(value)} | |||||
if isinstance(value, Serialize): | |||||
return value.serialize(memo) | |||||
elif isinstance(value, list): | |||||
return [_serialize(elem, memo) for elem in value] | |||||
elif isinstance(value, frozenset): | |||||
return list(value) # TODO reversible? | |||||
elif isinstance(value, dict): | |||||
return {key:_serialize(elem, memo) for key, elem in value.items()} | |||||
return value | |||||
###{standalone | ###{standalone | ||||
def _deserialize(data, namespace, memo): | |||||
if isinstance(data, dict): | |||||
if '__type__' in data: # Object | |||||
class_ = namespace[data['__type__']] | |||||
return class_.deserialize(data, memo) | |||||
elif '@' in data: | |||||
return memo[data['@']] | |||||
return {key:_deserialize(value, namespace, memo) for key, value in data.items()} | |||||
elif isinstance(data, list): | |||||
return [_deserialize(value, namespace, memo) for value in data] | |||||
return data | |||||
class Serialize(object): | |||||
def memo_serialize(self, types_to_memoize): | |||||
memo = SerializeMemoizer(types_to_memoize) | |||||
return self.serialize(memo), memo.serialize() | |||||
def serialize(self, memo=None): | |||||
if memo and memo.in_types(self): | |||||
return {'@': memo.memoized.get(self)} | |||||
fields = getattr(self, '__serialize_fields__') | |||||
res = {f: _serialize(getattr(self, f), memo) for f in fields} | |||||
res['__type__'] = type(self).__name__ | |||||
postprocess = getattr(self, '_serialize', None) | |||||
if postprocess: | |||||
postprocess(res, memo) | |||||
return res | |||||
@classmethod | |||||
def deserialize(cls, data, memo): | |||||
namespace = getattr(cls, '__serialize_namespace__', {}) | |||||
namespace = {c.__name__:c for c in namespace} | |||||
fields = getattr(cls, '__serialize_fields__') | |||||
if '@' in data: | |||||
return memo[data['@']] | |||||
inst = cls.__new__(cls) | |||||
for f in fields: | |||||
setattr(inst, f, _deserialize(data[f], namespace, memo)) | |||||
postprocess = getattr(inst, '_deserialize', None) | |||||
if postprocess: | |||||
postprocess() | |||||
return inst | |||||
class SerializeMemoizer(Serialize): | |||||
__serialize_fields__ = 'memoized', | |||||
def __init__(self, types_to_memoize): | |||||
self.types_to_memoize = tuple(types_to_memoize) | |||||
self.memoized = Enumerator() | |||||
def in_types(self, value): | |||||
return isinstance(value, self.types_to_memoize) | |||||
def serialize(self): | |||||
return _serialize(self.memoized.reversed(), None) | |||||
@classmethod | |||||
def deserialize(cls, data, namespace, memo): | |||||
return _deserialize(data, namespace, memo) | |||||
try: | try: | ||||
STRING_TYPE = basestring | STRING_TYPE = basestring | ||||
except NameError: # Python 3 | except NameError: # Python 3 | ||||
@@ -79,6 +159,11 @@ def smart_decorator(f, create_decorator): | |||||
else: | else: | ||||
return create_decorator(f.__func__.__call__, True) | return create_decorator(f.__func__.__call__, True) | ||||
import sys, re | |||||
Py36 = (sys.version_info[:2] >= (3, 6)) | |||||
###} | |||||
def dedup_list(l): | def dedup_list(l): | ||||
"""Given a list (l) will removing duplicates from the list, | """Given a list (l) will removing duplicates from the list, | ||||
preserving the original order of the list. Assumes that | preserving the original order of the list. Assumes that | ||||
@@ -86,7 +171,7 @@ def dedup_list(l): | |||||
dedup = set() | dedup = set() | ||||
return [ x for x in l if not (x in dedup or dedup.add(x))] | return [ x for x in l if not (x in dedup or dedup.add(x))] | ||||
###} | |||||
try: | try: | ||||
@@ -128,3 +213,22 @@ def get_regexp_width(regexp): | |||||
return sre_parse.parse(regexp).getwidth() | return sre_parse.parse(regexp).getwidth() | ||||
except sre_constants.error: | except sre_constants.error: | ||||
raise ValueError(regexp) | raise ValueError(regexp) | ||||
class Enumerator(Serialize): | |||||
def __init__(self): | |||||
self.enums = {} | |||||
def get(self, item): | |||||
if item not in self.enums: | |||||
self.enums[item] = len(self.enums) | |||||
return self.enums[item] | |||||
def __len__(self): | |||||
return len(self.enums) | |||||
def reversed(self): | |||||
r = {v: k for k, v in self.enums.items()} | |||||
assert len(r) == len(self.enums) | |||||
return r | |||||
@@ -21,6 +21,8 @@ from lark.lark import Lark | |||||
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters | from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters | ||||
from lark.tree import Tree | from lark.tree import Tree | ||||
from lark.visitors import Transformer | from lark.visitors import Transformer | ||||
from lark.grammar import Rule | |||||
from lark.lexer import TerminalDef | |||||
__path__ = os.path.dirname(__file__) | __path__ = os.path.dirname(__file__) | ||||
def _read(n, *args): | def _read(n, *args): | ||||
@@ -1429,6 +1431,23 @@ def _make_parser_test(LEXER, PARSER): | |||||
parser.parse(r'"That" "And a \"b"') | parser.parse(r'"That" "And a \"b"') | ||||
@unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") | |||||
def test_serialize(self): | |||||
grammar = """ | |||||
start: "A" b "C" | |||||
b: "B" | |||||
""" | |||||
parser = _Lark(grammar) | |||||
d = parser.serialize() | |||||
parser2 = Lark.deserialize(d, {}, {}) | |||||
self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) ) | |||||
namespace = {'Rule': Rule, 'TerminalDef': TerminalDef} | |||||
d, m = parser.memo_serialize(namespace.values()) | |||||
parser3 = Lark.deserialize(d, namespace, m) | |||||
self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) ) | |||||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||
_TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
@@ -70,6 +70,10 @@ class TestStandalone(TestCase): | |||||
x = T().transform(x) | x = T().transform(x) | ||||
self.assertEqual(x, ['a', 'b']) | self.assertEqual(x, ['a', 'b']) | ||||
l2 = _Lark(transformer=T()) | |||||
x = l2.parse('ABAB') | |||||
self.assertEqual(x, ['a', 'b']) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
unittest.main() | unittest.main() | ||||