Browse Source

Mid work. Almost stable

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.7.1
Erez Shinan 6 years ago
parent
commit
94e15fb6f7
8 changed files with 134 additions and 207 deletions
  1. +22
    -16
      lark/grammar.py
  2. +8
    -17
      lark/lark.py
  3. +7
    -5
      lark/lexer.py
  4. +14
    -9
      lark/parser_frontends.py
  5. +4
    -3
      lark/parsers/lalr_analysis.py
  6. +4
    -4
      lark/parsers/lalr_parser.py
  7. +24
    -134
      lark/tools/standalone.py
  8. +51
    -19
      lark/utils.py

+ 22
- 16
lark/grammar.py View File

@@ -1,5 +1,7 @@
from .utils import Serialize

###{standalone

class Symbol(Serialize):
is_term = NotImplemented

@@ -43,6 +45,24 @@ class NonTerminal(Symbol):
is_term = False



class RuleOptions(Serialize):
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices'

def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority
self.empty_indices = empty_indices

def __repr__(self):
return 'RuleOptions(%r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
)


class Rule(Serialize):
"""
origin : a symbol
@@ -52,7 +72,7 @@ class Rule(Serialize):
__slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')

__serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
__serialize_namespace__ = lambda: (Terminal, NonTerminal, RuleOptions)
__serialize_namespace__ = Terminal, NonTerminal, RuleOptions

def __init__(self, origin, expansion, order=0, alias=None, options=None):
self.origin = origin
@@ -81,18 +101,4 @@ class Rule(Serialize):



class RuleOptions(Serialize):
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices'

def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority
self.empty_indices = empty_indices

def __repr__(self):
return 'RuleOptions(%r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
)
###}

+ 8
- 17
lark/lark.py View File

@@ -15,6 +15,7 @@ from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend
from .grammar import Rule

###{standalone

class LarkOptions(Serialize):
"""Specifies the options for Lark
@@ -101,11 +102,11 @@ class LarkOptions(Serialize):
assert name in self.options
self.options[name] = value

def serialize(self):
def serialize(self, memo):
return self.options

@classmethod
def deserialize(cls, data):
def deserialize(cls, data, memo):
return cls(data)


@@ -240,12 +241,12 @@ class Lark(Serialize):
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)

@classmethod
def deserialize(cls, data):
def deserialize(cls, data, memo):
inst = cls.__new__(cls)
inst.options = LarkOptions.deserialize(data['options'])
inst.rules = [Rule.deserialize(r) for r in data['rules']]
inst.options = LarkOptions.deserialize(data['options'], memo)
inst.rules = [Rule.deserialize(r, memo) for r in data['rules']]
inst._prepare_callbacks()
inst.parser = inst.parser_class.deserialize(data['parser'], inst._callbacks)
inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks)
return inst


@@ -284,14 +285,4 @@ class Lark(Serialize):
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise."
return self.parser.parse(text)

# if self.profiler:
# self.profiler.enter_section('lex')
# l = list(self.lex(text))
# self.profiler.enter_section('parse')
# try:
# return self.parser.parse(l)
# finally:
# self.profiler.enter_section('outside_lark')
# else:
# l = list(self.lex(text))
# return self.parser.parse(l)
###}

+ 7
- 5
lark/lexer.py View File

@@ -5,6 +5,8 @@ import re
from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError

###{standalone

class Pattern(Serialize):
__serialize_fields__ = 'value', 'flags'

@@ -61,7 +63,7 @@ class PatternRE(Pattern):

class TerminalDef(Serialize):
__serialize_fields__ = 'name', 'pattern', 'priority'
__serialize_namespace__ = lambda: (PatternStr, PatternRE)
__serialize_namespace__ = PatternStr, PatternRE

def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
@@ -74,7 +76,6 @@ class TerminalDef(Serialize):



###{standalone
class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')

@@ -205,7 +206,6 @@ class CallChain:
return self.callback2(t) if self.cond(t2) else t2


###}



@@ -275,7 +275,7 @@ class Lexer(Serialize):

class TraditionalLexer(Lexer):
__serialize_fields__ = 'terminals', 'ignore_types', 'newline_types'
__serialize_namespace__ = lambda: (TerminalDef,)
__serialize_namespace__ = TerminalDef,

def _deserialize(self):
self.mres = build_mres(self.terminals)
@@ -328,7 +328,7 @@ class TraditionalLexer(Lexer):

class ContextualLexer(Lexer):
__serialize_fields__ = 'root_lexer', 'lexers'
__serialize_namespace__ = lambda: (TraditionalLexer,)
__serialize_namespace__ = TraditionalLexer,

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {}
@@ -363,3 +363,5 @@ class ContextualLexer(Lexer):
yield x
l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state

###}

+ 14
- 9
lark/parser_frontends.py View File

@@ -4,26 +4,29 @@ from functools import partial
from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
from .parsers import lalr_parser, earley, xearley, cyk
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
from .tree import Tree

###{standalone

class WithLexer(Serialize):
lexer = None
parser = None
lexer_conf = None

__serialize_fields__ = 'parser', 'lexer'
__serialize_namespace__ = lambda: (Rule, ContextualLexer, LALR_ContextualLexer)
__serialize_namespace__ = Rule, ContextualLexer

@classmethod
def deserialize(cls, data, callbacks):
inst = super(WithLexer, cls).deserialize(data)
def deserialize(cls, data, memo, callbacks):
inst = super(WithLexer, cls).deserialize(data, memo)
inst.postlex = None # TODO
inst.parser = lalr_parser.Parser.deserialize(inst.parser, callbacks)
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
return inst
def _serialize(self, data):
def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize()

def init_traditional_lexer(self, lexer_conf):
@@ -54,18 +57,18 @@ class WithLexer(Serialize):
class LALR_TraditionalLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = lalr_parser.Parser(parser_conf, debug=debug)
self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_traditional_lexer(lexer_conf)

class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = lalr_parser.Parser(parser_conf, debug=debug)
self.parser = LALR_Parser(parser_conf, debug=debug)
self.init_contextual_lexer(lexer_conf)

class LALR_CustomLexer(WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.parser = LALR_Parser(parser_conf)
self.lexer_conf = lexer_conf
self.lexer = lexer_cls(lexer_conf)

@@ -190,3 +193,5 @@ def get_frontend(parser, lexer):




###}

+ 4
- 3
lark/parsers/lalr_analysis.py View File

@@ -14,6 +14,8 @@ from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal

###{standalone

class Action:
def __init__(self, name):
self.name = name
@@ -50,7 +52,7 @@ class ParseTable:
}

@classmethod
def deserialize(cls, data):
def deserialize(cls, data, memo):
tokens = data['tokens']
rules = data['rules']
states = {
@@ -79,8 +81,7 @@ class IntParseTable(ParseTable):
end_state = state_to_idx[parse_table.end_state]
return cls(int_states, start_state, end_state)



###}

class LALR_Analyzer(GrammarAnalyzer):



+ 4
- 4
lark/parsers/lalr_parser.py View File

@@ -9,7 +9,8 @@ from ..utils import Enumerator, Serialize
from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable


class Parser:
###{standalone
class LALR_Parser(object):
def __init__(self, parser_conf, debug=False):
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -22,9 +23,9 @@ class Parser:
self.parser = _Parser(analysis.parse_table, callbacks)

@classmethod
def deserialize(cls, data, callbacks):
def deserialize(cls, data, memo, callbacks):
inst = cls.__new__(cls)
inst.parser = _Parser(IntParseTable.deserialize(data), callbacks)
inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks)
return inst

def serialize(self):
@@ -33,7 +34,6 @@ class Parser:
def parse(self, *args):
return self.parser.parse(*args)

###{standalone

class _Parser:
def __init__(self, parse_table, callbacks):


+ 24
- 134
lark/tools/standalone.py View File

@@ -36,6 +36,7 @@
#
###}

import pprint
import codecs
import sys
import os
@@ -47,6 +48,10 @@ import lark
from lark import Lark
from lark.parsers.lalr_analysis import Reduce


from lark.grammar import RuleOptions
from lark.lexer import TerminalDef

_dir = path.dirname(__file__)
_larkdir = path.join(_dir, path.pardir)

@@ -61,9 +66,12 @@ EXTRACT_STANDALONE_FILES = [
'lexer.py',
'parse_tree_builder.py',
'parsers/lalr_parser.py',
'parsers/lalr_analysis.py',
'parser_frontends.py',
'lark.py',
'grammar.py',
]


def extract_sections(lines):
section = None
text = []
@@ -83,152 +91,34 @@ def extract_sections(lines):

return {name:''.join(text) for name, text in sections.items()}

def _prepare_mres(mres):
return [(p.pattern,{i: t for i, t in d.items()}) for p,d in mres]

class TraditionalLexerAtoms:
def __init__(self, lexer):
self.mres = _prepare_mres(lexer.mres)
self.newline_types = lexer.newline_types
self.ignore_types = lexer.ignore_types
self.callback = {name:_prepare_mres(c.mres)
for name, c in lexer.callback.items()}

def print_python(self):
print('import re')
print('class LexerRegexps: pass')
print('NEWLINE_TYPES = %s' % self.newline_types)
print('IGNORE_TYPES = %s' % self.ignore_types)
self._print_python('lexer')

def _print_python(self, var_name):
print('MRES = (')
pprint(self.mres)
print(')')
print('LEXER_CALLBACK = (')
pprint(self.callback)
print(')')
print('lexer_regexps = LexerRegexps()')
print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]')
print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])')
print(' for n, mres in LEXER_CALLBACK.items()}')
print('%s = (lexer_regexps)' % var_name)


class ContextualLexerAtoms:
def __init__(self, lexer):
self.lexer_atoms = {state: TraditionalLexerAtoms(lexer) for state, lexer in lexer.lexers.items()}
self.root_lexer_atoms = TraditionalLexerAtoms(lexer.root_lexer)

def print_python(self):
print('import re')
print('class LexerRegexps: pass')
print('NEWLINE_TYPES = %s' % self.root_lexer_atoms.newline_types)
print('IGNORE_TYPES = %s' % self.root_lexer_atoms.ignore_types)

print('LEXERS = {}')
for state, lexer_atoms in self.lexer_atoms.items():
lexer_atoms._print_python('LEXERS[%d]' % state)

print('class ContextualLexer:')
print(' def __init__(self):')
print(' self.lexers = LEXERS')
print(' self.set_parser_state(None)')
print(' def set_parser_state(self, state):')
print(' self.parser_state = state')
print(' def lex(self, stream):')
print(' newline_types = NEWLINE_TYPES')
print(' ignore_types = IGNORE_TYPES')
print(' lexers = LEXERS')
print(' l = _Lex(lexers[self.parser_state], self.parser_state)')
print(' for x in l.lex(stream, newline_types, ignore_types):')
print(' yield x')
print(' l.lexer = lexers[self.parser_state]')
print(' l.state = self.parser_state')

print('CON_LEXER = ContextualLexer()')
print('def lex(stream):')
print(' return CON_LEXER.lex(stream)')

class GetRule:
def __init__(self, rule_id):
self.rule_id = rule_id

def __repr__(self):
return 'RULES[%d]' % self.rule_id

rule_ids = {}
token_types = {}

def _get_token_type(token_type):
if token_type not in token_types:
token_types[token_type] = len(token_types)
return token_types[token_type]

class ParserAtoms:
def __init__(self, parser):
self.parse_table = parser._parse_table

def print_python(self):
print('class ParseTable: pass')
print('parse_table = ParseTable()')
print('STATES = {')
for state, actions in self.parse_table.states.items():
print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}))
print('}')
print('TOKEN_TYPES = (')
pprint({v:k for k, v in token_types.items()})
print(')')
print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}')
print(' for s, acts in STATES.items()}')
print('parse_table.start_state = %s' % self.parse_table.start_state)
print('parse_table.end_state = %s' % self.parse_table.end_state)
print('class Lark_StandAlone:')
print(' def __init__(self, transformer=None, postlex=None):')
print(' callbacks = parse_tree_builder.create_callback(transformer=transformer)')
print(' self.parser = _Parser(parse_table, callbacks)')
print(' self.postlex = postlex')
print(' def parse(self, stream):')
print(' tokens = lex(stream)')
print(' sps = CON_LEXER.set_parser_state')
print(' if self.postlex: tokens = self.postlex.process(tokens)')
print(' return self.parser.parse(tokens, sps)')

class TreeBuilderAtoms:
def __init__(self, lark):
self.rules = lark.rules

def print_python(self):
# print('class InlineTransformer: pass')
print('RULES = {')
for i, r in enumerate(self.rules):
rule_ids[r] = i
print(' %d: Rule(%r, [%s], alias=%r, options=%r),' % (i, r.origin, ', '.join(s.fullrepr for s in r.expansion), r.alias, r.options ))
print('}')
print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)')

def main(fobj, start):
lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start)

lexer_atoms = ContextualLexerAtoms(lark_inst.parser.lexer)
parser_atoms = ParserAtoms(lark_inst.parser.parser)
tree_builder_atoms = TreeBuilderAtoms(lark_inst)

print('# The file was automatically generated by Lark v%s' % lark.__version__)

for pyfile in EXTRACT_STANDALONE_FILES:
with open(os.path.join(_larkdir, pyfile)) as f:
print (extract_sections(f)['standalone'])

with open(os.path.join(_larkdir, 'grammar.py')) as grammar_py:
print(grammar_py.read())
data, m = lark_inst.memo_serialize([TerminalDef])
print( 'DATA = (' )
# pprint(data, width=160)
print(data)
print(')')
print( 'MEMO = (')
print(m)
print(')')


print('Shift = 0')
print('Reduce = 1')
lexer_atoms.print_python()
tree_builder_atoms.print_python()
parser_atoms.print_python()
print("def load_parser():")
print(" return Lark.deserialize(DATA)")





if __name__ == '__main__':
if len(sys.argv) < 2:


+ 51
- 19
lark/utils.py View File

@@ -1,8 +1,6 @@
import sys
from collections import deque

Py36 = (sys.version_info[:2] >= (3, 6))

class fzset(frozenset):
def __repr__(self):
return '{%s}' % ', '.join(map(repr, self))
@@ -44,56 +42,90 @@ def bfs(initial, expand):



def _serialize(value):
###{standalone
import sys, re

Py36 = (sys.version_info[:2] >= (3, 6))



def _serialize(value, memo):
if isinstance(value, Serialize):
return value.serialize()
return value.serialize(memo)
elif isinstance(value, list):
return [_serialize(elem) for elem in value]
return [_serialize(elem, memo) for elem in value]
elif isinstance(value, frozenset):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem) for key, elem in value.items()}
return {key:_serialize(elem, memo) for key, elem in value.items()}
return value

def _deserialize(data, namespace):
def _deserialize(data, namespace, memo):
if isinstance(data, dict):
if '__type__' in data: # Object
class_ = namespace[data['__type__']]
return class_.deserialize(data)
return {key:_deserialize(value, namespace) for key, value in data.items()}
return class_.deserialize(data, memo)
return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
elif isinstance(data, list):
return [_deserialize(value, namespace) for value in data]
return [_deserialize(value, namespace, memo) for value in data]
return data


class Serialize(object):
def serialize(self):
def memo_serialize(self, types_to_memoize):
memo = SerializeMemoizer(types_to_memoize)
return self.serialize(memo), memo.serialize()

def serialize(self, memo=None):
if memo and memo.in_types(self):
return {'__memo__': memo.memoized.get(self)}

fields = getattr(self, '__serialize_fields__')
res = {f: _serialize(getattr(self, f)) for f in fields}
res = {f: _serialize(getattr(self, f), memo) for f in fields}
res['__type__'] = type(self).__name__
postprocess = getattr(self, '_serialize', None)
if postprocess:
postprocess(res)
postprocess(res, memo)
return res

@classmethod
def deserialize(cls, data):
namespace = getattr(cls, '__serialize_namespace__', dict)
namespace = {c.__name__:c for c in namespace()}
def deserialize(cls, data, memo):
namespace = getattr(cls, '__serialize_namespace__', {})
namespace = {c.__name__:c for c in namespace}

fields = getattr(cls, '__serialize_fields__')

if '__memo__' in data:
return memo[data['__memo__']]

inst = cls.__new__(cls)
for f in fields:
setattr(inst, f, _deserialize(data[f], namespace))
setattr(inst, f, _deserialize(data[f], namespace, memo))
postprocess = getattr(inst, '_deserialize', None)
if postprocess:
postprocess()
return inst


class SerializeMemoizer(Serialize):
__serialize_fields__ = 'memoized',

def __init__(self, types_to_memoize):
self.types_to_memoize = tuple(types_to_memoize)
self.memoized = Enumerator()

def in_types(self, value):
return isinstance(value, self.types_to_memoize)

def serialize(self):
return _serialize(self.memoized.reversed(), None)

@classmethod
def deserialize(cls, data, namespace, memo):
return _deserialize(data, namespace, memo)



###{standalone
try:
STRING_TYPE = basestring
except NameError: # Python 3
@@ -178,7 +210,7 @@ def get_regexp_width(regexp):
raise ValueError(regexp)


class Enumerator:
class Enumerator(Serialize):
def __init__(self):
self.enums = {}



Loading…
Cancel
Save