瀏覽代碼

Refactored parser_frontends. Now significantly simpler

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Sh 3 年之前
父節點
當前提交
284dfe7fd3
共有 5 個檔案被更改,包括 156 行新增167 行删除
  1. +9
    -3
      lark/common.py
  2. +4
    -3
      lark/load_grammar.py
  3. +140
    -160
      lark/parser_frontends.py
  4. +2
    -1
      lark/parsers/lalr_parser.py
  5. +1
    -0
      lark/utils.py

+ 9
- 3
lark/common.py 查看文件

@@ -5,7 +5,7 @@ from .lexer import TerminalDef


class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'name'
__serialize_namespace__ = TerminalDef,

def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
@@ -18,12 +18,18 @@ class LexerConf(Serialize):
self.skip_validation = skip_validation
self.use_bytes = use_bytes

###}
self.name = None


class ParserConf(Serialize):
__serialize_fields__ = 'rules', 'start', 'name'

class ParserConf:
def __init__(self, rules, callbacks, start):
assert isinstance(start, list)
self.rules = rules
self.callbacks = callbacks
self.start = start

self.name = None

###}

+ 4
- 3
lark/load_grammar.py 查看文件

@@ -11,7 +11,7 @@ from .utils import bfs, Py36, logger, classify_bool
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer
from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list, Str
@@ -883,9 +883,10 @@ class GrammarLoader:
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
lexer_conf.name = 'standard'
parser_conf.name = 'lalr'
self.parser = ParsingFrontend(lexer_conf, parser_conf, {})

self.canonize_tree = CanonizeTree()
self.global_keep_all_tokens = global_keep_all_tokens


+ 140
- 160
lark/parser_frontends.py 查看文件

@@ -4,9 +4,8 @@ from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
from .tree import Tree
from .common import LexerConf
from .common import LexerConf, ParserConf
try:
import regex
except ImportError:
@@ -27,56 +26,112 @@ def _wrap_lexer(lexer_class):
return self.lexer.lex(lexer_state.text)
return CustomLexerWrapper


class MakeParsingFrontend:
def __init__(self, parser, lexer):
self.parser = parser
self.lexer = lexer

def __call__(self, lexer_conf, parser_conf, options):
assert isinstance(lexer_conf, LexerConf)
assert isinstance(parser_conf, ParserConf)
parser_conf.name = self.parser
lexer_conf.name = self.lexer
return ParsingFrontend(lexer_conf, parser_conf, options)

@classmethod
def deserialize(cls, data, memo, callbacks, options):
lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
parser_conf.callbacks = callbacks

terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]

lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
lexer_conf.re_module = regex if options.regex else re
lexer_conf.use_bytes = options.use_bytes
lexer_conf.g_regex_flags = options.g_regex_flags
lexer_conf.skip_validation = True
lexer_conf.postlex = options.postlex

return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)




class ParsingFrontend(Serialize):
__serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options'

def __init__(self, lexer_conf, parser_conf, options, parser=None):
self.parser_conf = parser_conf
self.lexer_conf = lexer_conf
self.options = options

# Set-up parser
if parser: # From cache
self.parser = parser
else:
create_parser = {
'lalr': create_lalr_parser,
'earley': make_early,
'cyk': CYK_FrontEnd,
}[parser_conf.name]
self.parser = create_parser(lexer_conf, parser_conf, options)

# Set-up lexer
self.skip_lexer = False
if lexer_conf.name in ('dynamic', 'dynamic_complete'):
self.skip_lexer = True
return

try:
create_lexer = {
'standard': create_traditional_lexer,
'contextual': create_contextual_lexer,
}[lexer_conf.name]
except KeyError:
assert issubclass(lexer_conf.name, Lexer), lexer_conf.name
self.lexer = _wrap_lexer(lexer_conf.name)(lexer_conf)
else:
self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex)

if lexer_conf.postlex:
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)


def _parse(self, start, input, *args):
if start is None:
start = self.parser_conf.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
return self.parser.parse(input, start, *args)

def parse(self, text, start=None):
if self.skip_lexer:
return self._parse(start, text)

lexer = LexerThread(self.lexer, text)
return self._parse(start, lexer)


def get_frontend(parser, lexer):
if parser=='lalr':
if lexer is None:
raise ConfigurationError('The LALR parser requires use of a lexer')
elif lexer == 'standard':
return LALR_TraditionalLexer
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
wrapped = _wrap_lexer(lexer)
class LALR_CustomLexerWrapper(LALR_WithLexer):
def init_lexer(self):
self.lexer = wrapped(self.lexer_conf)
return LALR_CustomLexerWrapper
else:
if lexer not in ('standard' ,'contextual') and not issubclass(lexer, Lexer):
raise ConfigurationError('Unknown lexer: %s' % lexer)
elif parser=='earley':
if lexer=='standard':
return Earley_Traditional
elif lexer=='dynamic':
return XEarley
elif lexer=='dynamic_complete':
return XEarley_CompleteLex
elif lexer=='contextual':
if lexer=='contextual':
raise ConfigurationError('The Earley parser does not support the contextual parser')
elif issubclass(lexer, Lexer):
wrapped = _wrap_lexer(lexer)
class Earley_CustomLexerWrapper(Earley_WithLexer):
def init_lexer(self, **kw):
self.lexer = wrapped(self.lexer_conf)
return Earley_CustomLexerWrapper
else:
raise ConfigurationError('Unknown lexer: %s' % lexer)
elif parser == 'cyk':
if lexer == 'standard':
return CYK
else:
if lexer != 'standard':
raise ConfigurationError('CYK parser requires using standard parser.')
else:
raise ConfigurationError('Unknown parser: %s' % parser)


class _ParserFrontend(Serialize):
def _parse(self, start, input, *args):
if start is None:
start = self.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
return self.parser.parse(input, start, *args)
return MakeParsingFrontend(parser, lexer)


def _get_lexer_callbacks(transformer, terminals):
@@ -100,119 +155,26 @@ class PostLexConnector:
return self.postlexer.process(i)


class WithLexer(_ParserFrontend):
lexer = None
parser = None
lexer_conf = None
start = None

__serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.start = parser_conf.start
self.postlex = lexer_conf.postlex

@classmethod
def deserialize(cls, data, memo, callbacks, options):
inst = super(WithLexer, cls).deserialize(data, memo)

inst.postlex = options.postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug)

terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]
inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
inst.lexer_conf.re_module = regex if options.regex else re
inst.lexer_conf.use_bytes = options.use_bytes
inst.lexer_conf.g_regex_flags = options.g_regex_flags
inst.lexer_conf.skip_validation = True
inst.init_lexer()

return inst

def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def make_lexer(self, text):
lexer = self.lexer
if self.postlex:
lexer = PostLexConnector(self.lexer, self.postlex)
return LexerThread(lexer, text)

def parse(self, text, start=None):
return self._parse(start, self.make_lexer(text))

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf)

class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
def create_traditional_lexer(lexer_conf, parser, postlex):
return TraditionalLexer(lexer_conf)

self.init_lexer()
def create_contextual_lexer(lexer_conf, parser, postlex):
states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}
always_accept = postlex.always_accept if postlex else ()
return ContextualLexer(lexer_conf, states, always_accept=always_accept)

def init_lexer(self, **kw):
raise NotImplementedError()
def create_lalr_parser(lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
return LALR_Parser(parser_conf, debug=debug)

class LALR_TraditionalLexer(LALR_WithLexer):
def init_lexer(self):
self.init_traditional_lexer()

class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)

make_early = NotImplemented
CYK_FrontEnd = NotImplemented
###}


class Earley_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

def match(self, term, token):
return term.name == token.type

def init_lexer(self, **kw):
raise NotImplementedError()

class Earley_Traditional(Earley_WithLexer):
def init_lexer(self, **kw):
self.init_traditional_lexer()


class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start

self._prepare_match(lexer_conf)
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = xearley.Parser(parser_conf,
self.match,
ignore=lexer_conf.ignore,
resolve_ambiguity=resolve_ambiguity,
debug=debug,
tree_class=tree_class,
**kw
)

def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)

def _prepare_match(self, lexer_conf):
class EarleyRegexpMatcher:
def __init__(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
if t.priority != 1:
@@ -230,31 +192,49 @@ class XEarley(_ParserFrontend):

self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(start, text)
def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)

class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):
XEarley.__init__(self, *args, complete_lex=True, **kw)

def make_xearley(lexer_conf, parser_conf, options=None, **kw):
earley_matcher = EarleyRegexpMatcher(lexer_conf)
return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw)

def _match_earley_basic(term, token):
return term.name == token.type

class CYK(WithLexer):
def make_early_basic(lexer_conf, parser_conf, options, **kw):
return earley.Parser(parser_conf, _match_earley_basic, **kw)

def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()
def make_early(lexer_conf, parser_conf, options):
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None

extra = {}
if lexer_conf.name == 'dynamic':
f = make_xearley
elif lexer_conf.name == 'dynamic_complete':
extra['complete_lex'] =True
f = make_xearley
else:
f = make_early_basic

return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra)



class CYK_FrontEnd:
def __init__(self, lexer_conf, parser_conf, options=None):
self._analysis = GrammarAnalyzer(parser_conf)
self.parser = cyk.Parser(parser_conf.rules)

self.callbacks = parser_conf.callbacks

def parse(self, text, start):
tokens = list(self.make_lexer(text).lex(None))
parse = self._parse(start, tokens)
parse = self._transform(parse)
return parse
def parse(self, lexer, start):
tokens = list(lexer.lex(None))
tree = self.parser.parse(tokens, start)
return self._transform(tree)

def _transform(self, tree):
subtrees = list(tree.iter_subtrees())


+ 2
- 1
lark/parsers/lalr_parser.py 查看文件

@@ -5,13 +5,14 @@
from copy import deepcopy, copy
from ..exceptions import UnexpectedInput, UnexpectedToken
from ..lexer import Token
from ..utils import Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet

###{standalone

class LALR_Parser(object):
class LALR_Parser(Serialize):
def __init__(self, parser_conf, debug=False):
analysis = LALR_Analyzer(parser_conf, debug=debug)
analysis.compute_lalr()


+ 1
- 0
lark/utils.py 查看文件

@@ -302,4 +302,5 @@ def _serialize(value, memo):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem, memo) for key, elem in value.items()}
# assert value is None or isinstance(value, (int, float, str, tuple)), value
return value

Loading…
取消
儲存