Browse Source

Merge pull request #781 from lark-parser/refactor_frontends

Refactored parser_frontends. Now significantly simpler
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Shinan 4 years ago
committed by GitHub
parent
commit
692a950488
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 164 additions and 177 deletions
  1. +9
    -3
      lark/common.py
  2. +7
    -0
      lark/exceptions.py
  3. +1
    -5
      lark/lark.py
  4. +4
    -3
      lark/load_grammar.py
  5. +139
    -165
      lark/parser_frontends.py
  6. +2
    -1
      lark/parsers/lalr_parser.py
  7. +1
    -0
      lark/utils.py
  8. +1
    -0
      tests/test_parser.py

+ 9
- 3
lark/common.py View File

@@ -5,7 +5,7 @@ from .lexer import TerminalDef




class LexerConf(Serialize): class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
__serialize_namespace__ = TerminalDef, __serialize_namespace__ = TerminalDef,


def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
@@ -18,12 +18,18 @@ class LexerConf(Serialize):
self.skip_validation = skip_validation self.skip_validation = skip_validation
self.use_bytes = use_bytes self.use_bytes = use_bytes


###}
self.lexer_type = None



class ParserConf(Serialize):
__serialize_fields__ = 'rules', 'start', 'parser_type'


class ParserConf:
def __init__(self, rules, callbacks, start): def __init__(self, rules, callbacks, start):
assert isinstance(start, list) assert isinstance(start, list)
self.rules = rules self.rules = rules
self.callbacks = callbacks self.callbacks = callbacks
self.start = start self.start = start

self.parser_type = None

###}

+ 7
- 0
lark/exceptions.py View File

@@ -11,6 +11,11 @@ class ConfigurationError(LarkError, ValueError):
pass pass




def assert_config(value, options, msg='Got %r, expected one of %s'):
if value not in options:
raise ConfigurationError(msg % (value, options))


class GrammarError(LarkError): class GrammarError(LarkError):
pass pass


@@ -198,4 +203,6 @@ class VisitError(LarkError):


message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message) super(VisitError, self).__init__(message)


###} ###}

+ 1
- 5
lark/lark.py View File

@@ -1,5 +1,5 @@
from __future__ import absolute_import from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config


import sys, os, pickle, hashlib import sys, os, pickle, hashlib
from io import open from io import open
@@ -24,10 +24,6 @@ except ImportError:


###{standalone ###{standalone


def assert_config(value, options, msg='Got %r, expected one of %s'):
if value not in options:
raise ConfigurationError(msg % (value, options))



class LarkOptions(Serialize): class LarkOptions(Serialize):
"""Specifies the options for Lark """Specifies the options for Lark


+ 4
- 3
lark/load_grammar.py View File

@@ -11,7 +11,7 @@ from .utils import bfs, Py36, logger, classify_bool
from .lexer import Token, TerminalDef, PatternStr, PatternRE from .lexer import Token, TerminalDef, PatternStr, PatternRE


from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer
from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list, Str from .utils import classify, suppress, dedup_list, Str
@@ -883,9 +883,10 @@ class GrammarLoader:
callback = ParseTreeBuilder(rules, ST).create_callback() callback = ParseTreeBuilder(rules, ST).create_callback()
import re import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start']) parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
self.parser = ParsingFrontend(lexer_conf, parser_conf, {})


self.canonize_tree = CanonizeTree() self.canonize_tree = CanonizeTree()
self.global_keep_all_tokens = global_keep_all_tokens self.global_keep_all_tokens = global_keep_all_tokens


+ 139
- 165
lark/parser_frontends.py View File

@@ -1,12 +1,11 @@
from .exceptions import ConfigurationError, GrammarError
from .exceptions import ConfigurationError, GrammarError, assert_config
from .utils import get_regexp_width, Serialize from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
from .tree import Tree from .tree import Tree
from .common import LexerConf
from .common import LexerConf, ParserConf
try: try:
import regex import regex
except ImportError: except ImportError:
@@ -27,56 +26,106 @@ def _wrap_lexer(lexer_class):
return self.lexer.lex(lexer_state.text) return self.lexer.lex(lexer_state.text)
return CustomLexerWrapper return CustomLexerWrapper


def get_frontend(parser, lexer):
if parser=='lalr':
if lexer is None:
raise ConfigurationError('The LALR parser requires use of a lexer')
elif lexer == 'standard':
return LALR_TraditionalLexer
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
wrapped = _wrap_lexer(lexer)
class LALR_CustomLexerWrapper(LALR_WithLexer):
def init_lexer(self):
self.lexer = wrapped(self.lexer_conf)
return LALR_CustomLexerWrapper
else:
raise ConfigurationError('Unknown lexer: %s' % lexer)
elif parser=='earley':
if lexer=='standard':
return Earley_Traditional
elif lexer=='dynamic':
return XEarley
elif lexer=='dynamic_complete':
return XEarley_CompleteLex
elif lexer=='contextual':
raise ConfigurationError('The Earley parser does not support the contextual parser')
elif issubclass(lexer, Lexer):
wrapped = _wrap_lexer(lexer)
class Earley_CustomLexerWrapper(Earley_WithLexer):
def init_lexer(self, **kw):
self.lexer = wrapped(self.lexer_conf)
return Earley_CustomLexerWrapper

class MakeParsingFrontend:
def __init__(self, parser_type, lexer_type):
self.parser_type = parser_type
self.lexer_type = lexer_type

def __call__(self, lexer_conf, parser_conf, options):
assert isinstance(lexer_conf, LexerConf)
assert isinstance(parser_conf, ParserConf)
parser_conf.parser_type = self.parser_type
lexer_conf.lexer_type = self.lexer_type
return ParsingFrontend(lexer_conf, parser_conf, options)

@classmethod
def deserialize(cls, data, memo, callbacks, options):
lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
parser_conf.callbacks = callbacks

terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]

lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
lexer_conf.re_module = regex if options.regex else re
lexer_conf.use_bytes = options.use_bytes
lexer_conf.g_regex_flags = options.g_regex_flags
lexer_conf.skip_validation = True
lexer_conf.postlex = options.postlex

return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)




class ParsingFrontend(Serialize):
__serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options'

def __init__(self, lexer_conf, parser_conf, options, parser=None):
self.parser_conf = parser_conf
self.lexer_conf = lexer_conf
self.options = options

# Set-up parser
if parser: # From cache
self.parser = parser
else: else:
raise ConfigurationError('Unknown lexer: %s' % lexer)
elif parser == 'cyk':
if lexer == 'standard':
return CYK
create_parser = {
'lalr': create_lalr_parser,
'earley': create_earley_parser,
'cyk': CYK_FrontEnd,
}[parser_conf.parser_type]
self.parser = create_parser(lexer_conf, parser_conf, options)

# Set-up lexer
lexer_type = lexer_conf.lexer_type
self.skip_lexer = False
if lexer_type in ('dynamic', 'dynamic_complete'):
self.skip_lexer = True
return

try:
create_lexer = {
'standard': create_traditional_lexer,
'contextual': create_contextual_lexer,
}[lexer_type]
except KeyError:
assert issubclass(lexer_type, Lexer), lexer_type
self.lexer = _wrap_lexer(lexer_type)(lexer_conf)
else: else:
raise ConfigurationError('CYK parser requires using standard parser.')
else:
raise ConfigurationError('Unknown parser: %s' % parser)
self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex)


if lexer_conf.postlex:
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)


class _ParserFrontend(Serialize):
def _parse(self, start, input, *args):
def parse(self, text, start=None):
if start is None: if start is None:
start = self.start
start = self.parser_conf.start
if len(start) > 1: if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start start ,= start
return self.parser.parse(input, start, *args)

if self.skip_lexer:
return self.parser.parse(text, start)

lexer_thread = LexerThread(self.lexer, text)
return self.parser.parse(lexer_thread, start)


def get_frontend(parser, lexer):
assert_config(parser, ('lalr', 'earley', 'cyk'))
if not isinstance(lexer, type): # not custom lexer?
expected = {
'lalr': ('standard', 'contextual'),
'earley': ('standard', 'dynamic', 'dynamic_complete'),
'cyk': ('standard', ),
}[parser]
assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)

return MakeParsingFrontend(parser, lexer)




def _get_lexer_callbacks(transformer, terminals): def _get_lexer_callbacks(transformer, terminals):
@@ -100,119 +149,26 @@ class PostLexConnector:
return self.postlexer.process(i) return self.postlexer.process(i)




class WithLexer(_ParserFrontend):
lexer = None
parser = None
lexer_conf = None
start = None

__serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.start = parser_conf.start
self.postlex = lexer_conf.postlex

@classmethod
def deserialize(cls, data, memo, callbacks, options):
inst = super(WithLexer, cls).deserialize(data, memo)

inst.postlex = options.postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug)

terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]
inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
inst.lexer_conf.re_module = regex if options.regex else re
inst.lexer_conf.use_bytes = options.use_bytes
inst.lexer_conf.g_regex_flags = options.g_regex_flags
inst.lexer_conf.skip_validation = True
inst.init_lexer()

return inst

def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def make_lexer(self, text):
lexer = self.lexer
if self.postlex:
lexer = PostLexConnector(self.lexer, self.postlex)
return LexerThread(lexer, text)

def parse(self, text, start=None):
return self._parse(start, self.make_lexer(text))

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf)

class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)


self.init_lexer()
def create_traditional_lexer(lexer_conf, parser, postlex):
return TraditionalLexer(lexer_conf)


def init_lexer(self, **kw):
raise NotImplementedError()
def create_contextual_lexer(lexer_conf, parser, postlex):
states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}
always_accept = postlex.always_accept if postlex else ()
return ContextualLexer(lexer_conf, states, always_accept=always_accept)


class LALR_TraditionalLexer(LALR_WithLexer):
def init_lexer(self):
self.init_traditional_lexer()
def create_lalr_parser(lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
return LALR_Parser(parser_conf, debug=debug)


class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)


create_earley_parser = NotImplemented
CYK_FrontEnd = NotImplemented
###} ###}



class Earley_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

def match(self, term, token):
return term.name == token.type

def init_lexer(self, **kw):
raise NotImplementedError()

class Earley_Traditional(Earley_WithLexer):
def init_lexer(self, **kw):
self.init_traditional_lexer()


class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start

self._prepare_match(lexer_conf)
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = xearley.Parser(parser_conf,
self.match,
ignore=lexer_conf.ignore,
resolve_ambiguity=resolve_ambiguity,
debug=debug,
tree_class=tree_class,
**kw
)

def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)

def _prepare_match(self, lexer_conf):
class EarleyRegexpMatcher:
def __init__(self, lexer_conf):
self.regexps = {} self.regexps = {}
for t in lexer_conf.tokens: for t in lexer_conf.tokens:
if t.priority != 1: if t.priority != 1:
@@ -230,31 +186,49 @@ class XEarley(_ParserFrontend):


self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)


def parse(self, text, start):
return self._parse(start, text)
def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)


class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):
XEarley.__init__(self, *args, complete_lex=True, **kw)


def create_earley_parser__dynamic(lexer_conf, parser_conf, options=None, **kw):
earley_matcher = EarleyRegexpMatcher(lexer_conf)
return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw)


def _match_earley_basic(term, token):
return term.name == token.type


class CYK(WithLexer):
def create_earley_parser__basic(lexer_conf, parser_conf, options, **kw):
return earley.Parser(parser_conf, _match_earley_basic, **kw)


def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()
def create_earley_parser(lexer_conf, parser_conf, options):
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None

extra = {}
if lexer_conf.lexer_type == 'dynamic':
f = create_earley_parser__dynamic
elif lexer_conf.lexer_type == 'dynamic_complete':
extra['complete_lex'] =True
f = create_earley_parser__dynamic
else:
f = create_earley_parser__basic

return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra)




class CYK_FrontEnd:
def __init__(self, lexer_conf, parser_conf, options=None):
self._analysis = GrammarAnalyzer(parser_conf) self._analysis = GrammarAnalyzer(parser_conf)
self.parser = cyk.Parser(parser_conf.rules) self.parser = cyk.Parser(parser_conf.rules)


self.callbacks = parser_conf.callbacks self.callbacks = parser_conf.callbacks


def parse(self, text, start):
tokens = list(self.make_lexer(text).lex(None))
parse = self._parse(start, tokens)
parse = self._transform(parse)
return parse
def parse(self, lexer_thread, start):
tokens = list(lexer_thread.lex(None))
tree = self.parser.parse(tokens, start)
return self._transform(tree)


def _transform(self, tree): def _transform(self, tree):
subtrees = list(tree.iter_subtrees()) subtrees = list(tree.iter_subtrees())


+ 2
- 1
lark/parsers/lalr_parser.py View File

@@ -5,13 +5,14 @@
from copy import deepcopy, copy from copy import deepcopy, copy
from ..exceptions import UnexpectedInput, UnexpectedToken from ..exceptions import UnexpectedInput, UnexpectedToken
from ..lexer import Token from ..lexer import Token
from ..utils import Serialize


from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet from .lalr_puppet import ParserPuppet


###{standalone ###{standalone


class LALR_Parser(object):
class LALR_Parser(Serialize):
def __init__(self, parser_conf, debug=False): def __init__(self, parser_conf, debug=False):
analysis = LALR_Analyzer(parser_conf, debug=debug) analysis = LALR_Analyzer(parser_conf, debug=debug)
analysis.compute_lalr() analysis.compute_lalr()


+ 1
- 0
lark/utils.py View File

@@ -302,4 +302,5 @@ def _serialize(value, memo):
return list(value) # TODO reversible? return list(value) # TODO reversible?
elif isinstance(value, dict): elif isinstance(value, dict):
return {key:_serialize(elem, memo) for key, elem in value.items()} return {key:_serialize(elem, memo) for key, elem in value.items()}
# assert value is None or isinstance(value, (int, float, str, tuple)), value
return value return value

+ 1
- 0
tests/test_parser.py View File

@@ -2471,6 +2471,7 @@ _TO_TEST = [
('contextual', 'lalr'), ('contextual', 'lalr'),


('custom_new', 'lalr'), ('custom_new', 'lalr'),
('custom_new', 'cyk'),
('custom_old', 'earley'), ('custom_old', 'earley'),
] ]




Loading…
Cancel
Save