Browse Source

Merge pull request #781 from lark-parser/refactor_frontends

Refactored parser_frontends. Now significantly simpler
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Shinan 4 years ago
committed by GitHub
parent
commit
692a950488
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 164 additions and 177 deletions
  1. +9
    -3
      lark/common.py
  2. +7
    -0
      lark/exceptions.py
  3. +1
    -5
      lark/lark.py
  4. +4
    -3
      lark/load_grammar.py
  5. +139
    -165
      lark/parser_frontends.py
  6. +2
    -1
      lark/parsers/lalr_parser.py
  7. +1
    -0
      lark/utils.py
  8. +1
    -0
      tests/test_parser.py

+ 9
- 3
lark/common.py View File

@@ -5,7 +5,7 @@ from .lexer import TerminalDef


class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
__serialize_namespace__ = TerminalDef,

def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
@@ -18,12 +18,18 @@ class LexerConf(Serialize):
self.skip_validation = skip_validation
self.use_bytes = use_bytes

###}
self.lexer_type = None


class ParserConf(Serialize):
__serialize_fields__ = 'rules', 'start', 'parser_type'

class ParserConf:
def __init__(self, rules, callbacks, start):
assert isinstance(start, list)
self.rules = rules
self.callbacks = callbacks
self.start = start

self.parser_type = None

###}

+ 7
- 0
lark/exceptions.py View File

@@ -11,6 +11,11 @@ class ConfigurationError(LarkError, ValueError):
pass


def assert_config(value, options, msg='Got %r, expected one of %s'):
if value not in options:
raise ConfigurationError(msg % (value, options))


class GrammarError(LarkError):
pass

@@ -198,4 +203,6 @@ class VisitError(LarkError):

message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message)


###}

+ 1
- 5
lark/lark.py View File

@@ -1,5 +1,5 @@
from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config

import sys, os, pickle, hashlib
from io import open
@@ -24,10 +24,6 @@ except ImportError:

###{standalone

def assert_config(value, options, msg='Got %r, expected one of %s'):
if value not in options:
raise ConfigurationError(msg % (value, options))


class LarkOptions(Serialize):
"""Specifies the options for Lark


+ 4
- 3
lark/load_grammar.py View File

@@ -11,7 +11,7 @@ from .utils import bfs, Py36, logger, classify_bool
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer
from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list, Str
@@ -883,9 +883,10 @@ class GrammarLoader:
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
self.parser = ParsingFrontend(lexer_conf, parser_conf, {})

self.canonize_tree = CanonizeTree()
self.global_keep_all_tokens = global_keep_all_tokens


+ 139
- 165
lark/parser_frontends.py View File

@@ -1,12 +1,11 @@
from .exceptions import ConfigurationError, GrammarError
from .exceptions import ConfigurationError, GrammarError, assert_config
from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
from .tree import Tree
from .common import LexerConf
from .common import LexerConf, ParserConf
try:
import regex
except ImportError:
@@ -27,56 +26,106 @@ def _wrap_lexer(lexer_class):
return self.lexer.lex(lexer_state.text)
return CustomLexerWrapper

def get_frontend(parser, lexer):
if parser=='lalr':
if lexer is None:
raise ConfigurationError('The LALR parser requires use of a lexer')
elif lexer == 'standard':
return LALR_TraditionalLexer
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
wrapped = _wrap_lexer(lexer)
class LALR_CustomLexerWrapper(LALR_WithLexer):
def init_lexer(self):
self.lexer = wrapped(self.lexer_conf)
return LALR_CustomLexerWrapper
else:
raise ConfigurationError('Unknown lexer: %s' % lexer)
elif parser=='earley':
if lexer=='standard':
return Earley_Traditional
elif lexer=='dynamic':
return XEarley
elif lexer=='dynamic_complete':
return XEarley_CompleteLex
elif lexer=='contextual':
raise ConfigurationError('The Earley parser does not support the contextual parser')
elif issubclass(lexer, Lexer):
wrapped = _wrap_lexer(lexer)
class Earley_CustomLexerWrapper(Earley_WithLexer):
def init_lexer(self, **kw):
self.lexer = wrapped(self.lexer_conf)
return Earley_CustomLexerWrapper

class MakeParsingFrontend:
def __init__(self, parser_type, lexer_type):
self.parser_type = parser_type
self.lexer_type = lexer_type

def __call__(self, lexer_conf, parser_conf, options):
assert isinstance(lexer_conf, LexerConf)
assert isinstance(parser_conf, ParserConf)
parser_conf.parser_type = self.parser_type
lexer_conf.lexer_type = self.lexer_type
return ParsingFrontend(lexer_conf, parser_conf, options)

@classmethod
def deserialize(cls, data, memo, callbacks, options):
lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
parser_conf.callbacks = callbacks

terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]

lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
lexer_conf.re_module = regex if options.regex else re
lexer_conf.use_bytes = options.use_bytes
lexer_conf.g_regex_flags = options.g_regex_flags
lexer_conf.skip_validation = True
lexer_conf.postlex = options.postlex

return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)




class ParsingFrontend(Serialize):
__serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options'

def __init__(self, lexer_conf, parser_conf, options, parser=None):
self.parser_conf = parser_conf
self.lexer_conf = lexer_conf
self.options = options

# Set-up parser
if parser: # From cache
self.parser = parser
else:
raise ConfigurationError('Unknown lexer: %s' % lexer)
elif parser == 'cyk':
if lexer == 'standard':
return CYK
create_parser = {
'lalr': create_lalr_parser,
'earley': create_earley_parser,
'cyk': CYK_FrontEnd,
}[parser_conf.parser_type]
self.parser = create_parser(lexer_conf, parser_conf, options)

# Set-up lexer
lexer_type = lexer_conf.lexer_type
self.skip_lexer = False
if lexer_type in ('dynamic', 'dynamic_complete'):
self.skip_lexer = True
return

try:
create_lexer = {
'standard': create_traditional_lexer,
'contextual': create_contextual_lexer,
}[lexer_type]
except KeyError:
assert issubclass(lexer_type, Lexer), lexer_type
self.lexer = _wrap_lexer(lexer_type)(lexer_conf)
else:
raise ConfigurationError('CYK parser requires using standard parser.')
else:
raise ConfigurationError('Unknown parser: %s' % parser)
self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex)

if lexer_conf.postlex:
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)

class _ParserFrontend(Serialize):
def _parse(self, start, input, *args):
def parse(self, text, start=None):
if start is None:
start = self.start
start = self.parser_conf.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
return self.parser.parse(input, start, *args)

if self.skip_lexer:
return self.parser.parse(text, start)

lexer_thread = LexerThread(self.lexer, text)
return self.parser.parse(lexer_thread, start)


def get_frontend(parser, lexer):
assert_config(parser, ('lalr', 'earley', 'cyk'))
if not isinstance(lexer, type): # not custom lexer?
expected = {
'lalr': ('standard', 'contextual'),
'earley': ('standard', 'dynamic', 'dynamic_complete'),
'cyk': ('standard', ),
}[parser]
assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)

return MakeParsingFrontend(parser, lexer)


def _get_lexer_callbacks(transformer, terminals):
@@ -100,119 +149,26 @@ class PostLexConnector:
return self.postlexer.process(i)


class WithLexer(_ParserFrontend):
lexer = None
parser = None
lexer_conf = None
start = None

__serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.start = parser_conf.start
self.postlex = lexer_conf.postlex

@classmethod
def deserialize(cls, data, memo, callbacks, options):
inst = super(WithLexer, cls).deserialize(data, memo)

inst.postlex = options.postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug)

terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]
inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
inst.lexer_conf.re_module = regex if options.regex else re
inst.lexer_conf.use_bytes = options.use_bytes
inst.lexer_conf.g_regex_flags = options.g_regex_flags
inst.lexer_conf.skip_validation = True
inst.init_lexer()

return inst

def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def make_lexer(self, text):
lexer = self.lexer
if self.postlex:
lexer = PostLexConnector(self.lexer, self.postlex)
return LexerThread(lexer, text)

def parse(self, text, start=None):
return self._parse(start, self.make_lexer(text))

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf)

class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)

self.init_lexer()
def create_traditional_lexer(lexer_conf, parser, postlex):
return TraditionalLexer(lexer_conf)

def init_lexer(self, **kw):
raise NotImplementedError()
def create_contextual_lexer(lexer_conf, parser, postlex):
states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}
always_accept = postlex.always_accept if postlex else ()
return ContextualLexer(lexer_conf, states, always_accept=always_accept)

class LALR_TraditionalLexer(LALR_WithLexer):
def init_lexer(self):
self.init_traditional_lexer()
def create_lalr_parser(lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
return LALR_Parser(parser_conf, debug=debug)

class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)

create_earley_parser = NotImplemented
CYK_FrontEnd = NotImplemented
###}


class Earley_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

def match(self, term, token):
return term.name == token.type

def init_lexer(self, **kw):
raise NotImplementedError()

class Earley_Traditional(Earley_WithLexer):
def init_lexer(self, **kw):
self.init_traditional_lexer()


class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start

self._prepare_match(lexer_conf)
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = xearley.Parser(parser_conf,
self.match,
ignore=lexer_conf.ignore,
resolve_ambiguity=resolve_ambiguity,
debug=debug,
tree_class=tree_class,
**kw
)

def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)

def _prepare_match(self, lexer_conf):
class EarleyRegexpMatcher:
def __init__(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
if t.priority != 1:
@@ -230,31 +186,49 @@ class XEarley(_ParserFrontend):

self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(start, text)
def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)

class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):
XEarley.__init__(self, *args, complete_lex=True, **kw)

def create_earley_parser__dynamic(lexer_conf, parser_conf, options=None, **kw):
earley_matcher = EarleyRegexpMatcher(lexer_conf)
return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw)

def _match_earley_basic(term, token):
return term.name == token.type

class CYK(WithLexer):
def create_earley_parser__basic(lexer_conf, parser_conf, options, **kw):
return earley.Parser(parser_conf, _match_earley_basic, **kw)

def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()
def create_earley_parser(lexer_conf, parser_conf, options):
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None

extra = {}
if lexer_conf.lexer_type == 'dynamic':
f = create_earley_parser__dynamic
elif lexer_conf.lexer_type == 'dynamic_complete':
extra['complete_lex'] =True
f = create_earley_parser__dynamic
else:
f = create_earley_parser__basic

return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra)



class CYK_FrontEnd:
def __init__(self, lexer_conf, parser_conf, options=None):
self._analysis = GrammarAnalyzer(parser_conf)
self.parser = cyk.Parser(parser_conf.rules)

self.callbacks = parser_conf.callbacks

def parse(self, text, start):
tokens = list(self.make_lexer(text).lex(None))
parse = self._parse(start, tokens)
parse = self._transform(parse)
return parse
def parse(self, lexer_thread, start):
tokens = list(lexer_thread.lex(None))
tree = self.parser.parse(tokens, start)
return self._transform(tree)

def _transform(self, tree):
subtrees = list(tree.iter_subtrees())


+ 2
- 1
lark/parsers/lalr_parser.py View File

@@ -5,13 +5,14 @@
from copy import deepcopy, copy
from ..exceptions import UnexpectedInput, UnexpectedToken
from ..lexer import Token
from ..utils import Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet

###{standalone

class LALR_Parser(object):
class LALR_Parser(Serialize):
def __init__(self, parser_conf, debug=False):
analysis = LALR_Analyzer(parser_conf, debug=debug)
analysis.compute_lalr()


+ 1
- 0
lark/utils.py View File

@@ -302,4 +302,5 @@ def _serialize(value, memo):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem, memo) for key, elem in value.items()}
# assert value is None or isinstance(value, (int, float, str, tuple)), value
return value

+ 1
- 0
tests/test_parser.py View File

@@ -2471,6 +2471,7 @@ _TO_TEST = [
('contextual', 'lalr'),

('custom_new', 'lalr'),
('custom_new', 'cyk'),
('custom_old', 'earley'),
]



Loading…
Cancel
Save