Переглянути джерело

Basics for GrammarBuilder

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
MegaIng1 3 роки тому
джерело
коміт
9e545f8825
2 змінених файлів з 216 додано та 79 видалено
  1. +44
    -39
      lark/lark.py
  2. +172
    -40
      lark/load_grammar.py

+ 44
- 39
lark/lark.py Переглянути файл

@@ -7,7 +7,7 @@ import tempfile
from warnings import warn

from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar, FromPackageLoader
from .load_grammar import load_grammar, FromPackageLoader, Grammar
from .tree import Tree
from .common import LexerConf, ParserConf

@@ -234,42 +234,50 @@ class Lark(Serialize):
else:
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")

cache_fn = None
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5

if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return
if isinstance(grammar, STRING_TYPE):
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5
if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return


# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar

if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
@@ -301,9 +309,6 @@ class Lark(Serialize):
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))

# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)

if self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept)
else:


+ 172
- 40
lark/load_grammar.py Переглянути файл

@@ -302,15 +302,6 @@ class RuleTreeToText(Transformer):
return expansion, alias.value


@inline_args
class CanonizeTree(Transformer_InPlace):
def tokenmods(self, *args):
if len(args) == 1:
return list(args)
tokenmods, value = args
return tokenmods + [value]


class PrepareAnonTerminals(Transformer_InPlace):
"""Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"""

@@ -871,8 +862,26 @@ def extend_expansions(tree, new):
tree.children.insert(0, new)


class GrammarLoader:
ERRORS = [

def _grammar_parser():
try:
return _grammar_parser.cache
except AttributeError:
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
_grammar_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {})
return _grammar_parser.cache

_GRAMMAR_ERRORS = [
('Unclosed parenthesis', ['a: (\n']),
('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
@@ -886,21 +895,39 @@ class GrammarLoader:
('%ignore expects a value', ['%ignore %import\n']),
]

def __init__(self, global_keep_all_tokens):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
def _parse_grammar(text, name, start='start'):
try:
return _grammar_parser().parse(text + '\n', start)
except UnexpectedCharacters as e:
context = e.get_context(text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, name, context))
except UnexpectedToken as e:
context = e.get_context(text)
error = e.match_examples(_grammar_parser().parse, _GRAMMAR_ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
self.parser = ParsingFrontend(lexer_conf, parser_conf, {})

self.canonize_tree = CanonizeTree()
class GrammarLoader:
ERRORS = [
('Unclosed parenthesis', ['a: (\n']),
('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
('Illegal name for rules or terminals', ['Aa:\n']),
('Alias expects lowercase name', ['a: -> "a"\n']),
('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']),
('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']),
('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']),
('Terminal names cannot contain dots', ['A.B\n']),
('%import expects a name', ['%import "a"\n']),
('%ignore expects a value', ['%ignore %import\n']),
]

def __init__(self, global_keep_all_tokens=False):
self.global_keep_all_tokens = global_keep_all_tokens

def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
@@ -931,21 +958,7 @@ class GrammarLoader:
def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
"""Parse grammar_text, verify, and create Grammar object. Display nice messages on error."""

try:
tree = self.canonize_tree.transform(self.parser.parse(grammar_text+'\n'))
except UnexpectedCharacters as e:
context = e.get_context(grammar_text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, grammar_name, context))
except UnexpectedToken as e:
context = e.get_context(grammar_text)
error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise

tree = _parse_grammar(grammar_text+'\n', grammar_name)
tree = PrepareGrammar().transform(tree)

# Extract grammar items
@@ -1061,7 +1074,7 @@ class GrammarLoader:
raise GrammarError("Cannot override a nonexisting terminal: %s" % name)
term_defs.append(t)
# Extend the definition of rules
# Extend the definition of rules by adding new entries to the `expansions` node

for r in extend_rules:
name = r[0]
@@ -1162,5 +1175,124 @@ class GrammarLoader:
return Grammar(rule_defs, term_defs, ignore_names)


class GrammarBuilder:
def __init__(self, global_keep_all_tokens=False, import_paths=None):
self.global_keep_all_tokens = global_keep_all_tokens
self.import_paths = import_paths or []

self._term_defs = {}
self._rule_defs = {}
self._ignore_names = []

def define_term(self, name, exp, priority=1, override=False):
if (name in self._term_defs) ^ override:
if override:
raise GrammarError("Cannot override a nonexisting terminal" % name)
else:
raise GrammarError("Terminal '%s' defined more than once" % name)
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
self._term_defs[name] = (exp, priority)
def define_rule(self, name, params, exp, options, override=False):
if (name in self._rule_defs) ^ override:
if override:
raise GrammarError("Cannot override a nonexisting rule: %s" % name)
else:
raise GrammarError("Rule '%s' defined more than once" % name)
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
self._rule_defs[name] = (params, exp, options)

def extend_term(self, name, exp, priority=1):
if name not in self._term_defs:
raise GrammarError("Can't extend terminal %s as it wasn't defined before" % name)
old_expansions = self._term_defs[name][0]
extend_expansions(old_expansions, exp)
def extend_rule(self, name, params, exp, options):
if name not in self._rule_defs:
raise GrammarError("Can't extend rule %s as it wasn't defined before" % name)
if params != self._rule_defs[name][0]:
raise GrammarError("Cannot extend templates with different parameters: %s" % name)
# TODO: think about what to do with RuleOptions
old_expansions = self._rule_defs[name][1]
extend_expansions(old_expansions, exp)
def ignore(self, exp_or_name):
if isinstance(exp_or_name, str):
self._ignore_names.append(exp_or_name)
else:
assert isinstance(exp_or_name, Tree)
t = exp_or_name
if t.data=='expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
self._ignore_names.append(item.value)
return

name = '__IGNORE_%d'% len(self._ignore_names)
self._ignore_names.append(name)
self._term_defs[name] = (t, 1)
def declare(self, *names):
for name in names:
self.define_term(name, None, None)
def _unpack_term_def(self, tree):
name = tree.children[0].value
exp = tree.children[-1]
p = int(tree.children[1]) if len(tree.children) == 3 else 1
return name, exp, p
def _unpack_rule_def(self, tree):
# FIXME: A little pointless at the moment, but I want to rework this (e.g. move the work from `options_from_rule` to here)
r = options_from_rule(*tree.children)
return r

def load_grammar(self, grammar_text, grammar_source="<?>"):
tree = _parse_grammar(grammar_text, grammar_source)
for stmt in tree.children:
if stmt.data == 'term':
self.define_term(*self._unpack_term_def(stmt))
continue
elif stmt.data == 'rule':
self.define_rule(*self._unpack_rule_def(stmt))
continue
assert stmt.data == 'statement', stmt.data
stmt ,= stmt.children
if stmt.data == 'ignore':
self.ignore(*stmt.children)
elif stmt.data == 'declare':
self.declare(*(t.value for t in stmt.children))
elif stmt.data == 'override':
r ,= stmt.children
if r.data == 'rule':
self.define_rule(*self._unpack_rule_def(r), override=True)
else:
assert r.data == 'term'
self.define_term(*self._unpack_term_def(r), override=True)
elif stmt.data == 'extend':
r ,= stmt.children
if r.data == 'rule':
self.extend_rule(*self._unpack_rule_def(r))
else:
assert r.data == 'term'
self.extend_term(*self._unpack_term_def(r))
else:
assert False, stmt
def check(self):
pass
def build(self) -> Grammar:
return Grammar([(n, *r) for n, r in self._rule_defs.items()],
[(n, t) for n, t in self._term_defs],
self._ignore_names)

def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)

Завантаження…
Відмінити
Зберегти