Browse Source

Refactoring around terminals / tokens

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.5
Erez Shinan 6 years ago
parent
commit
f048bfb870
5 changed files with 173 additions and 180 deletions
  1. +0
    -68
      lark/common.py
  2. +90
    -28
      lark/lexer.py
  3. +78
    -82
      lark/load_grammar.py
  4. +2
    -2
      lark/reconstruct.py
  5. +3
    -0
      lark/utils.py

+ 0
- 68
lark/common.py View File

@@ -1,10 +1,3 @@
import re
import sys

from .utils import get_regexp_width

Py36 = (sys.version_info[:2] >= (3, 6))



###{standalone ###{standalone
###} ###}
@@ -25,64 +18,3 @@ class ParserConf:
self.start = start self.start = start





class Pattern(object):
def __init__(self, value, flags=()):
self.value = value
self.flags = frozenset(flags)

def __repr__(self):
return repr(self.to_regexp())

# Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self):
return hash((type(self), self.value, self.flags))
def __eq__(self, other):
return type(self) == type(other) and self.value == other.value and self.flags == other.flags

def to_regexp(self):
raise NotImplementedError()

if Py36:
# Python 3.6 changed syntax for flags in regular expression
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s:%s)' % (f, value))
return value

else:
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s)' % f) + value
return value

class PatternStr(Pattern):
def to_regexp(self):
return self._get_flags(re.escape(self.value))

@property
def min_width(self):
return len(self.value)
max_width = min_width

class PatternRE(Pattern):
def to_regexp(self):
return self._get_flags(self.value)

@property
def min_width(self):
return get_regexp_width(self.to_regexp())[0]
@property
def max_width(self):
return get_regexp_width(self.to_regexp())[1]

class TokenDef(object):
def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)


+ 90
- 28
lark/lexer.py View File

@@ -2,10 +2,71 @@


import re import re


from .utils import Str, classify
from .common import PatternStr, PatternRE, TokenDef
from .utils import Str, classify, get_regexp_width, Py36
from .exceptions import UnexpectedCharacters, LexError from .exceptions import UnexpectedCharacters, LexError


class Pattern(object):
def __init__(self, value, flags=()):
self.value = value
self.flags = frozenset(flags)

def __repr__(self):
return repr(self.to_regexp())

# Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self):
return hash((type(self), self.value, self.flags))
def __eq__(self, other):
return type(self) == type(other) and self.value == other.value and self.flags == other.flags

def to_regexp(self):
raise NotImplementedError()

if Py36:
# Python 3.6 changed syntax for flags in regular expression
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s:%s)' % (f, value))
return value

else:
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s)' % f) + value
return value

class PatternStr(Pattern):
def to_regexp(self):
return self._get_flags(re.escape(self.value))

@property
def min_width(self):
return len(self.value)
max_width = min_width

class PatternRE(Pattern):
def to_regexp(self):
return self._get_flags(self.value)

@property
def min_width(self):
return get_regexp_width(self.to_regexp())[0]
@property
def max_width(self):
return get_regexp_width(self.to_regexp())[1]

class TerminalDef(object):
def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)



###{standalone ###{standalone
class Token(Str): class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
@@ -125,8 +186,8 @@ class UnlessCallback:






def _create_unless(tokens):
tokens_by_type = classify(tokens, lambda t: type(t.pattern))
def _create_unless(terminals):
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys() assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set() embedded_strs = set()
callback = {} callback = {}
@@ -144,33 +205,34 @@ def _create_unless(tokens):
if unless: if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))


tokens = [t for t in tokens if t not in embedded_strs]
return tokens, callback
terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback




def _build_mres(tokens, max_size, match_whole):
def _build_mres(terminals, max_size, match_whole):
# Python sets an unreasonable group limit (currently 100) in its re module # Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError! # Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful. # This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else '' postfix = '$' if match_whole else ''
mres = [] mres = []
while tokens:
while terminals:
try: try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/ except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(tokens, max_size//2, match_whole)
return _build_mres(terminals, max_size//2, match_whole)


# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
tokens = tokens[max_size:]
terminals = terminals[max_size:]
return mres return mres


def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole)
def build_mres(terminals, match_whole=False):
return _build_mres(terminals, len(terminals), match_whole)


def _regexp_has_newline(r): def _regexp_has_newline(r):
"""Expressions that may indicate newlines in a regexp: """Expressions that may indicate newlines in a regexp:
- newlines (\n) - newlines (\n)
- escaped newline (\n)
- escaped newline (\\n)
- anything but ([^...]) - anything but ([^...])
- any-char (.) when the flag (?s) exists - any-char (.) when the flag (?s) exists
""" """
@@ -188,48 +250,48 @@ class Lexer:
lex = NotImplemented lex = NotImplemented


class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):
def __init__(self, tokens, ignore=(), user_callbacks={}):
assert all(isinstance(t, TokenDef) for t in tokens), tokens
def __init__(self, terminals, ignore=(), user_callbacks={}):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals


tokens = list(tokens)
terminals = list(terminals)


# Sanitization # Sanitization
for t in tokens:
for t in terminals:
try: try:
re.compile(t.pattern.to_regexp()) re.compile(t.pattern.to_regexp())
except: except:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))


if t.pattern.min_width == 0: if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))


assert set(ignore) <= {t.name for t in tokens}
assert set(ignore) <= {t.name for t in terminals}


# Init # Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(ignore) self.ignore_types = list(ignore)


tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))


tokens, self.callback = _create_unless(tokens)
terminals, self.callback = _create_unless(terminals)
assert all(self.callback.values()) assert all(self.callback.values())


for type_, f in user_callbacks.items(): for type_, f in user_callbacks.items():
assert type_ not in self.callback assert type_ not in self.callback
self.callback[type_] = f self.callback[type_] = f


self.tokens = tokens
self.terminals = terminals


self.mres = build_mres(tokens)
self.mres = build_mres(terminals)


def lex(self, stream): def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) return _Lex(self).lex(stream, self.newline_types, self.ignore_types)




class ContextualLexer(Lexer): class ContextualLexer(Lexer):
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {} tokens_by_name = {}
for t in tokens:
for t in terminals:
assert t.name not in tokens_by_name, t assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t tokens_by_name[t.name] = t


@@ -247,7 +309,7 @@ class ContextualLexer(Lexer):


self.lexers[state] = lexer self.lexers[state] = lexer


self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)


self.set_parser_state(None) # Needs to be set on the outside self.set_parser_state(None) # Needs to be set on the outside




+ 78
- 82
lark/load_grammar.py View File

@@ -2,17 +2,15 @@


import os.path import os.path
import sys import sys
from itertools import chain
import re
from ast import literal_eval from ast import literal_eval
from copy import deepcopy from copy import deepcopy


from .lexer import Token
from .lexer import Token, TerminalDef, PatternStr, PatternRE




from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress from .utils import classify, suppress
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
@@ -99,7 +97,7 @@ TERMINALS = {
RULES = { RULES = {
'start': ['_list'], 'start': ['_list'],
'_list': ['_item', '_list _item'], '_list': ['_item', '_list _item'],
'_item': ['rule', 'token', 'statement', '_NL'],
'_item': ['rule', 'term', 'statement', '_NL'],


'rule': ['RULE _COLON expansions _NL', 'rule': ['RULE _COLON expansions _NL',
'RULE _DOT NUMBER _COLON expansions _NL'], 'RULE _DOT NUMBER _COLON expansions _NL'],
@@ -135,7 +133,7 @@ RULES = {
'maybe': ['_LBRA expansions _RBRA'], 'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'], 'range': ['STRING _DOT _DOT STRING'],


'token': ['TERMINAL _COLON expansions _NL',
'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'], 'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import', 'declare'], 'statement': ['ignore', 'import', 'declare'],
'ignore': ['_IGNORE expansions _NL'], 'ignore': ['_IGNORE expansions _NL'],
@@ -275,58 +273,58 @@ class CanonizeTree(Transformer_InPlace):
return tokenmods + [value] return tokenmods + [value]


class PrepareAnonTerminals(Transformer_InPlace): class PrepareAnonTerminals(Transformer_InPlace):
"Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them"
"Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"


def __init__(self, tokens):
self.tokens = tokens
self.token_set = {td.name for td in self.tokens}
self.token_reverse = {td.pattern: td for td in tokens}
def __init__(self, terminals):
self.terminals = terminals
self.term_set = {td.name for td in self.terminals}
self.term_reverse = {td.pattern: td for td in terminals}
self.i = 0 self.i = 0




@inline_args @inline_args
def pattern(self, p): def pattern(self, p):
value = p.value value = p.value
if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags:
if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags:
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)


token_name = None
term_name = None


if isinstance(p, PatternStr): if isinstance(p, PatternStr):
try: try:
# If already defined, use the user-defined token name
token_name = self.token_reverse[p].name
# If already defined, use the user-defined terminal name
term_name = self.term_reverse[p].name
except KeyError: except KeyError:
# Try to assign an indicative anon-token name
# Try to assign an indicative anon-terminal name
try: try:
token_name = _TERMINAL_NAMES[value]
term_name = _TERMINAL_NAMES[value]
except KeyError: except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
with suppress(UnicodeEncodeError): with suppress(UnicodeEncodeError):
value.upper().encode('ascii') # Make sure we don't have unicode in our token names
token_name = value.upper()
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
term_name = value.upper()


if token_name in self.token_set:
token_name = None
if term_name in self.term_set:
term_name = None


elif isinstance(p, PatternRE): elif isinstance(p, PatternRE):
if p in self.token_reverse: # Kind of a wierd placement.name
token_name = self.token_reverse[p].name
if p in self.term_reverse: # Kind of a wierd placement.name
term_name = self.term_reverse[p].name
else: else:
assert False, p assert False, p


if token_name is None:
token_name = '__ANON_%d' % self.i
if term_name is None:
term_name = '__ANON_%d' % self.i
self.i += 1 self.i += 1


if token_name not in self.token_set:
assert p not in self.token_reverse
self.token_set.add(token_name)
tokendef = TokenDef(token_name, p)
self.token_reverse[p] = tokendef
self.tokens.append(tokendef)
if term_name not in self.term_set:
assert p not in self.term_reverse
self.term_set.add(term_name)
termdef = TerminalDef(term_name, p)
self.term_reverse[p] = termdef
self.terminals.append(termdef)


return Terminal(token_name, filter_out=isinstance(p, PatternStr))
return Terminal(term_name, filter_out=isinstance(p, PatternStr))




def _rfind(s, choices): def _rfind(s, choices):
@@ -391,7 +389,7 @@ class PrepareLiterals(Transformer_InPlace):
return ST('pattern', [PatternRE(regexp)]) return ST('pattern', [PatternRE(regexp)])




class TokenTreeToPattern(Transformer):
class TerminalTreeToPattern(Transformer):
def pattern(self, ps): def pattern(self, ps):
p ,= ps p ,= ps
return p return p
@@ -401,14 +399,14 @@ class TokenTreeToPattern(Transformer):
if len(items) == 1: if len(items) == 1:
return items[0] return items[0]
if len({i.flags for i in items}) > 1: if len({i.flags for i in items}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())


def expansions(self, exps): def expansions(self, exps):
if len(exps) == 1: if len(exps) == 1:
return exps[0] return exps[0]
if len({i.flags for i in exps}) > 1: if len({i.flags for i in exps}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags) return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)


def expr(self, args): def expr(self, args):
@@ -446,39 +444,39 @@ def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])


class Grammar: class Grammar:
def __init__(self, rule_defs, token_defs, ignore):
self.token_defs = token_defs
def __init__(self, rule_defs, term_defs, ignore):
self.term_defs = term_defs
self.rule_defs = rule_defs self.rule_defs = rule_defs
self.ignore = ignore self.ignore = ignore


def compile(self): def compile(self):
# We change the trees in-place (to support huge grammars) # We change the trees in-place (to support huge grammars)
# So deepcopy allows calling compile more than once. # So deepcopy allows calling compile more than once.
token_defs = deepcopy(list(self.token_defs))
term_defs = deepcopy(list(self.term_defs))
rule_defs = deepcopy(self.rule_defs) rule_defs = deepcopy(self.rule_defs)


# =================
# Compile Tokens
# =================
# ===================
# Compile Terminals
# ===================


# Convert token-trees to strings/regexps
transformer = PrepareLiterals() * TokenTreeToPattern()
for name, (token_tree, priority) in token_defs:
if token_tree is None: # Terminal added through %declare
# Convert terminal-trees to strings/regexps
transformer = PrepareLiterals() * TerminalTreeToPattern()
for name, (term_tree, priority) in term_defs:
if term_tree is None: # Terminal added through %declare
continue continue
expansions = list(token_tree.find_data('expansion'))
expansions = list(term_tree.find_data('expansion'))
if len(expansions) == 1 and not expansions[0].children: if len(expansions) == 1 and not expansions[0].children:
raise GrammarError("Terminals cannot be empty (%s)" % name) raise GrammarError("Terminals cannot be empty (%s)" % name)


tokens = [TokenDef(name, transformer.transform(token_tree), priority)
for name, (token_tree, priority) in token_defs if token_tree]
terminals = [TerminalDef(name, transformer.transform(term_tree), priority)
for name, (term_tree, priority) in term_defs if term_tree]


# ================= # =================
# Compile Rules # Compile Rules
# ================= # =================


# 1. Pre-process terminals # 1. Pre-process terminals
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(tokens) # Adds to tokens
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals


# 2. Convert EBNF to BNF (and apply step 1) # 2. Convert EBNF to BNF (and apply step 1)
ebnf_to_bnf = EBNF_to_BNF() ebnf_to_bnf = EBNF_to_BNF()
@@ -509,7 +507,7 @@ class Grammar:
rule = Rule(NonTerminal(name), expansion, alias, options) rule = Rule(NonTerminal(name), expansion, alias, options)
compiled_rules.append(rule) compiled_rules.append(rule)


return tokens, compiled_rules, self.ignore
return terminals, compiled_rules, self.ignore






@@ -531,16 +529,16 @@ def import_grammar(grammar_path, base_paths=[]):
return _imported_grammars[grammar_path] return _imported_grammars[grammar_path]




def resolve_token_references(token_defs):
def resolve_term_references(term_defs):
# TODO Cycles detection # TODO Cycles detection
# TODO Solve with transitive closure (maybe) # TODO Solve with transitive closure (maybe)


token_dict = {k:t for k, (t,_p) in token_defs}
assert len(token_dict) == len(token_defs), "Same name defined twice?"
token_dict = {k:t for k, (t,_p) in term_defs}
assert len(token_dict) == len(term_defs), "Same name defined twice?"


while True: while True:
changed = False changed = False
for name, (token_tree, _p) in token_defs:
for name, (token_tree, _p) in term_defs:
if token_tree is None: # Terminal added through %declare if token_tree is None: # Terminal added through %declare
continue continue
for exp in token_tree.find_data('value'): for exp in token_tree.find_data('value'):
@@ -583,12 +581,12 @@ class PrepareGrammar(Transformer_InPlace):


class GrammarLoader: class GrammarLoader:
def __init__(self): def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]


rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs]
callback = ParseTreeBuilder(rules, ST).create_callback() callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, 'start') parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
@@ -609,11 +607,11 @@ class GrammarLoader:
error = e.match_examples(self.parser.parse, { error = e.match_examples(self.parser.parse, {
'Unclosed parenthesis': ['a: (\n'], 'Unclosed parenthesis': ['a: (\n'],
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
'Alias expects lowercase name': ['a: -> "a"\n'], 'Alias expects lowercase name': ['a: -> "a"\n'],
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'],
'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'],
'%import expects a name': ['%import "a"\n'], '%import expects a name': ['%import "a"\n'],
'%ignore expects a value': ['%ignore %import\n'], '%ignore expects a value': ['%ignore %import\n'],
}) })
@@ -627,17 +625,16 @@ class GrammarLoader:


# Extract grammar items # Extract grammar items
defs = classify(tree.children, lambda c: c.data, lambda c: c.children) defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
token_defs = defs.pop('token', [])
term_defs = defs.pop('term', [])
rule_defs = defs.pop('rule', []) rule_defs = defs.pop('rule', [])
statements = defs.pop('statement', []) statements = defs.pop('statement', [])
assert not defs assert not defs


token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs]
token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs]
term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]


# Execute statements # Execute statements
ignore = [] ignore = []
declared = []
for (stmt,) in statements: for (stmt,) in statements:
if stmt.data == 'ignore': if stmt.data == 'ignore':
t ,= stmt.children t ,= stmt.children
@@ -672,25 +669,25 @@ class GrammarLoader:
g = import_grammar(grammar_path, base_paths=[base_path]) g = import_grammar(grammar_path, base_paths=[base_path])


for name, alias in zip(names, aliases): for name, alias in zip(names, aliases):
token_options = dict(g.token_defs)[name]
assert isinstance(token_options, tuple) and len(token_options)==2
token_defs.append([alias.value, token_options])
term_options = dict(g.term_defs)[name]
assert isinstance(term_options, tuple) and len(term_options)==2
term_defs.append([alias.value, term_options])


elif stmt.data == 'declare': elif stmt.data == 'declare':
for t in stmt.children: for t in stmt.children:
token_defs.append([t.value, (None, None)])
term_defs.append([t.value, (None, None)])
else: else:
assert False, stmt assert False, stmt




# Verify correctness 1 # Verify correctness 1
for name, _ in token_defs:
for name, _ in term_defs:
if name.startswith('__'): if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)


# Handle ignore tokens # Handle ignore tokens
# XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
# inability to handle duplicate tokens (two names, one value)
# inability to handle duplicate terminals (two names, one value)
ignore_names = [] ignore_names = []
for t in ignore: for t in ignore:
if t.data=='expansions' and len(t.children) == 1: if t.data=='expansions' and len(t.children) == 1:
@@ -705,20 +702,19 @@ class GrammarLoader:


name = '__IGNORE_%d'% len(ignore_names) name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name) ignore_names.append(name)
token_defs.append((name, (t, 0)))
term_defs.append((name, (t, 0)))


# Verify correctness 2 # Verify correctness 2
token_names = set()
for name, _ in token_defs:
if name in token_names:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)
terminal_names = set()
for name, _ in term_defs:
if name in terminal_names:
raise GrammarError("Terminal '%s' defined more than once" % name)
terminal_names.add(name)


if set(ignore_names) > token_names:
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
if set(ignore_names) > terminal_names:
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))


# Resolve token references
resolve_token_references(token_defs)
resolve_term_references(term_defs)


rules = [options_from_rule(*x) for x in rule_defs] rules = [options_from_rule(*x) for x in rule_defs]


@@ -735,15 +731,15 @@ class GrammarLoader:
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
for sym in used_symbols: for sym in used_symbols:
if is_terminal(sym): if is_terminal(sym):
if sym not in token_names:
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else: else:
if sym not in rule_names: if sym not in rule_names:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name)) raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))


# TODO don't include unused tokens, they can only cause trouble!
# TODO don't include unused terminals, they can only cause trouble!


return Grammar(rules, token_defs, ignore_names)
return Grammar(rules, term_defs, ignore_names)








+ 2
- 2
lark/reconstruct.py View File

@@ -2,8 +2,8 @@ from collections import defaultdict


from .tree import Tree from .tree import Tree
from .visitors import Transformer_InPlace from .visitors import Transformer_InPlace
from .common import ParserConf, PatternStr
from .lexer import Token
from .common import ParserConf
from .lexer import Token, PatternStr
from .parsers import earley, resolve_ambig from .parsers import earley, resolve_ambig
from .grammar import Rule, Terminal, NonTerminal from .grammar import Rule, Terminal, NonTerminal




+ 3
- 0
lark/utils.py View File

@@ -1,5 +1,8 @@
import sys
from collections import deque from collections import deque


Py36 = (sys.version_info[:2] >= (3, 6))

class fzset(frozenset): class fzset(frozenset):
def __repr__(self): def __repr__(self):
return '{%s}' % ', '.join(map(repr, self)) return '{%s}' % ', '.join(map(repr, self))


Loading…
Cancel
Save