Browse Source

Refactoring around terminals / tokens

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.5
Erez Shinan 6 years ago
parent
commit
f048bfb870
5 changed files with 173 additions and 180 deletions
  1. +0
    -68
      lark/common.py
  2. +90
    -28
      lark/lexer.py
  3. +78
    -82
      lark/load_grammar.py
  4. +2
    -2
      lark/reconstruct.py
  5. +3
    -0
      lark/utils.py

+ 0
- 68
lark/common.py View File

@@ -1,10 +1,3 @@
import re
import sys

from .utils import get_regexp_width

Py36 = (sys.version_info[:2] >= (3, 6))


###{standalone
###}
@@ -25,64 +18,3 @@ class ParserConf:
self.start = start



class Pattern(object):
def __init__(self, value, flags=()):
self.value = value
self.flags = frozenset(flags)

def __repr__(self):
return repr(self.to_regexp())

# Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self):
return hash((type(self), self.value, self.flags))
def __eq__(self, other):
return type(self) == type(other) and self.value == other.value and self.flags == other.flags

def to_regexp(self):
raise NotImplementedError()

if Py36:
# Python 3.6 changed syntax for flags in regular expression
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s:%s)' % (f, value))
return value

else:
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s)' % f) + value
return value

class PatternStr(Pattern):
def to_regexp(self):
return self._get_flags(re.escape(self.value))

@property
def min_width(self):
return len(self.value)
max_width = min_width

class PatternRE(Pattern):
def to_regexp(self):
return self._get_flags(self.value)

@property
def min_width(self):
return get_regexp_width(self.to_regexp())[0]
@property
def max_width(self):
return get_regexp_width(self.to_regexp())[1]

class TokenDef(object):
def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)


+ 90
- 28
lark/lexer.py View File

@@ -2,10 +2,71 @@

import re

from .utils import Str, classify
from .common import PatternStr, PatternRE, TokenDef
from .utils import Str, classify, get_regexp_width, Py36
from .exceptions import UnexpectedCharacters, LexError

class Pattern(object):
def __init__(self, value, flags=()):
self.value = value
self.flags = frozenset(flags)

def __repr__(self):
return repr(self.to_regexp())

# Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self):
return hash((type(self), self.value, self.flags))
def __eq__(self, other):
return type(self) == type(other) and self.value == other.value and self.flags == other.flags

def to_regexp(self):
raise NotImplementedError()

if Py36:
# Python 3.6 changed syntax for flags in regular expression
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s:%s)' % (f, value))
return value

else:
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s)' % f) + value
return value

class PatternStr(Pattern):
def to_regexp(self):
return self._get_flags(re.escape(self.value))

@property
def min_width(self):
return len(self.value)
max_width = min_width

class PatternRE(Pattern):
def to_regexp(self):
return self._get_flags(self.value)

@property
def min_width(self):
return get_regexp_width(self.to_regexp())[0]
@property
def max_width(self):
return get_regexp_width(self.to_regexp())[1]

class TerminalDef(object):
def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)



###{standalone
class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
@@ -125,8 +186,8 @@ class UnlessCallback:



def _create_unless(tokens):
tokens_by_type = classify(tokens, lambda t: type(t.pattern))
def _create_unless(terminals):
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
callback = {}
@@ -144,33 +205,34 @@ def _create_unless(tokens):
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))

tokens = [t for t in tokens if t not in embedded_strs]
return tokens, callback
terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(tokens, max_size, match_whole):
def _build_mres(terminals, max_size, match_whole):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while tokens:
while terminals:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(tokens, max_size//2, match_whole)
return _build_mres(terminals, max_size//2, match_whole)

# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
tokens = tokens[max_size:]
terminals = terminals[max_size:]
return mres

def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole)
def build_mres(terminals, match_whole=False):
return _build_mres(terminals, len(terminals), match_whole)

def _regexp_has_newline(r):
"""Expressions that may indicate newlines in a regexp:
- newlines (\n)
- escaped newline (\n)
- escaped newline (\\n)
- anything but ([^...])
- any-char (.) when the flag (?s) exists
"""
@@ -188,48 +250,48 @@ class Lexer:
lex = NotImplemented

class TraditionalLexer(Lexer):
def __init__(self, tokens, ignore=(), user_callbacks={}):
assert all(isinstance(t, TokenDef) for t in tokens), tokens
def __init__(self, terminals, ignore=(), user_callbacks={}):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

tokens = list(tokens)
terminals = list(terminals)

# Sanitization
for t in tokens:
for t in terminals:
try:
re.compile(t.pattern.to_regexp())
except:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))

assert set(ignore) <= {t.name for t in tokens}
assert set(ignore) <= {t.name for t in terminals}

# Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(ignore)

tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))

tokens, self.callback = _create_unless(tokens)
terminals, self.callback = _create_unless(terminals)
assert all(self.callback.values())

for type_, f in user_callbacks.items():
assert type_ not in self.callback
self.callback[type_] = f

self.tokens = tokens
self.terminals = terminals

self.mres = build_mres(tokens)
self.mres = build_mres(terminals)

def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)


class ContextualLexer(Lexer):
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {}
for t in tokens:
for t in terminals:
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t

@@ -247,7 +309,7 @@ class ContextualLexer(Lexer):

self.lexers[state] = lexer

self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)

self.set_parser_state(None) # Needs to be set on the outside



+ 78
- 82
lark/load_grammar.py View File

@@ -2,17 +2,15 @@

import os.path
import sys
from itertools import chain
import re
from ast import literal_eval
from copy import deepcopy

from .lexer import Token
from .lexer import Token, TerminalDef, PatternStr, PatternRE


from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
@@ -99,7 +97,7 @@ TERMINALS = {
RULES = {
'start': ['_list'],
'_list': ['_item', '_list _item'],
'_item': ['rule', 'token', 'statement', '_NL'],
'_item': ['rule', 'term', 'statement', '_NL'],

'rule': ['RULE _COLON expansions _NL',
'RULE _DOT NUMBER _COLON expansions _NL'],
@@ -135,7 +133,7 @@ RULES = {
'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],

'token': ['TERMINAL _COLON expansions _NL',
'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import', 'declare'],
'ignore': ['_IGNORE expansions _NL'],
@@ -275,58 +273,58 @@ class CanonizeTree(Transformer_InPlace):
return tokenmods + [value]

class PrepareAnonTerminals(Transformer_InPlace):
"Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them"
"Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"

def __init__(self, tokens):
self.tokens = tokens
self.token_set = {td.name for td in self.tokens}
self.token_reverse = {td.pattern: td for td in tokens}
def __init__(self, terminals):
self.terminals = terminals
self.term_set = {td.name for td in self.terminals}
self.term_reverse = {td.pattern: td for td in terminals}
self.i = 0


@inline_args
def pattern(self, p):
value = p.value
if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags:
if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags:
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)

token_name = None
term_name = None

if isinstance(p, PatternStr):
try:
# If already defined, use the user-defined token name
token_name = self.token_reverse[p].name
# If already defined, use the user-defined terminal name
term_name = self.term_reverse[p].name
except KeyError:
# Try to assign an indicative anon-token name
# Try to assign an indicative anon-terminal name
try:
token_name = _TERMINAL_NAMES[value]
term_name = _TERMINAL_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
with suppress(UnicodeEncodeError):
value.upper().encode('ascii') # Make sure we don't have unicode in our token names
token_name = value.upper()
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
term_name = value.upper()

if token_name in self.token_set:
token_name = None
if term_name in self.term_set:
term_name = None

elif isinstance(p, PatternRE):
if p in self.token_reverse: # Kind of a wierd placement.name
token_name = self.token_reverse[p].name
if p in self.term_reverse: # Kind of a wierd placement.name
term_name = self.term_reverse[p].name
else:
assert False, p

if token_name is None:
token_name = '__ANON_%d' % self.i
if term_name is None:
term_name = '__ANON_%d' % self.i
self.i += 1

if token_name not in self.token_set:
assert p not in self.token_reverse
self.token_set.add(token_name)
tokendef = TokenDef(token_name, p)
self.token_reverse[p] = tokendef
self.tokens.append(tokendef)
if term_name not in self.term_set:
assert p not in self.term_reverse
self.term_set.add(term_name)
termdef = TerminalDef(term_name, p)
self.term_reverse[p] = termdef
self.terminals.append(termdef)

return Terminal(token_name, filter_out=isinstance(p, PatternStr))
return Terminal(term_name, filter_out=isinstance(p, PatternStr))


def _rfind(s, choices):
@@ -391,7 +389,7 @@ class PrepareLiterals(Transformer_InPlace):
return ST('pattern', [PatternRE(regexp)])


class TokenTreeToPattern(Transformer):
class TerminalTreeToPattern(Transformer):
def pattern(self, ps):
p ,= ps
return p
@@ -401,14 +399,14 @@ class TokenTreeToPattern(Transformer):
if len(items) == 1:
return items[0]
if len({i.flags for i in items}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())

def expansions(self, exps):
if len(exps) == 1:
return exps[0]
if len({i.flags for i in exps}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)

def expr(self, args):
@@ -446,39 +444,39 @@ def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])

class Grammar:
def __init__(self, rule_defs, token_defs, ignore):
self.token_defs = token_defs
def __init__(self, rule_defs, term_defs, ignore):
self.term_defs = term_defs
self.rule_defs = rule_defs
self.ignore = ignore

def compile(self):
# We change the trees in-place (to support huge grammars)
# So deepcopy allows calling compile more than once.
token_defs = deepcopy(list(self.token_defs))
term_defs = deepcopy(list(self.term_defs))
rule_defs = deepcopy(self.rule_defs)

# =================
# Compile Tokens
# =================
# ===================
# Compile Terminals
# ===================

# Convert token-trees to strings/regexps
transformer = PrepareLiterals() * TokenTreeToPattern()
for name, (token_tree, priority) in token_defs:
if token_tree is None: # Terminal added through %declare
# Convert terminal-trees to strings/regexps
transformer = PrepareLiterals() * TerminalTreeToPattern()
for name, (term_tree, priority) in term_defs:
if term_tree is None: # Terminal added through %declare
continue
expansions = list(token_tree.find_data('expansion'))
expansions = list(term_tree.find_data('expansion'))
if len(expansions) == 1 and not expansions[0].children:
raise GrammarError("Terminals cannot be empty (%s)" % name)

tokens = [TokenDef(name, transformer.transform(token_tree), priority)
for name, (token_tree, priority) in token_defs if token_tree]
terminals = [TerminalDef(name, transformer.transform(term_tree), priority)
for name, (term_tree, priority) in term_defs if term_tree]

# =================
# Compile Rules
# =================

# 1. Pre-process terminals
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(tokens) # Adds to tokens
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals

# 2. Convert EBNF to BNF (and apply step 1)
ebnf_to_bnf = EBNF_to_BNF()
@@ -509,7 +507,7 @@ class Grammar:
rule = Rule(NonTerminal(name), expansion, alias, options)
compiled_rules.append(rule)

return tokens, compiled_rules, self.ignore
return terminals, compiled_rules, self.ignore



@@ -531,16 +529,16 @@ def import_grammar(grammar_path, base_paths=[]):
return _imported_grammars[grammar_path]


def resolve_token_references(token_defs):
def resolve_term_references(term_defs):
# TODO Cycles detection
# TODO Solve with transitive closure (maybe)

token_dict = {k:t for k, (t,_p) in token_defs}
assert len(token_dict) == len(token_defs), "Same name defined twice?"
token_dict = {k:t for k, (t,_p) in term_defs}
assert len(token_dict) == len(term_defs), "Same name defined twice?"

while True:
changed = False
for name, (token_tree, _p) in token_defs:
for name, (token_tree, _p) in term_defs:
if token_tree is None: # Terminal added through %declare
continue
for exp in token_tree.find_data('value'):
@@ -583,12 +581,12 @@ class PrepareGrammar(Transformer_InPlace):

class GrammarLoader:
def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs]
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
@@ -609,11 +607,11 @@ class GrammarLoader:
error = e.match_examples(self.parser.parse, {
'Unclosed parenthesis': ['a: (\n'],
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
'Alias expects lowercase name': ['a: -> "a"\n'],
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'],
'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'],
'%import expects a name': ['%import "a"\n'],
'%ignore expects a value': ['%ignore %import\n'],
})
@@ -627,17 +625,16 @@ class GrammarLoader:

# Extract grammar items
defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
token_defs = defs.pop('token', [])
term_defs = defs.pop('term', [])
rule_defs = defs.pop('rule', [])
statements = defs.pop('statement', [])
assert not defs

token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs]
token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs]
term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]

# Execute statements
ignore = []
declared = []
for (stmt,) in statements:
if stmt.data == 'ignore':
t ,= stmt.children
@@ -672,25 +669,25 @@ class GrammarLoader:
g = import_grammar(grammar_path, base_paths=[base_path])

for name, alias in zip(names, aliases):
token_options = dict(g.token_defs)[name]
assert isinstance(token_options, tuple) and len(token_options)==2
token_defs.append([alias.value, token_options])
term_options = dict(g.term_defs)[name]
assert isinstance(term_options, tuple) and len(term_options)==2
term_defs.append([alias.value, term_options])

elif stmt.data == 'declare':
for t in stmt.children:
token_defs.append([t.value, (None, None)])
term_defs.append([t.value, (None, None)])
else:
assert False, stmt


# Verify correctness 1
for name, _ in token_defs:
for name, _ in term_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)

# Handle ignore tokens
# XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
# inability to handle duplicate tokens (two names, one value)
# inability to handle duplicate terminals (two names, one value)
ignore_names = []
for t in ignore:
if t.data=='expansions' and len(t.children) == 1:
@@ -705,20 +702,19 @@ class GrammarLoader:

name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name)
token_defs.append((name, (t, 0)))
term_defs.append((name, (t, 0)))

# Verify correctness 2
token_names = set()
for name, _ in token_defs:
if name in token_names:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)
terminal_names = set()
for name, _ in term_defs:
if name in terminal_names:
raise GrammarError("Terminal '%s' defined more than once" % name)
terminal_names.add(name)

if set(ignore_names) > token_names:
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
if set(ignore_names) > terminal_names:
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))

# Resolve token references
resolve_token_references(token_defs)
resolve_term_references(term_defs)

rules = [options_from_rule(*x) for x in rule_defs]

@@ -735,15 +731,15 @@ class GrammarLoader:
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_names:
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else:
if sym not in rule_names:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))

# TODO don't include unused tokens, they can only cause trouble!
# TODO don't include unused terminals, they can only cause trouble!

return Grammar(rules, token_defs, ignore_names)
return Grammar(rules, term_defs, ignore_names)





+ 2
- 2
lark/reconstruct.py View File

@@ -2,8 +2,8 @@ from collections import defaultdict

from .tree import Tree
from .visitors import Transformer_InPlace
from .common import ParserConf, PatternStr
from .lexer import Token
from .common import ParserConf
from .lexer import Token, PatternStr
from .parsers import earley, resolve_ambig
from .grammar import Rule, Terminal, NonTerminal



+ 3
- 0
lark/utils.py View File

@@ -1,5 +1,8 @@
import sys
from collections import deque

Py36 = (sys.version_info[:2] >= (3, 6))

class fzset(frozenset):
def __repr__(self):
return '{%s}' % ', '.join(map(repr, self))


Loading…
Cancel
Save