Parcourir la source

Moved unless handling to lexer and improved code

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan il y a 7 ans
Parent
révision
387fa07818
4 fichiers modifiés avec 97 ajouts et 75 suppressions
  1. +7
    -18
      lark/lark.py
  2. +71
    -19
      lark/lexer.py
  3. +12
    -31
      lark/load_grammar.py
  4. +7
    -7
      lark/parsers/lalr_parser.py

+ 7
- 18
lark/lark.py Voir le fichier

@@ -112,39 +112,30 @@ class Lark:

self.tokens, self.rules = load_grammar(grammar)

self.lexer = self._build_lexer()
if not self.options.only_lex:
self.parser_engine = ENGINE_DICT[self.options.parser]()
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
self.parser = self._build_parser()

if self.profiler: self.profiler.enter_section('outside_lark')
self.lexer = self._build_lexer()

if self.profiler: self.profiler.enter_section('outside_lark')

def _create_unless_callback(self, strs):
def f(t):
if t in strs:
t.type = strs[t]
return t
return f
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC

def _build_lexer(self):
ignore_tokens = []
tokens = []
callbacks = {}
for name, value, flags in self.tokens:
for tokendef, flags in self.tokens:
for flag in flags:
if flag == 'ignore':
ignore_tokens.append(name)
elif isinstance(flag, tuple) and flag[0] == 'unless':
_, strs = flag
callbacks[name] = self._create_unless_callback(strs)
ignore_tokens.append(tokendef.name)
else:
raise GrammarError("No such flag: %s" % flag)

tokens.append((name, value))
return Lexer(tokens, callbacks, ignore=ignore_tokens)
tokens.append(tokendef)

return Lexer(tokens, ignore=ignore_tokens)

def _build_parser(self):
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
@@ -155,8 +146,6 @@ class Lark:
return self.parser_engine.build_parser(rules, callback, self.options.start)


__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC

def lex(self, text):
stream = self.lexer.lex(text)
if self.options.postlex:


+ 71
- 19
lark/lexer.py Voir le fichier

@@ -2,11 +2,32 @@

import re

from .utils import Str
from .utils import Str, classify

class LexError(Exception):
pass

class TokenDef(object):
def __init__(self, name, value):
self.name = name
self.value = value

def __repr__(self):
return ('%s(%r, %r)' % (type(self).__name__, self.name, self.value))

class TokenDef__Str(TokenDef):
def to_regexp(self):
return re.escape(self.value)

priority = 0

class TokenDef__Regexp(TokenDef):
def to_regexp(self):
return self.value

priority = 1


class UnexpectedInput(LexError):
def __init__(self, seq, lex_pos, line, column):
context = seq[lex_pos:lex_pos+5]
@@ -41,31 +62,63 @@ class Regex:
self.pattern = pattern
self.flags = flags

def _regexp_has_newline(r):
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)

def _create_unless_callback(strs):
def f(t):
if t in strs:
t.type = strs[t]
return t
return f

def _create_unless(tokens):
tokens_by_type = classify(tokens, type)
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
callback = {}
for retok in tokens_by_type.get(TokenDef__Regexp, []):
unless = {}
for strtok in tokens_by_type.get(TokenDef__Str, []):
m = re.match(retok.value, strtok.value)
if m and m.group(0) == strtok.value:
embedded_strs.add(strtok.name)
unless[strtok.value] = strtok.name
if unless:
callback[retok.name] = _create_unless_callback(unless)

tokens = [t for t in tokens if t.name not in embedded_strs]
return tokens, callback


class Lexer(object):
def __init__(self, tokens, callbacks, ignore=()):
def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens)

self.ignore = ignore
self.newline_char = '\n'
tokens = list(tokens)

# Sanitization
token_names = {t[0] for t in tokens}
for t in tokens:
try:
re.compile(t[1])
re.compile(t.to_regexp())
except:
raise LexError("Cannot compile token: %s: %s" % t)

token_names = {t.name for t in tokens}
assert all(t in token_names for t in ignore)

# Init
self.tokens = tokens
self.callbacks = callbacks
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())]
self.ignore_types = [t for t in ignore]

self.token_types = list(token_names)
self.type_index = {name:i for i,name in enumerate(self.token_types)}
tokens, self.callback = _create_unless(tokens)
assert all(self.callback.values())

self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]]
self.ignore_types = [self.type_index[t] for t in ignore]
tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True)

self.tokens = tokens

self.mres = self._build_mres(tokens, len(tokens))

@@ -77,11 +130,11 @@ class Lexer(object):
mres = []
while tokens:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(tokens, max_size//2)

mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
tokens = tokens[max_size:]
return mres

@@ -96,16 +149,16 @@ class Lexer(object):
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_num = type_from_index[m.lastindex]
if type_num not in ignore_types:
t = Token(self.token_types[type_num], value, lex_pos)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, lex_pos)
t.line = line
t.column = lex_pos - col_start_pos
if t.type in self.callbacks:
t = self.callbacks[t.type](t)
if t.type in self.callback:
t = self.callback[t.type](t)
yield t

if type_num in newline_types:
if type_ in newline_types:
newlines = value.count(self.newline_char)
if newlines:
line += newlines
@@ -117,4 +170,3 @@ class Lexer(object):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break



+ 12
- 31
lark/load_grammar.py Voir le fichier

@@ -1,7 +1,7 @@
import re
import codecs

from .lexer import Lexer, Token, UnexpectedInput
from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp

from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR
@@ -278,7 +278,8 @@ class ExtractAnonTokens(InlineTransformer):

class GrammarLoader:
def __init__(self):
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
self.lexer = Lexer(tokens, ignore=['WS', 'COMMENT'])

d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
@@ -312,47 +313,27 @@ class GrammarLoader:
extract_anon = ExtractAnonTokens(tokens, token_set)
tree = extract_anon.transform(tree) # Adds to tokens

tokens2 = []
token_ref = {}
tokendefs = []
for name, token, flags in tokens:
value = token.value[1:-1]
if r'\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]
tokens2.append((name, token.type, value, flags))

token_ref = {}
re_tokens = []
str_tokens = []
for name, type_, value, flags in tokens2:
if type_ == 'STRING':
str_tokens.append((name, value, flags))
else:
assert type_ == 'REGEXP'
if token.type == 'REGEXP':
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
if sp:
value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp)

re_tokens.append((name, value, flags))
token_ref[name] = value
tokendef = TokenDef__Regexp(name, value)
else:
assert token.type == 'STRING'
tokendef = TokenDef__Str(name, value)

embedded_strs = set()
for re_name, re_value, re_flags in re_tokens:
unless = {}
for str_name, str_value, _sf in str_tokens:
m = re.match(re_value, str_value)
if m and m.group(0) == str_value:
assert not _sf, "You just broke Lark! Please email me with your grammar"
embedded_strs.add(str_name)
unless[str_value] = str_name
if unless:
re_flags.append(('unless', unless))

str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if n not in embedded_strs]

str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
tokens = str_tokens + re_tokens # Order is important!
tokendefs.append((tokendef, flags))

# =================
# Process Rules
@@ -391,7 +372,7 @@ class GrammarLoader:
if sym not in rule_set:
raise GrammarError("Rule '%s' used but not defined" % sym)

return tokens, rules
return tokendefs, rules

load_grammar = GrammarLoader().load_grammar



+ 7
- 7
lark/parsers/lalr_parser.py Voir le fichier

@@ -3,15 +3,15 @@ from ..common import ParseError, UnexpectedToken


class Parser(object):
def __init__(self, ga, callback):
self.ga = ga
def __init__(self, analysis, callback):
self.analysis = analysis
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None)
for rule in ga.rules}
for rule in analysis.rules}

def parse(self, seq):
states_idx = self.ga.states_idx
states_idx = self.analysis.states_idx

state_stack = [self.ga.init_state_idx]
state_stack = [self.analysis.init_state_idx]
value_stack = []
i = 0

@@ -39,7 +39,7 @@ class Parser(object):

res = self.callbacks[rule](s)

if len(state_stack) == 1 and rule.origin == self.ga.start_symbol:
if len(state_stack) == 1 and rule.origin == self.analysis.start_symbol:
return res

_action, new_state = get_action(rule.origin)
@@ -63,7 +63,7 @@ class Parser(object):
assert _action == 'reduce'
res = reduce(*rule)
if res:
assert state_stack == [self.ga.init_state_idx] and not value_stack, len(state_stack)
assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack)
return res




Chargement…
Annuler
Enregistrer