From 5236e4a32af449f02a6b96f346e620e79f5e1e83 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 23 Feb 2017 23:50:52 +0200 Subject: [PATCH] Refactored TokenDef to store Pattern(Str/RE) --- lark/common.py | 32 ++++++++++++++++ lark/lexer.py | 47 +++++++---------------- lark/load_grammar.py | 90 ++++++++++++++++++++------------------------ tests/test_parser.py | 9 +++++ 4 files changed, 94 insertions(+), 84 deletions(-) diff --git a/lark/common.py b/lark/common.py index 122c7e5..3306d68 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,3 +1,4 @@ +import re class GrammarError(Exception): pass @@ -43,3 +44,34 @@ class ParserConf: self.rules = rules self.callback = callback self.start = start + + + +class Pattern(object): + def __init__(self, value): + self.value = value + + def __repr__(self): + return repr(self.value) + +class PatternStr(Pattern): + def to_regexp(self): + return re.escape(self.value) + + priority = 0 + +class PatternRE(Pattern): + def to_regexp(self): + return self.value + + priority = 1 + +class TokenDef(object): + def __init__(self, name, pattern): + assert isinstance(pattern, Pattern), pattern + self.name = name + self.pattern = pattern + + def __repr__(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + diff --git a/lark/lexer.py b/lark/lexer.py index 3c38833..799597b 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,33 +3,11 @@ import re from .utils import Str, classify, STRING_TYPE -from .common import is_terminal +from .common import is_terminal, PatternStr, PatternRE, TokenDef class LexError(Exception): pass -class TokenDef(object): - def __init__(self, name, value): - assert isinstance(value, STRING_TYPE), value - self.name = name - self.value = value - - def __repr__(self): - return '%s(%r, %r)' % (type(self).__name__, self.name, self.value) - -class TokenDef__Str(TokenDef): - def to_regexp(self): - return re.escape(self.value) - - priority = 0 - -class TokenDef__Regexp(TokenDef): - def to_regexp(self): - return self.value - - priority = 1 - - class UnexpectedInput(LexError): def __init__(self, seq, lex_pos, line, column): context = seq[lex_pos:lex_pos+5] @@ -75,17 +53,18 @@ def _create_unless_callback(strs): return unless_callback def _create_unless(tokens): - tokens_by_type = classify(tokens, type) + tokens_by_type = classify(tokens, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() callback = {} - for retok in tokens_by_type.get(TokenDef__Regexp, []): + for retok in tokens_by_type.get(PatternRE, []): unless = {} - for strtok in tokens_by_type.get(TokenDef__Str, []): - m = re.match(retok.value, strtok.value) - if m and m.group(0) == strtok.value: + for strtok in tokens_by_type.get(PatternStr, []): + s = strtok.pattern.value + m = re.match(retok.pattern.value, s) + if m and m.group(0) == s: embedded_strs.add(strtok.name) - unless[strtok.value] = strtok.name + unless[s] = strtok.name if unless: callback[retok.name] = _create_unless_callback(unless) @@ -104,21 +83,21 @@ class Lexer(object): # Sanitization for t in tokens: try: - re.compile(t.to_regexp()) + re.compile(t.pattern.to_regexp()) except: - raise LexError("Cannot compile token: %s: %s" % t) + raise LexError("Cannot compile token: %s: %s" % (t.name, t.pattern)) token_names = {t.name for t in tokens} assert all(t in token_names for t in ignore) # Init - self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())] + self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] self.ignore_types = [t for t in ignore] tokens, self.callback = _create_unless(tokens) assert all(self.callback.values()) - tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True) + tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True) self.tokens = tokens @@ -132,7 +111,7 @@ class Lexer(object): mres = [] while tokens: try: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size])) + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()) for t in tokens[:max_size])) except AssertionError: # Yes, this is what Python provides us.. :/ return self._build_mres(tokens, max_size//2) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index bbe512e..f210e3d 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -3,12 +3,12 @@ from itertools import chain import re import codecs -from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp +from .lexer import Lexer, Token, UnexpectedInput from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken -from .common import is_terminal, GrammarError, LexerConf, ParserConf +from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .tree import Tree as T, Transformer, InlineTransformer, Visitor @@ -232,18 +232,19 @@ class SimplifyTree(InlineTransformer): return tokenmods + [value] class ExtractAnonTokens(InlineTransformer): - def __init__(self, tokens, token_set): + def __init__(self, tokens): self.tokens = tokens - self.token_set = token_set - self.token_reverse = {td.value: td.name for td in tokens} + self.token_set = {td.name for td in self.tokens} + self.str_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternStr)} + self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)} self.i = 0 def tokenvalue(self, token): + value = token.value[1:-1] if token.type == 'STRING': - value = token.value[1:-1] try: # If already defined, use the user-defined token name - token_name = self.token_reverse[value] + token_name = self.str_reverse[value] except KeyError: # Try to assign an indicative anon-token name, otherwise use a numbered name try: @@ -257,40 +258,32 @@ class ExtractAnonTokens(InlineTransformer): token_name = '__' + token_name elif token.type == 'REGEXP': - token_name = 'ANONRE_%d' % self.i - value = token.value - self.i += 1 + if value in self.re_reverse: # Kind of a wierd placement + token_name = self.re_reverse[value] + else: + token_name = 'ANONRE_%d' % self.i + self.i += 1 else: assert False, token - if value in self.token_reverse: # Kind of a wierd placement - token_name = self.token_reverse[value] - if token_name not in self.token_set: self.token_set.add(token_name) + if token.type == 'STRING': - self.tokens.append(TokenDef__Str(token_name, token[1:-1])) + pattern = PatternStr(value) + assert value not in self.str_reverse + self.str_reverse[value] = token_name else: - self.tokens.append(TokenDef__Regexp(token_name, token[1:-1])) - assert value not in self.token_reverse, value - self.token_reverse[value] = token_name - - return Token('TOKEN', token_name, -1) + pattern = PatternRE(value) + assert value not in self.re_reverse + self.re_reverse[value] = token_name + self.tokens.append(TokenDef(token_name, pattern)) -class TokenValue(object): - def __init__(self, value): - self.value = value - -class TokenValue__Str(TokenValue): - def to_regexp(self): - return re.escape(self.value) + return Token('TOKEN', token_name, -1) -class TokenValue__Regexp(TokenValue): - def to_regexp(self): - return self.value -class TokenTreeToRegexp(Transformer): +class TokenTreeToPattern(Transformer): def tokenvalue(self, tv): tv ,= tv value = tv.value[1:-1] @@ -300,30 +293,30 @@ class TokenTreeToRegexp(Transformer): value = unicode_escape(value)[0] if tv.type == 'REGEXP': - return TokenValue__Regexp(value) + return PatternRE(value) elif tv.type == 'STRING': - return TokenValue__Str(value) + return PatternStr(value) assert False def expansion(self, items): if len(items) == 1: return items[0] - return TokenValue__Regexp(''.join(i.to_regexp() for i in items)) + return PatternRE(''.join(i.to_regexp() for i in items)) def expansions(self, exps): if len(exps) == 1: return exps[0] - return TokenValue__Regexp('(?:%s)' % ('|'.join(i.to_regexp() for i in exps))) + return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps))) def range(self, items): assert all(i.type=='STRING' for i in items) items = [i[1:-1] for i in items] start, end = items assert len(start) == len(end) == 1, (start, end) - return TokenValue__Regexp('[%s-%s]' % (start, end)) + return PatternRE('[%s-%s]' % (start, end)) def expr(self, args): inner, op = args - return TokenValue__Regexp('(?:%s)%s' % (inner.to_regexp(), op)) + return PatternRE('(?:%s)%s' % (inner.to_regexp(), op)) class Grammar: def __init__(self, rule_defs, token_defs, extra): @@ -339,32 +332,28 @@ class Grammar: # ================= # Compile Tokens # ================= - token_to_regexp = TokenTreeToRegexp() + token_tree_to_pattern = TokenTreeToPattern() # Convert tokens to strings/regexps tokens = [] for name, token_tree in tokendefs: - regexp = token_to_regexp.transform(token_tree) - if isinstance(regexp, TokenValue__Str): - tokendef = TokenDef__Str(name, regexp.value) - else: - tokendef = TokenDef__Regexp(name, regexp.to_regexp()) - tokens.append(tokendef) + pattern = token_tree_to_pattern.transform(token_tree) + tokens.append(TokenDef(name, pattern) ) # Resolve regexp assignments of the form /..${X}../ # XXX This is deprecated, since you can express most regexps with EBNF # XXX Also, since this happens after import, it can be a source of bugs - token_dict = {td.name: td.to_regexp() for td in tokens} + token_dict = {td.name: td.pattern.to_regexp() for td in tokens} while True: changed = False for t in tokens: - if isinstance(t, TokenDef__Regexp): - sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.value) + if isinstance(t.pattern, PatternRE): + sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.pattern.value) if sp: value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x for x in sp) - if value != t.value: - t.value = value + if value != t.pattern.value: + t.pattern.value = value changed = True if not changed: break @@ -372,7 +361,7 @@ class Grammar: # ================= # Compile Rules # ================= - extract_anon = ExtractAnonTokens(tokens, set(token_dict)) + extract_anon = ExtractAnonTokens(tokens) ebnf_to_bnf = EBNF_to_BNF() simplify_rule = SimplifyRule_Visitor() rule_tree_to_text = RuleTreeToText() @@ -439,7 +428,7 @@ def resolve_token_references(token_defs): class GrammarLoader: def __init__(self): - tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()] + tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) @@ -493,6 +482,7 @@ class GrammarLoader: for name, _ in token_defs: if name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) + # Handle ignore tokens ignore_names = [] for i, t in enumerate(ignore): diff --git a/tests/test_parser.py b/tests/test_parser.py index f4f0dc5..083577a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -317,15 +317,24 @@ def _make_parser_test(PARSER): That means that "a" is not filtered out, despite being an 'immediate string'. Whether or not this is the intuitive behavior, I'm not sure yet. + Perhaps the right thing to do is report a collision (if such is relevant) + -Erez """ g = _Lark("""start: "a" A: "a" """) x = g.parse('a') + self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') self.assertEqual(x.children[0].type, "A") + g = _Lark("""start: /a/ + A: /a/ """) + x = g.parse('a') + self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous') + self.assertEqual(x.children[0].type, "A") + def test_maybe(self): g = _Lark("""start: ["a"] """) x = g.parse('a')