Browse Source

Fixed possible tokens collisions (strings are now always tested before regexps)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 8 years ago
parent
commit
faf66c93f0
4 changed files with 40 additions and 23 deletions
  1. +1
    -1
      examples/indented_tree.py
  2. +4
    -4
      lark/lark.py
  3. +1
    -1
      lark/lexer.py
  4. +34
    -17
      lark/load_grammar.py

+ 1
- 1
examples/indented_tree.py View File

@@ -24,7 +24,7 @@ class TreeIndenter(Indenter):
CLOSE_PAREN_types = [] CLOSE_PAREN_types = []
INDENT_type = '_INDENT' INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT' DEDENT_type = '_DEDENT'
tab_len = 0
tab_len = 8


parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter()) parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter())




+ 4
- 4
lark/lark.py View File

@@ -152,12 +152,12 @@ class Lark:


def _build_lexer(self): def _build_lexer(self):
ignore_tokens = [] ignore_tokens = []
tokens = {}
for name, (value, flags) in self.tokens.items():
tokens = []
for name, (value, flags) in self.tokens:
if 'ignore' in flags: if 'ignore' in flags:
ignore_tokens.append(name) ignore_tokens.append(name)
tokens[name] = value
return Lexer(tokens.items(), {}, ignore=ignore_tokens)
tokens.append((name, value))
return Lexer(tokens, {}, ignore=ignore_tokens)




def _build_parser(self): def _build_parser(self):


+ 1
- 1
lark/lexer.py View File

@@ -48,7 +48,7 @@ class Lexer(object):
self.tokens = tokens self.tokens = tokens
self.callbacks = callbacks self.callbacks = callbacks


self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
# self.tokens.sort(key=lambda x:len(x[1]), reverse=True)


self.mres = [] self.mres = []
self.name_from_index = [] self.name_from_index = []


+ 34
- 17
lark/load_grammar.py View File

@@ -1,10 +1,14 @@
import re import re
import codecs

from lexer import Lexer, Token from lexer import Lexer, Token
from grammar_analysis import GrammarAnalyzer from grammar_analysis import GrammarAnalyzer
from parser import Parser from parser import Parser


from tree import Tree as T, Transformer, InlineTransformer, Visitor from tree import Tree as T, Transformer, InlineTransformer, Visitor


unicode_escape = codecs.getdecoder('unicode_escape')

_TOKEN_NAMES = { _TOKEN_NAMES = {
':' : 'COLON', ':' : 'COLON',
',' : 'COMMA', ',' : 'COMMA',
@@ -143,26 +147,18 @@ class SaveDefinitions(object):
raise ValueError("Token '%s' defined more than once" % name) raise ValueError("Token '%s' defined more than once" % name)


if len(x) == 4: if len(x) == 4:
self.tokens[name] = x[2][1], []
self.tokens[name] = x[2], []
else: else:
self.tokens[name] = x[3][1], x[1].children
self.tokens[name] = x[3], x[1].children


def tokenvalue(self, tokenvalue): def tokenvalue(self, tokenvalue):
value = tokenvalue.value[1:-1]
import codecs
decoder = codecs.getdecoder('unicode_escape')
if '\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = decoder(value)[0]

if tokenvalue.type == 'STRING':
value = re.escape(value)
return tokenvalue, value

def anontoken(self, (token, value)):
return tokenvalue

def anontoken(self, token):
if token.type == 'STRING': if token.type == 'STRING':
value = token.value[1:-1]
try: try:
token_name = _TOKEN_NAMES[token.value[1:-1]]
token_name = _TOKEN_NAMES[value]
except KeyError: except KeyError:
if value.isalnum() and value[0].isalpha(): if value.isalnum() and value[0].isalpha():
token_name = value.upper() token_name = value.upper()
@@ -178,7 +174,7 @@ class SaveDefinitions(object):
assert False, x assert False, x


if token_name not in self.tokens: if token_name not in self.tokens:
self.tokens[token_name] = value, []
self.tokens[token_name] = token, []


return Token('TOKEN', token_name, -1) return Token('TOKEN', token_name, -1)


@@ -312,6 +308,27 @@ class GrammarLoader:
p = Parser(self.ga, c) p = Parser(self.ga, c)
p.parse( list(self.lexer.lex(grammar_text+"\n")) ) p.parse( list(self.lexer.lex(grammar_text+"\n")) )


# Tokens
re_tokens = []
str_tokens = []
for name, (token, flags) in sd.tokens.items():
value = token.value[1:-1]
if '\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]

if token.type == 'STRING':
value = re.escape(value)
str_tokens.append((name, (value, flags)))
else:
assert token.type == 'REGEXP'
re_tokens.append((name, (value, flags)))

str_tokens.sort(key=lambda x:len(x[1][0]), reverse=True)
re_tokens.sort(key=lambda x:len(x[1][0]), reverse=True)
tokens = str_tokens + re_tokens # Order is important!

# Rules
ebnf_to_bnf = EBNF_to_BNF() ebnf_to_bnf = EBNF_to_BNF()


rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()}
@@ -320,7 +337,7 @@ class GrammarLoader:
for r in rules.values(): for r in rules.values():
self.simplify_rule.visit(r) self.simplify_rule.visit(r)


return sd.tokens, rules
return tokens, rules


load_grammar = GrammarLoader().load_grammar load_grammar = GrammarLoader().load_grammar




Loading…
Cancel
Save