瀏覽代碼

Fixed possible tokens collisions (strings are now always tested before regexps)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 年之前
父節點
當前提交
faf66c93f0
共有 4 個文件被更改,包括 40 次插入23 次删除
  1. +1
    -1
      examples/indented_tree.py
  2. +4
    -4
      lark/lark.py
  3. +1
    -1
      lark/lexer.py
  4. +34
    -17
      lark/load_grammar.py

+ 1
- 1
examples/indented_tree.py 查看文件

@@ -24,7 +24,7 @@ class TreeIndenter(Indenter):
CLOSE_PAREN_types = []
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 0
tab_len = 8

parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter())



+ 4
- 4
lark/lark.py 查看文件

@@ -152,12 +152,12 @@ class Lark:

def _build_lexer(self):
ignore_tokens = []
tokens = {}
for name, (value, flags) in self.tokens.items():
tokens = []
for name, (value, flags) in self.tokens:
if 'ignore' in flags:
ignore_tokens.append(name)
tokens[name] = value
return Lexer(tokens.items(), {}, ignore=ignore_tokens)
tokens.append((name, value))
return Lexer(tokens, {}, ignore=ignore_tokens)


def _build_parser(self):


+ 1
- 1
lark/lexer.py 查看文件

@@ -48,7 +48,7 @@ class Lexer(object):
self.tokens = tokens
self.callbacks = callbacks

self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
# self.tokens.sort(key=lambda x:len(x[1]), reverse=True)

self.mres = []
self.name_from_index = []


+ 34
- 17
lark/load_grammar.py 查看文件

@@ -1,10 +1,14 @@
import re
import codecs

from lexer import Lexer, Token
from grammar_analysis import GrammarAnalyzer
from parser import Parser

from tree import Tree as T, Transformer, InlineTransformer, Visitor

unicode_escape = codecs.getdecoder('unicode_escape')

_TOKEN_NAMES = {
':' : 'COLON',
',' : 'COMMA',
@@ -143,26 +147,18 @@ class SaveDefinitions(object):
raise ValueError("Token '%s' defined more than once" % name)

if len(x) == 4:
self.tokens[name] = x[2][1], []
self.tokens[name] = x[2], []
else:
self.tokens[name] = x[3][1], x[1].children
self.tokens[name] = x[3], x[1].children

def tokenvalue(self, tokenvalue):
value = tokenvalue.value[1:-1]
import codecs
decoder = codecs.getdecoder('unicode_escape')
if '\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = decoder(value)[0]

if tokenvalue.type == 'STRING':
value = re.escape(value)
return tokenvalue, value

def anontoken(self, (token, value)):
return tokenvalue

def anontoken(self, token):
if token.type == 'STRING':
value = token.value[1:-1]
try:
token_name = _TOKEN_NAMES[token.value[1:-1]]
token_name = _TOKEN_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha():
token_name = value.upper()
@@ -178,7 +174,7 @@ class SaveDefinitions(object):
assert False, x

if token_name not in self.tokens:
self.tokens[token_name] = value, []
self.tokens[token_name] = token, []

return Token('TOKEN', token_name, -1)

@@ -312,6 +308,27 @@ class GrammarLoader:
p = Parser(self.ga, c)
p.parse( list(self.lexer.lex(grammar_text+"\n")) )

# Tokens
re_tokens = []
str_tokens = []
for name, (token, flags) in sd.tokens.items():
value = token.value[1:-1]
if '\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]

if token.type == 'STRING':
value = re.escape(value)
str_tokens.append((name, (value, flags)))
else:
assert token.type == 'REGEXP'
re_tokens.append((name, (value, flags)))

str_tokens.sort(key=lambda x:len(x[1][0]), reverse=True)
re_tokens.sort(key=lambda x:len(x[1][0]), reverse=True)
tokens = str_tokens + re_tokens # Order is important!

# Rules
ebnf_to_bnf = EBNF_to_BNF()

rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()}
@@ -320,7 +337,7 @@ class GrammarLoader:
for r in rules.values():
self.simplify_rule.visit(r)

return sd.tokens, rules
return tokens, rules

load_grammar = GrammarLoader().load_grammar



Loading…
取消
儲存