Browse Source

Added regex embedding for tokens

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 8 years ago
parent
commit
aecb4f5aa7
2 changed files with 29 additions and 15 deletions
  1. +2
    -3
      lark/lark.py
  2. +27
    -12
      lark/load_grammar.py

+ 2
- 3
lark/lark.py View File

@@ -1,7 +1,6 @@
from __future__ import absolute_import

import functools
import types
import os

from .utils import STRING_TYPE, inline_args
from .load_grammar import load_grammar
@@ -153,7 +152,7 @@ class Lark:
def _build_lexer(self):
ignore_tokens = []
tokens = []
for name, (value, flags) in self.tokens:
for name, value, flags in self.tokens:
if 'ignore' in flags:
ignore_tokens.append(name)
tokens.append((name, value))


+ 27
- 12
lark/load_grammar.py View File

@@ -58,7 +58,7 @@ TOKENS = {
'REGEXP': r"/(.|\n)*?[^\\]/",
'NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'#[^\n]*\n',
'COMMENT': r'//[^\n]*\n',
'TO': '->'
}

@@ -106,7 +106,8 @@ RULES = [
class SaveDefinitions(object):
def __init__(self):
self.rules = {}
self.tokens = {}
self.token_set = set()
self.tokens = []
self.i = 0


@@ -143,13 +144,14 @@ class SaveDefinitions(object):

def token(self, *x):
name = x[0].value
if name in self.tokens:
if name in self.token_set:
raise ValueError("Token '%s' defined more than once" % name)
self.token_set.add(name)

if len(x) == 4:
self.tokens[name] = x[2], []
self.tokens.append((name, x[2], []))
else:
self.tokens[name] = x[3], x[1].children
self.tokens.append((name, x[3], x[1].children))

def tokenvalue(self, tokenvalue):
return tokenvalue
@@ -173,8 +175,9 @@ class SaveDefinitions(object):
else:
assert False, x

if token_name not in self.tokens:
self.tokens[token_name] = token, []
if token_name not in self.token_set:
self.token_set.add(token_name)
self.tokens.append((token_name, token, []))

return Token('TOKEN', token_name, -1)

@@ -191,14 +194,19 @@ class SaveDefinitions(object):
class EBNF_to_BNF(InlineTransformer):
def __init__(self):
self.new_rules = {}
self.rules_by_expr = {}
self.prefix = 'anon'
self.i = 0

def _add_recurse_rule(self, type_, expr):
if expr in self.rules_by_expr:
return self.rules_by_expr[expr]

new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1
t = Token('RULE', new_name, -1)
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
self.rules_by_expr[expr] = t
return t

def expr(self, rule, op):
@@ -309,9 +317,10 @@ class GrammarLoader:
p.parse( list(self.lexer.lex(grammar_text+"\n")) )

# Tokens
token_ref = {}
re_tokens = []
str_tokens = []
for name, (token, flags) in sd.tokens.items():
for name, token, flags in sd.tokens:
value = token.value[1:-1]
if '\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
@@ -319,13 +328,19 @@ class GrammarLoader:

if token.type == 'STRING':
value = re.escape(value)
str_tokens.append((name, (value, flags)))
str_tokens.append((name, value, flags))
else:
assert token.type == 'REGEXP'
re_tokens.append((name, (value, flags)))
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
if sp:
value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp)

re_tokens.append((name, value, flags))
token_ref[name] = value

str_tokens.sort(key=lambda x:len(x[1][0]), reverse=True)
re_tokens.sort(key=lambda x:len(x[1][0]), reverse=True)
str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
tokens = str_tokens + re_tokens # Order is important!

# Rules


Loading…
Cancel
Save