Browse Source

Better error message for bad regexps (Issue #62)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.3
Erez Shinan 6 years ago
parent
commit
37c1c0f65f
3 changed files with 17 additions and 6 deletions
  1. +4
    -3
      lark/common.py
  2. +6
    -3
      lark/parser_frontends.py
  3. +7
    -0
      lark/utils.py

+ 4
- 3
lark/common.py View File

@@ -1,7 +1,8 @@
import re import re
import sre_parse
import sys import sys


from .utils import get_regexp_width

Py36 = (sys.version_info[:2] >= (3, 6)) Py36 = (sys.version_info[:2] >= (3, 6))




@@ -95,10 +96,10 @@ class PatternRE(Pattern):


@property @property
def min_width(self): def min_width(self):
return sre_parse.parse(self.to_regexp()).getwidth()[0]
return get_regexp_width(self.to_regexp())[0]
@property @property
def max_width(self): def max_width(self):
return sre_parse.parse(self.to_regexp()).getwidth()[1]
return get_regexp_width(self.to_regexp())[1]


class TokenDef(object): class TokenDef(object):
def __init__(self, name, pattern, priority=1): def __init__(self, name, pattern, priority=1):


+ 6
- 3
lark/parser_frontends.py View File

@@ -1,5 +1,5 @@
import re import re
import sre_parse
from .utils import get_regexp_width


from .lexer import Lexer, ContextualLexer, Token from .lexer import Lexer, ContextualLexer, Token


@@ -77,7 +77,7 @@ class Earley_NoLex:
self.regexps = {} self.regexps = {}
for t in lexer_conf.tokens: for t in lexer_conf.tokens:
regexp = t.pattern.to_regexp() regexp = t.pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
width = get_regexp_width(regexp)
if width != (1,1): if width != (1,1):
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
self.regexps[t.name] = re.compile(regexp) self.regexps[t.name] = re.compile(regexp)
@@ -121,7 +121,10 @@ class XEarley:
self.regexps = {} self.regexps = {}
for t in lexer_conf.tokens: for t in lexer_conf.tokens:
regexp = t.pattern.to_regexp() regexp = t.pattern.to_regexp()
assert sre_parse.parse(regexp).getwidth()
try:
assert get_regexp_width(regexp)
except ValueError:
raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
self.regexps[t.name] = re.compile(regexp) self.regexps[t.name] = re.compile(regexp)


def parse(self, text): def parse(self, text):


+ 7
- 0
lark/utils.py View File

@@ -112,3 +112,10 @@ except NameError:
return -1 return -1




import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
except sre_constants.error:
raise ValueError(regexp)

Loading…
Cancel
Save