Browse Source

Fixed contextual lexer error that was confusing users (Issue #194)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.7.7
Erez Sh 5 years ago
parent
commit
a7e7b568ff
1 changed files with 47 additions and 34 deletions
  1. +47
    -34
      lark/lexer.py

+ 47
- 34
lark/lexer.py View File

@@ -3,7 +3,7 @@
import re

from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken

###{standalone

@@ -43,7 +43,7 @@ class PatternStr(Pattern):
__serialize_fields__ = 'value', 'flags'

type = "str"
def to_regexp(self):
return self._get_flags(re.escape(self.value))

@@ -166,37 +166,32 @@ class _Lex:

while line_ctr.char_pos < len(stream):
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if not m:
continue

t = None
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
last_token = t
yield t
else:
if type_ in lexer.callback:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t)

line_ctr.feed(value, type_ in newline_types)
if t:
t.end_line = line_ctr.line
t.end_column = line_ctr.column

break
else:
res = lexer.match(stream, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])

value, type_ = res

t = None
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
last_token = t
yield t
else:
if type_ in lexer.callback:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t)

line_ctr.feed(value, type_ in newline_types)
if t:
t.end_line = line_ctr.line
t.end_column = line_ctr.column


class UnlessCallback:
def __init__(self, mres):
@@ -330,6 +325,11 @@ class TraditionalLexer(Lexer):

self.mres = build_mres(terminals)

def match(self, stream, pos):
for mre, type_from_index in self.mres:
m = mre.match(stream, pos)
if m:
return m.group(0), type_from_index[m.lastindex]

def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
@@ -367,9 +367,22 @@ class ContextualLexer(Lexer):

def lex(self, stream):
l = _Lex(self.lexers[self.parser_state], self.parser_state)
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state
try:
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(stream, e.pos_in_stream)
if not root_match:
raise

value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
expected = {v for m, tfi in l.lexer.mres for v in tfi.values()}
raise UnexpectedToken(t, expected)

###}

Loading…
Cancel
Save