Browse Source

Merge branch 'MegaIng-bytes-support'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Sh 4 years ago
parent
commit
9ee8428f3f
9 changed files with 168 additions and 29 deletions
  1. +1
    -0
      docs/classes.md
  2. +4
    -1
      lark-stubs/lark.pyi
  3. +3
    -2
      lark/common.py
  4. +13
    -4
      lark/exceptions.py
  5. +14
    -5
      lark/lark.py
  6. +18
    -13
      lark/lexer.py
  7. +2
    -0
      lark/parser_frontends.py
  8. +14
    -1
      lark/utils.py
  9. +99
    -3
      tests/test_parser.py

+ 1
- 0
docs/classes.md View File

@@ -128,6 +128,7 @@ Useful for caching and multiprocessing.
- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
- **edit_terminals** - A callback
- **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`.


#### Using Unicode character classes with `regex`


+ 4
- 1
lark-stubs/lark.pyi View File

@@ -31,10 +31,12 @@ class LarkOptions:
lexer_callbacks: Dict[str, Callable[[Token], Token]]
cache: Union[bool, str]
g_regex_flags: int
use_bytes: bool


class Lark:
source: str
grammar_source: str
options: LarkOptions
lexer: Lexer
terminals: List[TerminalDef]
@@ -56,7 +58,8 @@ class Lark:
maybe_placeholders: bool = False,
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
cache: Union[bool, str] = False,
g_regex_flags: int = ...
g_regex_flags: int = ...,
use_bytes: bool = False,
):
...



+ 3
- 2
lark/common.py View File

@@ -4,10 +4,10 @@ from .lexer import TerminalDef
###{standalone

class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
__serialize_namespace__ = TerminalDef,

def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False):
def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
self.tokens = tokens # TODO should be terminals
self.ignore = ignore
self.postlex = postlex
@@ -15,6 +15,7 @@ class LexerConf(Serialize):
self.g_regex_flags = g_regex_flags
self.re_module = re_module
self.skip_validation = skip_validation
self.use_bytes = use_bytes

def _deserialize(self):
self.callbacks = {} # TODO


+ 13
- 4
lark/exceptions.py View File

@@ -28,9 +28,14 @@ class UnexpectedInput(LarkError):
pos = self.pos_in_stream
start = max(pos - span, 0)
end = pos + span
before = text[start:pos].rsplit('\n', 1)[-1]
after = text[pos:end].split('\n', 1)[0]
return before + after + '\n' + ' ' * len(before) + '^\n'
if not isinstance(text, bytes):
before = text[start:pos].rsplit('\n', 1)[-1]
after = text[pos:end].split('\n', 1)[0]
return before + after + '\n' + ' ' * len(before) + '^\n'
else:
before = text[start:pos].rsplit(b'\n', 1)[-1]
after = text[pos:end].split(b'\n', 1)[0]
return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")

def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
""" Given a parser instance and a dictionary mapping some label with
@@ -67,7 +72,11 @@ class UnexpectedInput(LarkError):

class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)
if isinstance(seq, bytes):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column)
else:
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)

self.line = line
self.column = column


+ 14
- 5
lark/lark.py View File

@@ -4,7 +4,7 @@ import sys, os, pickle, hashlib, logging
from io import open


from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS
from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii
from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf
@@ -82,6 +82,7 @@ class LarkOptions(Serialize):
invert (Default: auto)
lexer_callbacks - Dictionary of callbacks for the lexer. May alter
tokens during lexing. Use with caution.
use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only).
edit_terminals - A callback
"""
if __doc__:
@@ -105,6 +106,7 @@ class LarkOptions(Serialize):
'maybe_placeholders': False,
'edit_terminals': None,
'g_regex_flags': 0,
'use_bytes': False,
}

def __init__(self, options_dict):
@@ -114,7 +116,7 @@ class LarkOptions(Serialize):
for name, default in self._defaults.items():
if name in o:
value = o.pop(name)
if isinstance(default, bool) and name != 'cache':
if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
value = bool(value)
else:
value = default
@@ -187,6 +189,13 @@ class Lark(Serialize):
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.grammar_source = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ValueError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise NotImplementedError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")

cache_fn = None
if self.options.cache:
@@ -196,7 +205,7 @@ class Lark(Serialize):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ValueError("cache must be bool or str")
raise ValueError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
@@ -252,7 +261,7 @@ class Lark(Serialize):
for t in self.terminals:
self.options.edit_terminals(t)

self._terminals_dict = {t.name:t for t in self.terminals}
self._terminals_dict = {t.name: t for t in self.terminals}

# If the user asked to invert the priorities, negate them all here.
# This replaces the old 'resolve__antiscore_sum' option.
@@ -276,7 +285,7 @@ class Lark(Serialize):
if hasattr(t, term.name):
lexer_callbacks[term.name] = getattr(t, term.name)

self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)
self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes)

if self.options.parser:
self.parser = self._build_parser()


+ 18
- 13
lark/lexer.py View File

@@ -139,8 +139,8 @@ class Token(Str):


class LineCounter:
def __init__(self):
self.newline_char = '\n'
def __init__(self, newline_char):
self.newline_char = newline_char
self.char_pos = 0
self.line = 1
self.column = 1
@@ -169,7 +169,7 @@ class _Lex:
def lex(self, stream, newline_types, ignore_types):
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter()
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
last_token = None

while line_ctr.char_pos < len(stream):
@@ -230,7 +230,7 @@ class CallChain:



def _create_unless(terminals, g_regex_flags, re_):
def _create_unless(terminals, g_regex_flags, re_, use_bytes):
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
@@ -247,31 +247,34 @@ def _create_unless(terminals, g_regex_flags, re_):
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)

# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:]
return mres

def build_mres(terminals, g_regex_flags, re_, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)
def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)

def _regexp_has_newline(r):
r"""Expressions that may indicate newlines in a regexp:
@@ -321,12 +324,13 @@ class TraditionalLexer(Lexer):
self.terminals = terminals
self.user_callbacks = conf.callbacks
self.g_regex_flags = conf.g_regex_flags
self.use_bytes = conf.use_bytes

self._mres = None
# self.build(g_regex_flags)

def _build(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re)
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
assert all(self.callback.values())

for type_, f in self.user_callbacks.items():
@@ -336,7 +340,7 @@ class TraditionalLexer(Lexer):
else:
self.callback[type_] = f

self._mres = build_mres(terminals, self.g_regex_flags, self.re)
self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)

@property
def mres(self):
@@ -365,7 +369,8 @@ class ContextualLexer(Lexer):
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t

trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation)
trad_conf = copy(conf)
trad_conf.tokens = terminals

lexer_by_tokens = {}
self.lexers = {}


+ 2
- 0
lark/parser_frontends.py View File

@@ -189,6 +189,8 @@ class XEarley(_ParserFrontend):
else:
if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
if lexer_conf.use_bytes:
regexp = regexp.encode('utf-8')

self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)



+ 14
- 1
lark/utils.py View File

@@ -305,4 +305,17 @@ def combine_alternatives(lists):

class FS:
open = open
exists = os.path.exists
exists = os.path.exists



def isascii(s):
""" str.isascii only exists in python3.7+ """
try:
return s.isascii()
except AttributeError:
try:
s.encode('ascii')
return True
except (UnicodeDecodeError, UnicodeEncodeError):
return False

+ 99
- 3
tests/test_parser.py View File

@@ -8,7 +8,9 @@ import os
import sys
from copy import copy, deepcopy

from lark.utils import Py36
from lark.utils import Py36, isascii

from lark import Token

try:
from cStringIO import StringIO as cStringIO
@@ -561,12 +563,84 @@ class CustomLexer(Lexer):
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)

def _tree_structure_check(a, b):
"""
Checks that both Tree objects have the same structure, without checking their values.
"""
assert a.data == b.data and len(a.children) == len(b.children)
for ca,cb in zip(a.children, b.children):
assert type(ca) == type(cb)
if isinstance(ca, Tree):
_tree_structure_check(ca, cb)
elif isinstance(ca, Token):
assert ca.type == cb.type
else:
assert ca == cb

class DualBytesLark:
"""
A helper class that wraps both a normal parser, and a parser for bytes.
It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
It always checks that both produce the same output/error

NOTE: Not currently used, but left here for future debugging.
"""

def __init__(self, g, *args, **kwargs):
self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs)
g = self.text_lexer.grammar_source.lower()
if '\\u' in g or not isascii(g):
# Bytes re can't deal with uniode escapes
self.bytes_lark = None
else:
# Everything here should work, so use `use_bytes='force'`
self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs)

def parse(self, text, start=None):
# TODO: Easy workaround, more complex checks would be beneficial
if not isascii(text) or self.bytes_lark is None:
return self.text_lexer.parse(text, start)
try:
rv = self.text_lexer.parse(text, start)
except Exception as e:
try:
self.bytes_lark.parse(text.encode(), start)
except Exception as be:
assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions"
raise e
assert False, "Parser without `use_bytes` raises exception, with doesn't"
try:
bv = self.bytes_lark.parse(text.encode(), start)
except Exception as be:
assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
_tree_structure_check(rv, bv)
return rv

@classmethod
def open(cls, grammar_filename, rel_to=None, **options):
if rel_to:
basepath = os.path.dirname(rel_to)
grammar_filename = os.path.join(basepath, grammar_filename)
with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options)

def save(self,f):
self.text_lexer.save(f)
if self.bytes_lark is not None:
self.bytes_lark.save(f)

def load(self,f):
self.text_lexer = self.text_lexer.load(f)
if self.bytes_lark is not None:
self.bytes_lark.load(f)

def _make_parser_test(LEXER, PARSER):
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
def _Lark_open(gfilename, **kwargs):
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)

class _TestParser(unittest.TestCase):
def test_basic1(self):
g = _Lark("""start: a+ b a* "b" a*
@@ -647,6 +721,28 @@ def _make_parser_test(LEXER, PARSER):
""")
g.parse('\x01\x02\x03')

@unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly")
def test_bytes_utf8(self):
g = r"""
start: BOM? char+
BOM: "\xef\xbb\xbf"
char: CHAR1 | CHAR2 | CHAR3 | CHAR4
CONTINUATION_BYTE: "\x80" .. "\xbf"
CHAR1: "\x00" .. "\x7f"
CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE
CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
"""
g = _Lark(g, use_bytes=True)
s = u"🔣 地? gurīn".encode('utf-8')
self.assertEqual(len(g.parse(s).children), 10)

for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"),
("sjis", u"売春婦"),
("euc-jp", u"乂鵬鵠")]:
s = j.encode(enc)
self.assertRaises(UnexpectedCharacters, g.parse, s)

@unittest.skipIf(PARSER == 'cyk', "Takes forever")
def test_stack_for_ebnf(self):
"""Verify that stack depth isn't an issue for EBNF grammars"""
@@ -1065,7 +1161,7 @@ def _make_parser_test(LEXER, PARSER):
g = _Lark(g)
self.assertEqual( g.parse('"hello"').children, ['"hello"'])
self.assertEqual( g.parse("'hello'").children, ["'hello'"])
@unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+")
def test_join_regex_flags(self):
g = r"""
@@ -1078,7 +1174,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(g.parse(" ").children,[" "])
self.assertEqual(g.parse("\n ").children,["\n "])
self.assertRaises(UnexpectedCharacters, g.parse, "\n\n")
g = r"""
start: A
A: B | C


Loading…
Cancel
Save