@@ -128,6 +128,7 @@ Useful for caching and multiprocessing. | |||
- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) | |||
- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | |||
- **edit_terminals** - A callback | |||
- **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`. | |||
#### Using Unicode character classes with `regex` | |||
@@ -82,6 +82,7 @@ class LarkOptions(Serialize): | |||
invert (Default: auto) | |||
lexer_callbacks - Dictionary of callbacks for the lexer. May alter | |||
tokens during lexing. Use with caution. | |||
use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). | |||
edit_terminals - A callback | |||
""" | |||
if __doc__: | |||
@@ -190,11 +191,11 @@ class Lark(Serialize): | |||
assert isinstance(grammar, STRING_TYPE) | |||
self.grammar_source = grammar | |||
if self.options.use_bytes: | |||
assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only" | |||
if not isascii(grammar): | |||
raise ValueError("Grammar must be ascii only, when use_bytes=True") | |||
if sys.version_info[0] == 2 and self.options.use_bytes != 'force': | |||
raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. " | |||
"It might have weird behaviour. Use `use_bytes='force'` " | |||
"to still use it") | |||
raise NotImplementedError("`use_bytes=True` may have issues on python2." | |||
"Use `use_bytes='force'` to use it at your own risk.") | |||
cache_fn = None | |||
if self.options.cache: | |||
@@ -204,7 +205,7 @@ class Lark(Serialize): | |||
cache_fn = self.options.cache | |||
else: | |||
if self.options.cache is not True: | |||
raise ValueError("cache must be bool or str") | |||
raise ValueError("cache argument must be bool or str") | |||
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') | |||
from . import __version__ | |||
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) | |||
@@ -139,8 +139,8 @@ class Token(Str): | |||
class LineCounter: | |||
def __init__(self, use_bytes=False): | |||
self.newline_char = '\n' if not use_bytes else b'\n' | |||
def __init__(self, newline_char): | |||
self.newline_char = newline_char | |||
self.char_pos = 0 | |||
self.line = 1 | |||
self.column = 1 | |||
@@ -169,7 +169,7 @@ class _Lex: | |||
def lex(self, stream, newline_types, ignore_types): | |||
newline_types = frozenset(newline_types) | |||
ignore_types = frozenset(ignore_types) | |||
line_ctr = LineCounter(self.lexer.use_bytes) | |||
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') | |||
last_token = None | |||
while line_ctr.char_pos < len(stream): | |||
@@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) | |||
while terminals: | |||
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) | |||
if use_bytes: | |||
pattern = pattern.encode('utf-8') | |||
pattern = pattern.encode('latin-1') | |||
try: | |||
mre = re_.compile(pattern, g_regex_flags) | |||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||
@@ -577,11 +577,13 @@ def _tree_structure_check(a, b): | |||
else: | |||
assert ca == cb | |||
class DualLark: | |||
class DualBytesLark: | |||
""" | |||
A helper class that wraps both a normal parser, and a parser for bytes. | |||
It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer | |||
It always checks that both produce the same output/error | |||
NOTE: Not currently used, but left here for future debugging. | |||
""" | |||
def __init__(self, g, *args, **kwargs): | |||
@@ -613,7 +615,7 @@ class DualLark: | |||
assert False, "Parser without `use_bytes` doesn't raise an exception, with does" | |||
_tree_structure_check(rv, bv) | |||
return rv | |||
@classmethod | |||
def open(cls, grammar_filename, rel_to=None, **options): | |||
if rel_to: | |||
@@ -635,9 +637,9 @@ class DualLark: | |||
def _make_parser_test(LEXER, PARSER): | |||
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER | |||
def _Lark(grammar, **kwargs): | |||
return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
def _Lark_open(gfilename, **kwargs): | |||
return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
class _TestParser(unittest.TestCase): | |||
def test_basic1(self): | |||
@@ -718,7 +720,7 @@ def _make_parser_test(LEXER, PARSER): | |||
A: "\x01".."\x03" | |||
""") | |||
g.parse('\x01\x02\x03') | |||
@unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") | |||
def test_bytes_utf8(self): | |||
g = r""" | |||
@@ -731,15 +733,15 @@ def _make_parser_test(LEXER, PARSER): | |||
CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE | |||
CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE | |||
""" | |||
g = _Lark(g) | |||
g = _Lark(g, use_bytes=True) | |||
s = u"🔣 地? gurīn".encode('utf-8') | |||
self.assertEqual(len(g.bytes_lark.parse(s).children), 10) | |||
self.assertEqual(len(g.parse(s).children), 10) | |||
for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"), | |||
("sjis", u"売春婦"), | |||
("euc-jp", u"乂鵬鵠")]: | |||
s = j.encode(enc) | |||
self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s) | |||
self.assertRaises(UnexpectedCharacters, g.parse, s) | |||
@unittest.skipIf(PARSER == 'cyk', "Takes forever") | |||
def test_stack_for_ebnf(self): | |||
@@ -1159,7 +1161,7 @@ def _make_parser_test(LEXER, PARSER): | |||
g = _Lark(g) | |||
self.assertEqual( g.parse('"hello"').children, ['"hello"']) | |||
self.assertEqual( g.parse("'hello'").children, ["'hello'"]) | |||
@unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+") | |||
def test_join_regex_flags(self): | |||
g = r""" | |||
@@ -1172,7 +1174,7 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertEqual(g.parse(" ").children,[" "]) | |||
self.assertEqual(g.parse("\n ").children,["\n "]) | |||
self.assertRaises(UnexpectedCharacters, g.parse, "\n\n") | |||
g = r""" | |||
start: A | |||
A: B | C | |||