Browse Source

Fixed issues with the use_bytes PR, and added documentation

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Sh 4 years ago
parent
commit
7c6e94bf73
4 changed files with 23 additions and 19 deletions
  1. +1
    -0
      docs/classes.md
  2. +6
    -5
      lark/lark.py
  3. +4
    -4
      lark/lexer.py
  4. +12
    -10
      tests/test_parser.py

+ 1
- 0
docs/classes.md View File

@@ -128,6 +128,7 @@ Useful for caching and multiprocessing.
- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
- **edit_terminals** - A callback
- **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`.


#### Using Unicode character classes with `regex`


+ 6
- 5
lark/lark.py View File

@@ -82,6 +82,7 @@ class LarkOptions(Serialize):
invert (Default: auto)
lexer_callbacks - Dictionary of callbacks for the lexer. May alter
tokens during lexing. Use with caution.
use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only).
edit_terminals - A callback
"""
if __doc__:
@@ -190,11 +191,11 @@ class Lark(Serialize):
assert isinstance(grammar, STRING_TYPE)
self.grammar_source = grammar
if self.options.use_bytes:
assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only"
if not isascii(grammar):
raise ValueError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. "
"It might have weird behaviour. Use `use_bytes='force'` "
"to still use it")
raise NotImplementedError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")

cache_fn = None
if self.options.cache:
@@ -204,7 +205,7 @@ class Lark(Serialize):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ValueError("cache must be bool or str")
raise ValueError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)


+ 4
- 4
lark/lexer.py View File

@@ -139,8 +139,8 @@ class Token(Str):


class LineCounter:
def __init__(self, use_bytes=False):
self.newline_char = '\n' if not use_bytes else b'\n'
def __init__(self, newline_char):
self.newline_char = newline_char
self.char_pos = 0
self.line = 1
self.column = 1
@@ -169,7 +169,7 @@ class _Lex:
def lex(self, stream, newline_types, ignore_types):
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter(self.lexer.use_bytes)
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
last_token = None

while line_ctr.char_pos < len(stream):
@@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode('utf-8')
pattern = pattern.encode('latin-1')
try:
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/


+ 12
- 10
tests/test_parser.py View File

@@ -577,11 +577,13 @@ def _tree_structure_check(a, b):
else:
assert ca == cb

class DualLark:
class DualBytesLark:
"""
A helper class that wraps both a normal parser, and a parser for bytes.
It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
It always checks that both produce the same output/error

NOTE: Not currently used, but left here for future debugging.
"""

def __init__(self, g, *args, **kwargs):
@@ -613,7 +615,7 @@ class DualLark:
assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
_tree_structure_check(rv, bv)
return rv
@classmethod
def open(cls, grammar_filename, rel_to=None, **options):
if rel_to:
@@ -635,9 +637,9 @@ class DualLark:
def _make_parser_test(LEXER, PARSER):
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
def _Lark(grammar, **kwargs):
return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
def _Lark_open(gfilename, **kwargs):
return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)

class _TestParser(unittest.TestCase):
def test_basic1(self):
@@ -718,7 +720,7 @@ def _make_parser_test(LEXER, PARSER):
A: "\x01".."\x03"
""")
g.parse('\x01\x02\x03')
@unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly")
def test_bytes_utf8(self):
g = r"""
@@ -731,15 +733,15 @@ def _make_parser_test(LEXER, PARSER):
CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
"""
g = _Lark(g)
g = _Lark(g, use_bytes=True)
s = u"🔣 地? gurīn".encode('utf-8')
self.assertEqual(len(g.bytes_lark.parse(s).children), 10)
self.assertEqual(len(g.parse(s).children), 10)

for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"),
("sjis", u"売春婦"),
("euc-jp", u"乂鵬鵠")]:
s = j.encode(enc)
self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s)
self.assertRaises(UnexpectedCharacters, g.parse, s)

@unittest.skipIf(PARSER == 'cyk', "Takes forever")
def test_stack_for_ebnf(self):
@@ -1159,7 +1161,7 @@ def _make_parser_test(LEXER, PARSER):
g = _Lark(g)
self.assertEqual( g.parse('"hello"').children, ['"hello"'])
self.assertEqual( g.parse("'hello'").children, ["'hello'"])
@unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+")
def test_join_regex_flags(self):
g = r"""
@@ -1172,7 +1174,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(g.parse(" ").children,[" "])
self.assertEqual(g.parse("\n ").children,["\n "])
self.assertRaises(UnexpectedCharacters, g.parse, "\n\n")
g = r"""
start: A
A: B | C


Loading…
Cancel
Save