Fixed issues with the use_bytes PR, and added documentation

5 years ago · 7c6e94bf73
--- a/docs/classes.md
+++ b/docs/classes.md
@@ -128,6 +128,7 @@ Useful for caching and multiprocessing.
 - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
 - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
 - **edit_terminals** - A callback
 - **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`.


 #### Using Unicode character classes with `regex`
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -82,6 +82,7 @@ class LarkOptions(Serialize):
                invert (Default: auto)
    lexer_callbacks - Dictionary of callbacks for the lexer. May alter
                        tokens during lexing. Use with caution.
    use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only).
    edit_terminals - A callback
    """
    if __doc__:
@@ -190,11 +191,11 @@ class Lark(Serialize):
        assert isinstance(grammar, STRING_TYPE)
        self.grammar_source = grammar
        if self.options.use_bytes:
            assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only"
            if not isascii(grammar):
                raise ValueError("Grammar must be ascii only, when use_bytes=True")
            if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
                raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. "
                                          "It might have weird behaviour. Use `use_bytes='force'` "
                                          "to still use it")
                raise NotImplementedError("`use_bytes=True` may have issues on python2."
                                          "Use `use_bytes='force'` to use it at your own risk.")

        cache_fn = None
        if self.options.cache:
@@ -204,7 +205,7 @@ class Lark(Serialize):
                cache_fn = self.options.cache
            else:
                if self.options.cache is not True:
                    raise ValueError("cache must be bool or str")
                    raise ValueError("cache argument must be bool or str")
                unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
                from . import __version__
                options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -139,8 +139,8 @@ class Token(Str):


 class LineCounter:
    def __init__(self, use_bytes=False):
        self.newline_char = '\n' if not use_bytes else b'\n'
    def __init__(self, newline_char):
        self.newline_char = newline_char
        self.char_pos = 0
        self.line = 1
        self.column = 1
@@ -169,7 +169,7 @@ class _Lex:
    def lex(self, stream, newline_types, ignore_types):
        newline_types = frozenset(newline_types)
        ignore_types = frozenset(ignore_types)
        line_ctr = LineCounter(self.lexer.use_bytes)
        line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
        last_token = None

        while line_ctr.char_pos < len(stream):
@@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
    while terminals:
        pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
        if use_bytes:
            pattern = pattern.encode('utf-8')
            pattern = pattern.encode('latin-1')
        try:
            mre = re_.compile(pattern, g_regex_flags)
        except AssertionError:  # Yes, this is what Python provides us.. :/
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -577,11 +577,13 @@ def _tree_structure_check(a, b):
        else:
            assert ca == cb

 class DualLark:
 class DualBytesLark:
    """
    A helper class that wraps both a normal parser, and a parser for bytes.
    It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
    It always checks that both produce the same output/error

    NOTE: Not currently used, but left here for future debugging.
    """

    def __init__(self, g, *args, **kwargs):
@@ -613,7 +615,7 @@ class DualLark:
            assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
        _tree_structure_check(rv, bv)
        return rv
    

    @classmethod
    def open(cls, grammar_filename, rel_to=None, **options):
        if rel_to:
@@ -635,9 +637,9 @@ class DualLark:
 def _make_parser_test(LEXER, PARSER):
    lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
    def _Lark(grammar, **kwargs):
        return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
        return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
    def _Lark_open(gfilename, **kwargs):
        return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
        return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)

    class _TestParser(unittest.TestCase):
        def test_basic1(self):
@@ -718,7 +720,7 @@ def _make_parser_test(LEXER, PARSER):
                          A: "\x01".."\x03"
                          """)
            g.parse('\x01\x02\x03')
        

        @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly")
        def test_bytes_utf8(self):
            g = r"""
@@ -731,15 +733,15 @@ def _make_parser_test(LEXER, PARSER):
            CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
            CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
            """
            g = _Lark(g)
            g = _Lark(g, use_bytes=True)
            s = u"🔣 地? gurīn".encode('utf-8')
            self.assertEqual(len(g.bytes_lark.parse(s).children), 10)
            self.assertEqual(len(g.parse(s).children), 10)

            for enc, j in [("sjis", u"地球の絵はグリーンでグッド?  Chikyuu no e wa guriin de guddo"),
                           ("sjis", u"売春婦"),
                           ("euc-jp", u"乂鵬鵠")]:
                s = j.encode(enc)
                self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s)
                self.assertRaises(UnexpectedCharacters, g.parse, s)

        @unittest.skipIf(PARSER == 'cyk', "Takes forever")
        def test_stack_for_ebnf(self):
@@ -1159,7 +1161,7 @@ def _make_parser_test(LEXER, PARSER):
            g = _Lark(g)
            self.assertEqual( g.parse('"hello"').children, ['"hello"'])
            self.assertEqual( g.parse("'hello'").children, ["'hello'"])
        

        @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+")
        def test_join_regex_flags(self):
            g = r"""
@@ -1172,7 +1174,7 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(g.parse("  ").children,["  "])
            self.assertEqual(g.parse("\n ").children,["\n "])
            self.assertRaises(UnexpectedCharacters, g.parse, "\n\n")
            

            g = r"""
                start: A
                A: B | C