Tests for bytes parser (credit to @ctrlcctrlv)

4 years ago · c93106f143
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -36,6 +36,7 @@ class LarkOptions:

 class Lark:
    source: str
    grammar_source: str
    options: LarkOptions
    lexer: Lexer
    terminals: List[TerminalDef]
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -4,7 +4,7 @@ import sys, os, pickle, hashlib, logging
 from io import open


 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS
 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii
 from .load_grammar import load_grammar
 from .tree import Tree
 from .common import LexerConf, ParserConf
@@ -115,7 +115,7 @@ class LarkOptions(Serialize):
        for name, default in self._defaults.items():
            if name in o:
                value = o.pop(name)
                if isinstance(default, bool) and name != 'cache':
                if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
                    value = bool(value)
            else:
                value = default
@@ -188,6 +188,13 @@ class Lark(Serialize):
            grammar = read()

        assert isinstance(grammar, STRING_TYPE)
        self.grammar_source = grammar
        if self.options.use_bytes:
            assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only"
            if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
                raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. "
                                          "It might have weird behaviour. Use `use_bytes='force'` "
                                          "to still use it")

        cache_fn = None
        if self.options.cache:
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -139,8 +139,8 @@ class Token(Str):


 class LineCounter:
    def __init__(self):
        self.newline_char = '\n'
    def __init__(self, use_bytes=False):
        self.newline_char = '\n' if not use_bytes else b'\n'
        self.char_pos = 0
        self.line = 1
        self.column = 1
@@ -169,7 +169,7 @@ class _Lex:
    def lex(self, stream, newline_types, ignore_types):
        newline_types = frozenset(newline_types)
        ignore_types = frozenset(ignore_types)
        line_ctr = LineCounter()
        line_ctr = LineCounter(self.lexer.use_bytes)
        last_token = None

        while line_ctr.char_pos < len(stream):
@@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
    while terminals:
        pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
        if use_bytes:
            pattern = pattern.encode()
            pattern = pattern.encode('utf-8')
        try:
            mre = re_.compile(pattern, g_regex_flags)
        except AssertionError:  # Yes, this is what Python provides us.. :/
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -184,6 +184,8 @@ class XEarley(_ParserFrontend):
            else:
                if width == 0:
                    raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
            if lexer_conf.use_bytes:
                regexp = regexp.encode('utf-8')

            self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

--- a/lark/utils.py
+++ b/lark/utils.py
@@ -305,4 +305,17 @@ def combine_alternatives(lists):

 class FS:
    open = open
    exists = os.path.exists
    exists = os.path.exists



 def isascii(s):
    """ str.isascii only exists in python3.7+ """
    try:
        return s.isascii()
    except AttributeError:
        try:
            s.encode('ascii')
            return True
        except (UnicodeDecodeError, UnicodeEncodeError):
            return False
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -8,7 +8,9 @@ import os
 import sys
 from copy import copy, deepcopy

 from lark.utils import Py36
 from lark.utils import Py36, isascii

 from lark import Token

 try:
    from cStringIO import StringIO as cStringIO
@@ -561,12 +563,82 @@ class CustomLexer(Lexer):
    def lex(self, *args, **kwargs):
        return self.lexer.lex(*args, **kwargs)

 def _tree_structure_check(a, b):
    """
    Checks that both Tree objects have the same structure, without checking their values.
    """
    assert a.data == b.data and len(a.children) == len(b.children)
    for ca,cb in zip(a.children, b.children):
        assert type(ca) == type(cb)
        if isinstance(ca, Tree):
            _tree_structure_check(ca, cb)
        elif isinstance(ca, Token):
            assert ca.type == cb.type
        else:
            assert ca == cb

 class DualLark:
    """
    A helper class that wraps both a normal parser, and a parser for bytes.
    It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
    It always checks that both produce the same output/error
    """

    def __init__(self, g, *args, **kwargs):
        self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs)
        g = self.text_lexer.grammar_source.lower()
        if '\\u' in g or not isascii(g):
            # Bytes re can't deal with uniode escapes
            self.bytes_lark = None
        else:
            # Everything here should work, so use `use_bytes='force'`
            self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs)

    def parse(self, text, start=None):
        # TODO: Easy workaround, more complex checks would be beneficial
        if not isascii(text) or self.bytes_lark is None:
            return self.text_lexer.parse(text, start)
        try:
            rv = self.text_lexer.parse(text, start)
        except Exception as e:
            try:
                self.bytes_lark.parse(text.encode(), start)
            except Exception as be:
                assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions"
                raise e
            assert False, "Parser without `use_bytes` raises exception, with doesn't"
        try:
            bv = self.bytes_lark.parse(text.encode(), start)
        except Exception as be:
            assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
        _tree_structure_check(rv, bv)
        return rv
    
    @classmethod
    def open(cls, grammar_filename, rel_to=None, **options):
        if rel_to:
            basepath = os.path.dirname(rel_to)
            grammar_filename = os.path.join(basepath, grammar_filename)
        with open(grammar_filename, encoding='utf8') as f:
            return cls(f, **options)

    def save(self,f):
        self.text_lexer.save(f)
        if self.bytes_lark is not None:
            self.bytes_lark.save(f)

    def load(self,f):
        self.text_lexer = self.text_lexer.load(f)
        if self.bytes_lark is not None:
            self.bytes_lark.load(f)

 def _make_parser_test(LEXER, PARSER):
    lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
    def _Lark(grammar, **kwargs):
        return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
        return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
    def _Lark_open(gfilename, **kwargs):
        return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
        return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)

    class _TestParser(unittest.TestCase):
        def test_basic1(self):
            g = _Lark("""start: a+ b a* "b" a*
@@ -646,6 +718,28 @@ def _make_parser_test(LEXER, PARSER):
                          A: "\x01".."\x03"
                          """)
            g.parse('\x01\x02\x03')
        
        @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly")
        def test_bytes_utf8(self):
            g = r"""
            start: BOM? char+
            BOM: "\xef\xbb\xbf"
            char: CHAR1 | CHAR2 | CHAR3 | CHAR4
            CONTINUATION_BYTE: "\x80" .. "\xbf"
            CHAR1: "\x00" .. "\x7f"
            CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE
            CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
            CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
            """
            g = _Lark(g)
            s = u"🔣 地? gurīn".encode('utf-8')
            self.assertEqual(len(g.bytes_lark.parse(s).children), 10)

            for enc, j in [("sjis", u"地球の絵はグリーンでグッド?  Chikyuu no e wa guriin de guddo"),
                           ("sjis", u"売春婦"),
                           ("euc-jp", u"乂鵬鵠")]:
                s = j.encode(enc)
                self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s)

        @unittest.skipIf(PARSER == 'cyk', "Takes forever")
        def test_stack_for_ebnf(self):