diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 4f5f57e..131bbe0 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -36,6 +36,7 @@ class LarkOptions: class Lark: source: str + grammar_source: str options: LarkOptions lexer: Lexer terminals: List[TerminalDef] diff --git a/lark/lark.py b/lark/lark.py index 36e92b1..a1ed414 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -4,7 +4,7 @@ import sys, os, pickle, hashlib, logging from io import open -from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS +from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -115,7 +115,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool) and name != 'cache': + if isinstance(default, bool) and name not in ('cache', 'use_bytes'): value = bool(value) else: value = default @@ -188,6 +188,13 @@ class Lark(Serialize): grammar = read() assert isinstance(grammar, STRING_TYPE) + self.grammar_source = grammar + if self.options.use_bytes: + assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only" + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. " + "It might have weird behaviour. Use `use_bytes='force'` " + "to still use it") cache_fn = None if self.options.cache: diff --git a/lark/lexer.py b/lark/lexer.py index 6039c54..fdc7429 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -139,8 +139,8 @@ class Token(Str): class LineCounter: - def __init__(self): - self.newline_char = '\n' + def __init__(self, use_bytes=False): + self.newline_char = '\n' if not use_bytes else b'\n' self.char_pos = 0 self.line = 1 self.column = 1 @@ -169,7 +169,7 @@ class _Lex: def lex(self, stream, newline_types, ignore_types): newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) - line_ctr = LineCounter() + line_ctr = LineCounter(self.lexer.use_bytes) last_token = None while line_ctr.char_pos < len(stream): @@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) while terminals: pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) if use_bytes: - pattern = pattern.encode() + pattern = pattern.encode('utf-8') try: mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 08f4756..0b2d5f2 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -184,6 +184,8 @@ class XEarley(_ParserFrontend): else: if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) + if lexer_conf.use_bytes: + regexp = regexp.encode('utf-8') self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) diff --git a/lark/utils.py b/lark/utils.py index 36f50d1..c70b947 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -305,4 +305,17 @@ def combine_alternatives(lists): class FS: open = open - exists = os.path.exists \ No newline at end of file + exists = os.path.exists + + + +def isascii(s): + """ str.isascii only exists in python3.7+ """ + try: + return s.isascii() + except AttributeError: + try: + s.encode('ascii') + return True + except (UnicodeDecodeError, UnicodeEncodeError): + return False \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index 1249211..4fd6cea 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -8,7 +8,9 @@ import os import sys from copy import copy, deepcopy -from lark.utils import Py36 +from lark.utils import Py36, isascii + +from lark import Token try: from cStringIO import StringIO as cStringIO @@ -561,12 +563,82 @@ class CustomLexer(Lexer): def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) +def _tree_structure_check(a, b): + """ + Checks that both Tree objects have the same structure, without checking their values. + """ + assert a.data == b.data and len(a.children) == len(b.children) + for ca,cb in zip(a.children, b.children): + assert type(ca) == type(cb) + if isinstance(ca, Tree): + _tree_structure_check(ca, cb) + elif isinstance(ca, Token): + assert ca.type == cb.type + else: + assert ca == cb + +class DualLark: + """ + A helper class that wraps both a normal parser, and a parser for bytes. + It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer + It always checks that both produce the same output/error + """ + + def __init__(self, g, *args, **kwargs): + self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs) + g = self.text_lexer.grammar_source.lower() + if '\\u' in g or not isascii(g): + # Bytes re can't deal with uniode escapes + self.bytes_lark = None + else: + # Everything here should work, so use `use_bytes='force'` + self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs) + + def parse(self, text, start=None): + # TODO: Easy workaround, more complex checks would be beneficial + if not isascii(text) or self.bytes_lark is None: + return self.text_lexer.parse(text, start) + try: + rv = self.text_lexer.parse(text, start) + except Exception as e: + try: + self.bytes_lark.parse(text.encode(), start) + except Exception as be: + assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions" + raise e + assert False, "Parser without `use_bytes` raises exception, with doesn't" + try: + bv = self.bytes_lark.parse(text.encode(), start) + except Exception as be: + assert False, "Parser without `use_bytes` doesn't raise an exception, with does" + _tree_structure_check(rv, bv) + return rv + + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename, encoding='utf8') as f: + return cls(f, **options) + + def save(self,f): + self.text_lexer.save(f) + if self.bytes_lark is not None: + self.bytes_lark.save(f) + + def load(self,f): + self.text_lexer = self.text_lexer.load(f) + if self.bytes_lark is not None: + self.bytes_lark.load(f) + def _make_parser_test(LEXER, PARSER): lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER def _Lark(grammar, **kwargs): - return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) def _Lark_open(gfilename, **kwargs): - return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + class _TestParser(unittest.TestCase): def test_basic1(self): g = _Lark("""start: a+ b a* "b" a* @@ -646,6 +718,28 @@ def _make_parser_test(LEXER, PARSER): A: "\x01".."\x03" """) g.parse('\x01\x02\x03') + + @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") + def test_bytes_utf8(self): + g = r""" + start: BOM? char+ + BOM: "\xef\xbb\xbf" + char: CHAR1 | CHAR2 | CHAR3 | CHAR4 + CONTINUATION_BYTE: "\x80" .. "\xbf" + CHAR1: "\x00" .. "\x7f" + CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE + CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE + CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE + """ + g = _Lark(g) + s = u"🔣 地? gurīn".encode('utf-8') + self.assertEqual(len(g.bytes_lark.parse(s).children), 10) + + for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"), + ("sjis", u"売春婦"), + ("euc-jp", u"乂鵬鵠")]: + s = j.encode(enc) + self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s) @unittest.skipIf(PARSER == 'cyk', "Takes forever") def test_stack_for_ebnf(self):