Browse Source

Tests for bytes parser (credit to @ctrlcctrlv)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
MegaIng1 4 years ago
parent
commit
c93106f143
6 changed files with 127 additions and 10 deletions
  1. +1
    -0
      lark-stubs/lark.pyi
  2. +9
    -2
      lark/lark.py
  3. +4
    -4
      lark/lexer.py
  4. +2
    -0
      lark/parser_frontends.py
  5. +14
    -1
      lark/utils.py
  6. +97
    -3
      tests/test_parser.py

+ 1
- 0
lark-stubs/lark.pyi View File

@@ -36,6 +36,7 @@ class LarkOptions:

class Lark:
source: str
grammar_source: str
options: LarkOptions
lexer: Lexer
terminals: List[TerminalDef]


+ 9
- 2
lark/lark.py View File

@@ -4,7 +4,7 @@ import sys, os, pickle, hashlib, logging
from io import open


from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS
from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii
from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf
@@ -115,7 +115,7 @@ class LarkOptions(Serialize):
for name, default in self._defaults.items():
if name in o:
value = o.pop(name)
if isinstance(default, bool) and name != 'cache':
if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
value = bool(value)
else:
value = default
@@ -188,6 +188,13 @@ class Lark(Serialize):
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.grammar_source = grammar
if self.options.use_bytes:
assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only"
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. "
"It might have weird behaviour. Use `use_bytes='force'` "
"to still use it")

cache_fn = None
if self.options.cache:


+ 4
- 4
lark/lexer.py View File

@@ -139,8 +139,8 @@ class Token(Str):


class LineCounter:
def __init__(self):
self.newline_char = '\n'
def __init__(self, use_bytes=False):
self.newline_char = '\n' if not use_bytes else b'\n'
self.char_pos = 0
self.line = 1
self.column = 1
@@ -169,7 +169,7 @@ class _Lex:
def lex(self, stream, newline_types, ignore_types):
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter()
line_ctr = LineCounter(self.lexer.use_bytes)
last_token = None

while line_ctr.char_pos < len(stream):
@@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode()
pattern = pattern.encode('utf-8')
try:
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/


+ 2
- 0
lark/parser_frontends.py View File

@@ -184,6 +184,8 @@ class XEarley(_ParserFrontend):
else:
if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
if lexer_conf.use_bytes:
regexp = regexp.encode('utf-8')

self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)



+ 14
- 1
lark/utils.py View File

@@ -305,4 +305,17 @@ def combine_alternatives(lists):

class FS:
open = open
exists = os.path.exists
exists = os.path.exists



def isascii(s):
""" str.isascii only exists in python3.7+ """
try:
return s.isascii()
except AttributeError:
try:
s.encode('ascii')
return True
except (UnicodeDecodeError, UnicodeEncodeError):
return False

+ 97
- 3
tests/test_parser.py View File

@@ -8,7 +8,9 @@ import os
import sys
from copy import copy, deepcopy

from lark.utils import Py36
from lark.utils import Py36, isascii

from lark import Token

try:
from cStringIO import StringIO as cStringIO
@@ -561,12 +563,82 @@ class CustomLexer(Lexer):
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)

def _tree_structure_check(a, b):
"""
Checks that both Tree objects have the same structure, without checking their values.
"""
assert a.data == b.data and len(a.children) == len(b.children)
for ca,cb in zip(a.children, b.children):
assert type(ca) == type(cb)
if isinstance(ca, Tree):
_tree_structure_check(ca, cb)
elif isinstance(ca, Token):
assert ca.type == cb.type
else:
assert ca == cb

class DualLark:
"""
A helper class that wraps both a normal parser, and a parser for bytes.
It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
It always checks that both produce the same output/error
"""

def __init__(self, g, *args, **kwargs):
self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs)
g = self.text_lexer.grammar_source.lower()
if '\\u' in g or not isascii(g):
# Bytes re can't deal with uniode escapes
self.bytes_lark = None
else:
# Everything here should work, so use `use_bytes='force'`
self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs)

def parse(self, text, start=None):
# TODO: Easy workaround, more complex checks would be beneficial
if not isascii(text) or self.bytes_lark is None:
return self.text_lexer.parse(text, start)
try:
rv = self.text_lexer.parse(text, start)
except Exception as e:
try:
self.bytes_lark.parse(text.encode(), start)
except Exception as be:
assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions"
raise e
assert False, "Parser without `use_bytes` raises exception, with doesn't"
try:
bv = self.bytes_lark.parse(text.encode(), start)
except Exception as be:
assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
_tree_structure_check(rv, bv)
return rv
@classmethod
def open(cls, grammar_filename, rel_to=None, **options):
if rel_to:
basepath = os.path.dirname(rel_to)
grammar_filename = os.path.join(basepath, grammar_filename)
with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options)

def save(self,f):
self.text_lexer.save(f)
if self.bytes_lark is not None:
self.bytes_lark.save(f)

def load(self,f):
self.text_lexer = self.text_lexer.load(f)
if self.bytes_lark is not None:
self.bytes_lark.load(f)

def _make_parser_test(LEXER, PARSER):
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
def _Lark_open(gfilename, **kwargs):
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)

class _TestParser(unittest.TestCase):
def test_basic1(self):
g = _Lark("""start: a+ b a* "b" a*
@@ -646,6 +718,28 @@ def _make_parser_test(LEXER, PARSER):
A: "\x01".."\x03"
""")
g.parse('\x01\x02\x03')
@unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly")
def test_bytes_utf8(self):
g = r"""
start: BOM? char+
BOM: "\xef\xbb\xbf"
char: CHAR1 | CHAR2 | CHAR3 | CHAR4
CONTINUATION_BYTE: "\x80" .. "\xbf"
CHAR1: "\x00" .. "\x7f"
CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE
CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
"""
g = _Lark(g)
s = u"🔣 地? gurīn".encode('utf-8')
self.assertEqual(len(g.bytes_lark.parse(s).children), 10)

for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"),
("sjis", u"売春婦"),
("euc-jp", u"乂鵬鵠")]:
s = j.encode(enc)
self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s)

@unittest.skipIf(PARSER == 'cyk', "Takes forever")
def test_stack_for_ebnf(self):


Loading…
Cancel
Save