From 0c89189b5fed475fe265ca29b63e68857f2f4c7e Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 23 Jul 2020 19:11:53 +0200 Subject: [PATCH] Support for bytes parser --- lark-stubs/lark.pyi | 4 +++- lark/common.py | 5 +++-- lark/exceptions.py | 17 +++++++++++++---- lark/lark.py | 5 +++-- lark/lexer.py | 25 +++++++++++++++---------- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 511e0ad..4f5f57e 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -31,6 +31,7 @@ class LarkOptions: lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] g_regex_flags: int + use_bytes: bool class Lark: @@ -55,7 +56,8 @@ class Lark: propagate_positions: bool = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, - g_regex_flags: int = ... + g_regex_flags: int = ..., + use_bytes: bool = False, ): ... diff --git a/lark/common.py b/lark/common.py index 5c55b8c..cc8c73c 100644 --- a/lark/common.py +++ b/lark/common.py @@ -4,10 +4,10 @@ from .lexer import TerminalDef ###{standalone class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex @@ -15,6 +15,7 @@ class LexerConf(Serialize): self.g_regex_flags = g_regex_flags self.re_module = re_module self.skip_validation = skip_validation + self.use_bytes = use_bytes def _deserialize(self): self.callbacks = {} # TODO diff --git a/lark/exceptions.py b/lark/exceptions.py index 1c5e533..033275c 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -28,9 +28,14 @@ class UnexpectedInput(LarkError): pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span - before = text[start:pos].rsplit('\n', 1)[-1] - after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before) + '^\n' + if not isinstance(text, bytes): + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + else: + before = text[start:pos].rsplit(b'\n', 1)[-1] + after = text[pos:end].split(b'\n', 1)[0] + return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") def match_examples(self, parse_fn, examples, token_type_match_fallback=False): """ Given a parser instance and a dictionary mapping some label with @@ -67,7 +72,11 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + + if isinstance(seq, bytes): + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) + else: + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line self.column = column diff --git a/lark/lark.py b/lark/lark.py index 232dbb7..36e92b1 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -105,6 +105,7 @@ class LarkOptions(Serialize): 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, + 'use_bytes': False, } def __init__(self, options_dict): @@ -252,7 +253,7 @@ class Lark(Serialize): for t in self.terminals: self.options.edit_terminals(t) - self._terminals_dict = {t.name:t for t in self.terminals} + self._terminals_dict = {t.name: t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. @@ -276,7 +277,7 @@ class Lark(Serialize): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) if self.options.parser: self.parser = self._build_parser() diff --git a/lark/lexer.py b/lark/lexer.py index 4979500..6039c54 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -230,7 +230,7 @@ class CallChain: -def _create_unless(terminals, g_regex_flags, re_): +def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -247,31 +247,34 @@ def _create_unless(terminals, g_regex_flags, re_): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. postfix = '$' if match_whole else '' mres = [] while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if use_bytes: + pattern = pattern.encode() try: - mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) + mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, g_regex_flags, re_, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) +def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -321,12 +324,13 @@ class TraditionalLexer(Lexer): self.terminals = terminals self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags + self.use_bytes = conf.use_bytes self._mres = None # self.build(g_regex_flags) def _build(self): - terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -336,7 +340,7 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re) + self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) @property def mres(self): @@ -365,7 +369,8 @@ class ContextualLexer(Lexer): assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t - trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + trad_conf = copy(conf) + trad_conf.tokens = terminals lexer_by_tokens = {} self.lexers = {}