From 8a0028470031e0542d343da0f01c4db890c86c6e Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 27 Jun 2021 23:31:46 +0300 Subject: [PATCH 01/34] Removed some of the Python2 compatibility code --- lark/exceptions.py | 4 +-- lark/lark.py | 16 +++++------ lark/lexer.py | 9 ++++--- lark/load_grammar.py | 7 ++--- lark/tools/standalone.py | 1 + lark/utils.py | 57 +++++----------------------------------- 6 files changed, 25 insertions(+), 69 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 26ffce3..9099aa3 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,6 +1,6 @@ from warnings import warn -from .utils import STRING_TYPE, logger, NO_VALUE +from .utils import logger, NO_VALUE ###{standalone @@ -90,7 +90,7 @@ class UnexpectedInput(LarkError): candidate = (None, False) for i, (label, example) in enumerate(examples): - assert not isinstance(example, STRING_TYPE) + assert not isinstance(example, str), "Expecting a list" for j, malformed in enumerate(example): try: diff --git a/lark/lark.py b/lark/lark.py index 8e879cc..493081a 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,14 +1,13 @@ from __future__ import absolute_import - -from lark.exceptions import ConfigurationError, assert_config - +from abc import ABC, abstractmethod import sys, os, pickle, hashlib from io import open import tempfile from warnings import warn -from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod +from .exceptions import ConfigurationError, assert_config +from .utils import Serialize, SerializeMemoizer, FS, isascii, logger from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files from .tree import Tree from .common import LexerConf, ParserConf @@ -153,7 +152,7 @@ class LarkOptions(Serialize): options[name] = value - if isinstance(options['start'], STRING_TYPE): + if isinstance(options['start'], str): options['start'] = [options['start']] self.__dict__['options'] = options @@ -247,14 +246,11 @@ class Lark(Serialize): cache_fn = None cache_md5 = None - if isinstance(grammar, STRING_TYPE): + if isinstance(grammar, str): self.source_grammar = grammar if self.options.use_bytes: if not isascii(grammar): raise ConfigurationError("Grammar must be ascii only, when use_bytes=True") - if sys.version_info[0] == 2 and self.options.use_bytes != 'force': - raise ConfigurationError("`use_bytes=True` may have issues on python2." - "Use `use_bytes='force'` to use it at your own risk.") if self.options.cache: if self.options.parser != 'lalr': @@ -266,7 +262,7 @@ class Lark(Serialize): s = grammar + options_str + __version__ + str(sys.version_info[:2]) cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest() - if isinstance(self.options.cache, STRING_TYPE): + if isinstance(self.options.cache, str): cache_fn = self.options.cache else: if self.options.cache is not True: diff --git a/lark/lexer.py b/lark/lexer.py index a2aefd2..0349cde 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,8 +1,9 @@ # Lexer Implementation import re +from contextlib import suppress -from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress +from .utils import classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone @@ -110,7 +111,7 @@ class TerminalDef(Serialize): return self.name -class Token(Str): +class Token(str): """A string with meta-information, that is produced by the lexer. When parsing text, the resulting chunks of the input that haven't been discarded, @@ -177,9 +178,9 @@ class Token(Str): if isinstance(other, Token) and self.type != other.type: return False - return Str.__eq__(self, other) + return str.__eq__(self, other) - __hash__ = Str.__hash__ + __hash__ = str.__hash__ class LineCounter: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dbf4a1f..e437267 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -8,6 +8,7 @@ from io import open import pkgutil from ast import literal_eval from numbers import Integral +from contextlib import suppress from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique from .lexer import Token, TerminalDef, PatternStr, PatternRE @@ -16,7 +17,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol -from .utils import classify, suppress, dedup_list, Str +from .utils import classify, dedup_list from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError from .tree import Tree, SlottedTree as ST @@ -539,9 +540,9 @@ class PrepareSymbols(Transformer_InPlace): if isinstance(v, Tree): return v elif v.type == 'RULE': - return NonTerminal(Str(v.value)) + return NonTerminal(str(v.value)) elif v.type == 'TERMINAL': - return Terminal(Str(v.value), filter_out=v.startswith('_')) + return Terminal(str(v.value), filter_out=v.startswith('_')) assert False diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index c86d7d7..2669cd0 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -27,6 +27,7 @@ from __future__ import print_function # from io import open +from abc import ABC, abstractmethod ###} import sys diff --git a/lark/utils.py b/lark/utils.py index 70516e6..06291ac 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,4 +1,3 @@ -import hashlib import unicodedata import os from functools import reduce @@ -14,14 +13,6 @@ logger.addHandler(logging.StreamHandler()) # By default, we should not output any log messages logger.setLevel(logging.CRITICAL) -if sys.version_info[0]>2: - from abc import ABC, abstractmethod -else: - from abc import ABCMeta, abstractmethod - class ABC(object): # Provide Python27 compatibility - __slots__ = () - __metclass__ = ABCMeta - Py36 = (sys.version_info[:2] >= (3, 6)) @@ -120,28 +111,16 @@ class SerializeMemoizer(Serialize): return _deserialize(data, namespace, memo) -try: - STRING_TYPE = basestring -except NameError: # Python 3 - STRING_TYPE = str - import types from functools import wraps, partial -from contextlib import contextmanager - -Str = type(u'') -try: - classtype = types.ClassType # Python2 -except AttributeError: - classtype = type # Python3 def smart_decorator(f, create_decorator): if isinstance(f, types.FunctionType): return wraps(f)(create_decorator(f, True)) - elif isinstance(f, (classtype, type, types.BuiltinFunctionType)): + elif isinstance(f, (type, types.BuiltinFunctionType)): return wraps(f)(create_decorator(f, False)) elif isinstance(f, types.MethodType): @@ -222,34 +201,12 @@ def dedup_list(l): return [x for x in l if not (x in dedup or dedup.add(x))] -try: - from contextlib import suppress # Python 3 -except ImportError: - @contextmanager - def suppress(*excs): - '''Catch and dismiss the provided exception - - >>> x = 'hello' - >>> with suppress(IndexError): - ... x = x[10] - >>> x - 'hello' - ''' - try: - yield - except excs: - pass - - -try: - compare = cmp -except NameError: - def compare(a, b): - if a == b: - return 0 - elif a > b: - return 1 - return -1 +def compare(a, b): + if a == b: + return 0 + elif a > b: + return 1 + return -1 class Enumerator(Serialize): From eb453241464a2461f9642f6810ac966a4c7e77b7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 27 Jun 2021 23:42:57 +0300 Subject: [PATCH 02/34] Removed deprecated code --- lark/__init__.py | 3 +-- lark/common.py | 7 ------- lark/exceptions.py | 9 +-------- lark/lark.py | 19 ------------------- lark/lexer.py | 14 ++++---------- lark/parse_tree_builder.py | 15 ++------------- lark/parsers/lalr_interactive_parser.py | 4 ---- lark/parsers/lalr_puppet.py | 3 --- lark/tools/nearley.py | 5 +++-- lark/tools/standalone.py | 9 --------- lark/tree.py | 16 ---------------- lark/visitors.py | 16 ---------------- 12 files changed, 11 insertions(+), 109 deletions(-) delete mode 100644 lark/parsers/lalr_puppet.py diff --git a/lark/__init__.py b/lark/__init__.py index f056182..aff5683 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,10 +1,9 @@ from .utils import logger from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive -from .visitors import InlineTransformer, inline_args # XXX Deprecated from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError) from .lexer import Token from .lark import Lark -__version__ = "0.11.4" +__version__ = "1.0.0a" diff --git a/lark/common.py b/lark/common.py index 467acf8..e2cde6b 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,5 +1,3 @@ -from warnings import warn - from .utils import Serialize from .lexer import TerminalDef @@ -23,11 +21,6 @@ class LexerConf(Serialize): self.use_bytes = use_bytes self.lexer_type = None - @property - def tokens(self): - warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) - return self.terminals - def _deserialize(self): self.terminals_by_name = {t.name: t for t in self.terminals} diff --git a/lark/exceptions.py b/lark/exceptions.py index 9099aa3..2ae0859 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,5 +1,3 @@ -from warnings import warn - from .utils import logger, NO_VALUE @@ -193,7 +191,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') - self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.pos_in_stream = getattr(token, 'start_pos', None) self.state = state self.token = token @@ -220,11 +218,6 @@ class UnexpectedToken(ParseError, UnexpectedInput): return message - @property - def puppet(self): - warn("UnexpectedToken.puppet attribute has been renamed to interactive_parser", DeprecationWarning) - return self.interactive_parser - class VisitError(LarkError): diff --git a/lark/lark.py b/lark/lark.py index 493081a..f7eb85e 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -4,7 +4,6 @@ from abc import ABC, abstractmethod import sys, os, pickle, hashlib from io import open import tempfile -from warnings import warn from .exceptions import ConfigurationError, assert_config from .utils import Serialize, SerializeMemoizer, FS, isascii, logger @@ -568,23 +567,5 @@ class Lark(Serialize): """ return self.parser.parse(text, start=start, on_error=on_error) - @property - def source(self): - warn("Attribute Lark.source was renamed to Lark.source_path", DeprecationWarning) - return self.source_path - - @source.setter - def source(self, value): - self.source_path = value - - @property - def grammar_source(self): - warn("Attribute Lark.grammar_source was renamed to Lark.source_grammar", DeprecationWarning) - return self.source_grammar - - @grammar_source.setter - def grammar_source(self, value): - self.source_grammar = value - ###} diff --git a/lark/lexer.py b/lark/lexer.py index 0349cde..77f7090 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -7,7 +7,6 @@ from .utils import classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone -from warnings import warn from copy import copy @@ -121,18 +120,18 @@ class Token(str): Attributes: type: Name of the token (as specified in grammar) value: Value of the token (redundant, as ``token.value == token`` will always be true) - pos_in_stream: The index of the token in the text + start_pos: The index of the token in the text line: The line of the token in the text (starting with 1) column: The column of the token in the text (starting with 1) end_line: The line where the token ends end_column: The next column after the end of the token. For example, if the token is a single character with a column value of 4, end_column will be 5. - end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``) + end_pos: the index where the token ends (basically ``start_pos + len(token)``) """ __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') - def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): + def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: @@ -140,7 +139,7 @@ class Token(str): self = super(Token, cls).__new__(cls, value) self.type = type_ - self.start_pos = start_pos if start_pos is not None else pos_in_stream + self.start_pos = start_pos self.value = value self.line = line self.column = column @@ -149,11 +148,6 @@ class Token(str): self.end_pos = end_pos return self - @property - def pos_in_stream(self): - warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning) - return self.start_pos - def update(self, type_=None, value=None): return Token.new_borrow_pos( type_ if type_ is not None else self.type, diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7a854bc..720315f 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,7 +1,6 @@ from .exceptions import GrammarError, ConfigurationError from .lexer import Token from .tree import Tree -from .visitors import InlineTransformer # XXX Deprecated from .visitors import Transformer_InPlace from .visitors import _vargs_meta, _vargs_meta_inline @@ -297,12 +296,6 @@ class AmbiguousIntermediateExpander: return self.node_builder(children) -def ptb_inline_args(func): - @wraps(func) - def f(children): - return func(*children) - return f - def inplace_transformer(func): @wraps(func) @@ -358,15 +351,11 @@ class ParseTreeBuilder: user_callback_name = rule.alias or rule.options.template_source or rule.origin.name try: f = getattr(transformer, user_callback_name) - # XXX InlineTransformer is deprecated! wrapper = getattr(f, 'visit_wrapper', None) if wrapper is not None: f = apply_visit_wrapper(f, user_callback_name, wrapper) - else: - if isinstance(transformer, InlineTransformer): - f = ptb_inline_args(f) - elif isinstance(transformer, Transformer_InPlace): - f = inplace_transformer(f) + elif isinstance(transformer, Transformer_InPlace): + f = inplace_transformer(f) except AttributeError: f = partial(self.tree_class, user_callback_name) diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py index ce596b5..eeadef8 100644 --- a/lark/parsers/lalr_interactive_parser.py +++ b/lark/parsers/lalr_interactive_parser.py @@ -126,7 +126,3 @@ class ImmutableInteractiveParser(InteractiveParser): p = copy(self) return InteractiveParser(p.parser, p.parser_state, p.lexer_state) - -# Deprecated class names for the interactive parser -ParserPuppet = InteractiveParser -ImmutableParserPuppet = ImmutableInteractiveParser diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py deleted file mode 100644 index 6ea6d89..0000000 --- a/lark/parsers/lalr_puppet.py +++ /dev/null @@ -1,3 +0,0 @@ -# Deprecated - -from .lalr_interactive_parser import ParserPuppet, ImmutableParserPuppet \ No newline at end of file diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index f0779dc..f5026e8 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -6,7 +6,7 @@ import codecs import argparse -from lark import Lark, InlineTransformer +from lark import Lark, Transformer, v_args nearley_grammar = r""" start: (ruledef|directive)+ @@ -50,7 +50,8 @@ def _get_rulename(name): name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) return 'n_' + name.replace('$', '__DOLLAR__').lower() -class NearleyToLark(InlineTransformer): +@v_args(inline=True) +class NearleyToLark(Transformer): def __init__(self): self._count = 0 self.extra_rules = {} diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 2669cd0..170800b 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -37,7 +37,6 @@ from os import path from collections import defaultdict from functools import partial from argparse import ArgumentParser, SUPPRESS -from warnings import warn import lark from lark import Lark @@ -119,11 +118,6 @@ def strip_docstrings(line_gen): return ''.join(res) -def main(fobj, start, print=print): - warn('`lark.tools.standalone.main` is being redesigned. Use `gen_standalone`', DeprecationWarning) - lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) - gen_standalone(lark_inst, print) - def gen_standalone(lark_inst, output=None, out=sys.stdout, compress=False): if output is None: output = partial(print, file=out) @@ -186,9 +180,6 @@ def main(): parser.print_help(sys.stderr) sys.exit(1) ns = parser.parse_args() - if ns.old_start is not None: - warn('The syntax `python -m lark.tools.standalone ` is deprecated. Use the -s option') - ns.start.append(ns.old_start) lark_inst, out = build_lalr(ns) gen_standalone(lark_inst, out=out, compress=ns.compress) diff --git a/lark/tree.py b/lark/tree.py index bee53cf..468894a 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -147,22 +147,6 @@ class Tree(object): self.data = data self.children = children - # XXX Deprecated! Here for backwards compatibility <0.6.0 - @property - def line(self): - return self.meta.line - - @property - def column(self): - return self.meta.column - - @property - def end_line(self): - return self.meta.end_line - - @property - def end_column(self): - return self.meta.end_column class SlottedTree(Tree): diff --git a/lark/visitors.py b/lark/visitors.py index 23ef64a..22beb47 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -149,18 +149,6 @@ class Transformer(_Decoratable): return token -class InlineTransformer(Transformer): # XXX Deprecated - def _call_userfunc(self, tree, new_children=None): - # Assumes tree is already transformed - children = new_children if new_children is not None else tree.children - try: - f = getattr(self, tree.data) - except AttributeError: - return self.__default__(tree.data, children, tree.meta) - else: - return f(*children) - - class TransformerChain(object): def __init__(self, *transformers): self.transformers = transformers @@ -363,10 +351,6 @@ def _inline_args__func(func): return smart_decorator(func, create_decorator) -def inline_args(obj): # XXX Deprecated - return _apply_decorator(obj, _inline_args__func) - - def _visitor_args_func_dec(func, visit_wrapper=None, static=False): def create_decorator(_f, with_self): if with_self: From 8d082b577f7b0a797dd8b44dc23b1e9964c94700 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 27 Jun 2021 23:47:02 +0300 Subject: [PATCH 03/34] Remove tests for Python 2 --- .github/workflows/tests.yml | 2 +- setup.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1630c8b..df06b7d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy2, pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy2, pypy3] steps: - uses: actions/checkout@v2 diff --git a/setup.py b/setup.py index f23eb0b..affd499 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ Main Features: - CYK parser, for highly ambiguous grammars - EBNF grammar - Unicode fully supported - - Python 2 & 3 compatible + - Python 3 compatible - Automatic line & column tracking - Standard library of terminals (strings, numbers, names, etc.) - Import grammars from Nearley.js @@ -59,7 +59,6 @@ Main Features: classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: General", From d54f68530abf463a0fe6e86dfadff156081f4a86 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 28 Jun 2021 00:27:53 +0300 Subject: [PATCH 04/34] Remove tests for Python 2 --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6448cc8..792c6bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,12 @@ dist: xenial language: python python: - - "2.7" - "3.4" - "3.5" - "3.6" - "3.7" - "3.8" - "3.9-dev" - - "pypy2.7-6.0" - "pypy3.5-6.0" install: pip install tox-travis script: From def06dbc08e8340f1eda6117b344d3db7fd2d6fe Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 28 Jun 2021 00:31:07 +0300 Subject: [PATCH 05/34] Remove tests for Python 2 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index df06b7d..c106bc6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy2, pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy3] steps: - uses: actions/checkout@v2 From ff686fc89a6b14b1239832281ade320523e723a3 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 28 Jun 2021 14:35:21 +0300 Subject: [PATCH 06/34] Remove more Python 2 code --- lark/lark.py | 3 --- lark/load_grammar.py | 1 - lark/tools/standalone.py | 9 ++------- lark/utils.py | 1 - 4 files changed, 2 insertions(+), 12 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index f7eb85e..a4d223e 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,8 +1,5 @@ -from __future__ import absolute_import - from abc import ABC, abstractmethod import sys, os, pickle, hashlib -from io import open import tempfile from .exceptions import ConfigurationError, assert_config diff --git a/lark/load_grammar.py b/lark/load_grammar.py index e437267..cb8856b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -4,7 +4,6 @@ import os.path import sys from collections import namedtuple from copy import copy, deepcopy -from io import open import pkgutil from ast import literal_eval from numbers import Integral diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 170800b..1cc8f81 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -1,5 +1,3 @@ -from __future__ import print_function - ###{standalone # # @@ -26,7 +24,6 @@ from __future__ import print_function # # -from io import open from abc import ABC, abstractmethod ###} @@ -36,14 +33,13 @@ import os from os import path from collections import defaultdict from functools import partial -from argparse import ArgumentParser, SUPPRESS +from argparse import ArgumentParser import lark -from lark import Lark from lark.tools import lalr_argparser, build_lalr, make_warnings_comments -from lark.grammar import RuleOptions, Rule +from lark.grammar import Rule from lark.lexer import TerminalDef _dir = path.dirname(__file__) @@ -174,7 +170,6 @@ def main(): make_warnings_comments() parser = ArgumentParser(prog="prog='python -m lark.tools.standalone'", description="Lark Stand-alone Generator Tool", parents=[lalr_argparser], epilog='Look at the Lark documentation for more info on the options') - parser.add_argument("old_start", nargs='?', help=SUPPRESS) parser.add_argument('-c', '--compress', action='store_true', default=0, help="Enable compression") if len(sys.argv)==1: parser.print_help(sys.stderr) diff --git a/lark/utils.py b/lark/utils.py index 06291ac..47fe5ca 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -6,7 +6,6 @@ from collections import deque ###{standalone import sys, re import logging -from io import open logger = logging.getLogger("lark") logger.addHandler(logging.StreamHandler()) # Set to highest level, since we have some warnings amongst the code From 8d0cdeeb14cac15310991c6acc24788b3d9fb95d Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Mon, 28 Jun 2021 10:00:32 -0700 Subject: [PATCH 07/34] Start merging .pyi files ast_utils, grammar, and indenter --- lark-stubs/ast_utils.pyi | 17 --------------- lark-stubs/grammar.pyi | 14 ------------ lark-stubs/indenter.pyi | 47 ---------------------------------------- lark/ast_utils.py | 6 +++-- lark/grammar.py | 16 ++++++++------ lark/indenter.py | 43 +++++++++++++++++++++++++++++++----- 6 files changed, 51 insertions(+), 92 deletions(-) delete mode 100644 lark-stubs/ast_utils.pyi delete mode 100644 lark-stubs/grammar.pyi delete mode 100644 lark-stubs/indenter.pyi diff --git a/lark-stubs/ast_utils.pyi b/lark-stubs/ast_utils.pyi deleted file mode 100644 index 28246cf..0000000 --- a/lark-stubs/ast_utils.pyi +++ /dev/null @@ -1,17 +0,0 @@ -import types -from typing import Optional - -from .visitors import Transformer - -class Ast(object): - pass - -class AsList(object): - pass - - -def create_transformer( - ast_module: types.ModuleType, - transformer: Optional[Transformer]=None -) -> Transformer: - ... \ No newline at end of file diff --git a/lark-stubs/grammar.pyi b/lark-stubs/grammar.pyi deleted file mode 100644 index 3a3d806..0000000 --- a/lark-stubs/grammar.pyi +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Optional, Tuple - - -class RuleOptions: - keep_all_tokens: bool - expand1: bool - priority: int - template_source: Optional[str] - empty_indices: Tuple[bool, ...] - - -class Symbol: - name: str - is_term: bool diff --git a/lark-stubs/indenter.pyi b/lark-stubs/indenter.pyi deleted file mode 100644 index 3a7aa97..0000000 --- a/lark-stubs/indenter.pyi +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import Tuple, List, Iterator, Optional -from abc import ABC, abstractmethod -from .lexer import Token -from .lark import PostLex - - -class Indenter(PostLex, ABC): - paren_level: Optional[int] - indent_level: Optional[List[int]] - - def __init__(self) -> None: - ... - - def handle_NL(self, token: Token) -> Iterator[Token]: - ... - - @property - @abstractmethod - def NL_type(self) -> str: - ... - - @property - @abstractmethod - def OPEN_PAREN_types(self) -> List[str]: - ... - - @property - @abstractmethod - def CLOSE_PAREN_types(self) -> List[str]: - ... - - @property - @abstractmethod - def INDENT_type(self) -> str: - ... - - @property - @abstractmethod - def DEDENT_type(self) -> str: - ... - - @property - @abstractmethod - def tab_len(self) -> int: - ... diff --git a/lark/ast_utils.py b/lark/ast_utils.py index 0f2e498..c535f11 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -3,6 +3,8 @@ """ import inspect, re +import types +from typing import Optional from lark import Transformer, v_args @@ -27,7 +29,7 @@ def _call(func, _data, children, _meta): inline = v_args(wrapper=_call) -def create_transformer(ast_module, transformer=None): +def create_transformer(ast_module: types.ModuleType, transformer: Optional[Transformer]=None) -> Transformer: """Collects `Ast` subclasses from the given module, and creates a Lark transformer that builds the AST. For each class, we create a corresponding rule in the transformer, with a matching name. @@ -49,4 +51,4 @@ def create_transformer(ast_module, transformer=None): setattr(t, camel_to_snake(name), obj) - return t \ No newline at end of file + return t diff --git a/lark/grammar.py b/lark/grammar.py index 405086a..8896b17 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + from .utils import Serialize ###{standalone @@ -5,10 +7,10 @@ from .utils import Serialize class Symbol(Serialize): __slots__ = ('name',) - is_term = NotImplemented + is_term: bool = NotImplemented def __init__(self, name): - self.name = name + self.name: str = name def __eq__(self, other): assert isinstance(other, Symbol), other @@ -50,11 +52,11 @@ class RuleOptions(Serialize): __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices' def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()): - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.priority = priority - self.template_source = template_source - self.empty_indices = empty_indices + self.keep_all_tokens: bool = keep_all_tokens + self.expand1: bool = expand1 + self.priority: int = priority + self.template_source: Optional[str] = template_source + self.empty_indices: Tuple[bool, ...] = empty_indices def __repr__(self): return 'RuleOptions(%r, %r, %r, %r)' % ( diff --git a/lark/indenter.py b/lark/indenter.py index 7e1263d..496f6e7 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -1,5 +1,8 @@ "Provides Indentation services for languages with indentation similar to Python" +from abc import ABC, abstractmethod +from typing import Tuple, List, Iterator, Optional + from .exceptions import LarkError from .lark import PostLex from .lexer import Token @@ -8,13 +11,13 @@ from .lexer import Token class DedentError(LarkError): pass -class Indenter(PostLex): - def __init__(self): - self.paren_level = None - self.indent_level = None +class Indenter(PostLex, ABC): + def __init__(self) -> None: + self.paren_level: Optional[int] = None + self.indent_level: Optional[List[int]] = None assert self.tab_len > 0 - def handle_NL(self, token): + def handle_NL(self, token: Token) -> Iterator[Token]: if self.paren_level > 0: return @@ -64,4 +67,34 @@ class Indenter(PostLex): def always_accept(self): return (self.NL_type,) + @property + @abstractmethod + def NL_type(self) -> str: + ... + + @property + @abstractmethod + def OPEN_PAREN_types(self) -> List[str]: + ... + + @property + @abstractmethod + def CLOSE_PAREN_types(self) -> List[str]: + ... + + @property + @abstractmethod + def INDENT_type(self) -> str: + ... + + @property + @abstractmethod + def DEDENT_type(self) -> str: + ... + + @property + @abstractmethod + def tab_len(self) -> int: + ... + ###} From 9b77270502908736faca41b06f535e893efad0c4 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 28 Jun 2021 20:19:06 +0300 Subject: [PATCH 08/34] Removed code relevant to versions below 3.6 --- .github/workflows/tests.yml | 2 +- lark/lexer.py | 19 +++++-------------- lark/load_grammar.py | 15 ++------------- lark/utils.py | 2 -- tests/test_parser.py | 3 +-- tox.ini | 6 +----- 6 files changed, 10 insertions(+), 37 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c106bc6..a635e58 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy3] + python-version: [3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy3] steps: - uses: actions/checkout@v2 diff --git a/lark/lexer.py b/lark/lexer.py index 77f7090..1b5250e 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from contextlib import suppress -from .utils import classify, get_regexp_width, Py36, Serialize +from .utils import classify, get_regexp_width, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone @@ -38,19 +38,10 @@ class Pattern(Serialize): def max_width(self): raise NotImplementedError() - if Py36: - # Python 3.6 changed syntax for flags in regular expression - def _get_flags(self, value): - for f in self.flags: - value = ('(?%s:%s)' % (f, value)) - return value - - else: - def _get_flags(self, value): - for f in self.flags: - value = ('(?%s)' % f) + value - return value - + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s:%s)' % (f, value)) + return value class PatternStr(Pattern): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index cb8856b..d3c310d 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -9,7 +9,7 @@ from ast import literal_eval from numbers import Integral from contextlib import suppress -from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique +from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -475,18 +475,7 @@ class PrepareLiterals(Transformer_InPlace): def _make_joined_pattern(regexp, flags_set): - # In Python 3.6, a new syntax for flags was introduced, that allows us to restrict the scope - # of flags to a specific regexp group. We are already using it in `lexer.Pattern._get_flags` - # However, for prior Python versions, we still need to use global flags, so we have to make sure - # that there are no flag collisions when we merge several terminals. - flags = () - if not Py36: - if len(flags_set) > 1: - raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!") - elif len(flags_set) == 1: - flags ,= flags_set - - return PatternRE(regexp, flags) + return PatternRE(regexp, ()) class TerminalTreeToPattern(Transformer): diff --git a/lark/utils.py b/lark/utils.py index 47fe5ca..c210dca 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -13,8 +13,6 @@ logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.CRITICAL) -Py36 = (sys.version_info[:2] >= (3, 6)) - NO_VALUE = object() diff --git a/tests/test_parser.py b/tests/test_parser.py index ff4e064..ac409b3 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -8,7 +8,7 @@ import os import sys from copy import copy, deepcopy -from lark.utils import Py36, isascii +from lark.utils import isascii from lark import Token, Transformer_NonRecursive, LexError @@ -1545,7 +1545,6 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual( g.parse('"hello"').children, ['"hello"']) self.assertEqual( g.parse("'hello'").children, ["'hello'"]) - @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+") def test_join_regex_flags(self): g = r""" start: A diff --git a/tox.ini b/tox.ini index ef19e2c..33f9f64 100644 --- a/tox.ini +++ b/tox.ini @@ -1,16 +1,12 @@ [tox] -envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3 +envlist = py36, py37, py38, py39, pypy, pypy3 skip_missing_interpreters=true [travis] -2.7 = py27 -3.4 = py34 -3.5 = py35 3.6 = py36 3.7 = py37 3.8 = py38 3.9 = py39 -pypy = pypy pypy3 = pypy3 [testenv] From d7b819e9cedfb53f4dd33dfe184ae03d96be0852 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Mon, 28 Jun 2021 11:02:05 -0700 Subject: [PATCH 09/34] Continue merging .pyi files load_grammar, reconstruct, and visitors --- lark-stubs/load_grammar.pyi | 31 ----------- lark-stubs/reconstruct.pyi | 39 ------------- lark-stubs/visitors.pyi | 108 ------------------------------------ lark/indenter.py | 3 +- lark/load_grammar.py | 27 ++++----- lark/reconstruct.py | 12 ++-- lark/visitors.py | 59 +++++++++++--------- 7 files changed, 56 insertions(+), 223 deletions(-) delete mode 100644 lark-stubs/load_grammar.pyi delete mode 100644 lark-stubs/reconstruct.pyi delete mode 100644 lark-stubs/visitors.pyi diff --git a/lark-stubs/load_grammar.pyi b/lark-stubs/load_grammar.pyi deleted file mode 100644 index 86a6341..0000000 --- a/lark-stubs/load_grammar.pyi +++ /dev/null @@ -1,31 +0,0 @@ -from typing import List, Tuple, Union, Callable, Dict, Optional - -from .tree import Tree -from .grammar import RuleOptions -from .exceptions import UnexpectedInput - - -class Grammar: - rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] - term_defs: List[Tuple[str, Tuple[Tree, int]]] - ignore: List[str] - - -class GrammarBuilder: - global_keep_all_tokens: bool - import_paths: List[Union[str, Callable]] - used_files: Dict[str, str] - - def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None, used_files: Dict[str, str]=None) -> None: ... - - def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ... - - def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], - base_mangle: Callable[[str], str] = None) -> None: ... - - def validate(self) -> None: ... - - def build(self) -> Grammar: ... - - -def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: ... \ No newline at end of file diff --git a/lark-stubs/reconstruct.pyi b/lark-stubs/reconstruct.pyi deleted file mode 100644 index a8d39e3..0000000 --- a/lark-stubs/reconstruct.pyi +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import List, Dict, Union, Callable, Iterable - -from .grammar import Symbol -from .lark import Lark -from .tree import Tree -from .visitors import Transformer_InPlace -from .lexer import TerminalDef - - -class WriteTokensTransformer(Transformer_InPlace): - - def __init__(self, tokens: Dict[str, TerminalDef], term_subs: Dict[str, Callable[[Symbol], str]] = ...): ... - - -class MatchTree(Tree): - pass - - -class MakeMatchTree: - name: str - expansion: List[TerminalDef] - - def __init__(self, name: str, expansion: List[TerminalDef]): - ... - - def __call__(self, args: List[Union[str, Tree]]): - ... - - -class Reconstructor: - - def __init__(self, parser: Lark, term_subs: Dict[str, Callable[[Symbol], str]] = ...): - ... - - def reconstruct(self, tree: Tree, postproc: Callable[[Iterable[str]], Iterable[str]]=None, - insert_spaces: bool = True) -> str: - ... diff --git a/lark-stubs/visitors.pyi b/lark-stubs/visitors.pyi deleted file mode 100644 index 3a934ee..0000000 --- a/lark-stubs/visitors.pyi +++ /dev/null @@ -1,108 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union -from abc import ABC -from .tree import Tree - -_T = TypeVar('_T') -_R = TypeVar('_R') -_FUNC = Callable[..., _T] -_DECORATED = Union[_FUNC, type] - - -class Transformer(ABC, Generic[_T]): - - def __init__(self, visit_tokens: bool = True) -> None: - ... - - def transform(self, tree: Tree) -> _T: - ... - - def __mul__(self, other: Transformer[_T]) -> TransformerChain[_T]: - ... - - -class TransformerChain(Generic[_T]): - transformers: Tuple[Transformer[_T], ...] - - def __init__(self, *transformers: Transformer[_T]) -> None: - ... - - def transform(self, tree: Tree) -> _T: - ... - - def __mul__(self, other: Transformer[_T]) -> TransformerChain[_T]: - ... - - -class Transformer_InPlace(Transformer): - pass - - -class Transformer_NonRecursive(Transformer): - pass - - -class Transformer_InPlaceRecursive(Transformer): - pass - - -class VisitorBase: - pass - - -class Visitor(VisitorBase, ABC, Generic[_T]): - - def visit(self, tree: Tree) -> Tree: - ... - - def visit_topdown(self, tree: Tree) -> Tree: - ... - - -class Visitor_Recursive(VisitorBase): - - def visit(self, tree: Tree) -> Tree: - ... - - def visit_topdown(self, tree: Tree) -> Tree: - ... - - -class Interpreter(ABC, Generic[_T]): - - def visit(self, tree: Tree) -> _T: - ... - - def visit_children(self, tree: Tree) -> List[_T]: - ... - - -_InterMethod = Callable[[Type[Interpreter], _T], _R] - - -def v_args( - inline: bool = False, - meta: bool = False, - tree: bool = False, - wrapper: Callable = None -) -> Callable[[_DECORATED], _DECORATED]: - ... - - -def visit_children_decor(func: _InterMethod) -> _InterMethod: - ... - - -class Discard(Exception): - pass - - -# Deprecated -class InlineTransformer: - pass - - -# Deprecated -def inline_args(obj: _FUNC) -> _FUNC: - ... diff --git a/lark/indenter.py b/lark/indenter.py index 496f6e7..b6f47d6 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -1,13 +1,14 @@ "Provides Indentation services for languages with indentation similar to Python" from abc import ABC, abstractmethod -from typing import Tuple, List, Iterator, Optional from .exceptions import LarkError from .lark import PostLex from .lexer import Token ###{standalone +from typing import Tuple, List, Iterator, Optional + class DedentError(LarkError): pass diff --git a/lark/load_grammar.py b/lark/load_grammar.py index cb8856b..9ee3691 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -8,6 +8,7 @@ import pkgutil from ast import literal_eval from numbers import Integral from contextlib import suppress +from typing import List, Tuple, Union, Callable, Dict, Optional from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique from .lexer import Token, TerminalDef, PatternStr, PatternRE @@ -17,7 +18,7 @@ from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, dedup_list -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -552,9 +553,9 @@ def nr_deepcopy_tree(t): class Grammar: def __init__(self, rule_defs, term_defs, ignore): - self.term_defs = term_defs - self.rule_defs = rule_defs - self.ignore = ignore + self.term_defs: List[Tuple[str, Tuple[Tree, int]]] = term_defs + self.rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] = rule_defs + self.ignore: List[str] = ignore def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) @@ -874,7 +875,7 @@ def _search_interactive_parser(interactive_parser, predicate): if predicate(p): return path, p -def find_grammar_errors(text, start='start'): +def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: errors = [] def on_error(e): errors.append((e, _error_repr(e))) @@ -923,10 +924,10 @@ def _mangle_exp(exp, mangle): class GrammarBuilder: - def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None): - self.global_keep_all_tokens = global_keep_all_tokens - self.import_paths = import_paths or [] - self.used_files = used_files or {} + def __init__(self, global_keep_all_tokens: bool=False, import_paths: List[Union[str, Callable]]=None, used_files: Dict[str, str]=None) -> None: + self.global_keep_all_tokens: bool = global_keep_all_tokens + self.import_paths: List[Union[str, Callable]] = import_paths or [] + self.used_files: Dict[str, str] = used_files or {} self._definitions = {} self._ignore_names = [] @@ -1067,7 +1068,7 @@ class GrammarBuilder: return name, exp, params, opts - def load_grammar(self, grammar_text, grammar_name="", mangle=None): + def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Callable[[str], str]=None) -> None: tree = _parse_grammar(grammar_text, grammar_name) imports = {} @@ -1130,7 +1131,7 @@ class GrammarBuilder: self._definitions = {k: v for k, v in self._definitions.items() if k in _used} - def do_import(self, dotted_path, base_path, aliases, base_mangle=None): + def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], base_mangle: Callable[[str], str]=None) -> None: assert dotted_path mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle) grammar_path = os.path.join(*dotted_path) + EXT @@ -1166,7 +1167,7 @@ class GrammarBuilder: assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) - def validate(self): + def validate(self) -> None: for name, (params, exp, _options) in self._definitions.items(): for i, p in enumerate(params): if p in self._definitions: @@ -1195,7 +1196,7 @@ class GrammarBuilder: if not set(self._definitions).issuperset(self._ignore_names): raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) - def build(self): + def build(self) -> Grammar: self.validate() rule_defs = [] term_defs = [] diff --git a/lark/reconstruct.py b/lark/reconstruct.py index ab2fb38..2f0911b 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -1,11 +1,13 @@ """Reconstruct text from a tree, based on Lark grammar""" +from typing import List, Dict, Union, Callable, Iterable import unicodedata +from .lark import Lark from .tree import Tree from .visitors import Transformer_InPlace -from .lexer import Token, PatternStr -from .grammar import Terminal, NonTerminal +from .lexer import Token, PatternStr, TerminalDef +from .grammar import Terminal, NonTerminal, Symbol from .tree_matcher import TreeMatcher, is_discarded_terminal from .utils import is_id_continue @@ -21,7 +23,7 @@ def is_iter_empty(i): class WriteTokensTransformer(Transformer_InPlace): "Inserts discarded tokens into their correct place, according to the rules of grammar" - def __init__(self, tokens, term_subs): + def __init__(self, tokens: Dict[str, TerminalDef], term_subs: Dict[str, Callable[[Symbol], str]]) -> None: self.tokens = tokens self.term_subs = term_subs @@ -70,7 +72,7 @@ class Reconstructor(TreeMatcher): term_subs: a dictionary of [Terminal name as str] to [output text as str] """ - def __init__(self, parser, term_subs=None): + def __init__(self, parser: Lark, term_subs: Dict[str, Callable[[Symbol], str]]=None) -> None: TreeMatcher.__init__(self, parser) self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {}) @@ -87,7 +89,7 @@ class Reconstructor(TreeMatcher): else: yield item - def reconstruct(self, tree, postproc=None, insert_spaces=True): + def reconstruct(self, tree: Tree, postproc: Callable[[Iterable[str]], Iterable[str]]=None, insert_spaces: bool=True) -> str: x = self._reconstruct(tree) if postproc: x = postproc(x) diff --git a/lark/visitors.py b/lark/visitors.py index 22beb47..847c468 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -1,3 +1,4 @@ +from abc import ABC from functools import wraps from .utils import smart_decorator, combine_alternatives @@ -7,7 +8,12 @@ from .lexer import Token ###{standalone from inspect import getmembers, getmro +from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union +_T = TypeVar('_T') +_R = TypeVar('_R') +_FUNC = Callable[..., _T] +_DECORATED = Union[_FUNC, type] class Discard(Exception): """When raising the Discard exception in a transformer callback, @@ -46,7 +52,7 @@ class _Decoratable: return cls -class Transformer(_Decoratable): +class Transformer(_Decoratable, ABC, Generic[_T]): """Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. Methods are provided by the user via inheritance, and called according to ``tree.data``. @@ -74,7 +80,7 @@ class Transformer(_Decoratable): """ __visit_tokens__ = True # For backwards compatibility - def __init__(self, visit_tokens=True): + def __init__(self, visit_tokens: bool=True) -> None: self.__visit_tokens__ = visit_tokens def _call_userfunc(self, tree, new_children=None): @@ -125,11 +131,11 @@ class Transformer(_Decoratable): children = list(self._transform_children(tree.children)) return self._call_userfunc(tree, children) - def transform(self, tree): + def transform(self, tree: Tree) -> _T: "Transform the given tree, and return the final result" return self._transform_tree(tree) - def __mul__(self, other): + def __mul__(self, other: 'Transformer[_T]') -> 'TransformerChain[_T]': """Chain two transformers together, returning a new transformer. """ return TransformerChain(self, other) @@ -149,16 +155,16 @@ class Transformer(_Decoratable): return token -class TransformerChain(object): +class TransformerChain(Generic[_T]): def __init__(self, *transformers): - self.transformers = transformers + self.transformers: Tuple[Transformer[_T], ...] = transformers - def transform(self, tree): + def transform(self, tree: Tree) -> _T: for t in self.transformers: tree = t.transform(tree) return tree - def __mul__(self, other): + def __mul__(self, other: Transformer[_T]) -> 'TransformerChain[_T]': return TransformerChain(*self.transformers + (other,)) @@ -239,19 +245,19 @@ class VisitorBase: return cls -class Visitor(VisitorBase): +class Visitor(VisitorBase, ABC, Generic[_T]): """Tree visitor, non-recursive (can handle huge trees). Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data`` """ - def visit(self, tree): + def visit(self, tree: Tree) -> Tree: "Visits the tree, starting with the leaves and finally the root (bottom-up)" for subtree in tree.iter_subtrees(): self._call_userfunc(subtree) return tree - def visit_topdown(self,tree): + def visit_topdown(self, tree: Tree) -> Tree: "Visit the tree, starting at the root, and ending at the leaves (top-down)" for subtree in tree.iter_subtrees_topdown(): self._call_userfunc(subtree) @@ -266,7 +272,7 @@ class Visitor_Recursive(VisitorBase): Slightly faster than the non-recursive version. """ - def visit(self, tree): + def visit(self, tree: Tree) -> Tree: "Visits the tree, starting with the leaves and finally the root (bottom-up)" for child in tree.children: if isinstance(child, Tree): @@ -275,7 +281,7 @@ class Visitor_Recursive(VisitorBase): self._call_userfunc(tree) return tree - def visit_topdown(self,tree): + def visit_topdown(self,tree: Tree) -> Tree: "Visit the tree, starting at the root, and ending at the leaves (top-down)" self._call_userfunc(tree) @@ -286,16 +292,7 @@ class Visitor_Recursive(VisitorBase): return tree -def visit_children_decor(func): - "See Interpreter" - @wraps(func) - def inner(cls, tree): - values = cls.visit_children(tree) - return func(cls, values) - return inner - - -class Interpreter(_Decoratable): +class Interpreter(_Decoratable, ABC, Generic[_T]): """Interpreter walks the tree starting at the root. Visits the tree, starting with the root and finally the leaves (top-down) @@ -307,7 +304,7 @@ class Interpreter(_Decoratable): This allows the user to implement branching and loops. """ - def visit(self, tree): + def visit(self, tree: Tree) -> _T: f = getattr(self, tree.data) wrapper = getattr(f, 'visit_wrapper', None) if wrapper is not None: @@ -315,7 +312,7 @@ class Interpreter(_Decoratable): else: return f(tree) - def visit_children(self, tree): + def visit_children(self, tree: Tree) -> List[_T]: return [self.visit(child) if isinstance(child, Tree) else child for child in tree.children] @@ -326,6 +323,16 @@ class Interpreter(_Decoratable): return self.visit_children(tree) +_InterMethod = Callable[[Type[Interpreter], _T], _R] + +def visit_children_decor(func: _InterMethod) -> _InterMethod: + "See Interpreter" + @wraps(func) + def inner(cls, tree): + values = cls.visit_children(tree) + return func(cls, values) + return inner + # Decorators def _apply_decorator(obj, decorator, **kwargs): @@ -380,7 +387,7 @@ def _vargs_tree(f, data, children, meta): return f(Tree(data, children, meta)) -def v_args(inline=False, meta=False, tree=False, wrapper=None): +def v_args(inline: bool=False, meta: bool=False, tree: bool=False, wrapper: Callable[[_DECORATED], _DECORATED]=None) -> Callable[[_DECORATED], _DECORATED]: """A convenience decorator factory for modifying the behavior of user-supplied visitor methods. By default, callback methods of transformers/visitors accept one argument - a list of the node's children. From 089bc2b523ce7286fa3a6ee1046c4324b67d15f4 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Mon, 28 Jun 2021 15:56:39 -0700 Subject: [PATCH 10/34] More .pyi merging exceptions, lark, and tree --- lark-stubs/exceptions.pyi | 65 --------------------------------------- lark-stubs/lark.pyi | 62 ++----------------------------------- lark-stubs/tree.pyi | 62 ++----------------------------------- lark/exceptions.py | 30 ++++++++++++++++-- lark/lark.py | 56 ++++++++++++++++++++++++--------- lark/load_grammar.py | 8 +++-- lark/tree.py | 40 +++++++++++++++++------- 7 files changed, 109 insertions(+), 214 deletions(-) delete mode 100644 lark-stubs/exceptions.pyi diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi deleted file mode 100644 index 1c04fa8..0000000 --- a/lark-stubs/exceptions.pyi +++ /dev/null @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set -from .tree import Tree -from .lexer import Token -from .parsers.lalr_interactive_parser import InteractiveParser - -class LarkError(Exception): - pass - - -class ConfigurationError(LarkError, ValueError): - pass - - -class GrammarError(LarkError): - pass - - -class ParseError(LarkError): - pass - - -class LexError(LarkError): - pass - - -T = TypeVar('T') - -class UnexpectedEOF(ParseError): - expected: List[Token] - -class UnexpectedInput(LarkError): - line: int - column: int - pos_in_stream: int - state: Any - - def get_context(self, text: str, span: int = ...) -> str: - ... - - def match_examples( - self, - parse_fn: Callable[[str], Tree], - examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], - token_type_match_fallback: bool = False, - use_accepts: bool = False, - ) -> T: - ... - - -class UnexpectedToken(ParseError, UnexpectedInput): - expected: Set[str] - considered_rules: Set[str] - interactive_parser: InteractiveParser - accepts: Set[str] - -class UnexpectedCharacters(LexError, UnexpectedInput): - allowed: Set[str] - considered_tokens: Set[Any] - - -class VisitError(LarkError): - obj: Union[Tree, Token] - orig_exc: Exception diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 27c6863..579e802 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -1,19 +1,13 @@ # -*- coding: utf-8 -*- from typing import ( - TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, + Type, List, Dict, IO, Iterator, Callable, Union, Optional, Literal, Protocol, Tuple, Iterable, ) -from .parsers.lalr_interactive_parser import InteractiveParser from .visitors import Transformer from .lexer import Token, Lexer, TerminalDef -from .tree import Tree -from .exceptions import UnexpectedInput -from .load_grammar import Grammar - -_T = TypeVar('_T') - +from .load_grammar import Grammar, PackageResource class PostLex(Protocol): @@ -22,39 +16,8 @@ class PostLex(Protocol): always_accept: Iterable[str] - class LarkOptions: - start: List[str] - parser: str - lexer: str - transformer: Optional[Transformer] - postlex: Optional[PostLex] - ambiguity: str - regex: bool - debug: bool - keep_all_tokens: bool - propagate_positions: Union[bool, str] - maybe_placeholders: bool - lexer_callbacks: Dict[str, Callable[[Token], Token]] - cache: Union[bool, str] - g_regex_flags: int - use_bytes: bool - import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] - source_path: Optional[str] - - -class PackageResource(object): - pkg_name: str - path: str - - def __init__(self, pkg_name: str, path: str): ... - - -class FromPackageLoader: - def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... - - def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... - + ... class Lark: source_path: str @@ -88,22 +51,3 @@ class Lark: ): ... - def parse(self, text: str, start: Optional[str] = None, on_error: Callable[[UnexpectedInput], bool] = None) -> Tree: - ... - - def parse_interactive(self, text: str = None, start: Optional[str] = None) -> InteractiveParser: - ... - - @classmethod - def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: - ... - - @classmethod - def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T: - ... - - def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]: - ... - - def get_terminal(self, name: str) -> TerminalDef: - ... diff --git a/lark-stubs/tree.pyi b/lark-stubs/tree.pyi index ea99ff6..824e9e7 100644 --- a/lark-stubs/tree.pyi +++ b/lark-stubs/tree.pyi @@ -1,67 +1,9 @@ # -*- coding: utf-8 -*- -from typing import List, Callable, Iterator, Union, Optional, Literal, Any -from .lexer import TerminalDef - -class Meta: - empty: bool - line: int - column: int - start_pos: int - end_line: int - end_column: int - end_pos: int - orig_expansion: List[TerminalDef] - match_tree: bool - +from typing import Literal class Tree: - data: str - children: List[Union[str, Tree]] - meta: Meta - - def __init__( - self, - data: str, - children: List[Union[str, Tree]], - meta: Optional[Meta] = None - ) -> None: - ... - - def pretty(self, indent_str: str = ...) -> str: - ... - - def find_pred(self, pred: Callable[[Tree], bool]) -> Iterator[Tree]: - ... - - def find_data(self, data: str) -> Iterator[Tree]: - ... - - def expand_kids_by_index(self, *indices: int) -> None: - ... - - def scan_values(self, pred: Callable[[Union[str, Tree]], bool]) -> Iterator[str]: - ... - - def iter_subtrees(self) -> Iterator[Tree]: - ... - - def iter_subtrees_topdown(self) -> Iterator[Tree]: - ... - - def copy(self) -> Tree: - ... - - def set(self, data: str, children: List[Union[str, Tree]]) -> None: - ... - - def __hash__(self) -> int: - ... - - -class SlottedTree(Tree): - pass - + ... def pydot__tree_to_png( tree: Tree, diff --git a/lark/exceptions.py b/lark/exceptions.py index 2ae0859..3276db5 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -3,6 +3,12 @@ from .utils import logger, NO_VALUE ###{standalone +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, TYPE_CHECKING + +if TYPE_CHECKING: + from .lexer import Token + from .parsers.lalr_interactive_parser import InteractiveParser + from .tree import Tree class LarkError(Exception): pass @@ -28,6 +34,7 @@ class ParseError(LarkError): class LexError(LarkError): pass +T = TypeVar('T') class UnexpectedInput(LarkError): """UnexpectedInput Error. @@ -39,10 +46,13 @@ class UnexpectedInput(LarkError): After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ + line: int + column: int pos_in_stream = None + state: Any _terminals_by_name = None - def get_context(self, text, span=40): + def get_context(self, text: str, span: int=40) -> str: """Returns a pretty string pinpointing the error in the text, with span amount of context characters around it. @@ -63,7 +73,7 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): + def match_examples(self, parse_fn: 'Callable[[str], Tree]', examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool=False, use_accepts: bool=False) -> T: """Allows you to detect what's wrong in the input text by matching against example errors. @@ -126,6 +136,9 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): + + expected: 'List[Token]' + def __init__(self, expected, state=None, terminals_by_name=None): self.expected = expected self.state = state @@ -145,6 +158,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): + + allowed: Set[str] + considered_tokens: Set[Any] + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, terminals_by_name=None, considered_rules=None): # TODO considered_tokens and allowed can be figured out using state @@ -187,6 +204,10 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: ``InteractiveParser``. """ + expected: Set[str] + considered_rules: Set[str] + interactive_parser: 'InteractiveParser' + def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') @@ -205,7 +226,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__() @property - def accepts(self): + def accepts(self) -> Set[str]: if self._accepts is NO_VALUE: self._accepts = self.interactive_parser and self.interactive_parser.accepts() return self._accepts @@ -228,6 +249,9 @@ class VisitError(LarkError): - orig_exc: the exception that cause it to fail """ + obj: 'Union[Tree, Token]' + orig_exc: Exception + def __init__(self, rule, obj, orig_exc): self.obj = obj self.orig_exc = orig_exc diff --git a/lark/lark.py b/lark/lark.py index a4d223e..1bd8e52 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,6 +1,10 @@ from abc import ABC, abstractmethod import sys, os, pickle, hashlib import tempfile +from typing import ( + TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, + Tuple, Iterable, TYPE_CHECKING +) from .exceptions import ConfigurationError, assert_config from .utils import Serialize, SerializeMemoizer, FS, isascii, logger @@ -8,7 +12,7 @@ from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_ from .tree import Tree from .common import LexerConf, ParserConf -from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread +from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread, Token from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend, _get_lexer_callbacks from .grammar import Rule @@ -19,14 +23,44 @@ try: except ImportError: regex = None +if TYPE_CHECKING: + from .load_grammar import PackageResource + from .exceptions import UnexpectedInput + from .parsers.lalr_interactive_parser import InteractiveParser + from .visitors import Transformer ###{standalone +class PostLex(ABC): + @abstractmethod + def process(self, stream): + return stream + + always_accept = () class LarkOptions(Serialize): """Specifies the options for Lark """ + + start: List[str] + parser: str + lexer: str + transformer: 'Optional[Transformer]' + postlex: Optional[PostLex] + ambiguity: str + regex: bool + debug: bool + keep_all_tokens: bool + propagate_positions: Union[bool, str] + maybe_placeholders: bool + lexer_callbacks: Dict[str, Callable[[Token], Token]] + cache: Union[bool, str] + g_regex_flags: int + use_bytes: bool + import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]' + source_path: Optional[str] + OPTIONS_DOC = """ **=== General Options ===** @@ -189,13 +223,7 @@ _VALID_PRIORITY_OPTIONS = ('auto', 'normal', 'invert', None) _VALID_AMBIGUITY_OPTIONS = ('auto', 'resolve', 'explicit', 'forest') -class PostLex(ABC): - @abstractmethod - def process(self, stream): - return stream - - always_accept = () - +_T = TypeVar('_T') class Lark(Serialize): """Main interface for the library. @@ -476,7 +504,7 @@ class Lark(Serialize): return inst._load({'data': data, 'memo': memo}, **kwargs) @classmethod - def open(cls, grammar_filename, rel_to=None, **options): + def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str]=None, **options) -> _T: """Create an instance of Lark with the grammar given by its filename If ``rel_to`` is provided, the function will find the grammar filename in relation to it. @@ -494,7 +522,7 @@ class Lark(Serialize): return cls(f, **options) @classmethod - def open_from_package(cls, package, grammar_path, search_paths=("",), **options): + def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...]=("",), **options) -> _T: """Create an instance of Lark with the grammar loaded from within the package `package`. This allows grammar loading from zipapps. @@ -515,7 +543,7 @@ class Lark(Serialize): return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) - def lex(self, text, dont_ignore=False): + def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. @@ -530,11 +558,11 @@ class Lark(Serialize): return self.options.postlex.process(stream) return stream - def get_terminal(self, name): + def get_terminal(self, name: str) -> TerminalDef: """Get information about a terminal""" return self._terminals_dict[name] - def parse_interactive(self, text=None, start=None): + def parse_interactive(self, text: str=None, start: Optional[str]=None) -> 'InteractiveParser': """Start an interactive parsing session. Parameters: @@ -548,7 +576,7 @@ class Lark(Serialize): """ return self.parser.parse_interactive(text, start=start) - def parse(self, text, start=None, on_error=None): + def parse(self, text: str, start: Optional[str]=None, on_error: 'Callable[[UnexpectedInput], bool]'=None) -> Tree: """Parse the given text, according to the options provided. Parameters: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 9ee3691..5073475 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -691,14 +691,18 @@ class FromPackageLoader(object): pkg_name: The name of the package. You can probably provide `__name__` most of the time search_paths: All the path that will be search on absolute imports. """ - def __init__(self, pkg_name, search_paths=("", )): + + pkg_name: str + search_paths: Tuple[str, ...] + + def __init__(self, pkg_name: str, search_paths: Tuple[str, ...]=("", )) -> None: self.pkg_name = pkg_name self.search_paths = search_paths def __repr__(self): return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths) - def __call__(self, base_path, grammar_path): + def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: if base_path is None: to_try = self.search_paths else: diff --git a/lark/tree.py b/lark/tree.py index 468894a..cb29c8a 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -8,9 +8,23 @@ from copy import deepcopy ###{standalone from collections import OrderedDict +from typing import List, Callable, Iterator, Union, Optional, Any, TYPE_CHECKING +if TYPE_CHECKING: + from .lexer import TerminalDef class Meta: + + empty: bool + line: int + column: int + start_pos: int + end_line: int + end_column: int + end_pos: int + orig_expansion: 'List[TerminalDef]' + match_tree: bool + def __init__(self): self.empty = True @@ -27,13 +41,17 @@ class Tree(object): meta: Line & Column numbers (if ``propagate_positions`` is enabled). meta attributes: line, column, start_pos, end_line, end_column, end_pos """ - def __init__(self, data, children, meta=None): + + data: str + children: 'List[Union[str, Tree]]' + + def __init__(self, data: str, children: 'List[Union[str, Tree]]', meta: Meta=None) -> None: self.data = data self.children = children self._meta = meta @property - def meta(self): + def meta(self) -> Meta: if self._meta is None: self._meta = Meta() return self._meta @@ -57,7 +75,7 @@ class Tree(object): return l - def pretty(self, indent_str=' '): + def pretty(self, indent_str: str=' ') -> str: """Returns an indented string representation of the tree. Great for debugging. @@ -73,10 +91,10 @@ class Tree(object): def __ne__(self, other): return not (self == other) - def __hash__(self): + def __hash__(self) -> int: return hash((self.data, tuple(self.children))) - def iter_subtrees(self): + def iter_subtrees(self) -> 'Iterator[Tree]': """Depth-first iteration. Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG). @@ -91,23 +109,23 @@ class Tree(object): del queue return reversed(list(subtrees.values())) - def find_pred(self, pred): + def find_pred(self, pred: 'Callable[[Tree], bool]') -> 'Iterator[Tree]': """Returns all nodes of the tree that evaluate pred(node) as true.""" return filter(pred, self.iter_subtrees()) - def find_data(self, data): + def find_data(self, data: str) -> 'Iterator[Tree]': """Returns all nodes of the tree whose data equals the given data.""" return self.find_pred(lambda t: t.data == data) ###} - def expand_kids_by_index(self, *indices): + def expand_kids_by_index(self, *indices: int) -> None: """Expand (inline) children at the given indices""" for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices kid = self.children[i] self.children[i:i+1] = kid.children - def scan_values(self, pred): + def scan_values(self, pred: 'Callable[[Union[str, Tree]], bool]') -> Iterator[str]: """Return all values in the tree that evaluate pred(value) as true. This can be used to find all the tokens in the tree. @@ -140,10 +158,10 @@ class Tree(object): def __deepcopy__(self, memo): return type(self)(self.data, deepcopy(self.children, memo), meta=self._meta) - def copy(self): + def copy(self) -> 'Tree': return type(self)(self.data, self.children) - def set(self, data, children): + def set(self, data: str, children: 'List[Union[str, Tree]]') -> None: self.data = data self.children = children From a5f3ec4d0c2a849ffd071d03800e949fcc439dc0 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 29 Jun 2021 15:50:38 +0300 Subject: [PATCH 11/34] Small fix to setup.py --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index affd499..f479614 100644 --- a/setup.py +++ b/setup.py @@ -48,12 +48,13 @@ Main Features: - CYK parser, for highly ambiguous grammars - EBNF grammar - Unicode fully supported - - Python 3 compatible - Automatic line & column tracking - Standard library of terminals (strings, numbers, names, etc.) - Import grammars from Nearley.js - Extensive test suite - And much more! + +Since version 1.0, only Python versions 3.6 and up are supported. ''', classifiers=[ From 433adeeaff80306f5ba4bfb246d5eef8180f62cc Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Tue, 29 Jun 2021 13:08:32 -0700 Subject: [PATCH 12/34] Last batch of .pyi files __init__, lexer, and parsers/__init__ --- lark-stubs/__init__.pyi | 12 --- lark-stubs/lexer.pyi | 161 -------------------------------- lark-stubs/parsers/__init__.pyi | 0 lark/__init__.py | 2 +- lark/common.py | 19 +++- lark/lexer.py | 98 +++++++++++++------ lark/utils.py | 2 +- 7 files changed, 89 insertions(+), 205 deletions(-) delete mode 100644 lark-stubs/__init__.pyi delete mode 100644 lark-stubs/lexer.pyi delete mode 100644 lark-stubs/parsers/__init__.pyi diff --git a/lark-stubs/__init__.pyi b/lark-stubs/__init__.pyi deleted file mode 100644 index c79a6ef..0000000 --- a/lark-stubs/__init__.pyi +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- - -from .tree import * -from .visitors import * -from .exceptions import * -from .lexer import * -from .load_grammar import * -from .lark import * -from logging import Logger as _Logger - -logger: _Logger -__version__: str = ... diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi deleted file mode 100644 index 004865c..0000000 --- a/lark-stubs/lexer.pyi +++ /dev/null @@ -1,161 +0,0 @@ -# -*- coding: utf-8 -*- -from types import ModuleType -from typing import ( - TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - Pattern as REPattern, -) -from abc import abstractmethod, ABC - -_T = TypeVar('_T') - - -class Pattern(ABC): - value: str - flags: Collection[str] - raw: str - type: str - - def __init__(self, value: str, flags: Collection[str] = (), raw: str = None) -> None: - ... - - @abstractmethod - def to_regexp(self) -> str: - ... - - @property - @abstractmethod - def min_width(self) -> int: - ... - - @property - @abstractmethod - def max_width(self) -> int: - ... - - -class PatternStr(Pattern): - type: str = ... - - def to_regexp(self) -> str: - ... - - @property - def min_width(self) -> int: - ... - - @property - def max_width(self) -> int: - ... - - -class PatternRE(Pattern): - type: str = ... - - def to_regexp(self) -> str: - ... - - @property - def min_width(self) -> int: - ... - - @property - def max_width(self) -> int: - ... - - -class TerminalDef: - name: str - pattern: Pattern - priority: int - - def __init__(self, name: str, pattern: Pattern, priority: int = ...) -> None: - ... - - def user_repr(self) -> str: ... - - -class Token(str): - type: str - start_pos: int - value: Any - line: int - column: int - end_line: int - end_column: int - end_pos: int - - def __init__(self, type_: str, value: Any, start_pos: int = None, line: int = None, column: int = None, end_line: int = None, end_column: int = None, end_pos: int = None) -> None: - ... - - def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> Token: - ... - - @classmethod - def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: Token) -> _T: - ... - - -_Callback = Callable[[Token], Token] - - -class Lexer(ABC): - lex: Callable[..., Iterator[Token]] - - -class LexerConf: - tokens: Collection[TerminalDef] - re_module: ModuleType - ignore: Collection[str] = () - postlex: Any =None - callbacks: Optional[Dict[str, _Callback]] = None - g_regex_flags: int = 0 - skip_validation: bool = False - use_bytes: bool = False - - - -class TraditionalLexer(Lexer): - terminals: Collection[TerminalDef] - ignore_types: FrozenSet[str] - newline_types: FrozenSet[str] - user_callbacks: Dict[str, _Callback] - callback: Dict[str, _Callback] - mres: List[Tuple[REPattern, Dict[int, str]]] - re: ModuleType - - def __init__( - self, - conf: LexerConf - ) -> None: - ... - - def build(self) -> None: - ... - - def match(self, stream: str, pos: int) -> Optional[Tuple[str, str]]: - ... - - def lex(self, stream: str) -> Iterator[Token]: - ... - - def next_token(self, lex_state: Any, parser_state: Any = None) -> Token: - ... - -class ContextualLexer(Lexer): - lexers: Dict[str, TraditionalLexer] - root_lexer: TraditionalLexer - - def __init__( - self, - terminals: Collection[TerminalDef], - states: Dict[str, Collection[str]], - re_: ModuleType, - ignore: Collection[str] = ..., - always_accept: Collection[str] = ..., - user_callbacks: Dict[str, _Callback] = ..., - g_regex_flags: int = ... - ) -> None: - ... - - def lex(self, stream: str, get_parser_state: Callable[[], str]) -> Iterator[Token]: - ... diff --git a/lark-stubs/parsers/__init__.pyi b/lark-stubs/parsers/__init__.pyi deleted file mode 100644 index e69de29..0000000 diff --git a/lark/__init__.py b/lark/__init__.py index aff5683..609cfc7 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -6,4 +6,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "1.0.0a" +__version__: str = "1.0.0a" diff --git a/lark/common.py b/lark/common.py index e2cde6b..ccd5e16 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,14 +1,29 @@ from .utils import Serialize -from .lexer import TerminalDef +from .lexer import TerminalDef, Token ###{standalone +from types import ModuleType +from typing import Any, Callable, Collection, Dict, Optional, TYPE_CHECKING +if TYPE_CHECKING: + from .lark import PostLex + +_Callback = Callable[[Token], Token] class LexerConf(Serialize): __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' __serialize_namespace__ = TerminalDef, - def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): + terminals: Collection[TerminalDef] + re_module: ModuleType + ignore: Collection[str] = () + postlex: 'PostLex' = None + callbacks: Optional[Dict[str, _Callback]] = None + g_regex_flags: int = 0 + skip_validation: bool = False + use_bytes: bool = False + + def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'PostLex'=None, callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False): self.terminals = terminals self.terminals_by_name = {t.name: t for t in self.terminals} assert len(self.terminals) == len(self.terminals_by_name) diff --git a/lark/lexer.py b/lark/lexer.py index 77f7090..6177d26 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,5 +1,6 @@ # Lexer Implementation +from abc import abstractmethod, ABC import re from contextlib import suppress @@ -9,12 +10,23 @@ from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone from copy import copy +from types import ModuleType +from typing import ( + TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, + Pattern as REPattern, TYPE_CHECKING +) -class Pattern(Serialize): - raw = None - type = None +if TYPE_CHECKING: + from .common import LexerConf - def __init__(self, value, flags=(), raw=None): +class Pattern(Serialize, ABC): + + value: str + flags: Collection[str] + raw: str = None + type: str = None + + def __init__(self, value: str, flags: Collection[str]=(), raw: str=None) -> None: self.value = value self.flags = frozenset(flags) self.raw = raw @@ -29,13 +41,18 @@ class Pattern(Serialize): def __eq__(self, other): return type(self) == type(other) and self.value == other.value and self.flags == other.flags - def to_regexp(self): + @abstractmethod + def to_regexp(self) -> str: raise NotImplementedError() - def min_width(self): + @property + @abstractmethod + def min_width(self) -> int: raise NotImplementedError() - def max_width(self): + @property + @abstractmethod + def max_width(self) -> int: raise NotImplementedError() if Py36: @@ -56,13 +73,13 @@ class Pattern(Serialize): class PatternStr(Pattern): __serialize_fields__ = 'value', 'flags' - type = "str" + type: str = "str" - def to_regexp(self): + def to_regexp(self) -> str: return self._get_flags(re.escape(self.value)) @property - def min_width(self): + def min_width(self) -> int: return len(self.value) max_width = min_width @@ -70,9 +87,9 @@ class PatternStr(Pattern): class PatternRE(Pattern): __serialize_fields__ = 'value', 'flags', '_width' - type = "re" + type: str = "re" - def to_regexp(self): + def to_regexp(self) -> str: return self._get_flags(self.value) _width = None @@ -82,11 +99,11 @@ class PatternRE(Pattern): return self._width @property - def min_width(self): + def min_width(self) -> int: return self._get_width()[0] @property - def max_width(self): + def max_width(self) -> int: return self._get_width()[1] @@ -94,7 +111,11 @@ class TerminalDef(Serialize): __serialize_fields__ = 'name', 'pattern', 'priority' __serialize_namespace__ = PatternStr, PatternRE - def __init__(self, name, pattern, priority=1): + name: str + pattern: Pattern + priority: int + + def __init__(self, name: str, pattern: Pattern, priority: int=1) -> None: assert isinstance(pattern, Pattern), pattern self.name = name self.pattern = pattern @@ -103,12 +124,13 @@ class TerminalDef(Serialize): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - def user_repr(self): + def user_repr(self) -> str: if self.name.startswith('__'): # We represent a generated terminal return self.pattern.raw or self.name else: return self.name +_T = TypeVar('_T') class Token(str): """A string with meta-information, that is produced by the lexer. @@ -131,6 +153,15 @@ class Token(str): """ __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') + type: str + start_pos: int + value: Any + line: int + column: int + end_line: int + end_column: int + end_pos: int + def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): try: self = super(Token, cls).__new__(cls, value) @@ -148,7 +179,7 @@ class Token(str): self.end_pos = end_pos return self - def update(self, type_=None, value=None): + def update(self, type_: Optional[str]=None, value: Optional[Any]=None) -> 'Token': return Token.new_borrow_pos( type_ if type_ is not None else self.type, value if value is not None else self.value, @@ -156,7 +187,7 @@ class Token(str): ) @classmethod - def new_borrow_pos(cls, type_, value, borrow_t): + def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> _T: return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) def __reduce__(self): @@ -289,14 +320,15 @@ def _regexp_has_newline(r): """ return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) +_Callback = Callable[[Token], Token] -class Lexer(object): +class Lexer(ABC): """Lexer interface Method Signatures: lex(self, text) -> Iterator[Token] """ - lex = NotImplemented + lex: Callable[..., Iterator[Token]] = NotImplemented def make_lexer_state(self, text): line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') @@ -305,7 +337,14 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, conf): + terminals: Collection[TerminalDef] + ignore_types: FrozenSet[str] + newline_types: FrozenSet[str] + user_callbacks: Dict[str, _Callback] + callback: Dict[str, _Callback] + re: ModuleType + + def __init__(self, conf: 'LexerConf') -> None: terminals = list(conf.terminals) assert all(isinstance(t, TerminalDef) for t in terminals), terminals @@ -338,7 +377,7 @@ class TraditionalLexer(Lexer): self._mres = None - def _build(self): + def _build(self) -> None: terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) assert all(self.callback.values()) @@ -352,23 +391,23 @@ class TraditionalLexer(Lexer): self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) @property - def mres(self): + def mres(self) -> List[Tuple[REPattern, Dict[int, str]]]: if self._mres is None: self._build() return self._mres - def match(self, text, pos): + def match(self, text: str, pos: int) -> Optional[Tuple[str, str]]: for mre, type_from_index in self.mres: m = mre.match(text, pos) if m: return m.group(0), type_from_index[m.lastindex] - def lex(self, state, parser_state): + def lex(self, state: Any, parser_state: Any) -> Iterator[Token]: with suppress(EOFError): while True: yield self.next_token(state, parser_state) - def next_token(self, lex_state, parser_state=None): + def next_token(self, lex_state: Any, parser_state: Any=None) -> Token: line_ctr = lex_state.line_ctr while line_ctr.char_pos < len(lex_state.text): res = self.match(lex_state.text, line_ctr.char_pos) @@ -424,7 +463,10 @@ class LexerState(object): class ContextualLexer(Lexer): - def __init__(self, conf, states, always_accept=()): + lexers: Dict[str, TraditionalLexer] + root_lexer: TraditionalLexer + + def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None: terminals = list(conf.terminals) terminals_by_name = conf.terminals_by_name @@ -452,7 +494,7 @@ class ContextualLexer(Lexer): def make_lexer_state(self, text): return self.root_lexer.make_lexer_state(text) - def lex(self, lexer_state, parser_state): + def lex(self, lexer_state: Any, parser_state: Any) -> Iterator[Token]: try: while True: lexer = self.lexers[parser_state.position] diff --git a/lark/utils.py b/lark/utils.py index 47fe5ca..81c9128 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -6,7 +6,7 @@ from collections import deque ###{standalone import sys, re import logging -logger = logging.getLogger("lark") +logger: logging.Logger = logging.getLogger("lark") logger.addHandler(logging.StreamHandler()) # Set to highest level, since we have some warnings amongst the code # By default, we should not output any log messages From be76059015291ac4e3e4bd9e4fe51184c3bf4f74 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Tue, 29 Jun 2021 14:45:38 -0700 Subject: [PATCH 13/34] Merge the Literal types --- lark-stubs/lark.pyi | 53 --------------------------------------- lark-stubs/tree.pyi | 14 ----------- lark/grammar.py | 3 ++- lark/lark.py | 60 +++++++++++++++++++++++++++------------------ lark/tree.py | 6 ++++- 5 files changed, 43 insertions(+), 93 deletions(-) delete mode 100644 lark-stubs/lark.pyi delete mode 100644 lark-stubs/tree.pyi diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi deleted file mode 100644 index 579e802..0000000 --- a/lark-stubs/lark.pyi +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import ( - Type, List, Dict, IO, Iterator, Callable, Union, Optional, - Literal, Protocol, Tuple, Iterable, -) - -from .visitors import Transformer -from .lexer import Token, Lexer, TerminalDef -from .load_grammar import Grammar, PackageResource - -class PostLex(Protocol): - - def process(self, stream: Iterator[Token]) -> Iterator[Token]: - ... - - always_accept: Iterable[str] - -class LarkOptions: - ... - -class Lark: - source_path: str - source_grammar: str - grammar: Grammar - options: LarkOptions - lexer: Lexer - terminals: List[TerminalDef] - - def __init__( - self, - grammar: Union[Grammar, str, IO[str]], - *, - start: Union[None, str, List[str]] = "start", - parser: Literal["earley", "lalr", "cyk", "auto"] = "auto", - lexer: Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]] = "auto", - transformer: Optional[Transformer] = None, - postlex: Optional[PostLex] = None, - ambiguity: Literal["explicit", "resolve"] = "resolve", - regex: bool = False, - debug: bool = False, - keep_all_tokens: bool = False, - propagate_positions: Union[bool, str] = False, - maybe_placeholders: bool = False, - lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, - cache: Union[bool, str] = False, - g_regex_flags: int = ..., - use_bytes: bool = False, - import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ..., - source_path: Optional[str]=None, - ): - ... - diff --git a/lark-stubs/tree.pyi b/lark-stubs/tree.pyi deleted file mode 100644 index 824e9e7..0000000 --- a/lark-stubs/tree.pyi +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import Literal - -class Tree: - ... - -def pydot__tree_to_png( - tree: Tree, - filename: str, - rankdir: Literal["TB", "LR", "BT", "RL"] = ..., - **kwargs -) -> None: - ... diff --git a/lark/grammar.py b/lark/grammar.py index 8896b17..6045620 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,9 +1,10 @@ -from typing import Optional, Tuple from .utils import Serialize ###{standalone +from typing import Optional, Tuple + class Symbol(Serialize): __slots__ = ('name',) diff --git a/lark/lark.py b/lark/lark.py index 1bd8e52..b4c767b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,14 +1,10 @@ from abc import ABC, abstractmethod import sys, os, pickle, hashlib import tempfile -from typing import ( - TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, - Tuple, Iterable, TYPE_CHECKING -) -from .exceptions import ConfigurationError, assert_config +from .exceptions import ConfigurationError, assert_config, UnexpectedInput from .utils import Serialize, SerializeMemoizer, FS, isascii, logger -from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files +from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource from .tree import Tree from .common import LexerConf, ParserConf @@ -23,20 +19,27 @@ try: except ImportError: regex = None + +###{standalone +from typing import ( + TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, + Tuple, Iterable, IO, Any, TYPE_CHECKING +) + if TYPE_CHECKING: - from .load_grammar import PackageResource - from .exceptions import UnexpectedInput from .parsers.lalr_interactive_parser import InteractiveParser from .visitors import Transformer - -###{standalone + if sys.version_info >= (3, 8): + from typing import Literal + else: + from typing_extensions import Literal class PostLex(ABC): @abstractmethod - def process(self, stream): + def process(self, stream: Iterator[Token]) -> Iterator[Token]: return stream - always_accept = () + always_accept: Iterable[str] = () class LarkOptions(Serialize): """Specifies the options for Lark @@ -44,20 +47,23 @@ class LarkOptions(Serialize): """ start: List[str] - parser: str - lexer: str - transformer: 'Optional[Transformer]' - postlex: Optional[PostLex] - ambiguity: str - regex: bool debug: bool - keep_all_tokens: bool + transformer: 'Optional[Transformer]' propagate_positions: Union[bool, str] maybe_placeholders: bool - lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] + regex: bool g_regex_flags: int + keep_all_tokens: bool + tree_class: Any + parser: 'Literal["earley", "lalr", "cyk", "auto"]' + lexer: 'Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' + ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]' + postlex: Optional[PostLex] + priority: 'Optional[Literal["auto", "normal", "invert"]]' + lexer_callbacks: Dict[str, Callable[[Token], Token]] use_bytes: bool + edit_terminals: Optional[Callable[[TerminalDef], TerminalDef]] import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]' source_path: Optional[str] @@ -140,9 +146,7 @@ class LarkOptions(Serialize): # Adding a new option needs to be done in multiple places: # - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts # - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs - # - In `lark-stubs/lark.pyi`: - # - As attribute to `LarkOptions` - # - As parameter to `Lark.__init__` + # - As an attribute of `LarkOptions` above # - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded # - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument _defaults = { @@ -238,7 +242,15 @@ class Lark(Serialize): >>> Lark(r'''start: "foo" ''') Lark(...) """ - def __init__(self, grammar, **options): + + source_path: str + source_grammar: str + grammar: 'Grammar' + options: LarkOptions + lexer: Lexer + terminals: List[TerminalDef] + + def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: self.options = LarkOptions(options) # Set regex or re module diff --git a/lark/tree.py b/lark/tree.py index cb29c8a..ff56dea 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -12,6 +12,10 @@ from typing import List, Callable, Iterator, Union, Optional, Any, TYPE_CHECKING if TYPE_CHECKING: from .lexer import TerminalDef + if sys.version_info >= (3, 8): + from typing import Literal + else: + from typing_extensions import Literal class Meta: @@ -171,7 +175,7 @@ class SlottedTree(Tree): __slots__ = 'data', 'children', 'rule', '_meta' -def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs): +def pydot__tree_to_png(tree: Tree, filename: str, rankdir: 'Literal["TB", "LR", "BT", "RL"]'="LR", **kwargs) -> None: graph = pydot__tree_to_graph(tree, rankdir, **kwargs) graph.write_png(filename) From bca7c79b1fa3551d80cba294210d914b15772b2f Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Tue, 29 Jun 2021 15:10:48 -0700 Subject: [PATCH 14/34] Declare instance variable types at class level --- lark/grammar.py | 23 +++++++++++++++-------- lark/indenter.py | 8 ++++++-- lark/load_grammar.py | 30 ++++++++++++++++++++---------- lark/reconstruct.py | 11 ++++++++--- lark/visitors.py | 11 +++++++---- 5 files changed, 56 insertions(+), 27 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 6045620..be1aff6 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -8,10 +8,11 @@ from typing import Optional, Tuple class Symbol(Serialize): __slots__ = ('name',) + name: str is_term: bool = NotImplemented - def __init__(self, name): - self.name: str = name + def __init__(self, name: str) -> None: + self.name = name def __eq__(self, other): assert isinstance(other, Symbol), other @@ -52,12 +53,18 @@ class NonTerminal(Symbol): class RuleOptions(Serialize): __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices' - def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()): - self.keep_all_tokens: bool = keep_all_tokens - self.expand1: bool = expand1 - self.priority: int = priority - self.template_source: Optional[str] = template_source - self.empty_indices: Tuple[bool, ...] = empty_indices + keep_all_tokens: bool + expand1: bool + priority: Optional[int] + template_source: Optional[str] + empty_indices: Tuple[bool, ...] + + def __init__(self, keep_all_tokens: bool=False, expand1: bool=False, priority: Optional[int]=None, template_source: Optional[str]=None, empty_indices: Tuple[bool, ...]=()) -> None: + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.priority = priority + self.template_source = template_source + self.empty_indices = empty_indices def __repr__(self): return 'RuleOptions(%r, %r, %r, %r)' % ( diff --git a/lark/indenter.py b/lark/indenter.py index b6f47d6..03c5093 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -13,9 +13,13 @@ class DedentError(LarkError): pass class Indenter(PostLex, ABC): + + paren_level: Optional[int] + indent_level: Optional[List[int]] + def __init__(self) -> None: - self.paren_level: Optional[int] = None - self.indent_level: Optional[List[int]] = None + self.paren_level = None + self.indent_level = None assert self.tab_len > 0 def handle_NL(self, token: Token) -> Iterator[Token]: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 5073475..3e9b95a 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -552,10 +552,15 @@ def nr_deepcopy_tree(t): class Grammar: - def __init__(self, rule_defs, term_defs, ignore): - self.term_defs: List[Tuple[str, Tuple[Tree, int]]] = term_defs - self.rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] = rule_defs - self.ignore: List[str] = ignore + + term_defs: List[Tuple[str, Tuple[Tree, int]]] + rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] + ignore: List[str] + + def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]], term_defs: List[Tuple[str, Tuple[Tree, int]]], ignore: List[str]) -> None: + self.term_defs = term_defs + self.rule_defs = rule_defs + self.ignore = ignore def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) @@ -928,10 +933,15 @@ def _mangle_exp(exp, mangle): class GrammarBuilder: - def __init__(self, global_keep_all_tokens: bool=False, import_paths: List[Union[str, Callable]]=None, used_files: Dict[str, str]=None) -> None: - self.global_keep_all_tokens: bool = global_keep_all_tokens - self.import_paths: List[Union[str, Callable]] = import_paths or [] - self.used_files: Dict[str, str] = used_files or {} + + global_keep_all_tokens: bool + import_paths: List[Union[str, Callable]] + used_files: Dict[str, str] + + def __init__(self, global_keep_all_tokens: bool=False, import_paths: Optional[List[Union[str, Callable]]]=None, used_files: Optional[Dict[str, str]]=None) -> None: + self.global_keep_all_tokens = global_keep_all_tokens + self.import_paths = import_paths or [] + self.used_files = used_files or {} self._definitions = {} self._ignore_names = [] @@ -1072,7 +1082,7 @@ class GrammarBuilder: return name, exp, params, opts - def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Callable[[str], str]=None) -> None: + def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Optional[Callable[[str], str]]=None) -> None: tree = _parse_grammar(grammar_text, grammar_name) imports = {} @@ -1135,7 +1145,7 @@ class GrammarBuilder: self._definitions = {k: v for k, v in self._definitions.items() if k in _used} - def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], base_mangle: Callable[[str], str]=None) -> None: + def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], base_mangle: Optional[Callable[[str], str]]=None) -> None: assert dotted_path mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle) grammar_path = os.path.join(*dotted_path) + EXT diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 2f0911b..aa8c753 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -1,6 +1,6 @@ """Reconstruct text from a tree, based on Lark grammar""" -from typing import List, Dict, Union, Callable, Iterable +from typing import List, Dict, Union, Callable, Iterable, Optional import unicodedata from .lark import Lark @@ -23,6 +23,9 @@ def is_iter_empty(i): class WriteTokensTransformer(Transformer_InPlace): "Inserts discarded tokens into their correct place, according to the rules of grammar" + tokens: Dict[str, TerminalDef] + term_subs: Dict[str, Callable[[Symbol], str]] + def __init__(self, tokens: Dict[str, TerminalDef], term_subs: Dict[str, Callable[[Symbol], str]]) -> None: self.tokens = tokens self.term_subs = term_subs @@ -72,7 +75,9 @@ class Reconstructor(TreeMatcher): term_subs: a dictionary of [Terminal name as str] to [output text as str] """ - def __init__(self, parser: Lark, term_subs: Dict[str, Callable[[Symbol], str]]=None) -> None: + write_tokens: WriteTokensTransformer + + def __init__(self, parser: Lark, term_subs: Optional[Dict[str, Callable[[Symbol], str]]]=None) -> None: TreeMatcher.__init__(self, parser) self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {}) @@ -89,7 +94,7 @@ class Reconstructor(TreeMatcher): else: yield item - def reconstruct(self, tree: Tree, postproc: Callable[[Iterable[str]], Iterable[str]]=None, insert_spaces: bool=True) -> str: + def reconstruct(self, tree: Tree, postproc: Optional[Callable[[Iterable[str]], Iterable[str]]]=None, insert_spaces: bool=True) -> str: x = self._reconstruct(tree) if postproc: x = postproc(x) diff --git a/lark/visitors.py b/lark/visitors.py index 847c468..2c7309f 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -8,7 +8,7 @@ from .lexer import Token ###{standalone from inspect import getmembers, getmro -from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union +from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union, Optional _T = TypeVar('_T') _R = TypeVar('_R') @@ -156,8 +156,11 @@ class Transformer(_Decoratable, ABC, Generic[_T]): class TransformerChain(Generic[_T]): - def __init__(self, *transformers): - self.transformers: Tuple[Transformer[_T], ...] = transformers + + transformers: Tuple[Transformer[_T], ...] + + def __init__(self, *transformers: Transformer[_T]) -> None: + self.transformers = transformers def transform(self, tree: Tree) -> _T: for t in self.transformers: @@ -387,7 +390,7 @@ def _vargs_tree(f, data, children, meta): return f(Tree(data, children, meta)) -def v_args(inline: bool=False, meta: bool=False, tree: bool=False, wrapper: Callable[[_DECORATED], _DECORATED]=None) -> Callable[[_DECORATED], _DECORATED]: +def v_args(inline: bool=False, meta: bool=False, tree: bool=False, wrapper: Optional[Callable]=None) -> Callable[[_DECORATED], _DECORATED]: """A convenience decorator factory for modifying the behavior of user-supplied visitor methods. By default, callback methods of transformers/visitors accept one argument - a list of the node's children. From 1c4af01a117f8a010cf88dced77bd6aea60cb88d Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Tue, 29 Jun 2021 15:15:51 -0700 Subject: [PATCH 15/34] Update mypy workflow --- .github/workflows/mypy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 85039a4..f1c667f 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -16,4 +16,4 @@ jobs: python -m pip install --upgrade pip pip install mypy - name: Lint with mypy - run: mypy -p lark-stubs || true + run: mypy -p lark || true From 4bc9445238ed19280dbbf7afc25d36bb6aa254c6 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Wed, 30 Jun 2021 12:49:14 -0700 Subject: [PATCH 16/34] Corrections for PR and some mypy errors --- lark/common.py | 9 +++--- lark/exceptions.py | 4 +-- lark/indenter.py | 8 +++--- lark/lark.py | 9 +++--- lark/lexer.py | 61 ++++++++++++++++++++++------------------ lark/parser_frontends.py | 2 +- lark/tree.py | 7 +++-- lark/utils.py | 2 +- 8 files changed, 56 insertions(+), 46 deletions(-) diff --git a/lark/common.py b/lark/common.py index ccd5e16..0e6ae05 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,8 +1,9 @@ +from types import ModuleType + from .utils import Serialize from .lexer import TerminalDef, Token ###{standalone -from types import ModuleType from typing import Any, Callable, Collection, Dict, Optional, TYPE_CHECKING if TYPE_CHECKING: @@ -17,13 +18,13 @@ class LexerConf(Serialize): terminals: Collection[TerminalDef] re_module: ModuleType ignore: Collection[str] = () - postlex: 'PostLex' = None - callbacks: Optional[Dict[str, _Callback]] = None + postlex: 'Optional[PostLex]' = None + callbacks: Dict[str, _Callback] = {} g_regex_flags: int = 0 skip_validation: bool = False use_bytes: bool = False - def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'PostLex'=None, callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False): + def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None, callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False): self.terminals = terminals self.terminals_by_name = {t.name: t for t in self.terminals} assert len(self.terminals) == len(self.terminals_by_name) diff --git a/lark/exceptions.py b/lark/exceptions.py index 3276db5..9c4dc1e 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -3,7 +3,7 @@ from .utils import logger, NO_VALUE ###{standalone -from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, TYPE_CHECKING +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, TYPE_CHECKING if TYPE_CHECKING: from .lexer import Token @@ -73,7 +73,7 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn: 'Callable[[str], Tree]', examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool=False, use_accepts: bool=False) -> T: + def match_examples(self, parse_fn: 'Callable[[str], Tree]', examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool=False, use_accepts: bool=False) -> Optional[T]: """Allows you to detect what's wrong in the input text by matching against example errors. diff --git a/lark/indenter.py b/lark/indenter.py index 03c5093..69a7ba4 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -14,12 +14,12 @@ class DedentError(LarkError): class Indenter(PostLex, ABC): - paren_level: Optional[int] - indent_level: Optional[List[int]] + paren_level: int + indent_level: List[int] def __init__(self) -> None: - self.paren_level = None - self.indent_level = None + self.paren_level = 0 + self.indent_level = [0] assert self.tab_len > 0 def handle_NL(self, token: Token) -> Iterator[Token]: diff --git a/lark/lark.py b/lark/lark.py index b4c767b..e225bad 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -15,7 +15,7 @@ from .grammar import Rule import re try: - import regex + import regex # type: ignore except ImportError: regex = None @@ -149,7 +149,7 @@ class LarkOptions(Serialize): # - As an attribute of `LarkOptions` above # - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded # - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument - _defaults = { + _defaults: Dict[str, Any] = { 'debug': False, 'keep_all_tokens': False, 'tree_class': None, @@ -414,6 +414,7 @@ class Lark(Serialize): if cache_fn: logger.debug('Saving grammar to cache: %s', cache_fn) with FS.open(cache_fn, 'wb') as f: + assert cache_md5 is not None f.write(cache_md5.encode('utf8') + b'\n') pickle.dump(used_files, f) self.save(f) @@ -574,7 +575,7 @@ class Lark(Serialize): """Get information about a terminal""" return self._terminals_dict[name] - def parse_interactive(self, text: str=None, start: Optional[str]=None) -> 'InteractiveParser': + def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser': """Start an interactive parsing session. Parameters: @@ -588,7 +589,7 @@ class Lark(Serialize): """ return self.parser.parse_interactive(text, start=start) - def parse(self, text: str, start: Optional[str]=None, on_error: 'Callable[[UnexpectedInput], bool]'=None) -> Tree: + def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> Tree: """Parse the given text, according to the options provided. Parameters: diff --git a/lark/lexer.py b/lark/lexer.py index 6177d26..8f05bc7 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -23,10 +23,10 @@ class Pattern(Serialize, ABC): value: str flags: Collection[str] - raw: str = None - type: str = None + raw: Optional[str] = None + type: Optional[str] = None - def __init__(self, value: str, flags: Collection[str]=(), raw: str=None) -> None: + def __init__(self, value: str, flags: Collection[str]=(), raw: Optional[str]=None) -> None: self.value = value self.flags = frozenset(flags) self.raw = raw @@ -81,7 +81,10 @@ class PatternStr(Pattern): @property def min_width(self) -> int: return len(self.value) - max_width = min_width + + @property + def max_width(self) -> int: + return len(self.value) class PatternRE(Pattern): @@ -320,15 +323,36 @@ def _regexp_has_newline(r): """ return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) + +class LexerState(object): + __slots__ = 'text', 'line_ctr', 'last_token' + + def __init__(self, text, line_ctr, last_token=None): + self.text = text + self.line_ctr = line_ctr + self.last_token = last_token + + def __eq__(self, other): + if not isinstance(other, LexerState): + return NotImplemented + + return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token + + def __copy__(self): + return type(self)(self.text, copy(self.line_ctr), self.last_token) + + _Callback = Callable[[Token], Token] class Lexer(ABC): """Lexer interface Method Signatures: - lex(self, text) -> Iterator[Token] + lex(self, lexer_state, parser_state) -> Iterator[Token] """ - lex: Callable[..., Iterator[Token]] = NotImplemented + @abstractmethod + def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: + ... def make_lexer_state(self, text): line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') @@ -394,6 +418,7 @@ class TraditionalLexer(Lexer): def mres(self) -> List[Tuple[REPattern, Dict[int, str]]]: if self._mres is None: self._build() + assert self._mres is not None return self._mres def match(self, text: str, pos: int) -> Optional[Tuple[str, str]]: @@ -402,12 +427,12 @@ class TraditionalLexer(Lexer): if m: return m.group(0), type_from_index[m.lastindex] - def lex(self, state: Any, parser_state: Any) -> Iterator[Token]: + def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]: with suppress(EOFError): while True: yield self.next_token(state, parser_state) - def next_token(self, lex_state: Any, parser_state: Any=None) -> Token: + def next_token(self, lex_state: LexerState, parser_state: Any=None) -> Token: line_ctr = lex_state.line_ctr while line_ctr.char_pos < len(lex_state.text): res = self.match(lex_state.text, line_ctr.char_pos) @@ -443,24 +468,6 @@ class TraditionalLexer(Lexer): raise EOFError(self) -class LexerState(object): - __slots__ = 'text', 'line_ctr', 'last_token' - - def __init__(self, text, line_ctr, last_token=None): - self.text = text - self.line_ctr = line_ctr - self.last_token = last_token - - def __eq__(self, other): - if not isinstance(other, LexerState): - return NotImplemented - - return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token - - def __copy__(self): - return type(self)(self.text, copy(self.line_ctr), self.last_token) - - class ContextualLexer(Lexer): lexers: Dict[str, TraditionalLexer] @@ -494,7 +501,7 @@ class ContextualLexer(Lexer): def make_lexer_state(self, text): return self.root_lexer.make_lexer_state(text) - def lex(self, lexer_state: Any, parser_state: Any) -> Iterator[Token]: + def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: try: while True: lexer = self.lexers[parser_state.position] diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e066d9a..926997a 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,7 +7,7 @@ from .parsers.lalr_parser import LALR_Parser from .tree import Tree from .common import LexerConf, ParserConf try: - import regex + import regex # type: ignore except ImportError: regex = None import re diff --git a/lark/tree.py b/lark/tree.py index ff56dea..90ec0fe 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -1,8 +1,9 @@ try: - from future_builtins import filter + from future_builtins import filter # type: ignore except ImportError: pass +import sys from copy import deepcopy @@ -49,7 +50,7 @@ class Tree(object): data: str children: 'List[Union[str, Tree]]' - def __init__(self, data: str, children: 'List[Union[str, Tree]]', meta: Meta=None) -> None: + def __init__(self, data: str, children: 'List[Union[str, Tree]]', meta: Optional[Meta]=None) -> None: self.data = data self.children = children self._meta = meta @@ -196,7 +197,7 @@ def pydot__tree_to_graph(tree, rankdir="LR", **kwargs): possible attributes, see https://www.graphviz.org/doc/info/attrs.html. """ - import pydot + import pydot # type: ignore graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs) i = [0] diff --git a/lark/utils.py b/lark/utils.py index 81c9128..1214e97 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -134,7 +134,7 @@ def smart_decorator(f, create_decorator): try: - import regex + import regex # type: ignore except ImportError: regex = None From 804114d1ff3a2c982e8fb8793936b9122f1b8fb4 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Wed, 30 Jun 2021 13:21:09 -0700 Subject: [PATCH 17/34] Remove default values on type declarations and mark ClassVars --- lark/common.py | 12 ++++++------ lark/grammar.py | 8 ++++---- lark/lexer.py | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lark/common.py b/lark/common.py index 0e6ae05..6ad38fc 100644 --- a/lark/common.py +++ b/lark/common.py @@ -17,12 +17,12 @@ class LexerConf(Serialize): terminals: Collection[TerminalDef] re_module: ModuleType - ignore: Collection[str] = () - postlex: 'Optional[PostLex]' = None - callbacks: Dict[str, _Callback] = {} - g_regex_flags: int = 0 - skip_validation: bool = False - use_bytes: bool = False + ignore: Collection[str] + postlex: 'Optional[PostLex]' + callbacks: Dict[str, _Callback] + g_regex_flags: int + skip_validation: bool + use_bytes: bool def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None, callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False): self.terminals = terminals diff --git a/lark/grammar.py b/lark/grammar.py index be1aff6..25aec17 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -3,13 +3,13 @@ from .utils import Serialize ###{standalone -from typing import Optional, Tuple +from typing import Optional, Tuple, ClassVar class Symbol(Serialize): __slots__ = ('name',) name: str - is_term: bool = NotImplemented + is_term: ClassVar[bool] = NotImplemented def __init__(self, name: str) -> None: self.name = name @@ -33,7 +33,7 @@ class Symbol(Serialize): class Terminal(Symbol): __serialize_fields__ = 'name', 'filter_out' - is_term = True + is_term: ClassVar[bool] = True def __init__(self, name, filter_out=False): self.name = name @@ -47,7 +47,7 @@ class Terminal(Symbol): class NonTerminal(Symbol): __serialize_fields__ = 'name', - is_term = False + is_term: ClassVar[bool] = False class RuleOptions(Serialize): diff --git a/lark/lexer.py b/lark/lexer.py index 8f05bc7..af698cb 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -13,7 +13,7 @@ from copy import copy from types import ModuleType from typing import ( TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - Pattern as REPattern, TYPE_CHECKING + Pattern as REPattern, ClassVar, TYPE_CHECKING ) if TYPE_CHECKING: @@ -23,8 +23,8 @@ class Pattern(Serialize, ABC): value: str flags: Collection[str] - raw: Optional[str] = None - type: Optional[str] = None + raw: Optional[str] + type: ClassVar[str] def __init__(self, value: str, flags: Collection[str]=(), raw: Optional[str]=None) -> None: self.value = value @@ -73,7 +73,7 @@ class Pattern(Serialize, ABC): class PatternStr(Pattern): __serialize_fields__ = 'value', 'flags' - type: str = "str" + type: ClassVar[str] = "str" def to_regexp(self) -> str: return self._get_flags(re.escape(self.value)) @@ -90,7 +90,7 @@ class PatternStr(Pattern): class PatternRE(Pattern): __serialize_fields__ = 'value', 'flags', '_width' - type: str = "re" + type: ClassVar[str] = "re" def to_regexp(self) -> str: return self._get_flags(self.value) From ead7a0447378cb7c10b213f770761c791d8dec92 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 15 Jul 2021 18:24:53 +0300 Subject: [PATCH 18/34] Small fix --- lark/exceptions.py | 2 +- lark/lexer.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 0dfb659..6f7d32d 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -12,7 +12,7 @@ class ConfigurationError(LarkError, ValueError): pass -def assert_config(value, options, msg='Got %r, expected one of %s'): +def assert_config(value, options: list, msg='Got %r, expected one of %s'): if value not in options: raise ConfigurationError(msg % (value, options)) diff --git a/lark/lexer.py b/lark/lexer.py index a017001..36ed6bc 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -130,7 +130,7 @@ class Token(str): inst = super(Token, cls).__new__(cls, value) inst.type = type_ - inst.start_pos = start_pos if start_pos is not None else pos_in_stream + inst.start_pos = start_pos inst.value = value inst.line = line inst.column = column @@ -287,7 +287,7 @@ class Scanner: return m.group(0), type_from_index[m.lastindex] -def _regexp_has_newline(r): +def _regexp_has_newline(r: str): r"""Expressions that may indicate newlines in a regexp: - newlines (\n) - escaped newline (\\n) From d0f25985afe7eeedfce36cd2bd9d586c2c279f87 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 15 Jul 2021 22:18:26 +0300 Subject: [PATCH 19/34] Fixup: change typehint to Sequence --- lark/exceptions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 6f7d32d..76db0fc 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,3 +1,5 @@ +from collections.abc import Sequence + from .utils import logger, NO_VALUE @@ -12,7 +14,7 @@ class ConfigurationError(LarkError, ValueError): pass -def assert_config(value, options: list, msg='Got %r, expected one of %s'): +def assert_config(value, options: Sequence, msg='Got %r, expected one of %s'): if value not in options: raise ConfigurationError(msg % (value, options)) From ee75166376c22f061a7363ffd5776484861ef5f3 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 15 Jul 2021 22:20:18 +0300 Subject: [PATCH 20/34] Remove tests for versions below 3.6 --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 792c6bd..5784ae8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,6 @@ dist: xenial language: python python: - - "3.4" - - "3.5" - "3.6" - "3.7" - "3.8" From 8aa4bfbd95a6f70ddc5eb551ad6d5e427029345d Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 15 Jul 2021 22:25:43 +0300 Subject: [PATCH 21/34] Fixup: moved import to standalone. --- lark/exceptions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 76db0fc..3fbd807 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,9 +1,8 @@ -from collections.abc import Sequence - from .utils import logger, NO_VALUE ###{standalone +from collections.abc import Sequence class LarkError(Exception): From 3e9e5d84497acae2941c4a4f4d5692ca4dcd5f8e Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 19 Jul 2021 17:35:56 +0300 Subject: [PATCH 22/34] Refactor: Split class to move code out of standalone section --- lark/parser_frontends.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 0e53dd5..47f291a 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -32,20 +32,13 @@ class MakeParsingFrontend: self.parser_type = parser_type self.lexer_type = lexer_type - def __call__(self, lexer_conf, parser_conf, options): - assert isinstance(lexer_conf, LexerConf) - assert isinstance(parser_conf, ParserConf) - parser_conf.parser_type = self.parser_type - lexer_conf.lexer_type = self.lexer_type - return ParsingFrontend(lexer_conf, parser_conf, options) - def deserialize(self, data, memo, lexer_conf, callbacks, options): parser_conf = ParserConf.deserialize(data['parser_conf'], memo) parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) parser_conf.callbacks = callbacks return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser) - + # ... Continued later in the module class ParsingFrontend(Serialize): @@ -237,3 +230,12 @@ class CYK_FrontEnd: def _apply_callback(self, tree): return self.callbacks[tree.rule](tree.children) + + +class MakeParsingFrontend(MakeParsingFrontend): + def __call__(self, lexer_conf, parser_conf, options): + assert isinstance(lexer_conf, LexerConf) + assert isinstance(parser_conf, ParserConf) + parser_conf.parser_type = self.parser_type + lexer_conf.lexer_type = self.lexer_type + return ParsingFrontend(lexer_conf, parser_conf, options) \ No newline at end of file From 3f507fc073b938464a6c2266525b8f5d2679fab9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 30 Jul 2021 00:17:17 +0300 Subject: [PATCH 23/34] A few more type annotations, reduce use of inline flags --- lark/grammars/python.lark | 4 ++-- lark/indenter.py | 4 ++++ lark/parse_tree_builder.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lark/grammars/python.lark b/lark/grammars/python.lark index 684193d..e73362d 100644 --- a/lark/grammars/python.lark +++ b/lark/grammars/python.lark @@ -10,8 +10,8 @@ DEC_NUMBER: /0|[1-9][\d_]*/i HEX_NUMBER.2: /0x[\da-f]*/i OCT_NUMBER.2: /0o[0-7]*/i BIN_NUMBER.2 : /0b[0-1]*/i -FLOAT_NUMBER.2: /((\d+\.[\d_]*|\.[\d_]+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i -IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i +FLOAT_NUMBER.2: /((\d+\.[\d_]*|\.[\d_]+)([Ee][-+]?\d+)?|\d+([Ee][-+]?\d+))/ +IMAG_NUMBER.2: /\d+[Jj]/ | FLOAT_NUMBER /[Jj]/ // Comma-separated list (with an optional trailing comma) diff --git a/lark/indenter.py b/lark/indenter.py index 7e1263d..f5a1895 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -9,6 +9,10 @@ class DedentError(LarkError): pass class Indenter(PostLex): + OPEN_PAREN_types: list + CLOSE_PAREN_types: list + DEDENT_type: str + def __init__(self): self.paren_level = None self.indent_level = None diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index e95003a..fa418a9 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -151,7 +151,7 @@ def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') -def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices): +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices: list): # Prepare empty_indices as: How many Nones to insert at each index? if _empty_indices: assert _empty_indices.count(False) == len(expansion) From a8473e7e5d60542f4a68c88ee9fa775ebdb9ffe2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 4 Aug 2021 16:38:13 +0300 Subject: [PATCH 24/34] A tiny bit more typing info --- lark/lexer.py | 2 +- lark/parse_tree_builder.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 36ed6bc..fc50b57 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -184,7 +184,7 @@ class LineCounter: return self.char_pos == other.char_pos and self.newline_char == other.newline_char - def feed(self, token, test_newline=True): + def feed(self, token: Token, test_newline=True): """Consume a token and calculate the new line & column. As an optional optimization, set test_newline=False if token doesn't contain a newline. diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index fa418a9..4342b24 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,3 +1,5 @@ +from typing import List + from .exceptions import GrammarError, ConfigurationError from .lexer import Token from .tree import Tree @@ -151,7 +153,7 @@ def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') -def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices: list): +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices: List[bool]): # Prepare empty_indices as: How many Nones to insert at each index? if _empty_indices: assert _empty_indices.count(False) == len(expansion) From 1457e01e7e088b1984825bad531b110fead80ea2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 17 Aug 2021 11:01:40 +0300 Subject: [PATCH 25/34] Fixes to typing and tests --- lark/common.py | 8 ++++---- lark/exceptions.py | 8 +++----- lark/grammar.py | 3 +-- lark/indenter.py | 3 +-- lark/lark.py | 24 ++++++++++++------------ lark/lexer.py | 15 +++++++-------- lark/tools/standalone.py | 7 +++++++ lark/tree.py | 9 +++++---- lark/visitors.py | 2 +- tests/test_tools.py | 2 +- 10 files changed, 42 insertions(+), 39 deletions(-) diff --git a/lark/common.py b/lark/common.py index 12149b6..6c3962e 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,14 +1,14 @@ from copy import deepcopy from types import ModuleType +from typing import Callable, Collection, Dict, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from .lark import PostLex from .utils import Serialize from .lexer import TerminalDef, Token ###{standalone -from typing import Any, Callable, Collection, Dict, Optional, TYPE_CHECKING - -if TYPE_CHECKING: - from .lark import PostLex _Callback = Callable[[Token], Token] diff --git a/lark/exceptions.py b/lark/exceptions.py index 797d5cb..55e9a3a 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,16 +1,14 @@ from .utils import logger, NO_VALUE - - -###{standalone -from collections.abc import Sequence - from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, TYPE_CHECKING +from collections.abc import Sequence if TYPE_CHECKING: from .lexer import Token from .parsers.lalr_interactive_parser import InteractiveParser from .tree import Tree +###{standalone + class LarkError(Exception): pass diff --git a/lark/grammar.py b/lark/grammar.py index 25aec17..3d6f0ff 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,10 +1,9 @@ +from typing import Optional, Tuple, ClassVar from .utils import Serialize ###{standalone -from typing import Optional, Tuple, ClassVar - class Symbol(Serialize): __slots__ = ('name',) diff --git a/lark/indenter.py b/lark/indenter.py index b7b3369..0a18347 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -1,19 +1,18 @@ "Provides Indentation services for languages with indentation similar to Python" from abc import ABC, abstractmethod +from typing import List, Iterator from .exceptions import LarkError from .lark import PostLex from .lexer import Token ###{standalone -from typing import List, Iterator class DedentError(LarkError): pass class Indenter(PostLex, ABC): - paren_level: int indent_level: List[int] diff --git a/lark/lark.py b/lark/lark.py index 8b8af4e..aed3346 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,7 +1,18 @@ from abc import ABC, abstractmethod import sys, os, pickle, hashlib import tempfile - +from typing import ( + TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, + Tuple, Iterable, IO, Any, TYPE_CHECKING +) +if TYPE_CHECKING: + from .parsers.lalr_interactive_parser import InteractiveParser + from .visitors import Transformer + if sys.version_info >= (3, 8): + from typing import Literal + else: + from typing_extensions import Literal + from .exceptions import ConfigurationError, assert_config, UnexpectedInput from .utils import Serialize, SerializeMemoizer, FS, isascii, logger from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource @@ -21,18 +32,7 @@ except ImportError: ###{standalone -from typing import ( - TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, - Tuple, Iterable, IO, Any, TYPE_CHECKING -) -if TYPE_CHECKING: - from .parsers.lalr_interactive_parser import InteractiveParser - from .visitors import Transformer - if sys.version_info >= (3, 8): - from typing import Literal - else: - from typing_extensions import Literal class PostLex(ABC): @abstractmethod diff --git a/lark/lexer.py b/lark/lexer.py index f826e06..173b3f5 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,6 +3,13 @@ from abc import abstractmethod, ABC import re from contextlib import suppress +from typing import ( + TypeVar, Type, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, + Pattern as REPattern, ClassVar, TYPE_CHECKING +) +from types import ModuleType +if TYPE_CHECKING: + from .common import LexerConf from .utils import classify, get_regexp_width, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken @@ -10,14 +17,6 @@ from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone from copy import copy -from types import ModuleType -from typing import ( - TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - Pattern as REPattern, ClassVar, TYPE_CHECKING -) - -if TYPE_CHECKING: - from .common import LexerConf class Pattern(Serialize, ABC): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 1cc8f81..7282699 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -25,6 +25,13 @@ # from abc import ABC, abstractmethod +from collections.abc import Sequence +from types import ModuleType +from typing import ( + TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, + Union, Iterable, IO, TYPE_CHECKING, + Pattern as REPattern, ClassVar, Set, +) ###} import sys diff --git a/lark/tree.py b/lark/tree.py index 90ec0fe..1ca0c62 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -1,3 +1,4 @@ + try: from future_builtins import filter # type: ignore except ImportError: @@ -6,10 +7,7 @@ except ImportError: import sys from copy import deepcopy - -###{standalone -from collections import OrderedDict -from typing import List, Callable, Iterator, Union, Optional, Any, TYPE_CHECKING +from typing import List, Callable, Iterator, Union, Optional, TYPE_CHECKING if TYPE_CHECKING: from .lexer import TerminalDef @@ -18,6 +16,9 @@ if TYPE_CHECKING: else: from typing_extensions import Literal +###{standalone +from collections import OrderedDict + class Meta: empty: bool diff --git a/lark/visitors.py b/lark/visitors.py index 2c7309f..954886a 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -1,3 +1,4 @@ +from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union, Optional from abc import ABC from functools import wraps @@ -8,7 +9,6 @@ from .lexer import Token ###{standalone from inspect import getmembers, getmro -from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union, Optional _T = TypeVar('_T') _R = TypeVar('_R') diff --git a/tests/test_tools.py b/tests/test_tools.py index 7a732d1..fd42b1c 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -24,7 +24,7 @@ class TestStandalone(TestCase): standalone.gen_standalone(Lark(grammar, parser='lalr'), out=code_buf, compress=compress) code = code_buf.getvalue() - context = {'__doc__': None} + context = {'__doc__': None, '__name__': 'test_standalone'} exec(code, context) return context From 0fddb7fef6de455bca7110266633df8341e90f21 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 17 Aug 2021 11:14:39 +0300 Subject: [PATCH 26/34] Replace '...' with 'NotImplemented' --- lark/indenter.py | 12 ++++++------ lark/lexer.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lark/indenter.py b/lark/indenter.py index 0a18347..a4bbb24 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -74,31 +74,31 @@ class Indenter(PostLex, ABC): @property @abstractmethod def NL_type(self) -> str: - ... + return NotImplemented @property @abstractmethod def OPEN_PAREN_types(self) -> List[str]: - ... + return NotImplemented @property @abstractmethod def CLOSE_PAREN_types(self) -> List[str]: - ... + return NotImplemented @property @abstractmethod def INDENT_type(self) -> str: - ... + return NotImplemented @property @abstractmethod def DEDENT_type(self) -> str: - ... + return NotImplemented @property @abstractmethod def tab_len(self) -> int: - ... + return NotImplemented ###} diff --git a/lark/lexer.py b/lark/lexer.py index 173b3f5..512e8ff 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -359,7 +359,7 @@ class Lexer(ABC): """ @abstractmethod def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: - ... + return NotImplemented def make_lexer_state(self, text): line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') From e30eef7a5325666eb5747e09841afd04def0c84a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 17 Aug 2021 11:24:59 +0300 Subject: [PATCH 27/34] Commit latest master to v1.0 --- README.md | 2 +- docs/classes.rst | 2 + docs/index.rst | 2 +- docs/json_tutorial.md | 8 +- docs/visitors.rst | 5 + examples/advanced/python3.lark | 141 +++++++++++++++--------- examples/standalone/json_parser_main.py | 4 +- lark/ast_utils.py | 4 +- lark/common.py | 12 ++ lark/exceptions.py | 45 +++++--- lark/lark.py | 24 ++-- lark/lexer.py | 135 +++++++++++++---------- lark/load_grammar.py | 135 +++++++++++++++++++++-- lark/parse_tree_builder.py | 61 +++++----- lark/parser_frontends.py | 19 ++-- lark/parsers/lalr_interactive_parser.py | 2 +- lark/parsers/lalr_parser.py | 4 +- lark/utils.py | 52 ++++++--- tests/test_grammar.py | 49 +++++++- tests/test_parser.py | 40 +++---- 20 files changed, 510 insertions(+), 236 deletions(-) diff --git a/README.md b/README.md index 8ec22ed..82f6148 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) -- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) +- [Online IDE](https://lark-parser.github.io/ide) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - [Gitter chat](https://gitter.im/lark-parser/Lobby) diff --git a/docs/classes.rst b/docs/classes.rst index 7b18460..1287896 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -66,6 +66,8 @@ UnexpectedInput .. autoclass:: lark.exceptions.UnexpectedCharacters +.. autoclass:: lark.exceptions.UnexpectedEOF + InteractiveParser ----------------- diff --git a/docs/index.rst b/docs/index.rst index 39ecd5a..e8bd6b2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -113,7 +113,7 @@ Resources .. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Third-party examples: https://github.com/ligurio/lark-grammars -.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html +.. _Online IDE: https://lark-parser.github.io/ide .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index 65c6c78..668d9de 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist | Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | | Lark - LALR(1) | 8s | 1.53s | 453M | 266M | | Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | -| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M | -| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | -| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M | +| PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M | +| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | +| Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M | I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). @@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective. This is the end of the tutorial. I hoped you liked it and learned a little about Lark. -To see what else you can do with Lark, check out the [examples](examples). +To see what else you can do with Lark, check out the [examples](/examples). For questions or any other subject, feel free to email me at erezshin at gmail dot com. diff --git a/docs/visitors.rst b/docs/visitors.rst index a0e1711..f263712 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -107,3 +107,8 @@ Discard ------- .. autoclass:: lark.visitors.Discard + +VisitError +------- + +.. autoclass:: lark.exceptions.VisitError \ No newline at end of file diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index 0fc5949..7fb5ae5 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -21,7 +21,7 @@ decorators: decorator+ decorated: decorators (classdef | funcdef | async_funcdef) async_funcdef: "async" funcdef -funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite +funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] | starparams @@ -29,25 +29,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] -kwparams: "**" typedparam +kwparams: "**" typedparam ","? -?paramvalue: typedparam ["=" test] -?typedparam: NAME [":" test] +?paramvalue: typedparam ("=" test)? +?typedparam: NAME (":" test)? -varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] - | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] - | "**" vfpdef [","]) -vfpdef: NAME +lambdef: "lambda" [lambda_params] ":" test +lambdef_nocond: "lambda" [lambda_params] ":" test_nocond +lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]] + | lambda_starparams + | lambda_kwparams +?lambda_paramvalue: NAME ("=" test)? +lambda_starparams: "*" [NAME] ("," lambda_paramvalue)* ["," [lambda_kwparams]] +lambda_kwparams: "**" NAME ","? + ?stmt: simple_stmt | compound_stmt ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE -?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) -?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) - | ("=" (yield_expr|testlist_star_expr))*) -annassign: ":" test ["=" test] -?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] -!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") +?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +expr_stmt: testlist_star_expr +assign_stmt: annassign | augassign | assign + +annassign: testlist_star_expr ":" test ["=" test] +assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+ +augassign: testlist_star_expr augassign_op (yield_expr|testlist) +!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=" +?testlist_star_expr: test_or_star_expr + | test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple + | test_or_star_expr "," -> tuple + // For normal and annotated assignments, additional restrictions enforced by the interpreter del_stmt: "del" exprlist pass_stmt: "pass" @@ -71,43 +82,52 @@ global_stmt: "global" NAME ("," NAME)* nonlocal_stmt: "nonlocal" NAME ("," NAME)* assert_stmt: "assert" test ["," test] -compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt +?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt async_stmt: "async" (funcdef | with_stmt | for_stmt) -if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] +if_stmt: "if" test ":" suite elifs ["else" ":" suite] +elifs: elif_* +elif_: "elif" test ":" suite while_stmt: "while" test ":" suite ["else" ":" suite] for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] -try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) -with_stmt: "with" with_item ("," with_item)* ":" suite +try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally] + | "try" ":" suite finally -> try_finally +finally: "finally" ":" suite +except_clauses: except_clause+ +except_clause: "except" [test ["as" NAME]] ":" suite + +with_stmt: "with" with_items ":" suite +with_items: with_item ("," with_item)* with_item: test ["as" expr] // NB compile.c makes sure that the default except clause is last -except_clause: "except" [test ["as" NAME]] suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT -?test: or_test ("if" or_test "else" test)? | lambdef +?test: or_test ("if" or_test "else" test)? + | lambdef ?test_nocond: or_test | lambdef_nocond -lambdef: "lambda" [varargslist] ":" test -lambdef_nocond: "lambda" [varargslist] ":" test_nocond + ?or_test: and_test ("or" and_test)* ?and_test: not_test ("and" not_test)* -?not_test: "not" not_test -> not +?not_test: "not" not_test -> not_test | comparison -?comparison: expr (_comp_op expr)* +?comparison: expr (comp_op expr)* star_expr: "*" expr -?expr: xor_expr ("|" xor_expr)* + +?expr: or_expr +?or_expr: xor_expr ("|" xor_expr)* ?xor_expr: and_expr ("^" and_expr)* ?and_expr: shift_expr ("&" shift_expr)* ?shift_expr: arith_expr (_shift_op arith_expr)* ?arith_expr: term (_add_op term)* ?term: factor (_mul_op factor)* -?factor: _factor_op factor | power +?factor: _unary_op factor | power -!_factor_op: "+"|"-"|"~" +!_unary_op: "+"|"-"|"~" !_add_op: "+"|"-" !_shift_op: "<<"|">>" !_mul_op: "*"|"@"|"/"|"%"|"//" // <> isn't actually a valid comparison operator in Python. It's here for the // sake of a __future__ import described in PEP 401 (which really works :-) -!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" +!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" ?power: await_expr ("**" factor)? ?await_expr: AWAIT? atom_expr @@ -118,61 +138,75 @@ AWAIT: "await" | atom_expr "." NAME -> getattr | atom -?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple - | "[" [testlist_comp] "]" -> list - | "{" [dict_comp] "}" -> dict - | "{" set_comp "}" -> set +?atom: "(" yield_expr ")" + | "(" _tuple_inner? ")" -> tuple + | "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension + | "[" _testlist_comp? "]" -> list + | "[" comprehension{test_or_star_expr} "]" -> list_comprehension + | "{" _dict_exprlist? "}" -> dict + | "{" comprehension{key_value} "}" -> dict_comprehension + | "{" _set_exprlist "}" -> set + | "{" comprehension{test} "}" -> set_comprehension | NAME -> var - | number | string+ + | number + | string_concat | "(" test ")" | "..." -> ellipsis | "None" -> const_none | "True" -> const_true | "False" -> const_false -?testlist_comp: test | tuplelist_comp -tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",") + +?string_concat: string+ + +_testlist_comp: test | _tuple_inner +_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",") + + +?test_or_star_expr: test + | star_expr + ?subscriptlist: subscript | subscript (("," subscript)+ [","] | ",") -> subscript_tuple -subscript: test | ([test] ":" [test] [sliceop]) -> slice +?subscript: test | ([test] ":" [test] [sliceop]) -> slice sliceop: ":" [test] -exprlist: (expr|star_expr) - | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple -testlist: test | testlist_tuple +?exprlist: (expr|star_expr) + | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") +?testlist: test | testlist_tuple testlist_tuple: test (("," test)+ [","] | ",") -dict_comp: key_value comp_for - | (key_value | "**" expr) ("," (key_value | "**" expr))* [","] +_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","] key_value: test ":" test -set_comp: test comp_for - | (test|star_expr) ("," (test | star_expr))* [","] +_set_exprlist: test_or_star_expr ("," test_or_star_expr)* [","] classdef: "class" NAME ["(" [arguments] ")"] ":" suite + + arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | starargs | kwargs - | test comp_for + | comprehension{test} -starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] +starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs] +stararg: "*" test kwargs: "**" test ?argvalue: test ("=" test)? - -comp_iter: comp_for | comp_if | async_for -async_for: "async" "for" exprlist "in" or_test [comp_iter] -comp_for: "for" exprlist "in" or_test [comp_iter] -comp_if: "if" test_nocond [comp_iter] +comprehension{comp_result}: comp_result comp_fors [comp_if] +comp_fors: comp_for+ +comp_for: [ASYNC] "for" exprlist "in" or_test +ASYNC: "async" +?comp_if: "if" test_nocond // not used in grammar, but may appear in "node" passed from Parser to Compiler encoding_decl: NAME -yield_expr: "yield" [yield_arg] -yield_arg: "from" test | testlist - +yield_expr: "yield" [testlist] + | "yield" "from" test -> yield_from number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER string: STRING | LONG_STRING @@ -181,6 +215,7 @@ string: STRING | LONG_STRING %import python (NAME, COMMENT, STRING, LONG_STRING) %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) + // Other terminals _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py index 503b249..3d9b5a6 100644 --- a/examples/standalone/json_parser_main.py +++ b/examples/standalone/json_parser_main.py @@ -10,7 +10,9 @@ Standalone Parser import sys -from json_parser import Lark_StandAlone, Transformer, inline_args +from json_parser import Lark_StandAlone, Transformer, v_args + +inline_args = v_args(inline=True) class TreeToJson(Transformer): @inline_args diff --git a/lark/ast_utils.py b/lark/ast_utils.py index c535f11..abd7384 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -38,8 +38,8 @@ def create_transformer(ast_module: types.ModuleType, transformer: Optional[Trans Classes starting with an underscore (`_`) will be skipped. Parameters: - ast_module - A Python module containing all the subclasses of `ast_utils.Ast` - transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. + ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` + transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. """ t = transformer or Transformer() diff --git a/lark/common.py b/lark/common.py index 6ad38fc..f5db5f7 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,4 +1,5 @@ from types import ModuleType +from copy import deepcopy from .utils import Serialize from .lexer import TerminalDef, Token @@ -40,6 +41,17 @@ class LexerConf(Serialize): def _deserialize(self): self.terminals_by_name = {t.name: t for t in self.terminals} + def __deepcopy__(self, memo=None): + return type(self)( + deepcopy(self.terminals, memo), + self.re_module, + deepcopy(self.ignore, memo), + deepcopy(self.postlex, memo), + deepcopy(self.callbacks, memo), + deepcopy(self.g_regex_flags, memo), + deepcopy(self.skip_validation, memo), + deepcopy(self.use_bytes, memo), + ) class ParserConf(Serialize): diff --git a/lark/exceptions.py b/lark/exceptions.py index 9c4dc1e..662d55a 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -41,8 +41,9 @@ class UnexpectedInput(LarkError): Used as a base class for the following exceptions: - - ``UnexpectedToken``: The parser received an unexpected token - ``UnexpectedCharacters``: The lexer encountered an unexpected string + - ``UnexpectedToken``: The parser received an unexpected token + - ``UnexpectedEOF``: The parser expected a token, but the input ended After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ @@ -136,10 +137,13 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): - + """An exception that is raised by the parser, when the input ends while it still expects a token. + """ expected: 'List[Token]' def __init__(self, expected, state=None, terminals_by_name=None): + super(UnexpectedEOF, self).__init__() + self.expected = expected self.state = state from .lexer import Token @@ -149,7 +153,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput): self.column = -1 self._terminals_by_name = terminals_by_name - super(UnexpectedEOF, self).__init__() def __str__(self): message = "Unexpected end-of-input. " @@ -158,12 +161,17 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): + """An exception that is raised by the lexer, when it cannot match the next + string of characters to any of its terminals. + """ allowed: Set[str] considered_tokens: Set[Any] def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, terminals_by_name=None, considered_rules=None): + super(UnexpectedCharacters, self).__init__() + # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column @@ -182,7 +190,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput): self.char = seq[lex_pos] self._context = self.get_context(seq) - super(UnexpectedCharacters, self).__init__() def __str__(self): message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) @@ -198,10 +205,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): """An exception that is raised by the parser, when the token it received doesn't match any valid step forward. - The parser provides an interactive instance through `interactive_parser`, - which is initialized to the point of failture, and can be used for debugging and error handling. + Parameters: + token: The mismatched token + expected: The set of expected tokens + considered_rules: Which rules were considered, to deduce the expected tokens + state: A value representing the parser state. Do not rely on its value or type. + interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, + and can be used for debugging and error handling. - see: ``InteractiveParser``. + Note: These parameters are available as attributes of the instance. """ expected: Set[str] @@ -209,6 +221,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): interactive_parser: 'InteractiveParser' def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): + super(UnexpectedToken, self).__init__() + # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -223,7 +237,6 @@ class UnexpectedToken(ParseError, UnexpectedInput): self._terminals_by_name = terminals_by_name self.token_history = token_history - super(UnexpectedToken, self).__init__() @property def accepts(self) -> Set[str]: @@ -245,18 +258,24 @@ class VisitError(LarkError): """VisitError is raised when visitors are interrupted by an exception It provides the following attributes for inspection: - - obj: the tree node or token it was processing when the exception was raised - - orig_exc: the exception that cause it to fail + + Parameters: + rule: the name of the visit rule that failed + obj: the tree-node or token that was being processed + orig_exc: the exception that cause it to fail + + Note: These parameters are available as attributes """ obj: 'Union[Tree, Token]' orig_exc: Exception def __init__(self, rule, obj, orig_exc): - self.obj = obj - self.orig_exc = orig_exc - message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + self.rule = rule + self.obj = obj + self.orig_exc = orig_exc + ###} diff --git a/lark/lark.py b/lark/lark.py index e225bad..78ed2ea 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -79,7 +79,7 @@ class LarkOptions(Serialize): Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) propagate_positions Propagates (line, column, end_line, end_column) attributes into all tree branches. - Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. + Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. maybe_placeholders When ``True``, the ``[]`` operator returns ``None`` when not matched. @@ -137,7 +137,7 @@ class LarkOptions(Serialize): A List of either paths or loader functions to specify from where grammars are imported source_path Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading - **=== End Options ===** + **=== End of Options ===** """ if __doc__: __doc__ += OPTIONS_DOC @@ -195,7 +195,7 @@ class LarkOptions(Serialize): assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) if self.parser == 'earley' and self.transformer: - raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.' + raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') if o: @@ -484,11 +484,11 @@ class Lark(Serialize): d = f else: d = pickle.load(f) - memo = d['memo'] + memo_json = d['memo'] data = d['data'] - assert memo - memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) + assert memo_json + memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): raise ConfigurationError("Some options are not allowed when loading a Parser: {}" @@ -545,11 +545,11 @@ class Lark(Serialize): Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) """ - package = FromPackageLoader(package, search_paths) - full_path, text = package(None, grammar_path) + package_loader = FromPackageLoader(package, search_paths) + full_path, text = package_loader(None, grammar_path) options.setdefault('source_path', full_path) options.setdefault('import_paths', []) - options['import_paths'].append(package) + options['import_paths'].append(package_loader) return cls(text, **options) def __repr__(self): @@ -560,6 +560,8 @@ class Lark(Serialize): """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. + + :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. """ if not hasattr(self, 'lexer') or dont_ignore: lexer = self._build_lexer(dont_ignore) @@ -602,6 +604,10 @@ class Lark(Serialize): If a transformer is supplied to ``__init__``, returns whatever is the result of the transformation. Otherwise, returns a Tree instance. + :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: + ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. + For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. + """ return self.parser.parse(text, start=start, on_error=on_error) diff --git a/lark/lexer.py b/lark/lexer.py index 90a2047..7906a70 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -158,20 +158,20 @@ class Token(str): def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): try: - self = super(Token, cls).__new__(cls, value) + inst = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: value = value.decode('latin1') - self = super(Token, cls).__new__(cls, value) - - self.type = type_ - self.start_pos = start_pos - self.value = value - self.line = line - self.column = column - self.end_line = end_line - self.end_column = end_column - self.end_pos = end_pos - return self + inst = super(Token, cls).__new__(cls, value) + + inst.type = type_ + inst.start_pos = start_pos + inst.value = value + inst.line = line + inst.column = column + inst.end_line = end_line + inst.end_column = end_column + inst.end_pos = end_pos + return inst def update(self, type_: Optional[str]=None, value: Optional[Any]=None) -> 'Token': return Token.new_borrow_pos( @@ -234,15 +234,13 @@ class LineCounter: class UnlessCallback: - def __init__(self, mres): - self.mres = mres + def __init__(self, scanner): + self.scanner = scanner def __call__(self, t): - for mre, type_from_index in self.mres: - m = mre.match(t.value) - if m: - t.type = type_from_index[m.lastindex] - break + res = self.scanner.match(t.value, 0) + if res: + _value, t.type = res return t @@ -257,6 +255,11 @@ class CallChain: return self.callback2(t) if self.cond(t2) else t2 +def _get_match(re_, regexp, s, flags): + m = re_.match(regexp, s, flags) + if m: + return m.group(0) + def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() @@ -268,40 +271,54 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) - if m and m.group(0) == s: + if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags): unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) - - terminals = [t for t in terminals if t not in embedded_strs] - return terminals, callback - - -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): - # Python sets an unreasonable group limit (currently 100) in its re module - # Worse, the only way to know we reached it is by catching an AssertionError! - # This function recursively tries less and less groups until it's successful. - postfix = '$' if match_whole else '' - mres = [] - while terminals: - pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) - if use_bytes: - pattern = pattern.encode('latin-1') - try: - mre = re_.compile(pattern, g_regex_flags) - except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) + callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) - mres.append((mre, {i: n for n, i in mre.groupindex.items()})) - terminals = terminals[max_size:] - return mres + new_terminals = [t for t in terminals if t not in embedded_strs] + return new_terminals, callback -def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) + +class Scanner: + def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): + self.terminals = terminals + self.g_regex_flags = g_regex_flags + self.re_ = re_ + self.use_bytes = use_bytes + self.match_whole = match_whole + + self.allowed_types = {t.name for t in self.terminals} + + self._mres = self._build_mres(terminals, len(terminals)) + + def _build_mres(self, terminals, max_size): + # Python sets an unreasonable group limit (currently 100) in its re module + # Worse, the only way to know we reached it is by catching an AssertionError! + # This function recursively tries less and less groups until it's successful. + postfix = '$' if self.match_whole else '' + mres = [] + while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if self.use_bytes: + pattern = pattern.encode('latin-1') + try: + mre = self.re_.compile(pattern, self.g_regex_flags) + except AssertionError: # Yes, this is what Python provides us.. :/ + return self._build_mres(terminals, max_size//2) + + mres.append((mre, {i: n for n, i in mre.groupindex.items()})) + terminals = terminals[max_size:] + return mres + + def match(self, text, pos): + for mre, type_from_index in self._mres: + m = mre.match(text, pos) + if m: + return m.group(0), type_from_index[m.lastindex] def _regexp_has_newline(r): @@ -390,9 +407,9 @@ class TraditionalLexer(Lexer): self.use_bytes = conf.use_bytes self.terminals_by_name = conf.terminals_by_name - self._mres = None + self._scanner = None - def _build(self) -> None: + def _build_scanner(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) assert all(self.callback.values()) @@ -403,20 +420,16 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) + self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) @property - def mres(self) -> List[Tuple[REPattern, Dict[int, str]]]: - if self._mres is None: - self._build() - assert self._mres is not None - return self._mres - - def match(self, text: str, pos: int) -> Optional[Tuple[str, str]]: - for mre, type_from_index in self.mres: - m = mre.match(text, pos) - if m: - return m.group(0), type_from_index[m.lastindex] + def scanner(self): + if self._scanner is None: + self._build_scanner() + return self._scanner + + def match(self, text, pos): + return self.scanner.match(text, pos) def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]: with suppress(EOFError): @@ -428,7 +441,7 @@ class TraditionalLexer(Lexer): while line_ctr.char_pos < len(lex_state.text): res = self.match(lex_state.text, line_ctr.char_pos) if not res: - allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types + allowed = self.scanner.allowed_types - self.ignore_types if not allowed: allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 31b2c35..3b46426 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -10,7 +10,7 @@ from numbers import Integral from contextlib import suppress from typing import List, Tuple, Union, Callable, Dict, Optional -from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique +from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -176,27 +176,136 @@ RULES = { } +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 +# The Threshold whether repeat via ~ are split up into different rules +# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, +# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. +# (See PR #949) +REPEAT_BREAK_THRESHOLD = 50 + + @inline_args class EBNF_to_BNF(Transformer_InPlace): def __init__(self): self.new_rules = [] - self.rules_by_expr = {} + self.rules_cache = {} self.prefix = 'anon' self.i = 0 self.rule_options = None - def _add_recurse_rule(self, type_, expr): - if expr in self.rules_by_expr: - return self.rules_by_expr[expr] - - new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) + def _name_rule(self, inner): + new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[expr] = t + return new_name + + def _add_rule(self, key, name, expansions): + t = NonTerminal(name) + self.new_rules.append((name, expansions, self.rule_options)) + self.rules_cache[key] = t return t + def _add_recurse_rule(self, type_, expr): + try: + return self.rules_cache[expr] + except KeyError: + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('expansions', [ + ST('expansion', [expr]), + ST('expansion', [t, expr]) + ]) + return self._add_rule(expr, new_name, tree) + + def _add_repeat_rule(self, a, b, target, atom): + """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. + + When called recursively (into target), it repeats atom for x(n) times, where: + x(0) = 1 + x(n) = a(n) * x(n-1) + b + + Example rule when a=3, b=4: + + new_rule: target target target atom atom atom atom + + """ + key = (a, b, target, atom) + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + return self._add_rule(key, new_name, tree) + + def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): + """Creates a rule that matches atom 0 to (a*n+b)-1 times. + + When target matches n times atom, and target_opt 0 to n-1 times target_opt, + + First we generate target * i followed by target_opt, for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i, for i from 0 to b-1 + These match n*a to n*a + b-1 times atom + + The created rule will not have any shift/reduce conflicts so that it can be used with lalr + + Example rule when a=3, b=4: + + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target + | target target target atom + | target target target atom atom + | target target target atom atom atom + + """ + key = (a, b, target, atom, "opt") + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) + tree = ST('expansions', [ + ST('expansion', [target]*i + [target_opt]) for i in range(a) + ] + [ + ST('expansion', [target]*a + [atom]*i) for i in range(b) + ]) + return self._add_rule(key, new_name, tree) + + def _generate_repeats(self, rule, mn, mx): + """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. + """ + # For a small number of repeats, we can take the naive approach + if mx < REPEAT_BREAK_THRESHOLD: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + # For large repeat values, we break the repetition into sub-rules. + # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. + # We then use small_factors to split up mn and diff up into values [(a, b), ...] + # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + # to generate a complete rule/expression that matches the corresponding number of repeats + mn_target = rule + for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): + mn_target = self._add_repeat_rule(a, b, mn_target, rule) + if mx == mn: + return mn_target + + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less + diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) + diff_target = rule # Match rule 1 times + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) + for a, b in diff_factors[:-1]: + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + diff_target = self._add_repeat_rule(a, b, diff_target, rule) + + a, b = diff_factors[-1] + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + + return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) + def expr(self, rule, op, *args): if op.value == '?': empty = ST('expansion', []) @@ -221,7 +330,9 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) + + return self._generate_repeats(rule, mn, mx) + assert False, op def maybe(self, rule): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 720315f..e95003a 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -22,54 +22,59 @@ class ExpandSingleChild: class PropagatePositions: - def __init__(self, node_builder): + def __init__(self, node_builder, node_filter=None): self.node_builder = node_builder + self.node_filter = node_filter def __call__(self, children): res = self.node_builder(children) - # local reference to Tree.meta reduces number of presence checks if isinstance(res, Tree): - res_meta = res.meta + # Calculate positions while the tree is streaming, according to the rule: + # - nodes start at the start of their first child's container, + # and end at the end of their last child's container. + # Containers are nodes that take up space in text, but have been inlined in the tree. - src_meta = self._pp_get_meta(children) - if src_meta is not None: - res_meta.line = src_meta.line - res_meta.column = src_meta.column - res_meta.start_pos = src_meta.start_pos - res_meta.empty = False + res_meta = res.meta - src_meta = self._pp_get_meta(reversed(children)) - if src_meta is not None: - res_meta.end_line = src_meta.end_line - res_meta.end_column = src_meta.end_column - res_meta.end_pos = src_meta.end_pos - res_meta.empty = False + first_meta = self._pp_get_meta(children) + if first_meta is not None: + if not hasattr(res_meta, 'line'): + # meta was already set, probably because the rule has been inlined (e.g. `?rule`) + res_meta.line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.column = getattr(first_meta, 'container_column', first_meta.column) + res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) + res_meta.empty = False + + res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column) + + last_meta = self._pp_get_meta(reversed(children)) + if last_meta is not None: + if not hasattr(res_meta, 'end_line'): + res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) + res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos) + res_meta.empty = False + + res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) return res def _pp_get_meta(self, children): for c in children: + if self.node_filter is not None and not self.node_filter(c): + continue if isinstance(c, Tree): if not c.meta.empty: return c.meta elif isinstance(c, Token): return c -class PropagatePositions_IgnoreWs(PropagatePositions): - def _pp_get_meta(self, children): - for c in children: - if isinstance(c, Tree): - if not c.meta.empty: - return c.meta - elif isinstance(c, Token): - if c and not c.isspace(): # Disregard whitespace-only tokens - return c - - def make_propagate_positions(option): - if option == "ignore_ws": - return PropagatePositions_IgnoreWs + if callable(option): + return partial(PropagatePositions, node_filter=option) elif option is True: return PropagatePositions elif option is False: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 926997a..475f70d 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -39,8 +39,7 @@ class MakeParsingFrontend: lexer_conf.lexer_type = self.lexer_type return ParsingFrontend(lexer_conf, parser_conf, options) - @classmethod - def deserialize(cls, data, memo, lexer_conf, callbacks, options): + def deserialize(self, data, memo, lexer_conf, callbacks, options): parser_conf = ParserConf.deserialize(data['parser_conf'], memo) parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) parser_conf.callbacks = callbacks @@ -92,26 +91,26 @@ class ParsingFrontend(Serialize): def _verify_start(self, start=None): if start is None: - start = self.parser_conf.start - if len(start) > 1: - raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) - start ,= start + start_decls = self.parser_conf.start + if len(start_decls) > 1: + raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls) + start ,= start_decls elif start not in self.parser_conf.start: raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start def parse(self, text, start=None, on_error=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) stream = text if self.skip_lexer else LexerThread(self.lexer, text) kw = {} if on_error is None else {'on_error': on_error} - return self.parser.parse(stream, start, **kw) + return self.parser.parse(stream, chosen_start, **kw) def parse_interactive(self, text=None, start=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") stream = text if self.skip_lexer else LexerThread(self.lexer, text) - return self.parser.parse_interactive(stream, start) + return self.parser.parse_interactive(stream, chosen_start) def get_frontend(parser, lexer): diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py index eeadef8..99dfc92 100644 --- a/lark/parsers/lalr_interactive_parser.py +++ b/lark/parsers/lalr_interactive_parser.py @@ -65,7 +65,7 @@ class InteractiveParser(object): """Print the output of ``choices()`` in a way that's easier to read.""" out = ["Parser choices:"] for k, v in self.choices().items(): - out.append('\t- %s -> %s' % (k, v)) + out.append('\t- %s -> %r' % (k, v)) out.append('stack size: %s' % len(self.parser_state.state_stack)) return '\n'.join(out) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index fe40791..d916b46 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -178,8 +178,8 @@ class _Parser(object): for token in state.lexer.lex(state): state.feed_token(token) - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) - return state.feed_token(token, True) + end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + return state.feed_token(end_token, True) except UnexpectedInput as e: try: e.interactive_parser = InteractiveParser(self, state, state.lexer) diff --git a/lark/utils.py b/lark/utils.py index a75d485..2b6e3b6 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -61,14 +61,13 @@ class Serialize(object): fields = getattr(self, '__serialize_fields__') res = {f: _serialize(getattr(self, f), memo) for f in fields} res['__type__'] = type(self).__name__ - postprocess = getattr(self, '_serialize', None) - if postprocess: - postprocess(res, memo) + if hasattr(self, '_serialize'): + self._serialize(res, memo) return res @classmethod def deserialize(cls, data, memo): - namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = getattr(cls, '__serialize_namespace__', []) namespace = {c.__name__:c for c in namespace} fields = getattr(cls, '__serialize_fields__') @@ -82,9 +81,10 @@ class Serialize(object): setattr(inst, f, _deserialize(data[f], namespace, memo)) except KeyError as e: raise KeyError("Cannot find key for class", cls, e) - postprocess = getattr(inst, '_deserialize', None) - if postprocess: - postprocess() + + if hasattr(inst, '_deserialize'): + inst._deserialize() + return inst @@ -163,7 +163,7 @@ def get_regexp_width(expr): return 1, sre_constants.MAXREPEAT else: return 0, sre_constants.MAXREPEAT - + ###} @@ -198,14 +198,6 @@ def dedup_list(l): return [x for x in l if not (x in dedup or dedup.add(x))] -def compare(a, b): - if a == b: - return 0 - elif a > b: - return 1 - return -1 - - class Enumerator(Serialize): def __init__(self): self.enums = {} @@ -253,7 +245,7 @@ except ImportError: class FS: exists = os.path.exists - + @staticmethod def open(name, mode="r", **kwargs): if atomicwrites and "w" in mode: @@ -324,3 +316,29 @@ def _serialize(value, memo): return {key:_serialize(elem, memo) for key, elem in value.items()} # assert value is None or isinstance(value, (int, float, str, tuple)), value return value + + + + +def small_factors(n, max_factor): + """ + Splits n up into smaller factors and summands <= max_factor. + Returns a list of [(a, b), ...] + so that the following code returns n: + + n = 1 + for a, b in values: + n = n * a + b + + Currently, we also keep a + b <= max_factor, but that might change + """ + assert n >= 0 + assert max_factor > 2 + if n <= max_factor: + return [(n, 0)] + + for a in range(max_factor, 1, -1): + r, b = divmod(n, a) + if a + b <= max_factor: + return small_factors(r, max_factor) + [(a, b)] + assert False, "Failed to factorize %s" % n diff --git a/tests/test_grammar.py b/tests/test_grammar.py index a643117..3ae65f2 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,7 +3,7 @@ from __future__ import absolute_import import sys from unittest import TestCase, main -from lark import Lark, Token, Tree +from lark import Lark, Token, Tree, ParseError, UnexpectedInput from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors from lark.load_grammar import FromPackageLoader @@ -198,6 +198,53 @@ class TestGrammar(TestCase): x = find_grammar_errors(text) assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] + def test_ranged_repeat_terms(self): + g = u"""!start: AAA + AAA: "A"~3 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') + + g = u"""!start: AABB CC + AABB: "A"~0..2 "B"~2 + CC: "C"~1..2 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) + self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) + self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + + def test_ranged_repeat_large(self): + g = u"""!start: "A"~60 + """ + l = Lark(g, parser='lalr') + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) + + g = u"""!start: "A"~15..100 + """ + l = Lark(g, parser='lalr') + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) if __name__ == '__main__': diff --git a/tests/test_parser.py b/tests/test_parser.py index ac409b3..ebc6152 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase): r = g.parse('a') self.assertEqual( r.children[0].meta.line, 1 ) + def test_propagate_positions2(self): + g = Lark("""start: a + a: b + ?b: "(" t ")" + !t: "t" + """, propagate_positions=True) + + start = g.parse("(t)") + a ,= start.children + t ,= a.children + assert t.children[0] == "t" + + assert t.meta.column == 2 + assert t.meta.end_column == 3 + + assert start.meta.column == a.meta.column == 1 + assert start.meta.end_column == a.meta.end_column == 4 + + + def test_expand1(self): g = Lark("""start: a @@ -2183,27 +2203,7 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - def test_ranged_repeat_terms(self): - g = u"""!start: AAA - AAA: "A"~3 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') - g = u"""!start: AABB CC - AABB: "A"~0..2 "B"~2 - CC: "C"~1..2 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) - self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) - self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX def test_priority_vs_embedded(self): From a8900c13b71f44ac27f7441203ce96461a7c553f Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 17 Aug 2021 14:59:30 +0300 Subject: [PATCH 28/34] Changed sequence to collection --- lark/exceptions.py | 5 ++--- lark/utils.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 7a331ad..2660e35 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,6 +1,5 @@ from .utils import logger, NO_VALUE -from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, TYPE_CHECKING -from collections.abc import Sequence +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, Collection, TYPE_CHECKING if TYPE_CHECKING: from .lexer import Token @@ -17,7 +16,7 @@ class ConfigurationError(LarkError, ValueError): pass -def assert_config(value, options: Sequence, msg='Got %r, expected one of %s'): +def assert_config(value, options: Collection, msg='Got %r, expected one of %s'): if value not in options: raise ConfigurationError(msg % (value, options)) diff --git a/lark/utils.py b/lark/utils.py index 2b6e3b6..ffaa593 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -241,7 +241,7 @@ def combine_alternatives(lists): try: import atomicwrites except ImportError: - atomicwrites = None + atomicwrites = None # type: ignore class FS: exists = os.path.exists From 343c22e21802fa19d44cc0fff1a0d8c6f1b07244 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 17 Aug 2021 15:00:38 +0300 Subject: [PATCH 29/34] NotImplemented -> NotImplmementedError --- lark/indenter.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lark/indenter.py b/lark/indenter.py index a4bbb24..1a9e587 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -74,31 +74,31 @@ class Indenter(PostLex, ABC): @property @abstractmethod def NL_type(self) -> str: - return NotImplemented + raise NotImplementedError() @property @abstractmethod def OPEN_PAREN_types(self) -> List[str]: - return NotImplemented + raise NotImplementedError() @property @abstractmethod def CLOSE_PAREN_types(self) -> List[str]: - return NotImplemented + raise NotImplementedError() @property @abstractmethod def INDENT_type(self) -> str: - return NotImplemented + raise NotImplementedError() @property @abstractmethod def DEDENT_type(self) -> str: - return NotImplemented + raise NotImplementedError() @property @abstractmethod def tab_len(self) -> int: - return NotImplemented + raise NotImplementedError() ###} From 4e96b96bb55fee49a249e9e4417df03d83676177 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 12 Sep 2021 20:41:45 +0100 Subject: [PATCH 30/34] Various backwards incompatible fixes for v1.0 --- lark/exceptions.py | 11 +++++++---- lark/lark.py | 9 +++------ lark/load_grammar.py | 4 +--- lark/visitors.py | 2 +- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 662d55a..deea929 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -74,7 +74,11 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn: 'Callable[[str], Tree]', examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool=False, use_accepts: bool=False) -> Optional[T]: + def match_examples(self, parse_fn: 'Callable[[str], Tree]', + examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], + token_type_match_fallback: bool=False, + use_accepts: bool=True + ) -> Optional[T]: """Allows you to detect what's wrong in the input text by matching against example errors. @@ -89,8 +93,7 @@ class UnexpectedInput(LarkError): Parameters: parse_fn: parse function (usually ``lark_instance.parse``) examples: dictionary of ``{'example_string': value}``. - use_accepts: Recommended to call this with ``use_accepts=True``. - The default is ``False`` for backwards compatibility. + use_accepts: Recommended to keep this as ``use_accepts=True``. """ assert self.state is not None, "Not supported for this exception" @@ -106,7 +109,7 @@ class UnexpectedInput(LarkError): parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: - if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: + if use_accepts and hasattr(self, 'accepts') and hasattr(ut, 'accepts') and ut.accepts != self.accepts: logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % (self.state, self.accepts, ut.accepts, i, j)) continue diff --git a/lark/lark.py b/lark/lark.py index 78ed2ea..8c6fcd3 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -82,9 +82,8 @@ class LarkOptions(Serialize): Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. maybe_placeholders When ``True``, the ``[]`` operator returns ``None`` when not matched. - When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all. - (default= ``False``. Recommended to set to ``True``) + (default= ``True``) cache Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. @@ -164,7 +163,7 @@ class LarkOptions(Serialize): 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, - 'maybe_placeholders': False, + 'maybe_placeholders': True, 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False, @@ -304,7 +303,7 @@ class Lark(Serialize): if self.options.cache is not True: raise ConfigurationError("cache argument must be bool or str") # Python2.7 doesn't support * syntax in tuples - cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2]) + cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % (cache_md5, *sys.version_info[:2]) if FS.exists(cache_fn): logger.debug('Loading grammar from cache: %s', cache_fn) @@ -368,7 +367,6 @@ class Lark(Serialize): if self.options.priority not in _VALID_PRIORITY_OPTIONS: raise ConfigurationError("invalid priority option: %r. Must be one of %r" % (self.options.priority, _VALID_PRIORITY_OPTIONS)) - assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"' if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) @@ -387,7 +385,6 @@ class Lark(Serialize): self._terminals_dict = {t.name: t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. - # This replaces the old 'resolve__antiscore_sum' option. if self.options.priority == 'invert': for rule in self.rules: if rule.options.priority is not None: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index c0503e6..4d73e61 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,7 +6,6 @@ from collections import namedtuple from copy import copy, deepcopy import pkgutil from ast import literal_eval -from numbers import Integral from contextlib import suppress from typing import List, Tuple, Union, Callable, Dict, Optional @@ -1067,8 +1066,7 @@ class GrammarBuilder: if self._is_term(name): if options is None: options = 1 - # if we don't use Integral here, we run into python2.7/python3 problems with long vs int - elif not isinstance(options, Integral): + elif not isinstance(options, int): raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),)) else: if options is None: diff --git a/lark/visitors.py b/lark/visitors.py index 2c7309f..60923b3 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -385,7 +385,7 @@ def _vargs_inline(f, _data, children, _meta): def _vargs_meta_inline(f, _data, children, meta): return f(meta, *children) def _vargs_meta(f, _data, children, meta): - return f(children, meta) # TODO swap these for consistency? Backwards incompatible! + return f(meta, children) def _vargs_tree(f, data, children, meta): return f(Tree(data, children, meta)) From 5eecb7f5c91df83ae055722d229c6dd2744731d2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 12 Sep 2021 20:45:07 +0100 Subject: [PATCH 31/34] Remove old comment --- lark/lark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index 8c6fcd3..380c32a 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -302,7 +302,7 @@ class Lark(Serialize): else: if self.options.cache is not True: raise ConfigurationError("cache argument must be bool or str") - # Python2.7 doesn't support * syntax in tuples + cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % (cache_md5, *sys.version_info[:2]) if FS.exists(cache_fn): From 5eb348481b4579c9bfcd03f4b7e05f406b3f7adc Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 12 Sep 2021 20:55:59 +0100 Subject: [PATCH 32/34] Added CHANGELOG.md --- lark/CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 lark/CHANGELOG.md diff --git a/lark/CHANGELOG.md b/lark/CHANGELOG.md new file mode 100644 index 0000000..9fdb072 --- /dev/null +++ b/lark/CHANGELOG.md @@ -0,0 +1,6 @@ +v1.0 + +- `maybe_placeholders` is now True by default + +- `use_accepts` in `UnexpectedInput.match_examples()` is now True by default + From 19b2aa934fc69ce1229c85887f9e6c07a12b1272 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 12 Sep 2021 21:02:00 +0100 Subject: [PATCH 33/34] Fix tests for PR --- tests/test_parser.py | 4 ++-- tests/test_reconstructor.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index ebc6152..dab69f7 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -208,11 +208,11 @@ class TestParsers(unittest.TestCase): @v_args(meta=True) class T1(Transformer): - def a(self, children, meta): + def a(self, meta, children): assert not children return meta.line - def start(self, children, meta): + def start(self, meta, children): return children @v_args(meta=True, inline=True) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index e2f2dbe..4df1cb9 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -183,8 +183,8 @@ class TestReconstructor(TestCase): keyword x += y """ - l1 = Lark(g1, parser='lalr') - l2 = Lark(g2, parser='lalr') + l1 = Lark(g1, parser='lalr', maybe_placeholders=False) + l2 = Lark(g2, parser='lalr', maybe_placeholders=False) r = Reconstructor(l2) tree = l1.parse(code) From 36c957595b71c9223c4ec378b70496a82fe80acb Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 12 Sep 2021 21:31:00 +0100 Subject: [PATCH 34/34] Token priority is now 0 by default --- lark/CHANGELOG.md | 3 +++ lark/grammar.py | 2 ++ lark/lexer.py | 3 ++- lark/load_grammar.py | 6 +++--- lark/parser_frontends.py | 2 +- tests/test_nearley/nearley | 2 +- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lark/CHANGELOG.md b/lark/CHANGELOG.md index 9fdb072..1c547e4 100644 --- a/lark/CHANGELOG.md +++ b/lark/CHANGELOG.md @@ -4,3 +4,6 @@ v1.0 - `use_accepts` in `UnexpectedInput.match_examples()` is now True by default +- Token priority is now 0 by default + +- `v_args(meta=True)` now gives meta as the first argument. i.e. `(meta, children)` \ No newline at end of file diff --git a/lark/grammar.py b/lark/grammar.py index 3d6f0ff..108f347 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -3,6 +3,8 @@ from typing import Optional, Tuple, ClassVar from .utils import Serialize ###{standalone +TOKEN_DEFAULT_PRIORITY = 0 + class Symbol(Serialize): __slots__ = ('name',) diff --git a/lark/lexer.py b/lark/lexer.py index 0b7bef8..292bb35 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from .utils import classify, get_regexp_width, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken +from .grammar import TOKEN_DEFAULT_PRIORITY ###{standalone from copy import copy @@ -108,7 +109,7 @@ class TerminalDef(Serialize): pattern: Pattern priority: int - def __init__(self, name: str, pattern: Pattern, priority: int=1) -> None: + def __init__(self, name: str, pattern: Pattern, priority: int=TOKEN_DEFAULT_PRIORITY) -> None: assert isinstance(pattern, Pattern), pattern self.name = name self.pattern = pattern diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 4d73e61..461b21e 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -15,7 +15,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf -from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY from .utils import classify, dedup_list from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput @@ -1121,7 +1121,7 @@ class GrammarBuilder: name = '__IGNORE_%d'% len(self._ignore_names) self._ignore_names.append(name) - self._definitions[name] = ((), t, 1) + self._definitions[name] = ((), t, TOKEN_DEFAULT_PRIORITY) def _declare(self, *names): for name in names: @@ -1172,7 +1172,7 @@ class GrammarBuilder: else: name = tree.children[0].value params = () # TODO terminal templates - opts = int(tree.children[1]) if len(tree.children) == 3 else 1 # priority + opts = int(tree.children[1]) if len(tree.children) == 3 else TOKEN_DEFAULT_PRIORITY # priority exp = tree.children[-1] if mangle is not None: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 37d59ae..f79ea36 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -162,7 +162,7 @@ class EarleyRegexpMatcher: def __init__(self, lexer_conf): self.regexps = {} for t in lexer_conf.terminals: - if t.priority != 1: + if t.priority: raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority) regexp = t.pattern.to_regexp() try: diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index a46b374..3268316 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44 +Subproject commit 326831689826cb1b9a4d21d1ce0d5db9278e9636