From 070c54dc8e82440e9297ee32bc661e26c6e80635 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 13 Sep 2021 11:23:48 +0100 Subject: [PATCH 1/5] yield -> yield from --- lark/indenter.py | 3 +-- lark/reconstruct.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lark/indenter.py b/lark/indenter.py index 1a9e587..1121104 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -44,8 +44,7 @@ class Indenter(PostLex, ABC): def _process(self, stream): for token in stream: if token.type == self.NL_type: - for t in self.handle_NL(token): - yield t + yield from self.handle_NL(token) else: yield token diff --git a/lark/reconstruct.py b/lark/reconstruct.py index aa8c753..02b4947 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -89,8 +89,7 @@ class Reconstructor(TreeMatcher): for item in res: if isinstance(item, Tree): # TODO use orig_expansion.rulename to support templates - for x in self._reconstruct(item): - yield x + yield from self._reconstruct(item) else: yield item From 61c4512cf37dfcba07c311e1db0592360525a348 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 13 Sep 2021 11:34:06 +0100 Subject: [PATCH 2/5] Rename lexer: Standard/Traditional -> Basic --- docs/grammar.md | 2 +- docs/how_to_develop.md | 2 +- docs/json_tutorial.md | 2 +- docs/parsers.md | 2 +- docs/recipes.md | 2 +- examples/advanced/_json_parser.py | 4 ++-- examples/advanced/conf_earley.py | 2 +- examples/advanced/conf_lalr.py | 2 +- examples/advanced/python_parser.py | 2 +- examples/advanced/qscintilla_json.py | 2 +- examples/json_parser.py | 6 +++--- lark/lark.py | 22 +++++++++++----------- lark/lexer.py | 10 +++++----- lark/load_grammar.py | 2 +- lark/parser_frontends.py | 14 +++++++------- lark/tools/__init__.py | 2 +- lark/tools/nearley.py | 2 +- lark/tools/standalone.py | 2 +- tests/test_parser.py | 20 ++++++++++---------- 19 files changed, 51 insertions(+), 51 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index 4ac5c77..6d5394e 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -124,7 +124,7 @@ Regexps/strings of different flags can only be concatenated in Python 3.6+ #### Notes for when using a lexer: -When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence: +When using a lexer (basic or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence: 1. Highest priority first (priority is specified as: TERM.number: ...) 2. Length of match (for regexps, the longest theoretical match is used) diff --git a/docs/how_to_develop.md b/docs/how_to_develop.md index b161e0c..b43fe19 100644 --- a/docs/how_to_develop.md +++ b/docs/how_to_develop.md @@ -32,7 +32,7 @@ For a list of supported interpreters, you can consult the `tox.ini` file. You can also run a single unittest using its class and method name, for example: ```bash ## test_package test_class_name.test_function_name -python -m tests TestLalrStandard.test_lexer_error_recovering +python -m tests TestLalrBasic.test_keep_all_tokens ``` ### tox diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index 668d9de..6fc3bc8 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -332,7 +332,7 @@ class TreeToJson(Transformer): true = lambda self, _: True false = lambda self, _: False -json_parser = Lark(json_grammar, start='value', lexer='standard') +json_parser = Lark(json_grammar, start='value', lexer='basic') if __name__ == '__main__': with open(sys.argv[1]) as f: diff --git a/docs/parsers.md b/docs/parsers.md index cf2c066..9a24bcb 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`. -It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` +It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'` **SPPF & Ambiguity resolution** diff --git a/docs/recipes.md b/docs/recipes.md index 2d2f83a..1aadd04 100644 --- a/docs/recipes.md +++ b/docs/recipes.md @@ -43,7 +43,7 @@ It accepts a dictionary of the form Where callback is of type `f(Token) -> Token` -It only works with the standard and contextual lexers. +It only works with the basic and contextual lexers. This has the same effect of using a transformer, but can also process ignored tokens. diff --git a/examples/advanced/_json_parser.py b/examples/advanced/_json_parser.py index 80d9101..27253e2 100644 --- a/examples/advanced/_json_parser.py +++ b/examples/advanced/_json_parser.py @@ -53,9 +53,9 @@ class TreeToJson(Transformer): ### Create the JSON parser with Lark, using the LALR algorithm json_parser = Lark(json_grammar, parser='lalr', - # Using the standard lexer isn't required, and isn't usually recommended. + # Using the basic lexer isn't required, and isn't usually recommended. # But, it's good enough for JSON, and it's slightly faster. - lexer='standard', + lexer='basic', # Disabling propagate_positions and placeholders slightly improves speed propagate_positions=False, maybe_placeholders=False, diff --git a/examples/advanced/conf_earley.py b/examples/advanced/conf_earley.py index 348ce3b..9b511fa 100644 --- a/examples/advanced/conf_earley.py +++ b/examples/advanced/conf_earley.py @@ -5,7 +5,7 @@ Earley’s dynamic lexer Demonstrates the power of Earley’s dynamic lexer on a toy configuration language Using a lexer for configuration files is tricky, because values don't -have to be surrounded by delimiters. Using a standard lexer for this just won't work. +have to be surrounded by delimiters. Using a basic lexer for this just won't work. In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. diff --git a/examples/advanced/conf_lalr.py b/examples/advanced/conf_lalr.py index b0e164c..450c644 100644 --- a/examples/advanced/conf_lalr.py +++ b/examples/advanced/conf_lalr.py @@ -6,7 +6,7 @@ This example demonstrates the power of LALR's contextual lexer, by parsing a toy configuration language. The terminals `NAME` and `VALUE` overlap. They can match the same input. -A standard lexer would arbitrarily choose one over the other, based on priority, +A basic lexer would arbitrarily choose one over the other, based on priority, which would lead to a (confusing) parse error. However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows which one of them to expect at each point during the parse. diff --git a/examples/advanced/python_parser.py b/examples/advanced/python_parser.py index a37bade..b86fa01 100644 --- a/examples/advanced/python_parser.py +++ b/examples/advanced/python_parser.py @@ -28,7 +28,7 @@ kwargs = dict(rel_to=__file__, postlex=PythonIndenter(), start='file_input') python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs) python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) -python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs) +python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='basic', **kwargs) try: xrange diff --git a/examples/advanced/qscintilla_json.py b/examples/advanced/qscintilla_json.py index b876d4c..6e66070 100644 --- a/examples/advanced/qscintilla_json.py +++ b/examples/advanced/qscintilla_json.py @@ -77,7 +77,7 @@ class LexerJson(QsciLexerCustom): %ignore WS ''' - self.lark = Lark(grammar, parser=None, lexer='standard') + self.lark = Lark(grammar, parser=None, lexer='basic') # All tokens: print([t.name for t in self.lark.parser.lexer.tokens]) def defaultPaper(self, style): diff --git a/examples/json_parser.py b/examples/json_parser.py index c3573f3..2f02edd 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -50,15 +50,15 @@ class TreeToJson(Transformer): ### Create the JSON parser with Lark, using the Earley algorithm -# json_parser = Lark(json_grammar, parser='earley', lexer='standard') +# json_parser = Lark(json_grammar, parser='earley', lexer='basic') # def parse(x): # return TreeToJson().transform(json_parser.parse(x)) ### Create the JSON parser with Lark, using the LALR algorithm json_parser = Lark(json_grammar, parser='lalr', - # Using the standard lexer isn't required, and isn't usually recommended. + # Using the basic lexer isn't required, and isn't usually recommended. # But, it's good enough for JSON, and it's slightly faster. - lexer='standard', + lexer='basic', # Disabling propagate_positions and placeholders slightly improves speed propagate_positions=False, maybe_placeholders=False, diff --git a/lark/lark.py b/lark/lark.py index b6b45a0..0d143df 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -19,7 +19,7 @@ from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_ from .tree import Tree from .common import LexerConf, ParserConf -from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread, Token +from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend, _get_lexer_callbacks from .grammar import Rule @@ -57,7 +57,7 @@ class LarkOptions(Serialize): keep_all_tokens: bool tree_class: Any parser: 'Literal["earley", "lalr", "cyk", "auto"]' - lexer: 'Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' + lexer: 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]' postlex: Optional[PostLex] priority: 'Optional[Literal["auto", "normal", "invert"]]' @@ -108,7 +108,7 @@ class LarkOptions(Serialize): Decides whether or not to use a lexer stage - "auto" (default): Choose for me based on the parser - - "standard": Use a standard lexer + - "basic": Use a basic lexer - "contextual": Stronger lexer (only works with parser="lalr") - "dynamic": Flexible and powerful (only with parser="earley") - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. @@ -123,7 +123,7 @@ class LarkOptions(Serialize): **=== Misc. / Domain Specific Options ===** postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. + Lexer post-processing (Default: None) Only works with the basic and contextual lexers. priority How priorities should be evaluated - auto, none, normal, invert (Default: auto) lexer_callbacks @@ -339,22 +339,22 @@ class Lark(Serialize): self.options.lexer = 'contextual' elif self.options.parser == 'earley': if self.options.postlex is not None: - logger.info("postlex can't be used with the dynamic lexer, so we use standard instead. " + logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. " "Consider using lalr with contextual instead of earley") - self.options.lexer = 'standard' + self.options.lexer = 'basic' else: self.options.lexer = 'dynamic' elif self.options.parser == 'cyk': - self.options.lexer = 'standard' + self.options.lexer = 'basic' else: assert False, self.options.parser lexer = self.options.lexer if isinstance(lexer, type): assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance else: - assert_config(lexer, ('standard', 'contextual', 'dynamic', 'dynamic_complete')) + assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete')) if self.options.postlex is not None and 'dynamic' in lexer: - raise ConfigurationError("Can't use postlex with a dynamic lexer. Use standard or contextual instead") + raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead") if self.options.ambiguity == 'auto': if self.options.parser == 'earley': @@ -429,7 +429,7 @@ class Lark(Serialize): from copy import copy lexer_conf = copy(lexer_conf) lexer_conf.ignore = () - return TraditionalLexer(lexer_conf) + return BasicLexer(lexer_conf) def _prepare_callbacks(self): self._callbacks = {} @@ -556,7 +556,7 @@ class Lark(Serialize): def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: - """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' + """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. diff --git a/lark/lexer.py b/lark/lexer.py index 292bb35..ac485a6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -366,7 +366,7 @@ class Lexer(ABC): return LexerState(text, line_ctr) -class TraditionalLexer(Lexer): +class BasicLexer(Lexer): terminals: Collection[TerminalDef] ignore_types: FrozenSet[str] @@ -473,8 +473,8 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - lexers: Dict[str, TraditionalLexer] - root_lexer: TraditionalLexer + lexers: Dict[str, BasicLexer] + root_lexer: BasicLexer def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None: terminals = list(conf.terminals) @@ -493,13 +493,13 @@ class ContextualLexer(Lexer): accepts = set(accepts) | set(conf.ignore) | set(always_accept) lexer_conf = copy(trad_conf) lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] - lexer = TraditionalLexer(lexer_conf) + lexer = BasicLexer(lexer_conf) lexer_by_tokens[key] = lexer self.lexers[state] = lexer assert trad_conf.terminals is terminals - self.root_lexer = TraditionalLexer(trad_conf) + self.root_lexer = BasicLexer(trad_conf) def make_lexer_state(self, text): return self.root_lexer.make_lexer_state(text) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 94bb97f..402a64d 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -918,7 +918,7 @@ def _get_parser(): import re lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) parser_conf = ParserConf(rules, callback, ['start']) - lexer_conf.lexer_type = 'standard' + lexer_conf.lexer_type = 'basic' parser_conf.parser_type = 'lalr' _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) return _get_parser.cache diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index f79ea36..06533e0 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,7 +1,7 @@ from .exceptions import ConfigurationError, GrammarError, assert_config from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer -from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef +from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, Token, TerminalDef from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .tree import Tree @@ -70,7 +70,7 @@ class ParsingFrontend(Serialize): try: create_lexer = { - 'standard': create_traditional_lexer, + 'basic': create_basic_lexer, 'contextual': create_contextual_lexer, }[lexer_type] except KeyError: @@ -110,9 +110,9 @@ def get_frontend(parser, lexer): assert_config(parser, ('lalr', 'earley', 'cyk')) if not isinstance(lexer, type): # not custom lexer? expected = { - 'lalr': ('standard', 'contextual'), - 'earley': ('standard', 'dynamic', 'dynamic_complete'), - 'cyk': ('standard', ), + 'lalr': ('basic', 'contextual'), + 'earley': ('basic', 'dynamic', 'dynamic_complete'), + 'cyk': ('basic', ), }[parser] assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) @@ -141,8 +141,8 @@ class PostLexConnector: -def create_traditional_lexer(lexer_conf, parser, postlex): - return TraditionalLexer(lexer_conf) +def create_basic_lexer(lexer_conf, parser, postlex): + return BasicLexer(lexer_conf) def create_contextual_lexer(lexer_conf, parser, postlex): states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} diff --git a/lark/tools/__init__.py b/lark/tools/__init__.py index 4ecf13d..3fe66bd 100644 --- a/lark/tools/__init__.py +++ b/lark/tools/__init__.py @@ -25,7 +25,7 @@ options = ['start', 'lexer'] lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times") lalr_argparser.add_argument('-s', '--start', action='append', default=[]) -lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('standard', 'contextual')) +lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual')) k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {} lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)') lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file') diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index f5026e8..8b8ef89 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -44,7 +44,7 @@ nearley_grammar = r""" """ -nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard') +nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') def _get_rulename(name): name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 7282699..84277cc 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -3,7 +3,7 @@ # # Lark Stand-alone Generator Tool # ---------------------------------- -# Generates a stand-alone LALR(1) parser with a standard lexer +# Generates a stand-alone LALR(1) parser # # Git: https://github.com/erezsh/lark # Author: Erez Shinan (erezshin@gmail.com) diff --git a/tests/test_parser.py b/tests/test_parser.py index dab69f7..6c12b79 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -36,7 +36,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte from lark.tree import Tree from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive from lark.grammar import Rule -from lark.lexer import TerminalDef, Lexer, TraditionalLexer +from lark.lexer import TerminalDef, Lexer, BasicLexer from lark.indenter import Indenter __all__ = ['TestParsers'] @@ -465,7 +465,7 @@ def _make_full_earley_test(LEXER): empty_tree = Tree('empty', [Tree('empty2', [])]) self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) - @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") + @unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") def test_earley_explicit_ambiguity(self): # This was a sneaky bug! @@ -481,7 +481,7 @@ def _make_full_earley_test(LEXER): self.assertEqual( ambig_tree.data, '_ambig') self.assertEqual( len(ambig_tree.children), 2) - @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") + @unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") def test_ambiguity1(self): grammar = """ start: cd+ "e" @@ -497,7 +497,7 @@ def _make_full_earley_test(LEXER): assert ambig_tree.data == '_ambig', ambig_tree assert len(ambig_tree.children) == 2 - @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") + @unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") def test_ambiguity2(self): grammar = """ ANY: /[a-zA-Z0-9 ]+/ @@ -1019,9 +1019,9 @@ def _make_parser_test(LEXER, PARSER): def _Lark_open(gfilename, **kwargs): return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) - if (LEXER, PARSER) == ('standard', 'earley'): + if (LEXER, PARSER) == ('basic', 'earley'): # Check that the `lark.lark` grammar represents can parse every example used in these tests. - # Standard-Earley was an arbitrary choice, to make sure it only ran once. + # basic-Earley was an arbitrary choice, to make sure it only ran once. lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr') def wrap_with_test_grammar(f): def _f(x, **kwargs): @@ -1736,7 +1736,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(len(tree.children), 2) - @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority") + # @unittest.skipIf(LEXER != 'basic', "Only basic lexers care about token priority") def test_lexer_prioritization(self): "Tests effect of priority on result" @@ -2505,9 +2505,9 @@ def _make_parser_test(LEXER, PARSER): __all__.append(_NAME) _TO_TEST = [ - ('standard', 'earley'), - ('standard', 'cyk'), - ('standard', 'lalr'), + ('basic', 'earley'), + ('basic', 'cyk'), + ('basic', 'lalr'), ('dynamic', 'earley'), ('dynamic_complete', 'earley'), From 81fa0398723966b77bb2df87aeb998cdef5bf7d9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 13 Sep 2021 11:49:25 +0100 Subject: [PATCH 3/5] Refactor: Eradicated inheritance from object --- lark/ast_utils.py | 6 +++--- lark/lexer.py | 4 ++-- lark/load_grammar.py | 2 +- lark/parsers/cyk.py | 10 +++++----- lark/parsers/earley_common.py | 19 +++---------------- lark/parsers/earley_forest.py | 5 ++--- lark/parsers/grammar_analysis.py | 6 +++--- lark/parsers/lalr_interactive_parser.py | 2 +- lark/parsers/lalr_parser.py | 6 +++--- lark/tree.py | 2 +- lark/utils.py | 2 +- tests/test_parser.py | 7 ++++--- 12 files changed, 29 insertions(+), 42 deletions(-) diff --git a/lark/ast_utils.py b/lark/ast_utils.py index 9131087..53342ef 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -8,20 +8,20 @@ from typing import Optional, Callable from lark import Transformer, v_args -class Ast(object): +class Ast: """Abstract class Subclasses will be collected by `create_transformer()` """ pass -class AsList(object): +class AsList: """Abstract class Subclasses will be instanciated with the parse results as a single list, instead of as arguments. """ -class WithMeta(object): +class WithMeta: """Abstract class Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail) diff --git a/lark/lexer.py b/lark/lexer.py index ac485a6..afdbade 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -331,7 +331,7 @@ def _regexp_has_newline(r: str): return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) -class LexerState(object): +class LexerState: __slots__ = 'text', 'line_ctr', 'last_token' def __init__(self, text, line_ctr, last_token=None): @@ -521,7 +521,7 @@ class ContextualLexer(Lexer): except UnexpectedCharacters: raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. -class LexerThread(object): +class LexerThread: """A thread that ties a lexer instance and a lexer state, to be used by the parser""" def __init__(self, lexer, text): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 402a64d..b7c5452 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -791,7 +791,7 @@ class Grammar: PackageResource = namedtuple('PackageResource', 'pkg_name path') -class FromPackageLoader(object): +class FromPackageLoader: """ Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. This allows them to be compatible even from within zip files. diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py index ff0924f..82818cc 100644 --- a/lark/parsers/cyk.py +++ b/lark/parsers/cyk.py @@ -23,7 +23,7 @@ def match(t, s): return t.name == s.type -class Rule(object): +class Rule: """Context-free grammar rule.""" def __init__(self, lhs, rhs, weight, alias): @@ -51,7 +51,7 @@ class Rule(object): return not (self == other) -class Grammar(object): +class Grammar: """Context-free grammar.""" def __init__(self, rules): @@ -68,7 +68,7 @@ class Grammar(object): # Parse tree data structures -class RuleNode(object): +class RuleNode: """A node in the parse tree, which also contains the full rhs rule.""" def __init__(self, rule, children, weight=0): @@ -81,7 +81,7 @@ class RuleNode(object): -class Parser(object): +class Parser: """Parser wrapper.""" def __init__(self, rules): @@ -186,7 +186,7 @@ def _parse(s, g): # * Empty rules (epsilon rules) -class CnfWrapper(object): +class CnfWrapper: """CNF wrapper for grammar. Validates that the input grammar is CNF and provides helper data structures. diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py index 6bd614b..101c9e8 100644 --- a/lark/parsers/earley_common.py +++ b/lark/parsers/earley_common.py @@ -1,21 +1,8 @@ -"This module implements an Earley Parser" +"""This module implements useful building blocks for the Earley parser +""" -# The parser uses a parse-forest to keep track of derivations and ambiguations. -# When the parse ends successfully, a disambiguation stage resolves all ambiguity -# (right now ambiguity resolution is not developed beyond the needs of lark) -# Afterwards the parse tree is reduced (transformed) according to user callbacks. -# I use the no-recursion version of Transformer, because the tree might be -# deeper than Python's recursion limit (a bit absurd, but that's life) -# -# The algorithm keeps track of each state set, using a corresponding Column instance. -# Column keeps track of new items using NewsList instances. -# -# Author: Erez Shinan (2017) -# Email : erezshin@gmail.com -from ..grammar import NonTerminal, Terminal - -class Item(object): +class Item: "An Earley Item, the atom of the algorithm." __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index f39f4eb..3ab7988 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -8,7 +8,6 @@ http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ """ from random import randint -from math import isinf from collections import deque from operator import attrgetter from importlib import import_module @@ -20,7 +19,7 @@ from ..lexer import Token from ..utils import logger from ..tree import Tree -class ForestNode(object): +class ForestNode: pass class SymbolNode(ForestNode): @@ -173,7 +172,7 @@ class PackedNode(ForestNode): symbol = self.s.name return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) -class ForestVisitor(object): +class ForestVisitor: """ An abstract base class for building forest visitors. diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 737cb02..b526e47 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -5,7 +5,7 @@ from ..exceptions import GrammarError from ..grammar import Rule, Terminal, NonTerminal -class RulePtr(object): +class RulePtr: __slots__ = ('rule', 'index') def __init__(self, rule, index): @@ -38,7 +38,7 @@ class RulePtr(object): # state generation ensures no duplicate LR0ItemSets -class LR0ItemSet(object): +class LR0ItemSet: __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') def __init__(self, kernel, closure): @@ -121,7 +121,7 @@ def calculate_sets(rules): return FIRST, FOLLOW, NULLABLE -class GrammarAnalyzer(object): +class GrammarAnalyzer: def __init__(self, parser_conf, debug=False): self.debug = debug diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py index 99dfc92..a82dbe6 100644 --- a/lark/parsers/lalr_interactive_parser.py +++ b/lark/parsers/lalr_interactive_parser.py @@ -6,7 +6,7 @@ from .. import Token from ..exceptions import UnexpectedToken -class InteractiveParser(object): +class InteractiveParser: """InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR. For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``. diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index d916b46..8403225 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -69,7 +69,7 @@ class LALR_Parser(Serialize): e = e2 -class ParseConf(object): +class ParseConf: __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' def __init__(self, parse_table, callbacks, start): @@ -83,7 +83,7 @@ class ParseConf(object): self.start = start -class ParserState(object): +class ParserState: __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): @@ -157,7 +157,7 @@ class ParserState(object): if is_end and state_stack[-1] == end_state: return value_stack[-1] -class _Parser(object): +class _Parser: def __init__(self, parse_table, callbacks, debug=False): self.parse_table = parse_table self.callbacks = callbacks diff --git a/lark/tree.py b/lark/tree.py index 48e1c24..17b2aa9 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -35,7 +35,7 @@ class Meta: self.empty = True -class Tree(object): +class Tree: """The main tree class. Creates a new tree, and stores "data" and "children" in attributes of the same name. diff --git a/lark/utils.py b/lark/utils.py index f81f3ba..db6cb7c 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -41,7 +41,7 @@ def _deserialize(data, namespace, memo): return data -class Serialize(object): +class Serialize: """Safe-ish serialization interface that doesn't rely on Pickle Attributes: diff --git a/tests/test_parser.py b/tests/test_parser.py index 6c12b79..8591cfb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -918,7 +918,7 @@ class CustomLexerNew(Lexer): so it uses the traditionalparser as implementation without custom lexing behaviour. """ def __init__(self, lexer_conf): - self.lexer = TraditionalLexer(copy(lexer_conf)) + self.lexer = BasicLexer(copy(lexer_conf)) def lex(self, lexer_state, parser_state): return self.lexer.lex(lexer_state, parser_state) @@ -930,7 +930,7 @@ class CustomLexerOld(Lexer): so it uses the traditionalparser as implementation without custom lexing behaviour. """ def __init__(self, lexer_conf): - self.lexer = TraditionalLexer(copy(lexer_conf)) + self.lexer = BasicLexer(copy(lexer_conf)) def lex(self, text): ls = self.lexer.make_lexer_state(text) return self.lexer.lex(ls, None) @@ -1736,7 +1736,8 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(len(tree.children), 2) - # @unittest.skipIf(LEXER != 'basic', "Only basic lexers care about token priority") + # TODO: Remove after merging priority for Dynamic Earley + @unittest.skipIf(LEXER != 'basic', "Only basic lexers care about token priority") def test_lexer_prioritization(self): "Tests effect of priority on result" From a9129f33c27b475de703ebe3706c09ac769e7fd4 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 13 Sep 2021 11:53:10 +0100 Subject: [PATCH 4/5] Remove Py2-related unicode patch --- lark/lexer.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index afdbade..91c523a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -157,12 +157,7 @@ class Token(str): end_pos: int def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): - try: - inst = super(Token, cls).__new__(cls, value) - except UnicodeDecodeError: - value = value.decode('latin1') - inst = super(Token, cls).__new__(cls, value) - + inst = super(Token, cls).__new__(cls, value) inst.type = type_ inst.start_pos = start_pos inst.value = value From 2ed4d5adce5fba9fbf1ea3454ba4099513bd1353 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 13 Sep 2021 11:59:32 +0100 Subject: [PATCH 5/5] pip install lark-parser -> lark --- README.md | 2 +- docs/classes.rst | 2 +- docs/index.rst | 2 +- docs/parsers.md | 2 +- docs/tools.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 156a671..6e764af 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h ### Install Lark - $ pip install lark-parser --upgrade + $ pip install lark --upgrade Lark has no dependencies. diff --git a/docs/classes.rst b/docs/classes.rst index 1287896..2a1c770 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -13,7 +13,7 @@ Using Unicode character classes with ``regex`` Python's builtin ``re`` module has a few persistent known bugs and also won't parse advanced regex features such as character classes. -With ``pip install lark-parser[regex]``, the ``regex`` module will be +With ``pip install lark[regex]``, the ``regex`` module will be installed alongside lark and can act as a drop-in replacement to ``re``. Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``. diff --git a/docs/index.rst b/docs/index.rst index e8bd6b2..e691e30 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,7 +56,7 @@ Install Lark .. code:: bash - $ pip install lark-parser + $ pip install lark Syntax Highlighting ------------------- diff --git a/docs/parsers.md b/docs/parsers.md index 9a24bcb..6c99ac5 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`. -It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'` +It's possible to bypass the dynamic lexing, and use the regular Earley parser with a basic lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'` **SPPF & Ambiguity resolution** diff --git a/docs/tools.md b/docs/tools.md index 3ea5176..ee9d2cf 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -23,7 +23,7 @@ Lark comes with a tool to convert grammars from [Nearley](https://github.com/Har 1. Install Lark with the `nearley` component: ```bash -pip install lark-parser[nearley] +pip install lark[nearley] ``` 2. Acquire a copy of the Nearley codebase. This can be done using: