Rename lexer: Standard/Traditional -> Basic

4 years ago · 61c4512cf3
--- a/docs/grammar.md
+++ b/docs/grammar.md
@@ -124,7 +124,7 @@ Regexps/strings of different flags can only be concatenated in Python 3.6+

 #### Notes for when using a lexer:

 When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence:
 When using a lexer (basic or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence:

 1. Highest priority first (priority is specified as: TERM.number: ...)
 2. Length of match (for regexps, the longest theoretical match is used)
--- a/docs/how_to_develop.md
+++ b/docs/how_to_develop.md
@@ -32,7 +32,7 @@ For a list of supported interpreters, you can consult the `tox.ini` file.
 You can also run a single unittest using its class and method name, for example:
 ```bash
 ##   test_package test_class_name.test_function_name
 python -m tests TestLalrStandard.test_lexer_error_recovering
 python -m tests TestLalrBasic.test_keep_all_tokens
 ```

 ### tox
--- a/docs/json_tutorial.md
+++ b/docs/json_tutorial.md
@@ -332,7 +332,7 @@ class TreeToJson(Transformer):
    true = lambda self, _: True
    false = lambda self, _: False

 json_parser = Lark(json_grammar, start='value', lexer='standard')
 json_parser = Lark(json_grammar, start='value', lexer='basic')

 if __name__ == '__main__':
    with open(sys.argv[1]) as f:
--- a/docs/parsers.md
+++ b/docs/parsers.md
@@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser

 Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`.

 It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`
 It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'`

 **SPPF & Ambiguity resolution**

--- a/docs/recipes.md
+++ b/docs/recipes.md
@@ -43,7 +43,7 @@ It accepts a dictionary of the form

 Where callback is of type `f(Token) -> Token`

 It only works with the standard and contextual lexers.
 It only works with the basic and contextual lexers.

 This has the same effect of using a transformer, but can also process ignored tokens.

--- a/examples/advanced/_json_parser.py
+++ b/examples/advanced/_json_parser.py
@@ -53,9 +53,9 @@ class TreeToJson(Transformer):

 ### Create the JSON parser with Lark, using the LALR algorithm
 json_parser = Lark(json_grammar, parser='lalr',
                   # Using the standard lexer isn't required, and isn't usually recommended.
                   # Using the basic lexer isn't required, and isn't usually recommended.
                   # But, it's good enough for JSON, and it's slightly faster.
                   lexer='standard',
                   lexer='basic',
                   # Disabling propagate_positions and placeholders slightly improves speed
                   propagate_positions=False,
                   maybe_placeholders=False,
--- a/examples/advanced/conf_earley.py
+++ b/examples/advanced/conf_earley.py
@@ -5,7 +5,7 @@ Earley’s dynamic lexer
 Demonstrates the power of Earley’s dynamic lexer on a toy configuration language

 Using a lexer for configuration files is tricky, because values don't
 have to be surrounded by delimiters. Using a standard lexer for this just won't work.
 have to be surrounded by delimiters. Using a basic lexer for this just won't work.

 In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity.

--- a/examples/advanced/conf_lalr.py
+++ b/examples/advanced/conf_lalr.py
@@ -6,7 +6,7 @@ This example demonstrates the power of LALR's contextual lexer,
 by parsing a toy configuration language.

 The terminals `NAME` and `VALUE` overlap. They can match the same input.
 A standard lexer would arbitrarily choose one over the other, based on priority,
 A basic lexer would arbitrarily choose one over the other, based on priority,
 which would lead to a (confusing) parse error.
 However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows
 which one of them to expect at each point during the parse.
--- a/examples/advanced/python_parser.py
+++ b/examples/advanced/python_parser.py
@@ -28,7 +28,7 @@ kwargs = dict(rel_to=__file__, postlex=PythonIndenter(), start='file_input')

 python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs)
 python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs)
 python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs)
 python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='basic', **kwargs)

 try:
    xrange
--- a/examples/advanced/qscintilla_json.py
+++ b/examples/advanced/qscintilla_json.py
@@ -77,7 +77,7 @@ class LexerJson(QsciLexerCustom):
            %ignore WS
        '''

        self.lark = Lark(grammar, parser=None, lexer='standard')
        self.lark = Lark(grammar, parser=None, lexer='basic')
        # All tokens: print([t.name for t in self.lark.parser.lexer.tokens])

    def defaultPaper(self, style):
--- a/examples/json_parser.py
+++ b/examples/json_parser.py
@@ -50,15 +50,15 @@ class TreeToJson(Transformer):


 ### Create the JSON parser with Lark, using the Earley algorithm
 # json_parser = Lark(json_grammar, parser='earley', lexer='standard')
 # json_parser = Lark(json_grammar, parser='earley', lexer='basic')
 # def parse(x):
 #     return TreeToJson().transform(json_parser.parse(x))

 ### Create the JSON parser with Lark, using the LALR algorithm
 json_parser = Lark(json_grammar, parser='lalr',
                   # Using the standard lexer isn't required, and isn't usually recommended.
                   # Using the basic lexer isn't required, and isn't usually recommended.
                   # But, it's good enough for JSON, and it's slightly faster.
                   lexer='standard',
                   lexer='basic',
                   # Disabling propagate_positions and placeholders slightly improves speed
                   propagate_positions=False,
                   maybe_placeholders=False,
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -19,7 +19,7 @@ from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_
 from .tree import Tree
 from .common import LexerConf, ParserConf

 from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread, Token
 from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
 from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import get_frontend, _get_lexer_callbacks
 from .grammar import Rule
@@ -57,7 +57,7 @@ class LarkOptions(Serialize):
    keep_all_tokens: bool
    tree_class: Any
    parser: 'Literal["earley", "lalr", "cyk", "auto"]'
    lexer: 'Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]'
    lexer: 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]'
    ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]'
    postlex: Optional[PostLex]
    priority: 'Optional[Literal["auto", "normal", "invert"]]'
@@ -108,7 +108,7 @@ class LarkOptions(Serialize):
            Decides whether or not to use a lexer stage

            - "auto" (default): Choose for me based on the parser
            - "standard": Use a standard lexer
            - "basic": Use a basic lexer
            - "contextual": Stronger lexer (only works with parser="lalr")
            - "dynamic": Flexible and powerful (only with parser="earley")
            - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
@@ -123,7 +123,7 @@ class LarkOptions(Serialize):
    **=== Misc. / Domain Specific Options ===**

    postlex
            Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
            Lexer post-processing (Default: None) Only works with the basic and contextual lexers.
    priority
            How priorities should be evaluated - auto, none, normal, invert (Default: auto)
    lexer_callbacks
@@ -339,22 +339,22 @@ class Lark(Serialize):
                self.options.lexer = 'contextual'
            elif self.options.parser == 'earley':
                if self.options.postlex is not None:
                    logger.info("postlex can't be used with the dynamic lexer, so we use standard instead. "
                    logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. "
                                "Consider using lalr with contextual instead of earley")
                    self.options.lexer = 'standard'
                    self.options.lexer = 'basic'
                else:
                    self.options.lexer = 'dynamic'
            elif self.options.parser == 'cyk':
                self.options.lexer = 'standard'
                self.options.lexer = 'basic'
            else:
                assert False, self.options.parser
        lexer = self.options.lexer
        if isinstance(lexer, type):
            assert issubclass(lexer, Lexer)     # XXX Is this really important? Maybe just ensure interface compliance
        else:
            assert_config(lexer, ('standard', 'contextual', 'dynamic', 'dynamic_complete'))
            assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete'))
            if self.options.postlex is not None and 'dynamic' in lexer:
                raise ConfigurationError("Can't use postlex with a dynamic lexer. Use standard or contextual instead")
                raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead")

        if self.options.ambiguity == 'auto':
            if self.options.parser == 'earley':
@@ -429,7 +429,7 @@ class Lark(Serialize):
            from copy import copy
            lexer_conf = copy(lexer_conf)
            lexer_conf.ignore = ()
        return TraditionalLexer(lexer_conf)
        return BasicLexer(lexer_conf)

    def _prepare_callbacks(self):
        self._callbacks = {}
@@ -556,7 +556,7 @@ class Lark(Serialize):


    def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
        """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'
        """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'

        When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.

--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -366,7 +366,7 @@ class Lexer(ABC):
        return LexerState(text, line_ctr)


 class TraditionalLexer(Lexer):
 class BasicLexer(Lexer):

    terminals: Collection[TerminalDef]
    ignore_types: FrozenSet[str]
@@ -473,8 +473,8 @@ class TraditionalLexer(Lexer):

 class ContextualLexer(Lexer):

    lexers: Dict[str, TraditionalLexer]
    root_lexer: TraditionalLexer
    lexers: Dict[str, BasicLexer]
    root_lexer: BasicLexer

    def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None:
        terminals = list(conf.terminals)
@@ -493,13 +493,13 @@ class ContextualLexer(Lexer):
                accepts = set(accepts) | set(conf.ignore) | set(always_accept)
                lexer_conf = copy(trad_conf)
                lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
                lexer = TraditionalLexer(lexer_conf)
                lexer = BasicLexer(lexer_conf)
                lexer_by_tokens[key] = lexer

            self.lexers[state] = lexer

        assert trad_conf.terminals is terminals
        self.root_lexer = TraditionalLexer(trad_conf)
        self.root_lexer = BasicLexer(trad_conf)

    def make_lexer_state(self, text):
        return self.root_lexer.make_lexer_state(text)
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -918,7 +918,7 @@ def _get_parser():
        import re
        lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH'])
        parser_conf = ParserConf(rules, callback, ['start'])
        lexer_conf.lexer_type = 'standard'
        lexer_conf.lexer_type = 'basic'
        parser_conf.parser_type = 'lalr'
        _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None)
        return _get_parser.cache
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,7 +1,7 @@
 from .exceptions import ConfigurationError, GrammarError, assert_config
 from .utils import get_regexp_width, Serialize
 from .parsers.grammar_analysis import GrammarAnalyzer
 from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
 from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, Token, TerminalDef
 from .parsers import earley, xearley, cyk
 from .parsers.lalr_parser import LALR_Parser
 from .tree import Tree
@@ -70,7 +70,7 @@ class ParsingFrontend(Serialize):

        try:
            create_lexer = {
                'standard': create_traditional_lexer,
                'basic': create_basic_lexer,
                'contextual': create_contextual_lexer,
            }[lexer_type]
        except KeyError:
@@ -110,9 +110,9 @@ def get_frontend(parser, lexer):
    assert_config(parser, ('lalr', 'earley', 'cyk'))
    if not isinstance(lexer, type):     # not custom lexer?
        expected = {
            'lalr': ('standard', 'contextual'),
            'earley': ('standard', 'dynamic', 'dynamic_complete'),
            'cyk': ('standard', ),
            'lalr': ('basic', 'contextual'),
            'earley': ('basic', 'dynamic', 'dynamic_complete'),
            'cyk': ('basic', ),
         }[parser]
        assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)

@@ -141,8 +141,8 @@ class PostLexConnector:



 def create_traditional_lexer(lexer_conf, parser, postlex):
    return TraditionalLexer(lexer_conf)
 def create_basic_lexer(lexer_conf, parser, postlex):
    return BasicLexer(lexer_conf)

 def create_contextual_lexer(lexer_conf, parser, postlex):
    states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}
--- a/lark/tools/init.py
+++ b/lark/tools/init.py
@@ -25,7 +25,7 @@ options = ['start', 'lexer']

 lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times")
 lalr_argparser.add_argument('-s', '--start', action='append', default=[])
 lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('standard', 'contextual'))
 lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual'))
 k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {}
 lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)')
 lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file')
--- a/lark/tools/nearley.py
+++ b/lark/tools/nearley.py
@@ -44,7 +44,7 @@ nearley_grammar = r"""

    """

 nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard')
 nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic')

 def _get_rulename(name):
    name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name)
--- a/lark/tools/standalone.py
+++ b/lark/tools/standalone.py
@@ -3,7 +3,7 @@
 #
 #   Lark Stand-alone Generator Tool
 # ----------------------------------
 # Generates a stand-alone LALR(1) parser with a standard lexer
 # Generates a stand-alone LALR(1) parser
 #
 # Git:    https://github.com/erezsh/lark
 # Author: Erez Shinan (erezshin@gmail.com)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -36,7 +36,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte
 from lark.tree import Tree
 from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive
 from lark.grammar import Rule
 from lark.lexer import TerminalDef, Lexer, TraditionalLexer
 from lark.lexer import TerminalDef, Lexer, BasicLexer
 from lark.indenter import Indenter

 __all__ = ['TestParsers']
@@ -465,7 +465,7 @@ def _make_full_earley_test(LEXER):
            empty_tree = Tree('empty', [Tree('empty2', [])])
            self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])

        @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
        @unittest.skipIf(LEXER=='basic', "Requires dynamic lexer")
        def test_earley_explicit_ambiguity(self):
            # This was a sneaky bug!

@@ -481,7 +481,7 @@ def _make_full_earley_test(LEXER):
            self.assertEqual( ambig_tree.data, '_ambig')
            self.assertEqual( len(ambig_tree.children), 2)

        @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
        @unittest.skipIf(LEXER=='basic', "Requires dynamic lexer")
        def test_ambiguity1(self):
            grammar = """
            start: cd+ "e"
@@ -497,7 +497,7 @@ def _make_full_earley_test(LEXER):
            assert ambig_tree.data == '_ambig', ambig_tree
            assert len(ambig_tree.children) == 2

        @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
        @unittest.skipIf(LEXER=='basic', "Requires dynamic lexer")
        def test_ambiguity2(self):
            grammar = """
            ANY:  /[a-zA-Z0-9 ]+/
@@ -1019,9 +1019,9 @@ def _make_parser_test(LEXER, PARSER):
    def _Lark_open(gfilename, **kwargs):
        return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)

    if (LEXER, PARSER) == ('standard', 'earley'):
    if (LEXER, PARSER) == ('basic', 'earley'):
        # Check that the `lark.lark` grammar represents can parse every example used in these tests.
        # Standard-Earley was an arbitrary choice, to make sure it only ran once.
        # basic-Earley was an arbitrary choice, to make sure it only ran once.
        lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr')
        def wrap_with_test_grammar(f):
            def _f(x, **kwargs):
@@ -1736,7 +1736,7 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(len(tree.children), 2)


        @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
        # @unittest.skipIf(LEXER != 'basic', "Only basic lexers care about token priority")
        def test_lexer_prioritization(self):
            "Tests effect of priority on result"

@@ -2505,9 +2505,9 @@ def _make_parser_test(LEXER, PARSER):
    __all__.append(_NAME)

 _TO_TEST = [
        ('standard', 'earley'),
        ('standard', 'cyk'),
        ('standard', 'lalr'),
        ('basic', 'earley'),
        ('basic', 'cyk'),
        ('basic', 'lalr'),

        ('dynamic', 'earley'),
        ('dynamic_complete', 'earley'),