@@ -124,7 +124,7 @@ Regexps/strings of different flags can only be concatenated in Python 3.6+ | |||
#### Notes for when using a lexer: | |||
When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence: | |||
When using a lexer (basic or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence: | |||
1. Highest priority first (priority is specified as: TERM.number: ...) | |||
2. Length of match (for regexps, the longest theoretical match is used) | |||
@@ -32,7 +32,7 @@ For a list of supported interpreters, you can consult the `tox.ini` file. | |||
You can also run a single unittest using its class and method name, for example: | |||
```bash | |||
## test_package test_class_name.test_function_name | |||
python -m tests TestLalrStandard.test_lexer_error_recovering | |||
python -m tests TestLalrBasic.test_keep_all_tokens | |||
``` | |||
### tox | |||
@@ -332,7 +332,7 @@ class TreeToJson(Transformer): | |||
true = lambda self, _: True | |||
false = lambda self, _: False | |||
json_parser = Lark(json_grammar, start='value', lexer='standard') | |||
json_parser = Lark(json_grammar, start='value', lexer='basic') | |||
if __name__ == '__main__': | |||
with open(sys.argv[1]) as f: | |||
@@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser | |||
Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`. | |||
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` | |||
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'` | |||
**SPPF & Ambiguity resolution** | |||
@@ -43,7 +43,7 @@ It accepts a dictionary of the form | |||
Where callback is of type `f(Token) -> Token` | |||
It only works with the standard and contextual lexers. | |||
It only works with the basic and contextual lexers. | |||
This has the same effect of using a transformer, but can also process ignored tokens. | |||
@@ -53,9 +53,9 @@ class TreeToJson(Transformer): | |||
### Create the JSON parser with Lark, using the LALR algorithm | |||
json_parser = Lark(json_grammar, parser='lalr', | |||
# Using the standard lexer isn't required, and isn't usually recommended. | |||
# Using the basic lexer isn't required, and isn't usually recommended. | |||
# But, it's good enough for JSON, and it's slightly faster. | |||
lexer='standard', | |||
lexer='basic', | |||
# Disabling propagate_positions and placeholders slightly improves speed | |||
propagate_positions=False, | |||
maybe_placeholders=False, | |||
@@ -5,7 +5,7 @@ Earley’s dynamic lexer | |||
Demonstrates the power of Earley’s dynamic lexer on a toy configuration language | |||
Using a lexer for configuration files is tricky, because values don't | |||
have to be surrounded by delimiters. Using a standard lexer for this just won't work. | |||
have to be surrounded by delimiters. Using a basic lexer for this just won't work. | |||
In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. | |||
@@ -6,7 +6,7 @@ This example demonstrates the power of LALR's contextual lexer, | |||
by parsing a toy configuration language. | |||
The terminals `NAME` and `VALUE` overlap. They can match the same input. | |||
A standard lexer would arbitrarily choose one over the other, based on priority, | |||
A basic lexer would arbitrarily choose one over the other, based on priority, | |||
which would lead to a (confusing) parse error. | |||
However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows | |||
which one of them to expect at each point during the parse. | |||
@@ -28,7 +28,7 @@ kwargs = dict(rel_to=__file__, postlex=PythonIndenter(), start='file_input') | |||
python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs) | |||
python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) | |||
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs) | |||
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='basic', **kwargs) | |||
try: | |||
xrange | |||
@@ -77,7 +77,7 @@ class LexerJson(QsciLexerCustom): | |||
%ignore WS | |||
''' | |||
self.lark = Lark(grammar, parser=None, lexer='standard') | |||
self.lark = Lark(grammar, parser=None, lexer='basic') | |||
# All tokens: print([t.name for t in self.lark.parser.lexer.tokens]) | |||
def defaultPaper(self, style): | |||
@@ -50,15 +50,15 @@ class TreeToJson(Transformer): | |||
### Create the JSON parser with Lark, using the Earley algorithm | |||
# json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||
# json_parser = Lark(json_grammar, parser='earley', lexer='basic') | |||
# def parse(x): | |||
# return TreeToJson().transform(json_parser.parse(x)) | |||
### Create the JSON parser with Lark, using the LALR algorithm | |||
json_parser = Lark(json_grammar, parser='lalr', | |||
# Using the standard lexer isn't required, and isn't usually recommended. | |||
# Using the basic lexer isn't required, and isn't usually recommended. | |||
# But, it's good enough for JSON, and it's slightly faster. | |||
lexer='standard', | |||
lexer='basic', | |||
# Disabling propagate_positions and placeholders slightly improves speed | |||
propagate_positions=False, | |||
maybe_placeholders=False, | |||
@@ -19,7 +19,7 @@ from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_ | |||
from .tree import Tree | |||
from .common import LexerConf, ParserConf | |||
from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread, Token | |||
from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import get_frontend, _get_lexer_callbacks | |||
from .grammar import Rule | |||
@@ -57,7 +57,7 @@ class LarkOptions(Serialize): | |||
keep_all_tokens: bool | |||
tree_class: Any | |||
parser: 'Literal["earley", "lalr", "cyk", "auto"]' | |||
lexer: 'Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' | |||
lexer: 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' | |||
ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]' | |||
postlex: Optional[PostLex] | |||
priority: 'Optional[Literal["auto", "normal", "invert"]]' | |||
@@ -108,7 +108,7 @@ class LarkOptions(Serialize): | |||
Decides whether or not to use a lexer stage | |||
- "auto" (default): Choose for me based on the parser | |||
- "standard": Use a standard lexer | |||
- "basic": Use a basic lexer | |||
- "contextual": Stronger lexer (only works with parser="lalr") | |||
- "dynamic": Flexible and powerful (only with parser="earley") | |||
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. | |||
@@ -123,7 +123,7 @@ class LarkOptions(Serialize): | |||
**=== Misc. / Domain Specific Options ===** | |||
postlex | |||
Lexer post-processing (Default: None) Only works with the standard and contextual lexers. | |||
Lexer post-processing (Default: None) Only works with the basic and contextual lexers. | |||
priority | |||
How priorities should be evaluated - auto, none, normal, invert (Default: auto) | |||
lexer_callbacks | |||
@@ -339,22 +339,22 @@ class Lark(Serialize): | |||
self.options.lexer = 'contextual' | |||
elif self.options.parser == 'earley': | |||
if self.options.postlex is not None: | |||
logger.info("postlex can't be used with the dynamic lexer, so we use standard instead. " | |||
logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. " | |||
"Consider using lalr with contextual instead of earley") | |||
self.options.lexer = 'standard' | |||
self.options.lexer = 'basic' | |||
else: | |||
self.options.lexer = 'dynamic' | |||
elif self.options.parser == 'cyk': | |||
self.options.lexer = 'standard' | |||
self.options.lexer = 'basic' | |||
else: | |||
assert False, self.options.parser | |||
lexer = self.options.lexer | |||
if isinstance(lexer, type): | |||
assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance | |||
else: | |||
assert_config(lexer, ('standard', 'contextual', 'dynamic', 'dynamic_complete')) | |||
assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete')) | |||
if self.options.postlex is not None and 'dynamic' in lexer: | |||
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use standard or contextual instead") | |||
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead") | |||
if self.options.ambiguity == 'auto': | |||
if self.options.parser == 'earley': | |||
@@ -429,7 +429,7 @@ class Lark(Serialize): | |||
from copy import copy | |||
lexer_conf = copy(lexer_conf) | |||
lexer_conf.ignore = () | |||
return TraditionalLexer(lexer_conf) | |||
return BasicLexer(lexer_conf) | |||
def _prepare_callbacks(self): | |||
self._callbacks = {} | |||
@@ -556,7 +556,7 @@ class Lark(Serialize): | |||
def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: | |||
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | |||
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' | |||
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | |||
@@ -366,7 +366,7 @@ class Lexer(ABC): | |||
return LexerState(text, line_ctr) | |||
class TraditionalLexer(Lexer): | |||
class BasicLexer(Lexer): | |||
terminals: Collection[TerminalDef] | |||
ignore_types: FrozenSet[str] | |||
@@ -473,8 +473,8 @@ class TraditionalLexer(Lexer): | |||
class ContextualLexer(Lexer): | |||
lexers: Dict[str, TraditionalLexer] | |||
root_lexer: TraditionalLexer | |||
lexers: Dict[str, BasicLexer] | |||
root_lexer: BasicLexer | |||
def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None: | |||
terminals = list(conf.terminals) | |||
@@ -493,13 +493,13 @@ class ContextualLexer(Lexer): | |||
accepts = set(accepts) | set(conf.ignore) | set(always_accept) | |||
lexer_conf = copy(trad_conf) | |||
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] | |||
lexer = TraditionalLexer(lexer_conf) | |||
lexer = BasicLexer(lexer_conf) | |||
lexer_by_tokens[key] = lexer | |||
self.lexers[state] = lexer | |||
assert trad_conf.terminals is terminals | |||
self.root_lexer = TraditionalLexer(trad_conf) | |||
self.root_lexer = BasicLexer(trad_conf) | |||
def make_lexer_state(self, text): | |||
return self.root_lexer.make_lexer_state(text) | |||
@@ -918,7 +918,7 @@ def _get_parser(): | |||
import re | |||
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) | |||
parser_conf = ParserConf(rules, callback, ['start']) | |||
lexer_conf.lexer_type = 'standard' | |||
lexer_conf.lexer_type = 'basic' | |||
parser_conf.parser_type = 'lalr' | |||
_get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) | |||
return _get_parser.cache | |||
@@ -1,7 +1,7 @@ | |||
from .exceptions import ConfigurationError, GrammarError, assert_config | |||
from .utils import get_regexp_width, Serialize | |||
from .parsers.grammar_analysis import GrammarAnalyzer | |||
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, Token, TerminalDef | |||
from .parsers import earley, xearley, cyk | |||
from .parsers.lalr_parser import LALR_Parser | |||
from .tree import Tree | |||
@@ -70,7 +70,7 @@ class ParsingFrontend(Serialize): | |||
try: | |||
create_lexer = { | |||
'standard': create_traditional_lexer, | |||
'basic': create_basic_lexer, | |||
'contextual': create_contextual_lexer, | |||
}[lexer_type] | |||
except KeyError: | |||
@@ -110,9 +110,9 @@ def get_frontend(parser, lexer): | |||
assert_config(parser, ('lalr', 'earley', 'cyk')) | |||
if not isinstance(lexer, type): # not custom lexer? | |||
expected = { | |||
'lalr': ('standard', 'contextual'), | |||
'earley': ('standard', 'dynamic', 'dynamic_complete'), | |||
'cyk': ('standard', ), | |||
'lalr': ('basic', 'contextual'), | |||
'earley': ('basic', 'dynamic', 'dynamic_complete'), | |||
'cyk': ('basic', ), | |||
}[parser] | |||
assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) | |||
@@ -141,8 +141,8 @@ class PostLexConnector: | |||
def create_traditional_lexer(lexer_conf, parser, postlex): | |||
return TraditionalLexer(lexer_conf) | |||
def create_basic_lexer(lexer_conf, parser, postlex): | |||
return BasicLexer(lexer_conf) | |||
def create_contextual_lexer(lexer_conf, parser, postlex): | |||
states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} | |||
@@ -25,7 +25,7 @@ options = ['start', 'lexer'] | |||
lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times") | |||
lalr_argparser.add_argument('-s', '--start', action='append', default=[]) | |||
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('standard', 'contextual')) | |||
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual')) | |||
k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {} | |||
lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)') | |||
lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file') | |||
@@ -44,7 +44,7 @@ nearley_grammar = r""" | |||
""" | |||
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard') | |||
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') | |||
def _get_rulename(name): | |||
name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) | |||
@@ -3,7 +3,7 @@ | |||
# | |||
# Lark Stand-alone Generator Tool | |||
# ---------------------------------- | |||
# Generates a stand-alone LALR(1) parser with a standard lexer | |||
# Generates a stand-alone LALR(1) parser | |||
# | |||
# Git: https://github.com/erezsh/lark | |||
# Author: Erez Shinan (erezshin@gmail.com) | |||
@@ -36,7 +36,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte | |||
from lark.tree import Tree | |||
from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive | |||
from lark.grammar import Rule | |||
from lark.lexer import TerminalDef, Lexer, TraditionalLexer | |||
from lark.lexer import TerminalDef, Lexer, BasicLexer | |||
from lark.indenter import Indenter | |||
__all__ = ['TestParsers'] | |||
@@ -465,7 +465,7 @@ def _make_full_earley_test(LEXER): | |||
empty_tree = Tree('empty', [Tree('empty2', [])]) | |||
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | |||
@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") | |||
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") | |||
def test_earley_explicit_ambiguity(self): | |||
# This was a sneaky bug! | |||
@@ -481,7 +481,7 @@ def _make_full_earley_test(LEXER): | |||
self.assertEqual( ambig_tree.data, '_ambig') | |||
self.assertEqual( len(ambig_tree.children), 2) | |||
@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") | |||
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") | |||
def test_ambiguity1(self): | |||
grammar = """ | |||
start: cd+ "e" | |||
@@ -497,7 +497,7 @@ def _make_full_earley_test(LEXER): | |||
assert ambig_tree.data == '_ambig', ambig_tree | |||
assert len(ambig_tree.children) == 2 | |||
@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") | |||
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") | |||
def test_ambiguity2(self): | |||
grammar = """ | |||
ANY: /[a-zA-Z0-9 ]+/ | |||
@@ -1019,9 +1019,9 @@ def _make_parser_test(LEXER, PARSER): | |||
def _Lark_open(gfilename, **kwargs): | |||
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
if (LEXER, PARSER) == ('standard', 'earley'): | |||
if (LEXER, PARSER) == ('basic', 'earley'): | |||
# Check that the `lark.lark` grammar represents can parse every example used in these tests. | |||
# Standard-Earley was an arbitrary choice, to make sure it only ran once. | |||
# basic-Earley was an arbitrary choice, to make sure it only ran once. | |||
lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr') | |||
def wrap_with_test_grammar(f): | |||
def _f(x, **kwargs): | |||
@@ -1736,7 +1736,7 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertEqual(len(tree.children), 2) | |||
@unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority") | |||
# @unittest.skipIf(LEXER != 'basic', "Only basic lexers care about token priority") | |||
def test_lexer_prioritization(self): | |||
"Tests effect of priority on result" | |||
@@ -2505,9 +2505,9 @@ def _make_parser_test(LEXER, PARSER): | |||
__all__.append(_NAME) | |||
_TO_TEST = [ | |||
('standard', 'earley'), | |||
('standard', 'cyk'), | |||
('standard', 'lalr'), | |||
('basic', 'earley'), | |||
('basic', 'cyk'), | |||
('basic', 'lalr'), | |||
('dynamic', 'earley'), | |||
('dynamic_complete', 'earley'), | |||