A bunch of refactors and fixes for v1.0remotes/origin/gm/2021-09-23T00Z/github.com--lark-parser-lark/master
@@ -33,7 +33,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h | |||||
### Install Lark | ### Install Lark | ||||
$ pip install lark-parser --upgrade | |||||
$ pip install lark --upgrade | |||||
Lark has no dependencies. | Lark has no dependencies. | ||||
@@ -13,7 +13,7 @@ Using Unicode character classes with ``regex`` | |||||
Python's builtin ``re`` module has a few persistent known bugs and also won't parse | Python's builtin ``re`` module has a few persistent known bugs and also won't parse | ||||
advanced regex features such as character classes. | advanced regex features such as character classes. | ||||
With ``pip install lark-parser[regex]``, the ``regex`` module will be | |||||
With ``pip install lark[regex]``, the ``regex`` module will be | |||||
installed alongside lark and can act as a drop-in replacement to ``re``. | installed alongside lark and can act as a drop-in replacement to ``re``. | ||||
Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``. | Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``. | ||||
@@ -124,7 +124,7 @@ Regexps/strings of different flags can only be concatenated in Python 3.6+ | |||||
#### Notes for when using a lexer: | #### Notes for when using a lexer: | ||||
When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence: | |||||
When using a lexer (basic or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence: | |||||
1. Highest priority first (priority is specified as: TERM.number: ...) | 1. Highest priority first (priority is specified as: TERM.number: ...) | ||||
2. Length of match (for regexps, the longest theoretical match is used) | 2. Length of match (for regexps, the longest theoretical match is used) | ||||
@@ -32,7 +32,7 @@ For a list of supported interpreters, you can consult the `tox.ini` file. | |||||
You can also run a single unittest using its class and method name, for example: | You can also run a single unittest using its class and method name, for example: | ||||
```bash | ```bash | ||||
## test_package test_class_name.test_function_name | ## test_package test_class_name.test_function_name | ||||
python -m tests TestLalrStandard.test_lexer_error_recovering | |||||
python -m tests TestLalrBasic.test_keep_all_tokens | |||||
``` | ``` | ||||
### tox | ### tox | ||||
@@ -56,7 +56,7 @@ Install Lark | |||||
.. code:: bash | .. code:: bash | ||||
$ pip install lark-parser | |||||
$ pip install lark | |||||
Syntax Highlighting | Syntax Highlighting | ||||
------------------- | ------------------- | ||||
@@ -332,7 +332,7 @@ class TreeToJson(Transformer): | |||||
true = lambda self, _: True | true = lambda self, _: True | ||||
false = lambda self, _: False | false = lambda self, _: False | ||||
json_parser = Lark(json_grammar, start='value', lexer='standard') | |||||
json_parser = Lark(json_grammar, start='value', lexer='basic') | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
with open(sys.argv[1]) as f: | with open(sys.argv[1]) as f: | ||||
@@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser | |||||
Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`. | Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`. | ||||
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` | |||||
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a basic lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'` | |||||
**SPPF & Ambiguity resolution** | **SPPF & Ambiguity resolution** | ||||
@@ -43,7 +43,7 @@ It accepts a dictionary of the form | |||||
Where callback is of type `f(Token) -> Token` | Where callback is of type `f(Token) -> Token` | ||||
It only works with the standard and contextual lexers. | |||||
It only works with the basic and contextual lexers. | |||||
This has the same effect of using a transformer, but can also process ignored tokens. | This has the same effect of using a transformer, but can also process ignored tokens. | ||||
@@ -23,7 +23,7 @@ Lark comes with a tool to convert grammars from [Nearley](https://github.com/Har | |||||
1. Install Lark with the `nearley` component: | 1. Install Lark with the `nearley` component: | ||||
```bash | ```bash | ||||
pip install lark-parser[nearley] | |||||
pip install lark[nearley] | |||||
``` | ``` | ||||
2. Acquire a copy of the Nearley codebase. This can be done using: | 2. Acquire a copy of the Nearley codebase. This can be done using: | ||||
@@ -53,9 +53,9 @@ class TreeToJson(Transformer): | |||||
### Create the JSON parser with Lark, using the LALR algorithm | ### Create the JSON parser with Lark, using the LALR algorithm | ||||
json_parser = Lark(json_grammar, parser='lalr', | json_parser = Lark(json_grammar, parser='lalr', | ||||
# Using the standard lexer isn't required, and isn't usually recommended. | |||||
# Using the basic lexer isn't required, and isn't usually recommended. | |||||
# But, it's good enough for JSON, and it's slightly faster. | # But, it's good enough for JSON, and it's slightly faster. | ||||
lexer='standard', | |||||
lexer='basic', | |||||
# Disabling propagate_positions and placeholders slightly improves speed | # Disabling propagate_positions and placeholders slightly improves speed | ||||
propagate_positions=False, | propagate_positions=False, | ||||
maybe_placeholders=False, | maybe_placeholders=False, | ||||
@@ -5,7 +5,7 @@ Earley’s dynamic lexer | |||||
Demonstrates the power of Earley’s dynamic lexer on a toy configuration language | Demonstrates the power of Earley’s dynamic lexer on a toy configuration language | ||||
Using a lexer for configuration files is tricky, because values don't | Using a lexer for configuration files is tricky, because values don't | ||||
have to be surrounded by delimiters. Using a standard lexer for this just won't work. | |||||
have to be surrounded by delimiters. Using a basic lexer for this just won't work. | |||||
In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. | In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. | ||||
@@ -6,7 +6,7 @@ This example demonstrates the power of LALR's contextual lexer, | |||||
by parsing a toy configuration language. | by parsing a toy configuration language. | ||||
The terminals `NAME` and `VALUE` overlap. They can match the same input. | The terminals `NAME` and `VALUE` overlap. They can match the same input. | ||||
A standard lexer would arbitrarily choose one over the other, based on priority, | |||||
A basic lexer would arbitrarily choose one over the other, based on priority, | |||||
which would lead to a (confusing) parse error. | which would lead to a (confusing) parse error. | ||||
However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows | However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows | ||||
which one of them to expect at each point during the parse. | which one of them to expect at each point during the parse. | ||||
@@ -28,7 +28,7 @@ kwargs = dict(rel_to=__file__, postlex=PythonIndenter(), start='file_input') | |||||
python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs) | python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs) | ||||
python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) | python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) | ||||
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs) | |||||
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='basic', **kwargs) | |||||
try: | try: | ||||
xrange | xrange | ||||
@@ -77,7 +77,7 @@ class LexerJson(QsciLexerCustom): | |||||
%ignore WS | %ignore WS | ||||
''' | ''' | ||||
self.lark = Lark(grammar, parser=None, lexer='standard') | |||||
self.lark = Lark(grammar, parser=None, lexer='basic') | |||||
# All tokens: print([t.name for t in self.lark.parser.lexer.tokens]) | # All tokens: print([t.name for t in self.lark.parser.lexer.tokens]) | ||||
def defaultPaper(self, style): | def defaultPaper(self, style): | ||||
@@ -50,15 +50,15 @@ class TreeToJson(Transformer): | |||||
### Create the JSON parser with Lark, using the Earley algorithm | ### Create the JSON parser with Lark, using the Earley algorithm | ||||
# json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||||
# json_parser = Lark(json_grammar, parser='earley', lexer='basic') | |||||
# def parse(x): | # def parse(x): | ||||
# return TreeToJson().transform(json_parser.parse(x)) | # return TreeToJson().transform(json_parser.parse(x)) | ||||
### Create the JSON parser with Lark, using the LALR algorithm | ### Create the JSON parser with Lark, using the LALR algorithm | ||||
json_parser = Lark(json_grammar, parser='lalr', | json_parser = Lark(json_grammar, parser='lalr', | ||||
# Using the standard lexer isn't required, and isn't usually recommended. | |||||
# Using the basic lexer isn't required, and isn't usually recommended. | |||||
# But, it's good enough for JSON, and it's slightly faster. | # But, it's good enough for JSON, and it's slightly faster. | ||||
lexer='standard', | |||||
lexer='basic', | |||||
# Disabling propagate_positions and placeholders slightly improves speed | # Disabling propagate_positions and placeholders slightly improves speed | ||||
propagate_positions=False, | propagate_positions=False, | ||||
maybe_placeholders=False, | maybe_placeholders=False, | ||||
@@ -8,20 +8,20 @@ from typing import Optional, Callable | |||||
from lark import Transformer, v_args | from lark import Transformer, v_args | ||||
class Ast(object): | |||||
class Ast: | |||||
"""Abstract class | """Abstract class | ||||
Subclasses will be collected by `create_transformer()` | Subclasses will be collected by `create_transformer()` | ||||
""" | """ | ||||
pass | pass | ||||
class AsList(object): | |||||
class AsList: | |||||
"""Abstract class | """Abstract class | ||||
Subclasses will be instanciated with the parse results as a single list, instead of as arguments. | Subclasses will be instanciated with the parse results as a single list, instead of as arguments. | ||||
""" | """ | ||||
class WithMeta(object): | |||||
class WithMeta: | |||||
"""Abstract class | """Abstract class | ||||
Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail) | Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail) | ||||
@@ -44,8 +44,7 @@ class Indenter(PostLex, ABC): | |||||
def _process(self, stream): | def _process(self, stream): | ||||
for token in stream: | for token in stream: | ||||
if token.type == self.NL_type: | if token.type == self.NL_type: | ||||
for t in self.handle_NL(token): | |||||
yield t | |||||
yield from self.handle_NL(token) | |||||
else: | else: | ||||
yield token | yield token | ||||
@@ -19,7 +19,7 @@ from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_ | |||||
from .tree import Tree | from .tree import Tree | ||||
from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread, Token | |||||
from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token | |||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
from .parser_frontends import get_frontend, _get_lexer_callbacks | from .parser_frontends import get_frontend, _get_lexer_callbacks | ||||
from .grammar import Rule | from .grammar import Rule | ||||
@@ -57,7 +57,7 @@ class LarkOptions(Serialize): | |||||
keep_all_tokens: bool | keep_all_tokens: bool | ||||
tree_class: Any | tree_class: Any | ||||
parser: 'Literal["earley", "lalr", "cyk", "auto"]' | parser: 'Literal["earley", "lalr", "cyk", "auto"]' | ||||
lexer: 'Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' | |||||
lexer: 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' | |||||
ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]' | ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]' | ||||
postlex: Optional[PostLex] | postlex: Optional[PostLex] | ||||
priority: 'Optional[Literal["auto", "normal", "invert"]]' | priority: 'Optional[Literal["auto", "normal", "invert"]]' | ||||
@@ -108,7 +108,7 @@ class LarkOptions(Serialize): | |||||
Decides whether or not to use a lexer stage | Decides whether or not to use a lexer stage | ||||
- "auto" (default): Choose for me based on the parser | - "auto" (default): Choose for me based on the parser | ||||
- "standard": Use a standard lexer | |||||
- "basic": Use a basic lexer | |||||
- "contextual": Stronger lexer (only works with parser="lalr") | - "contextual": Stronger lexer (only works with parser="lalr") | ||||
- "dynamic": Flexible and powerful (only with parser="earley") | - "dynamic": Flexible and powerful (only with parser="earley") | ||||
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. | - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. | ||||
@@ -123,7 +123,7 @@ class LarkOptions(Serialize): | |||||
**=== Misc. / Domain Specific Options ===** | **=== Misc. / Domain Specific Options ===** | ||||
postlex | postlex | ||||
Lexer post-processing (Default: None) Only works with the standard and contextual lexers. | |||||
Lexer post-processing (Default: None) Only works with the basic and contextual lexers. | |||||
priority | priority | ||||
How priorities should be evaluated - auto, none, normal, invert (Default: auto) | How priorities should be evaluated - auto, none, normal, invert (Default: auto) | ||||
lexer_callbacks | lexer_callbacks | ||||
@@ -339,22 +339,22 @@ class Lark(Serialize): | |||||
self.options.lexer = 'contextual' | self.options.lexer = 'contextual' | ||||
elif self.options.parser == 'earley': | elif self.options.parser == 'earley': | ||||
if self.options.postlex is not None: | if self.options.postlex is not None: | ||||
logger.info("postlex can't be used with the dynamic lexer, so we use standard instead. " | |||||
logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. " | |||||
"Consider using lalr with contextual instead of earley") | "Consider using lalr with contextual instead of earley") | ||||
self.options.lexer = 'standard' | |||||
self.options.lexer = 'basic' | |||||
else: | else: | ||||
self.options.lexer = 'dynamic' | self.options.lexer = 'dynamic' | ||||
elif self.options.parser == 'cyk': | elif self.options.parser == 'cyk': | ||||
self.options.lexer = 'standard' | |||||
self.options.lexer = 'basic' | |||||
else: | else: | ||||
assert False, self.options.parser | assert False, self.options.parser | ||||
lexer = self.options.lexer | lexer = self.options.lexer | ||||
if isinstance(lexer, type): | if isinstance(lexer, type): | ||||
assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance | assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance | ||||
else: | else: | ||||
assert_config(lexer, ('standard', 'contextual', 'dynamic', 'dynamic_complete')) | |||||
assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete')) | |||||
if self.options.postlex is not None and 'dynamic' in lexer: | if self.options.postlex is not None and 'dynamic' in lexer: | ||||
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use standard or contextual instead") | |||||
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead") | |||||
if self.options.ambiguity == 'auto': | if self.options.ambiguity == 'auto': | ||||
if self.options.parser == 'earley': | if self.options.parser == 'earley': | ||||
@@ -429,7 +429,7 @@ class Lark(Serialize): | |||||
from copy import copy | from copy import copy | ||||
lexer_conf = copy(lexer_conf) | lexer_conf = copy(lexer_conf) | ||||
lexer_conf.ignore = () | lexer_conf.ignore = () | ||||
return TraditionalLexer(lexer_conf) | |||||
return BasicLexer(lexer_conf) | |||||
def _prepare_callbacks(self): | def _prepare_callbacks(self): | ||||
self._callbacks = {} | self._callbacks = {} | ||||
@@ -556,7 +556,7 @@ class Lark(Serialize): | |||||
def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: | def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: | ||||
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | |||||
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' | |||||
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | ||||
@@ -157,12 +157,7 @@ class Token(str): | |||||
end_pos: int | end_pos: int | ||||
def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): | def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): | ||||
try: | |||||
inst = super(Token, cls).__new__(cls, value) | |||||
except UnicodeDecodeError: | |||||
value = value.decode('latin1') | |||||
inst = super(Token, cls).__new__(cls, value) | |||||
inst = super(Token, cls).__new__(cls, value) | |||||
inst.type = type_ | inst.type = type_ | ||||
inst.start_pos = start_pos | inst.start_pos = start_pos | ||||
inst.value = value | inst.value = value | ||||
@@ -331,7 +326,7 @@ def _regexp_has_newline(r: str): | |||||
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) | return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) | ||||
class LexerState(object): | |||||
class LexerState: | |||||
__slots__ = 'text', 'line_ctr', 'last_token' | __slots__ = 'text', 'line_ctr', 'last_token' | ||||
def __init__(self, text, line_ctr, last_token=None): | def __init__(self, text, line_ctr, last_token=None): | ||||
@@ -366,7 +361,7 @@ class Lexer(ABC): | |||||
return LexerState(text, line_ctr) | return LexerState(text, line_ctr) | ||||
class TraditionalLexer(Lexer): | |||||
class BasicLexer(Lexer): | |||||
terminals: Collection[TerminalDef] | terminals: Collection[TerminalDef] | ||||
ignore_types: FrozenSet[str] | ignore_types: FrozenSet[str] | ||||
@@ -473,8 +468,8 @@ class TraditionalLexer(Lexer): | |||||
class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
lexers: Dict[str, TraditionalLexer] | |||||
root_lexer: TraditionalLexer | |||||
lexers: Dict[str, BasicLexer] | |||||
root_lexer: BasicLexer | |||||
def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None: | def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None: | ||||
terminals = list(conf.terminals) | terminals = list(conf.terminals) | ||||
@@ -493,13 +488,13 @@ class ContextualLexer(Lexer): | |||||
accepts = set(accepts) | set(conf.ignore) | set(always_accept) | accepts = set(accepts) | set(conf.ignore) | set(always_accept) | ||||
lexer_conf = copy(trad_conf) | lexer_conf = copy(trad_conf) | ||||
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] | lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] | ||||
lexer = TraditionalLexer(lexer_conf) | |||||
lexer = BasicLexer(lexer_conf) | |||||
lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
self.lexers[state] = lexer | self.lexers[state] = lexer | ||||
assert trad_conf.terminals is terminals | assert trad_conf.terminals is terminals | ||||
self.root_lexer = TraditionalLexer(trad_conf) | |||||
self.root_lexer = BasicLexer(trad_conf) | |||||
def make_lexer_state(self, text): | def make_lexer_state(self, text): | ||||
return self.root_lexer.make_lexer_state(text) | return self.root_lexer.make_lexer_state(text) | ||||
@@ -521,7 +516,7 @@ class ContextualLexer(Lexer): | |||||
except UnexpectedCharacters: | except UnexpectedCharacters: | ||||
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. | raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. | ||||
class LexerThread(object): | |||||
class LexerThread: | |||||
"""A thread that ties a lexer instance and a lexer state, to be used by the parser""" | """A thread that ties a lexer instance and a lexer state, to be used by the parser""" | ||||
def __init__(self, lexer, text): | def __init__(self, lexer, text): | ||||
@@ -790,7 +790,7 @@ class Grammar: | |||||
PackageResource = namedtuple('PackageResource', 'pkg_name path') | PackageResource = namedtuple('PackageResource', 'pkg_name path') | ||||
class FromPackageLoader(object): | |||||
class FromPackageLoader: | |||||
""" | """ | ||||
Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. | Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. | ||||
This allows them to be compatible even from within zip files. | This allows them to be compatible even from within zip files. | ||||
@@ -917,7 +917,7 @@ def _get_parser(): | |||||
import re | import re | ||||
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) | lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) | ||||
parser_conf = ParserConf(rules, callback, ['start']) | parser_conf = ParserConf(rules, callback, ['start']) | ||||
lexer_conf.lexer_type = 'standard' | |||||
lexer_conf.lexer_type = 'basic' | |||||
parser_conf.parser_type = 'lalr' | parser_conf.parser_type = 'lalr' | ||||
_get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) | _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) | ||||
return _get_parser.cache | return _get_parser.cache | ||||
@@ -1,7 +1,7 @@ | |||||
from .exceptions import ConfigurationError, GrammarError, assert_config | from .exceptions import ConfigurationError, GrammarError, assert_config | ||||
from .utils import get_regexp_width, Serialize | from .utils import get_regexp_width, Serialize | ||||
from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
from .parsers import earley, xearley, cyk | from .parsers import earley, xearley, cyk | ||||
from .parsers.lalr_parser import LALR_Parser | from .parsers.lalr_parser import LALR_Parser | ||||
from .tree import Tree | from .tree import Tree | ||||
@@ -70,7 +70,7 @@ class ParsingFrontend(Serialize): | |||||
try: | try: | ||||
create_lexer = { | create_lexer = { | ||||
'standard': create_traditional_lexer, | |||||
'basic': create_basic_lexer, | |||||
'contextual': create_contextual_lexer, | 'contextual': create_contextual_lexer, | ||||
}[lexer_type] | }[lexer_type] | ||||
except KeyError: | except KeyError: | ||||
@@ -110,9 +110,9 @@ def get_frontend(parser, lexer): | |||||
assert_config(parser, ('lalr', 'earley', 'cyk')) | assert_config(parser, ('lalr', 'earley', 'cyk')) | ||||
if not isinstance(lexer, type): # not custom lexer? | if not isinstance(lexer, type): # not custom lexer? | ||||
expected = { | expected = { | ||||
'lalr': ('standard', 'contextual'), | |||||
'earley': ('standard', 'dynamic', 'dynamic_complete'), | |||||
'cyk': ('standard', ), | |||||
'lalr': ('basic', 'contextual'), | |||||
'earley': ('basic', 'dynamic', 'dynamic_complete'), | |||||
'cyk': ('basic', ), | |||||
}[parser] | }[parser] | ||||
assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) | assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) | ||||
@@ -141,8 +141,8 @@ class PostLexConnector: | |||||
def create_traditional_lexer(lexer_conf, parser, postlex): | |||||
return TraditionalLexer(lexer_conf) | |||||
def create_basic_lexer(lexer_conf, parser, postlex): | |||||
return BasicLexer(lexer_conf) | |||||
def create_contextual_lexer(lexer_conf, parser, postlex): | def create_contextual_lexer(lexer_conf, parser, postlex): | ||||
states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} | states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} | ||||
@@ -23,7 +23,7 @@ def match(t, s): | |||||
return t.name == s.type | return t.name == s.type | ||||
class Rule(object): | |||||
class Rule: | |||||
"""Context-free grammar rule.""" | """Context-free grammar rule.""" | ||||
def __init__(self, lhs, rhs, weight, alias): | def __init__(self, lhs, rhs, weight, alias): | ||||
@@ -51,7 +51,7 @@ class Rule(object): | |||||
return not (self == other) | return not (self == other) | ||||
class Grammar(object): | |||||
class Grammar: | |||||
"""Context-free grammar.""" | """Context-free grammar.""" | ||||
def __init__(self, rules): | def __init__(self, rules): | ||||
@@ -68,7 +68,7 @@ class Grammar(object): | |||||
# Parse tree data structures | # Parse tree data structures | ||||
class RuleNode(object): | |||||
class RuleNode: | |||||
"""A node in the parse tree, which also contains the full rhs rule.""" | """A node in the parse tree, which also contains the full rhs rule.""" | ||||
def __init__(self, rule, children, weight=0): | def __init__(self, rule, children, weight=0): | ||||
@@ -81,7 +81,7 @@ class RuleNode(object): | |||||
class Parser(object): | |||||
class Parser: | |||||
"""Parser wrapper.""" | """Parser wrapper.""" | ||||
def __init__(self, rules): | def __init__(self, rules): | ||||
@@ -186,7 +186,7 @@ def _parse(s, g): | |||||
# * Empty rules (epsilon rules) | # * Empty rules (epsilon rules) | ||||
class CnfWrapper(object): | |||||
class CnfWrapper: | |||||
"""CNF wrapper for grammar. | """CNF wrapper for grammar. | ||||
Validates that the input grammar is CNF and provides helper data structures. | Validates that the input grammar is CNF and provides helper data structures. | ||||
@@ -1,21 +1,8 @@ | |||||
"This module implements an Earley Parser" | |||||
"""This module implements useful building blocks for the Earley parser | |||||
""" | |||||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||||
# I use the no-recursion version of Transformer, because the tree might be | |||||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||||
# | |||||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||||
# Column keeps track of new items using NewsList instances. | |||||
# | |||||
# Author: Erez Shinan (2017) | |||||
# Email : erezshin@gmail.com | |||||
from ..grammar import NonTerminal, Terminal | |||||
class Item(object): | |||||
class Item: | |||||
"An Earley Item, the atom of the algorithm." | "An Earley Item, the atom of the algorithm." | ||||
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') | __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') | ||||
@@ -8,7 +8,6 @@ http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||||
""" | """ | ||||
from random import randint | from random import randint | ||||
from math import isinf | |||||
from collections import deque | from collections import deque | ||||
from operator import attrgetter | from operator import attrgetter | ||||
from importlib import import_module | from importlib import import_module | ||||
@@ -20,7 +19,7 @@ from ..lexer import Token | |||||
from ..utils import logger | from ..utils import logger | ||||
from ..tree import Tree | from ..tree import Tree | ||||
class ForestNode(object): | |||||
class ForestNode: | |||||
pass | pass | ||||
class SymbolNode(ForestNode): | class SymbolNode(ForestNode): | ||||
@@ -173,7 +172,7 @@ class PackedNode(ForestNode): | |||||
symbol = self.s.name | symbol = self.s.name | ||||
return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) | return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) | ||||
class ForestVisitor(object): | |||||
class ForestVisitor: | |||||
""" | """ | ||||
An abstract base class for building forest visitors. | An abstract base class for building forest visitors. | ||||
@@ -5,7 +5,7 @@ from ..exceptions import GrammarError | |||||
from ..grammar import Rule, Terminal, NonTerminal | from ..grammar import Rule, Terminal, NonTerminal | ||||
class RulePtr(object): | |||||
class RulePtr: | |||||
__slots__ = ('rule', 'index') | __slots__ = ('rule', 'index') | ||||
def __init__(self, rule, index): | def __init__(self, rule, index): | ||||
@@ -38,7 +38,7 @@ class RulePtr(object): | |||||
# state generation ensures no duplicate LR0ItemSets | # state generation ensures no duplicate LR0ItemSets | ||||
class LR0ItemSet(object): | |||||
class LR0ItemSet: | |||||
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') | __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') | ||||
def __init__(self, kernel, closure): | def __init__(self, kernel, closure): | ||||
@@ -121,7 +121,7 @@ def calculate_sets(rules): | |||||
return FIRST, FOLLOW, NULLABLE | return FIRST, FOLLOW, NULLABLE | ||||
class GrammarAnalyzer(object): | |||||
class GrammarAnalyzer: | |||||
def __init__(self, parser_conf, debug=False): | def __init__(self, parser_conf, debug=False): | ||||
self.debug = debug | self.debug = debug | ||||
@@ -6,7 +6,7 @@ from .. import Token | |||||
from ..exceptions import UnexpectedToken | from ..exceptions import UnexpectedToken | ||||
class InteractiveParser(object): | |||||
class InteractiveParser: | |||||
"""InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR. | """InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR. | ||||
For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``. | For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``. | ||||
@@ -69,7 +69,7 @@ class LALR_Parser(Serialize): | |||||
e = e2 | e = e2 | ||||
class ParseConf(object): | |||||
class ParseConf: | |||||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | ||||
def __init__(self, parse_table, callbacks, start): | def __init__(self, parse_table, callbacks, start): | ||||
@@ -83,7 +83,7 @@ class ParseConf(object): | |||||
self.start = start | self.start = start | ||||
class ParserState(object): | |||||
class ParserState: | |||||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | ||||
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | ||||
@@ -157,7 +157,7 @@ class ParserState(object): | |||||
if is_end and state_stack[-1] == end_state: | if is_end and state_stack[-1] == end_state: | ||||
return value_stack[-1] | return value_stack[-1] | ||||
class _Parser(object): | |||||
class _Parser: | |||||
def __init__(self, parse_table, callbacks, debug=False): | def __init__(self, parse_table, callbacks, debug=False): | ||||
self.parse_table = parse_table | self.parse_table = parse_table | ||||
self.callbacks = callbacks | self.callbacks = callbacks | ||||
@@ -89,8 +89,7 @@ class Reconstructor(TreeMatcher): | |||||
for item in res: | for item in res: | ||||
if isinstance(item, Tree): | if isinstance(item, Tree): | ||||
# TODO use orig_expansion.rulename to support templates | # TODO use orig_expansion.rulename to support templates | ||||
for x in self._reconstruct(item): | |||||
yield x | |||||
yield from self._reconstruct(item) | |||||
else: | else: | ||||
yield item | yield item | ||||
@@ -25,7 +25,7 @@ options = ['start', 'lexer'] | |||||
lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times") | lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times") | ||||
lalr_argparser.add_argument('-s', '--start', action='append', default=[]) | lalr_argparser.add_argument('-s', '--start', action='append', default=[]) | ||||
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('standard', 'contextual')) | |||||
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual')) | |||||
k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {} | k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {} | ||||
lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)') | lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)') | ||||
lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file') | lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file') | ||||
@@ -44,7 +44,7 @@ nearley_grammar = r""" | |||||
""" | """ | ||||
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard') | |||||
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') | |||||
def _get_rulename(name): | def _get_rulename(name): | ||||
name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) | name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) | ||||
@@ -3,7 +3,7 @@ | |||||
# | # | ||||
# Lark Stand-alone Generator Tool | # Lark Stand-alone Generator Tool | ||||
# ---------------------------------- | # ---------------------------------- | ||||
# Generates a stand-alone LALR(1) parser with a standard lexer | |||||
# Generates a stand-alone LALR(1) parser | |||||
# | # | ||||
# Git: https://github.com/erezsh/lark | # Git: https://github.com/erezsh/lark | ||||
# Author: Erez Shinan (erezshin@gmail.com) | # Author: Erez Shinan (erezshin@gmail.com) | ||||
@@ -35,7 +35,7 @@ class Meta: | |||||
self.empty = True | self.empty = True | ||||
class Tree(object): | |||||
class Tree: | |||||
"""The main tree class. | """The main tree class. | ||||
Creates a new tree, and stores "data" and "children" in attributes of the same name. | Creates a new tree, and stores "data" and "children" in attributes of the same name. | ||||
@@ -41,7 +41,7 @@ def _deserialize(data, namespace, memo): | |||||
return data | return data | ||||
class Serialize(object): | |||||
class Serialize: | |||||
"""Safe-ish serialization interface that doesn't rely on Pickle | """Safe-ish serialization interface that doesn't rely on Pickle | ||||
Attributes: | Attributes: | ||||
@@ -36,7 +36,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte | |||||
from lark.tree import Tree | from lark.tree import Tree | ||||
from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive | from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive | ||||
from lark.grammar import Rule | from lark.grammar import Rule | ||||
from lark.lexer import TerminalDef, Lexer, TraditionalLexer | |||||
from lark.lexer import TerminalDef, Lexer, BasicLexer | |||||
from lark.indenter import Indenter | from lark.indenter import Indenter | ||||
__all__ = ['TestParsers'] | __all__ = ['TestParsers'] | ||||
@@ -465,7 +465,7 @@ def _make_full_earley_test(LEXER): | |||||
empty_tree = Tree('empty', [Tree('empty2', [])]) | empty_tree = Tree('empty', [Tree('empty2', [])]) | ||||
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | ||||
@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") | |||||
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") | |||||
def test_earley_explicit_ambiguity(self): | def test_earley_explicit_ambiguity(self): | ||||
# This was a sneaky bug! | # This was a sneaky bug! | ||||
@@ -481,7 +481,7 @@ def _make_full_earley_test(LEXER): | |||||
self.assertEqual( ambig_tree.data, '_ambig') | self.assertEqual( ambig_tree.data, '_ambig') | ||||
self.assertEqual( len(ambig_tree.children), 2) | self.assertEqual( len(ambig_tree.children), 2) | ||||
@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") | |||||
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") | |||||
def test_ambiguity1(self): | def test_ambiguity1(self): | ||||
grammar = """ | grammar = """ | ||||
start: cd+ "e" | start: cd+ "e" | ||||
@@ -497,7 +497,7 @@ def _make_full_earley_test(LEXER): | |||||
assert ambig_tree.data == '_ambig', ambig_tree | assert ambig_tree.data == '_ambig', ambig_tree | ||||
assert len(ambig_tree.children) == 2 | assert len(ambig_tree.children) == 2 | ||||
@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") | |||||
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer") | |||||
def test_ambiguity2(self): | def test_ambiguity2(self): | ||||
grammar = """ | grammar = """ | ||||
ANY: /[a-zA-Z0-9 ]+/ | ANY: /[a-zA-Z0-9 ]+/ | ||||
@@ -918,7 +918,7 @@ class CustomLexerNew(Lexer): | |||||
so it uses the traditionalparser as implementation without custom lexing behaviour. | so it uses the traditionalparser as implementation without custom lexing behaviour. | ||||
""" | """ | ||||
def __init__(self, lexer_conf): | def __init__(self, lexer_conf): | ||||
self.lexer = TraditionalLexer(copy(lexer_conf)) | |||||
self.lexer = BasicLexer(copy(lexer_conf)) | |||||
def lex(self, lexer_state, parser_state): | def lex(self, lexer_state, parser_state): | ||||
return self.lexer.lex(lexer_state, parser_state) | return self.lexer.lex(lexer_state, parser_state) | ||||
@@ -930,7 +930,7 @@ class CustomLexerOld(Lexer): | |||||
so it uses the traditionalparser as implementation without custom lexing behaviour. | so it uses the traditionalparser as implementation without custom lexing behaviour. | ||||
""" | """ | ||||
def __init__(self, lexer_conf): | def __init__(self, lexer_conf): | ||||
self.lexer = TraditionalLexer(copy(lexer_conf)) | |||||
self.lexer = BasicLexer(copy(lexer_conf)) | |||||
def lex(self, text): | def lex(self, text): | ||||
ls = self.lexer.make_lexer_state(text) | ls = self.lexer.make_lexer_state(text) | ||||
return self.lexer.lex(ls, None) | return self.lexer.lex(ls, None) | ||||
@@ -1019,9 +1019,9 @@ def _make_parser_test(LEXER, PARSER): | |||||
def _Lark_open(gfilename, **kwargs): | def _Lark_open(gfilename, **kwargs): | ||||
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | ||||
if (LEXER, PARSER) == ('standard', 'earley'): | |||||
if (LEXER, PARSER) == ('basic', 'earley'): | |||||
# Check that the `lark.lark` grammar represents can parse every example used in these tests. | # Check that the `lark.lark` grammar represents can parse every example used in these tests. | ||||
# Standard-Earley was an arbitrary choice, to make sure it only ran once. | |||||
# basic-Earley was an arbitrary choice, to make sure it only ran once. | |||||
lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr') | lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr') | ||||
def wrap_with_test_grammar(f): | def wrap_with_test_grammar(f): | ||||
def _f(x, **kwargs): | def _f(x, **kwargs): | ||||
@@ -1736,7 +1736,8 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertEqual(len(tree.children), 2) | self.assertEqual(len(tree.children), 2) | ||||
@unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority") | |||||
# TODO: Remove after merging priority for Dynamic Earley | |||||
@unittest.skipIf(LEXER != 'basic', "Only basic lexers care about token priority") | |||||
def test_lexer_prioritization(self): | def test_lexer_prioritization(self): | ||||
"Tests effect of priority on result" | "Tests effect of priority on result" | ||||
@@ -2514,9 +2515,9 @@ def _make_parser_test(LEXER, PARSER): | |||||
__all__.append(_NAME) | __all__.append(_NAME) | ||||
_TO_TEST = [ | _TO_TEST = [ | ||||
('standard', 'earley'), | |||||
('standard', 'cyk'), | |||||
('standard', 'lalr'), | |||||
('basic', 'earley'), | |||||
('basic', 'cyk'), | |||||
('basic', 'lalr'), | |||||
('dynamic', 'earley'), | ('dynamic', 'earley'), | ||||
('dynamic_complete', 'earley'), | ('dynamic_complete', 'earley'), | ||||