Browse Source

Merge pull request #995 from lark-parser/more_changes

A bunch of refactors and fixes for v1.0
remotes/origin/gm/2021-09-23T00Z/github.com--lark-parser-lark/master
Erez Shinan 3 years ago
committed by GitHub
parent
commit
9ba30f23b2
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 86 additions and 106 deletions
  1. +1
    -1
      README.md
  2. +1
    -1
      docs/classes.rst
  3. +1
    -1
      docs/grammar.md
  4. +1
    -1
      docs/how_to_develop.md
  5. +1
    -1
      docs/index.rst
  6. +1
    -1
      docs/json_tutorial.md
  7. +1
    -1
      docs/parsers.md
  8. +1
    -1
      docs/recipes.md
  9. +1
    -1
      docs/tools.md
  10. +2
    -2
      examples/advanced/_json_parser.py
  11. +1
    -1
      examples/advanced/conf_earley.py
  12. +1
    -1
      examples/advanced/conf_lalr.py
  13. +1
    -1
      examples/advanced/python_parser.py
  14. +1
    -1
      examples/advanced/qscintilla_json.py
  15. +3
    -3
      examples/json_parser.py
  16. +3
    -3
      lark/ast_utils.py
  17. +1
    -2
      lark/indenter.py
  18. +11
    -11
      lark/lark.py
  19. +8
    -13
      lark/lexer.py
  20. +2
    -2
      lark/load_grammar.py
  21. +7
    -7
      lark/parser_frontends.py
  22. +5
    -5
      lark/parsers/cyk.py
  23. +3
    -16
      lark/parsers/earley_common.py
  24. +2
    -3
      lark/parsers/earley_forest.py
  25. +3
    -3
      lark/parsers/grammar_analysis.py
  26. +1
    -1
      lark/parsers/lalr_interactive_parser.py
  27. +3
    -3
      lark/parsers/lalr_parser.py
  28. +1
    -2
      lark/reconstruct.py
  29. +1
    -1
      lark/tools/__init__.py
  30. +1
    -1
      lark/tools/nearley.py
  31. +1
    -1
      lark/tools/standalone.py
  32. +1
    -1
      lark/tree.py
  33. +1
    -1
      lark/utils.py
  34. +13
    -12
      tests/test_parser.py

+ 1
- 1
README.md View File

@@ -33,7 +33,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h


### Install Lark ### Install Lark


$ pip install lark-parser --upgrade
$ pip install lark --upgrade


Lark has no dependencies. Lark has no dependencies.




+ 1
- 1
docs/classes.rst View File

@@ -13,7 +13,7 @@ Using Unicode character classes with ``regex``


Python's builtin ``re`` module has a few persistent known bugs and also won't parse Python's builtin ``re`` module has a few persistent known bugs and also won't parse
advanced regex features such as character classes. advanced regex features such as character classes.
With ``pip install lark-parser[regex]``, the ``regex`` module will be
With ``pip install lark[regex]``, the ``regex`` module will be
installed alongside lark and can act as a drop-in replacement to ``re``. installed alongside lark and can act as a drop-in replacement to ``re``.


Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``. Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``.


+ 1
- 1
docs/grammar.md View File

@@ -124,7 +124,7 @@ Regexps/strings of different flags can only be concatenated in Python 3.6+


#### Notes for when using a lexer: #### Notes for when using a lexer:


When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence:
When using a lexer (basic or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence:


1. Highest priority first (priority is specified as: TERM.number: ...) 1. Highest priority first (priority is specified as: TERM.number: ...)
2. Length of match (for regexps, the longest theoretical match is used) 2. Length of match (for regexps, the longest theoretical match is used)


+ 1
- 1
docs/how_to_develop.md View File

@@ -32,7 +32,7 @@ For a list of supported interpreters, you can consult the `tox.ini` file.
You can also run a single unittest using its class and method name, for example: You can also run a single unittest using its class and method name, for example:
```bash ```bash
## test_package test_class_name.test_function_name ## test_package test_class_name.test_function_name
python -m tests TestLalrStandard.test_lexer_error_recovering
python -m tests TestLalrBasic.test_keep_all_tokens
``` ```


### tox ### tox


+ 1
- 1
docs/index.rst View File

@@ -56,7 +56,7 @@ Install Lark


.. code:: bash .. code:: bash


$ pip install lark-parser
$ pip install lark


Syntax Highlighting Syntax Highlighting
------------------- -------------------


+ 1
- 1
docs/json_tutorial.md View File

@@ -332,7 +332,7 @@ class TreeToJson(Transformer):
true = lambda self, _: True true = lambda self, _: True
false = lambda self, _: False false = lambda self, _: False


json_parser = Lark(json_grammar, start='value', lexer='standard')
json_parser = Lark(json_grammar, start='value', lexer='basic')


if __name__ == '__main__': if __name__ == '__main__':
with open(sys.argv[1]) as f: with open(sys.argv[1]) as f:


+ 1
- 1
docs/parsers.md View File

@@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser


Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`. Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`.


It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a basic lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='basic'`


**SPPF & Ambiguity resolution** **SPPF & Ambiguity resolution**




+ 1
- 1
docs/recipes.md View File

@@ -43,7 +43,7 @@ It accepts a dictionary of the form


Where callback is of type `f(Token) -> Token` Where callback is of type `f(Token) -> Token`


It only works with the standard and contextual lexers.
It only works with the basic and contextual lexers.


This has the same effect of using a transformer, but can also process ignored tokens. This has the same effect of using a transformer, but can also process ignored tokens.




+ 1
- 1
docs/tools.md View File

@@ -23,7 +23,7 @@ Lark comes with a tool to convert grammars from [Nearley](https://github.com/Har


1. Install Lark with the `nearley` component: 1. Install Lark with the `nearley` component:
```bash ```bash
pip install lark-parser[nearley]
pip install lark[nearley]
``` ```


2. Acquire a copy of the Nearley codebase. This can be done using: 2. Acquire a copy of the Nearley codebase. This can be done using:


+ 2
- 2
examples/advanced/_json_parser.py View File

@@ -53,9 +53,9 @@ class TreeToJson(Transformer):


### Create the JSON parser with Lark, using the LALR algorithm ### Create the JSON parser with Lark, using the LALR algorithm
json_parser = Lark(json_grammar, parser='lalr', json_parser = Lark(json_grammar, parser='lalr',
# Using the standard lexer isn't required, and isn't usually recommended.
# Using the basic lexer isn't required, and isn't usually recommended.
# But, it's good enough for JSON, and it's slightly faster. # But, it's good enough for JSON, and it's slightly faster.
lexer='standard',
lexer='basic',
# Disabling propagate_positions and placeholders slightly improves speed # Disabling propagate_positions and placeholders slightly improves speed
propagate_positions=False, propagate_positions=False,
maybe_placeholders=False, maybe_placeholders=False,


+ 1
- 1
examples/advanced/conf_earley.py View File

@@ -5,7 +5,7 @@ Earley’s dynamic lexer
Demonstrates the power of Earley’s dynamic lexer on a toy configuration language Demonstrates the power of Earley’s dynamic lexer on a toy configuration language


Using a lexer for configuration files is tricky, because values don't Using a lexer for configuration files is tricky, because values don't
have to be surrounded by delimiters. Using a standard lexer for this just won't work.
have to be surrounded by delimiters. Using a basic lexer for this just won't work.


In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity.




+ 1
- 1
examples/advanced/conf_lalr.py View File

@@ -6,7 +6,7 @@ This example demonstrates the power of LALR's contextual lexer,
by parsing a toy configuration language. by parsing a toy configuration language.


The terminals `NAME` and `VALUE` overlap. They can match the same input. The terminals `NAME` and `VALUE` overlap. They can match the same input.
A standard lexer would arbitrarily choose one over the other, based on priority,
A basic lexer would arbitrarily choose one over the other, based on priority,
which would lead to a (confusing) parse error. which would lead to a (confusing) parse error.
However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows
which one of them to expect at each point during the parse. which one of them to expect at each point during the parse.


+ 1
- 1
examples/advanced/python_parser.py View File

@@ -28,7 +28,7 @@ kwargs = dict(rel_to=__file__, postlex=PythonIndenter(), start='file_input')


python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs) python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs)
python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs)
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs)
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='basic', **kwargs)


try: try:
xrange xrange


+ 1
- 1
examples/advanced/qscintilla_json.py View File

@@ -77,7 +77,7 @@ class LexerJson(QsciLexerCustom):
%ignore WS %ignore WS
''' '''


self.lark = Lark(grammar, parser=None, lexer='standard')
self.lark = Lark(grammar, parser=None, lexer='basic')
# All tokens: print([t.name for t in self.lark.parser.lexer.tokens]) # All tokens: print([t.name for t in self.lark.parser.lexer.tokens])


def defaultPaper(self, style): def defaultPaper(self, style):


+ 3
- 3
examples/json_parser.py View File

@@ -50,15 +50,15 @@ class TreeToJson(Transformer):




### Create the JSON parser with Lark, using the Earley algorithm ### Create the JSON parser with Lark, using the Earley algorithm
# json_parser = Lark(json_grammar, parser='earley', lexer='standard')
# json_parser = Lark(json_grammar, parser='earley', lexer='basic')
# def parse(x): # def parse(x):
# return TreeToJson().transform(json_parser.parse(x)) # return TreeToJson().transform(json_parser.parse(x))


### Create the JSON parser with Lark, using the LALR algorithm ### Create the JSON parser with Lark, using the LALR algorithm
json_parser = Lark(json_grammar, parser='lalr', json_parser = Lark(json_grammar, parser='lalr',
# Using the standard lexer isn't required, and isn't usually recommended.
# Using the basic lexer isn't required, and isn't usually recommended.
# But, it's good enough for JSON, and it's slightly faster. # But, it's good enough for JSON, and it's slightly faster.
lexer='standard',
lexer='basic',
# Disabling propagate_positions and placeholders slightly improves speed # Disabling propagate_positions and placeholders slightly improves speed
propagate_positions=False, propagate_positions=False,
maybe_placeholders=False, maybe_placeholders=False,


+ 3
- 3
lark/ast_utils.py View File

@@ -8,20 +8,20 @@ from typing import Optional, Callable


from lark import Transformer, v_args from lark import Transformer, v_args


class Ast(object):
class Ast:
"""Abstract class """Abstract class


Subclasses will be collected by `create_transformer()` Subclasses will be collected by `create_transformer()`
""" """
pass pass


class AsList(object):
class AsList:
"""Abstract class """Abstract class


Subclasses will be instanciated with the parse results as a single list, instead of as arguments. Subclasses will be instanciated with the parse results as a single list, instead of as arguments.
""" """


class WithMeta(object):
class WithMeta:
"""Abstract class """Abstract class


Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail) Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail)


+ 1
- 2
lark/indenter.py View File

@@ -44,8 +44,7 @@ class Indenter(PostLex, ABC):
def _process(self, stream): def _process(self, stream):
for token in stream: for token in stream:
if token.type == self.NL_type: if token.type == self.NL_type:
for t in self.handle_NL(token):
yield t
yield from self.handle_NL(token)
else: else:
yield token yield token




+ 11
- 11
lark/lark.py View File

@@ -19,7 +19,7 @@ from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread, Token
from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend, _get_lexer_callbacks from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule from .grammar import Rule
@@ -57,7 +57,7 @@ class LarkOptions(Serialize):
keep_all_tokens: bool keep_all_tokens: bool
tree_class: Any tree_class: Any
parser: 'Literal["earley", "lalr", "cyk", "auto"]' parser: 'Literal["earley", "lalr", "cyk", "auto"]'
lexer: 'Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]'
lexer: 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]'
ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]' ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]'
postlex: Optional[PostLex] postlex: Optional[PostLex]
priority: 'Optional[Literal["auto", "normal", "invert"]]' priority: 'Optional[Literal["auto", "normal", "invert"]]'
@@ -108,7 +108,7 @@ class LarkOptions(Serialize):
Decides whether or not to use a lexer stage Decides whether or not to use a lexer stage


- "auto" (default): Choose for me based on the parser - "auto" (default): Choose for me based on the parser
- "standard": Use a standard lexer
- "basic": Use a basic lexer
- "contextual": Stronger lexer (only works with parser="lalr") - "contextual": Stronger lexer (only works with parser="lalr")
- "dynamic": Flexible and powerful (only with parser="earley") - "dynamic": Flexible and powerful (only with parser="earley")
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
@@ -123,7 +123,7 @@ class LarkOptions(Serialize):
**=== Misc. / Domain Specific Options ===** **=== Misc. / Domain Specific Options ===**


postlex postlex
Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
Lexer post-processing (Default: None) Only works with the basic and contextual lexers.
priority priority
How priorities should be evaluated - auto, none, normal, invert (Default: auto) How priorities should be evaluated - auto, none, normal, invert (Default: auto)
lexer_callbacks lexer_callbacks
@@ -339,22 +339,22 @@ class Lark(Serialize):
self.options.lexer = 'contextual' self.options.lexer = 'contextual'
elif self.options.parser == 'earley': elif self.options.parser == 'earley':
if self.options.postlex is not None: if self.options.postlex is not None:
logger.info("postlex can't be used with the dynamic lexer, so we use standard instead. "
logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. "
"Consider using lalr with contextual instead of earley") "Consider using lalr with contextual instead of earley")
self.options.lexer = 'standard'
self.options.lexer = 'basic'
else: else:
self.options.lexer = 'dynamic' self.options.lexer = 'dynamic'
elif self.options.parser == 'cyk': elif self.options.parser == 'cyk':
self.options.lexer = 'standard'
self.options.lexer = 'basic'
else: else:
assert False, self.options.parser assert False, self.options.parser
lexer = self.options.lexer lexer = self.options.lexer
if isinstance(lexer, type): if isinstance(lexer, type):
assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance
else: else:
assert_config(lexer, ('standard', 'contextual', 'dynamic', 'dynamic_complete'))
assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete'))
if self.options.postlex is not None and 'dynamic' in lexer: if self.options.postlex is not None and 'dynamic' in lexer:
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use standard or contextual instead")
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead")


if self.options.ambiguity == 'auto': if self.options.ambiguity == 'auto':
if self.options.parser == 'earley': if self.options.parser == 'earley':
@@ -429,7 +429,7 @@ class Lark(Serialize):
from copy import copy from copy import copy
lexer_conf = copy(lexer_conf) lexer_conf = copy(lexer_conf)
lexer_conf.ignore = () lexer_conf.ignore = ()
return TraditionalLexer(lexer_conf)
return BasicLexer(lexer_conf)


def _prepare_callbacks(self): def _prepare_callbacks(self):
self._callbacks = {} self._callbacks = {}
@@ -556,7 +556,7 @@ class Lark(Serialize):




def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'


When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.




+ 8
- 13
lark/lexer.py View File

@@ -157,12 +157,7 @@ class Token(str):
end_pos: int end_pos: int


def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
try:
inst = super(Token, cls).__new__(cls, value)
except UnicodeDecodeError:
value = value.decode('latin1')
inst = super(Token, cls).__new__(cls, value)

inst = super(Token, cls).__new__(cls, value)
inst.type = type_ inst.type = type_
inst.start_pos = start_pos inst.start_pos = start_pos
inst.value = value inst.value = value
@@ -331,7 +326,7 @@ def _regexp_has_newline(r: str):
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)




class LexerState(object):
class LexerState:
__slots__ = 'text', 'line_ctr', 'last_token' __slots__ = 'text', 'line_ctr', 'last_token'


def __init__(self, text, line_ctr, last_token=None): def __init__(self, text, line_ctr, last_token=None):
@@ -366,7 +361,7 @@ class Lexer(ABC):
return LexerState(text, line_ctr) return LexerState(text, line_ctr)




class TraditionalLexer(Lexer):
class BasicLexer(Lexer):


terminals: Collection[TerminalDef] terminals: Collection[TerminalDef]
ignore_types: FrozenSet[str] ignore_types: FrozenSet[str]
@@ -473,8 +468,8 @@ class TraditionalLexer(Lexer):


class ContextualLexer(Lexer): class ContextualLexer(Lexer):


lexers: Dict[str, TraditionalLexer]
root_lexer: TraditionalLexer
lexers: Dict[str, BasicLexer]
root_lexer: BasicLexer


def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None: def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None:
terminals = list(conf.terminals) terminals = list(conf.terminals)
@@ -493,13 +488,13 @@ class ContextualLexer(Lexer):
accepts = set(accepts) | set(conf.ignore) | set(always_accept) accepts = set(accepts) | set(conf.ignore) | set(always_accept)
lexer_conf = copy(trad_conf) lexer_conf = copy(trad_conf)
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
lexer = TraditionalLexer(lexer_conf)
lexer = BasicLexer(lexer_conf)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer


self.lexers[state] = lexer self.lexers[state] = lexer


assert trad_conf.terminals is terminals assert trad_conf.terminals is terminals
self.root_lexer = TraditionalLexer(trad_conf)
self.root_lexer = BasicLexer(trad_conf)


def make_lexer_state(self, text): def make_lexer_state(self, text):
return self.root_lexer.make_lexer_state(text) return self.root_lexer.make_lexer_state(text)
@@ -521,7 +516,7 @@ class ContextualLexer(Lexer):
except UnexpectedCharacters: except UnexpectedCharacters:
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.


class LexerThread(object):
class LexerThread:
"""A thread that ties a lexer instance and a lexer state, to be used by the parser""" """A thread that ties a lexer instance and a lexer state, to be used by the parser"""


def __init__(self, lexer, text): def __init__(self, lexer, text):


+ 2
- 2
lark/load_grammar.py View File

@@ -790,7 +790,7 @@ class Grammar:
PackageResource = namedtuple('PackageResource', 'pkg_name path') PackageResource = namedtuple('PackageResource', 'pkg_name path')




class FromPackageLoader(object):
class FromPackageLoader:
""" """
Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
This allows them to be compatible even from within zip files. This allows them to be compatible even from within zip files.
@@ -917,7 +917,7 @@ def _get_parser():
import re import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH'])
parser_conf = ParserConf(rules, callback, ['start']) parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
lexer_conf.lexer_type = 'basic'
parser_conf.parser_type = 'lalr' parser_conf.parser_type = 'lalr'
_get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None)
return _get_parser.cache return _get_parser.cache


+ 7
- 7
lark/parser_frontends.py View File

@@ -1,7 +1,7 @@
from .exceptions import ConfigurationError, GrammarError, assert_config from .exceptions import ConfigurationError, GrammarError, assert_config
from .utils import get_regexp_width, Serialize from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser from .parsers.lalr_parser import LALR_Parser
from .tree import Tree from .tree import Tree
@@ -70,7 +70,7 @@ class ParsingFrontend(Serialize):


try: try:
create_lexer = { create_lexer = {
'standard': create_traditional_lexer,
'basic': create_basic_lexer,
'contextual': create_contextual_lexer, 'contextual': create_contextual_lexer,
}[lexer_type] }[lexer_type]
except KeyError: except KeyError:
@@ -110,9 +110,9 @@ def get_frontend(parser, lexer):
assert_config(parser, ('lalr', 'earley', 'cyk')) assert_config(parser, ('lalr', 'earley', 'cyk'))
if not isinstance(lexer, type): # not custom lexer? if not isinstance(lexer, type): # not custom lexer?
expected = { expected = {
'lalr': ('standard', 'contextual'),
'earley': ('standard', 'dynamic', 'dynamic_complete'),
'cyk': ('standard', ),
'lalr': ('basic', 'contextual'),
'earley': ('basic', 'dynamic', 'dynamic_complete'),
'cyk': ('basic', ),
}[parser] }[parser]
assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)


@@ -141,8 +141,8 @@ class PostLexConnector:






def create_traditional_lexer(lexer_conf, parser, postlex):
return TraditionalLexer(lexer_conf)
def create_basic_lexer(lexer_conf, parser, postlex):
return BasicLexer(lexer_conf)


def create_contextual_lexer(lexer_conf, parser, postlex): def create_contextual_lexer(lexer_conf, parser, postlex):
states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}


+ 5
- 5
lark/parsers/cyk.py View File

@@ -23,7 +23,7 @@ def match(t, s):
return t.name == s.type return t.name == s.type




class Rule(object):
class Rule:
"""Context-free grammar rule.""" """Context-free grammar rule."""


def __init__(self, lhs, rhs, weight, alias): def __init__(self, lhs, rhs, weight, alias):
@@ -51,7 +51,7 @@ class Rule(object):
return not (self == other) return not (self == other)




class Grammar(object):
class Grammar:
"""Context-free grammar.""" """Context-free grammar."""


def __init__(self, rules): def __init__(self, rules):
@@ -68,7 +68,7 @@ class Grammar(object):




# Parse tree data structures # Parse tree data structures
class RuleNode(object):
class RuleNode:
"""A node in the parse tree, which also contains the full rhs rule.""" """A node in the parse tree, which also contains the full rhs rule."""


def __init__(self, rule, children, weight=0): def __init__(self, rule, children, weight=0):
@@ -81,7 +81,7 @@ class RuleNode(object):






class Parser(object):
class Parser:
"""Parser wrapper.""" """Parser wrapper."""


def __init__(self, rules): def __init__(self, rules):
@@ -186,7 +186,7 @@ def _parse(s, g):
# * Empty rules (epsilon rules) # * Empty rules (epsilon rules)




class CnfWrapper(object):
class CnfWrapper:
"""CNF wrapper for grammar. """CNF wrapper for grammar.


Validates that the input grammar is CNF and provides helper data structures. Validates that the input grammar is CNF and provides helper data structures.


+ 3
- 16
lark/parsers/earley_common.py View File

@@ -1,21 +1,8 @@
"This module implements an Earley Parser"
"""This module implements useful building blocks for the Earley parser
"""


# The parser uses a parse-forest to keep track of derivations and ambiguations.
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
# (right now ambiguity resolution is not developed beyond the needs of lark)
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
# I use the no-recursion version of Transformer, because the tree might be
# deeper than Python's recursion limit (a bit absurd, but that's life)
#
# The algorithm keeps track of each state set, using a corresponding Column instance.
# Column keeps track of new items using NewsList instances.
#
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com


from ..grammar import NonTerminal, Terminal

class Item(object):
class Item:
"An Earley Item, the atom of the algorithm." "An Earley Item, the atom of the algorithm."


__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')


+ 2
- 3
lark/parsers/earley_forest.py View File

@@ -8,7 +8,6 @@ http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
""" """


from random import randint from random import randint
from math import isinf
from collections import deque from collections import deque
from operator import attrgetter from operator import attrgetter
from importlib import import_module from importlib import import_module
@@ -20,7 +19,7 @@ from ..lexer import Token
from ..utils import logger from ..utils import logger
from ..tree import Tree from ..tree import Tree


class ForestNode(object):
class ForestNode:
pass pass


class SymbolNode(ForestNode): class SymbolNode(ForestNode):
@@ -173,7 +172,7 @@ class PackedNode(ForestNode):
symbol = self.s.name symbol = self.s.name
return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)


class ForestVisitor(object):
class ForestVisitor:
""" """
An abstract base class for building forest visitors. An abstract base class for building forest visitors.




+ 3
- 3
lark/parsers/grammar_analysis.py View File

@@ -5,7 +5,7 @@ from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal from ..grammar import Rule, Terminal, NonTerminal




class RulePtr(object):
class RulePtr:
__slots__ = ('rule', 'index') __slots__ = ('rule', 'index')


def __init__(self, rule, index): def __init__(self, rule, index):
@@ -38,7 +38,7 @@ class RulePtr(object):




# state generation ensures no duplicate LR0ItemSets # state generation ensures no duplicate LR0ItemSets
class LR0ItemSet(object):
class LR0ItemSet:
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')


def __init__(self, kernel, closure): def __init__(self, kernel, closure):
@@ -121,7 +121,7 @@ def calculate_sets(rules):
return FIRST, FOLLOW, NULLABLE return FIRST, FOLLOW, NULLABLE




class GrammarAnalyzer(object):
class GrammarAnalyzer:
def __init__(self, parser_conf, debug=False): def __init__(self, parser_conf, debug=False):
self.debug = debug self.debug = debug




+ 1
- 1
lark/parsers/lalr_interactive_parser.py View File

@@ -6,7 +6,7 @@ from .. import Token
from ..exceptions import UnexpectedToken from ..exceptions import UnexpectedToken




class InteractiveParser(object):
class InteractiveParser:
"""InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR. """InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR.


For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``. For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``.


+ 3
- 3
lark/parsers/lalr_parser.py View File

@@ -69,7 +69,7 @@ class LALR_Parser(Serialize):
e = e2 e = e2




class ParseConf(object):
class ParseConf:
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'


def __init__(self, parse_table, callbacks, start): def __init__(self, parse_table, callbacks, start):
@@ -83,7 +83,7 @@ class ParseConf(object):
self.start = start self.start = start




class ParserState(object):
class ParserState:
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'


def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None):
@@ -157,7 +157,7 @@ class ParserState(object):
if is_end and state_stack[-1] == end_state: if is_end and state_stack[-1] == end_state:
return value_stack[-1] return value_stack[-1]


class _Parser(object):
class _Parser:
def __init__(self, parse_table, callbacks, debug=False): def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table self.parse_table = parse_table
self.callbacks = callbacks self.callbacks = callbacks


+ 1
- 2
lark/reconstruct.py View File

@@ -89,8 +89,7 @@ class Reconstructor(TreeMatcher):
for item in res: for item in res:
if isinstance(item, Tree): if isinstance(item, Tree):
# TODO use orig_expansion.rulename to support templates # TODO use orig_expansion.rulename to support templates
for x in self._reconstruct(item):
yield x
yield from self._reconstruct(item)
else: else:
yield item yield item




+ 1
- 1
lark/tools/__init__.py View File

@@ -25,7 +25,7 @@ options = ['start', 'lexer']


lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times") lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times")
lalr_argparser.add_argument('-s', '--start', action='append', default=[]) lalr_argparser.add_argument('-s', '--start', action='append', default=[])
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('standard', 'contextual'))
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual'))
k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {} k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {}
lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)') lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)')
lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file') lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file')


+ 1
- 1
lark/tools/nearley.py View File

@@ -44,7 +44,7 @@ nearley_grammar = r"""


""" """


nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard')
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic')


def _get_rulename(name): def _get_rulename(name):
name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name)


+ 1
- 1
lark/tools/standalone.py View File

@@ -3,7 +3,7 @@
# #
# Lark Stand-alone Generator Tool # Lark Stand-alone Generator Tool
# ---------------------------------- # ----------------------------------
# Generates a stand-alone LALR(1) parser with a standard lexer
# Generates a stand-alone LALR(1) parser
# #
# Git: https://github.com/erezsh/lark # Git: https://github.com/erezsh/lark
# Author: Erez Shinan (erezshin@gmail.com) # Author: Erez Shinan (erezshin@gmail.com)


+ 1
- 1
lark/tree.py View File

@@ -35,7 +35,7 @@ class Meta:
self.empty = True self.empty = True




class Tree(object):
class Tree:
"""The main tree class. """The main tree class.


Creates a new tree, and stores "data" and "children" in attributes of the same name. Creates a new tree, and stores "data" and "children" in attributes of the same name.


+ 1
- 1
lark/utils.py View File

@@ -41,7 +41,7 @@ def _deserialize(data, namespace, memo):
return data return data




class Serialize(object):
class Serialize:
"""Safe-ish serialization interface that doesn't rely on Pickle """Safe-ish serialization interface that doesn't rely on Pickle


Attributes: Attributes:


+ 13
- 12
tests/test_parser.py View File

@@ -36,7 +36,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte
from lark.tree import Tree from lark.tree import Tree
from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive
from lark.grammar import Rule from lark.grammar import Rule
from lark.lexer import TerminalDef, Lexer, TraditionalLexer
from lark.lexer import TerminalDef, Lexer, BasicLexer
from lark.indenter import Indenter from lark.indenter import Indenter


__all__ = ['TestParsers'] __all__ = ['TestParsers']
@@ -465,7 +465,7 @@ def _make_full_earley_test(LEXER):
empty_tree = Tree('empty', [Tree('empty2', [])]) empty_tree = Tree('empty', [Tree('empty2', [])])
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])


@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer")
def test_earley_explicit_ambiguity(self): def test_earley_explicit_ambiguity(self):
# This was a sneaky bug! # This was a sneaky bug!


@@ -481,7 +481,7 @@ def _make_full_earley_test(LEXER):
self.assertEqual( ambig_tree.data, '_ambig') self.assertEqual( ambig_tree.data, '_ambig')
self.assertEqual( len(ambig_tree.children), 2) self.assertEqual( len(ambig_tree.children), 2)


@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer")
def test_ambiguity1(self): def test_ambiguity1(self):
grammar = """ grammar = """
start: cd+ "e" start: cd+ "e"
@@ -497,7 +497,7 @@ def _make_full_earley_test(LEXER):
assert ambig_tree.data == '_ambig', ambig_tree assert ambig_tree.data == '_ambig', ambig_tree
assert len(ambig_tree.children) == 2 assert len(ambig_tree.children) == 2


@unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
@unittest.skipIf(LEXER=='basic', "Requires dynamic lexer")
def test_ambiguity2(self): def test_ambiguity2(self):
grammar = """ grammar = """
ANY: /[a-zA-Z0-9 ]+/ ANY: /[a-zA-Z0-9 ]+/
@@ -918,7 +918,7 @@ class CustomLexerNew(Lexer):
so it uses the traditionalparser as implementation without custom lexing behaviour. so it uses the traditionalparser as implementation without custom lexing behaviour.
""" """
def __init__(self, lexer_conf): def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(copy(lexer_conf))
self.lexer = BasicLexer(copy(lexer_conf))
def lex(self, lexer_state, parser_state): def lex(self, lexer_state, parser_state):
return self.lexer.lex(lexer_state, parser_state) return self.lexer.lex(lexer_state, parser_state)


@@ -930,7 +930,7 @@ class CustomLexerOld(Lexer):
so it uses the traditionalparser as implementation without custom lexing behaviour. so it uses the traditionalparser as implementation without custom lexing behaviour.
""" """
def __init__(self, lexer_conf): def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(copy(lexer_conf))
self.lexer = BasicLexer(copy(lexer_conf))
def lex(self, text): def lex(self, text):
ls = self.lexer.make_lexer_state(text) ls = self.lexer.make_lexer_state(text)
return self.lexer.lex(ls, None) return self.lexer.lex(ls, None)
@@ -1019,9 +1019,9 @@ def _make_parser_test(LEXER, PARSER):
def _Lark_open(gfilename, **kwargs): def _Lark_open(gfilename, **kwargs):
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)


if (LEXER, PARSER) == ('standard', 'earley'):
if (LEXER, PARSER) == ('basic', 'earley'):
# Check that the `lark.lark` grammar represents can parse every example used in these tests. # Check that the `lark.lark` grammar represents can parse every example used in these tests.
# Standard-Earley was an arbitrary choice, to make sure it only ran once.
# basic-Earley was an arbitrary choice, to make sure it only ran once.
lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr') lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr')
def wrap_with_test_grammar(f): def wrap_with_test_grammar(f):
def _f(x, **kwargs): def _f(x, **kwargs):
@@ -1736,7 +1736,8 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(len(tree.children), 2) self.assertEqual(len(tree.children), 2)




@unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
# TODO: Remove after merging priority for Dynamic Earley
@unittest.skipIf(LEXER != 'basic', "Only basic lexers care about token priority")
def test_lexer_prioritization(self): def test_lexer_prioritization(self):
"Tests effect of priority on result" "Tests effect of priority on result"


@@ -2514,9 +2515,9 @@ def _make_parser_test(LEXER, PARSER):
__all__.append(_NAME) __all__.append(_NAME)


_TO_TEST = [ _TO_TEST = [
('standard', 'earley'),
('standard', 'cyk'),
('standard', 'lalr'),
('basic', 'earley'),
('basic', 'cyk'),
('basic', 'lalr'),


('dynamic', 'earley'), ('dynamic', 'earley'),
('dynamic_complete', 'earley'), ('dynamic_complete', 'earley'),


Loading…
Cancel
Save