Browse Source

Fixed Lark.lex(), added dont_ignore option, added tests for it.

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Sh 4 years ago
parent
commit
a7c71f32cc
5 changed files with 50 additions and 13 deletions
  1. +5
    -5
      lark-stubs/lark.pyi
  2. +20
    -8
      lark/lark.py
  3. +1
    -0
      lark/lexer.py
  4. +1
    -0
      tests/__main__.py
  5. +23
    -0
      tests/test_lexer.py

+ 5
- 5
lark-stubs/lark.pyi View File

@@ -15,7 +15,7 @@ class PostLex(Protocol):

def process(self, stream: Iterator[Token]) -> Iterator[Token]:
...
always_accept: Iterable[str]


@@ -42,12 +42,12 @@ class LarkOptions:
class PackageResource(object):
pkg_name: str
path: str
def __init__(self, pkg_name: str, path: str): ...

class FromPackageLoader:
def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ...


@@ -88,12 +88,12 @@ class Lark:
@classmethod
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
...
@classmethod
def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:
...

def lex(self, text: str) -> Iterator[Token]:
def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]:
...

def get_terminal(self, name: str) -> TerminalDef:


+ 20
- 8
lark/lark.py View File

@@ -11,7 +11,7 @@ from .load_grammar import load_grammar, FromPackageLoader
from .tree import Tree
from .common import LexerConf, ParserConf

from .lexer import Lexer, TraditionalLexer, TerminalDef
from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule
@@ -355,8 +355,13 @@ class Lark(Serialize):

__serialize_fields__ = 'parser', 'rules', 'options'

def _build_lexer(self):
return TraditionalLexer(self.lexer_conf)
def _build_lexer(self, dont_ignore=False):
lexer_conf = self.lexer_conf
if dont_ignore:
from copy import copy
lexer_conf = copy(lexer_conf)
lexer_conf.ignore = ()
return TraditionalLexer(lexer_conf)

def _prepare_callbacks(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer)
@@ -419,6 +424,7 @@ class Lark(Serialize):
self._callbacks,
self.options, # Not all, but multiple attributes are used
)
self.lexer_conf = self.parser.lexer_conf
self.terminals = self.parser.lexer_conf.terminals
self._terminals_dict = {t.name: t for t in self.terminals}
return self
@@ -468,11 +474,17 @@ class Lark(Serialize):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)


def lex(self, text):
"Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
if not hasattr(self, 'lexer'):
self.lexer = self._build_lexer()
stream = self.lexer.lex(text)
def lex(self, text, dont_ignore=False):
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'

When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
"""
if not hasattr(self, 'lexer') or dont_ignore:
lexer = self._build_lexer(dont_ignore)
else:
lexer = self.lexer
lexer_thread = LexerThread(lexer, text)
stream = lexer_thread.lex(None)
if self.options.postlex:
return self.options.postlex.process(stream)
return stream


+ 1
- 0
lark/lexer.py View File

@@ -10,6 +10,7 @@ from copy import copy


class Pattern(Serialize):
raw = None

def __init__(self, value, flags=(), raw=None):
self.value = value


+ 1
- 0
tests/__main__.py View File

@@ -10,6 +10,7 @@ from .test_cache import TestCache
from .test_grammar import TestGrammar
from .test_reconstructor import TestReconstructor
from .test_tree_forest_transformer import TestTreeForestTransformer
from .test_lexer import TestLexer

try:
from .test_nearley.test_nearley import TestNearley


+ 23
- 0
tests/test_lexer.py View File

@@ -0,0 +1,23 @@
from unittest import TestCase, main

from lark import Lark, Tree

class TestLexer(TestCase):
def setUp(self):
pass

def test_basic(self):
p = Lark("""
start: "a" "b" "c" "d"
%ignore " "
""")

res = list(p.lex("abc cba dd"))
assert res == list('abccbadd')

res = list(p.lex("abc cba dd", dont_ignore=True))
assert res == list('abc cba dd')


if __name__ == '__main__':
main()

Loading…
Cancel
Save