diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index b310dc5..ecbbb09 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -15,7 +15,7 @@ class PostLex(Protocol): def process(self, stream: Iterator[Token]) -> Iterator[Token]: ... - + always_accept: Iterable[str] @@ -42,12 +42,12 @@ class LarkOptions: class PackageResource(object): pkg_name: str path: str - + def __init__(self, pkg_name: str, path: str): ... class FromPackageLoader: def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... - + def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... @@ -88,12 +88,12 @@ class Lark: @classmethod def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: ... - + @classmethod def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T: ... - def lex(self, text: str) -> Iterator[Token]: + def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]: ... def get_terminal(self, name: str) -> TerminalDef: diff --git a/lark/lark.py b/lark/lark.py index 80327d9..715eb60 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -11,7 +11,7 @@ from .load_grammar import load_grammar, FromPackageLoader from .tree import Tree from .common import LexerConf, ParserConf -from .lexer import Lexer, TraditionalLexer, TerminalDef +from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend, _get_lexer_callbacks from .grammar import Rule @@ -355,8 +355,13 @@ class Lark(Serialize): __serialize_fields__ = 'parser', 'rules', 'options' - def _build_lexer(self): - return TraditionalLexer(self.lexer_conf) + def _build_lexer(self, dont_ignore=False): + lexer_conf = self.lexer_conf + if dont_ignore: + from copy import copy + lexer_conf = copy(lexer_conf) + lexer_conf.ignore = () + return TraditionalLexer(lexer_conf) def _prepare_callbacks(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) @@ -419,6 +424,7 @@ class Lark(Serialize): self._callbacks, self.options, # Not all, but multiple attributes are used ) + self.lexer_conf = self.parser.lexer_conf self.terminals = self.parser.lexer_conf.terminals self._terminals_dict = {t.name: t for t in self.terminals} return self @@ -468,11 +474,17 @@ class Lark(Serialize): return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) - def lex(self, text): - "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'" - if not hasattr(self, 'lexer'): - self.lexer = self._build_lexer() - stream = self.lexer.lex(text) + def lex(self, text, dont_ignore=False): + """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' + + When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. + """ + if not hasattr(self, 'lexer') or dont_ignore: + lexer = self._build_lexer(dont_ignore) + else: + lexer = self.lexer + lexer_thread = LexerThread(lexer, text) + stream = lexer_thread.lex(None) if self.options.postlex: return self.options.postlex.process(stream) return stream diff --git a/lark/lexer.py b/lark/lexer.py index 114b4ce..730d95e 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -10,6 +10,7 @@ from copy import copy class Pattern(Serialize): + raw = None def __init__(self, value, flags=(), raw=None): self.value = value diff --git a/tests/__main__.py b/tests/__main__.py index b779457..b8d3971 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -10,6 +10,7 @@ from .test_cache import TestCache from .test_grammar import TestGrammar from .test_reconstructor import TestReconstructor from .test_tree_forest_transformer import TestTreeForestTransformer +from .test_lexer import TestLexer try: from .test_nearley.test_nearley import TestNearley diff --git a/tests/test_lexer.py b/tests/test_lexer.py new file mode 100644 index 0000000..411ef94 --- /dev/null +++ b/tests/test_lexer.py @@ -0,0 +1,23 @@ +from unittest import TestCase, main + +from lark import Lark, Tree + +class TestLexer(TestCase): + def setUp(self): + pass + + def test_basic(self): + p = Lark(""" + start: "a" "b" "c" "d" + %ignore " " + """) + + res = list(p.lex("abc cba dd")) + assert res == list('abccbadd') + + res = list(p.lex("abc cba dd", dont_ignore=True)) + assert res == list('abc cba dd') + + +if __name__ == '__main__': + main() \ No newline at end of file