From 53b3e12bba9359aa6ac4cd5ab973fac21428148f Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Sat, 26 Sep 2020 21:04:54 +0200 Subject: [PATCH 1/9] Added `import_sources` --- lark-stubs/lark.pyi | 4 +++- lark/lark.py | 5 ++++- lark/load_grammar.py | 46 +++++++++++++++++++++++++++++--------------- tests/test_parser.py | 18 +++++++++++++++++ 4 files changed, 56 insertions(+), 17 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index c39ae3d..deb8849 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -2,7 +2,7 @@ from typing import ( TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, - Literal, Protocol, + Literal, Protocol, Tuple, ) from .visitors import Transformer from .lexer import Token, Lexer, TerminalDef @@ -32,6 +32,7 @@ class LarkOptions: cache: Union[bool, str] g_regex_flags: int use_bytes: bool + import_sources: List[Union[str, Callable[[str, str], str]]] class Lark: @@ -60,6 +61,7 @@ class Lark: cache: Union[bool, str] = False, g_regex_flags: int = ..., use_bytes: bool = False, + import_sources: List[Union[str, Callable[[List[str], str], Tuple[str, str]]]] = ..., ): ... diff --git a/lark/lark.py b/lark/lark.py index 8799610..9877b00 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -90,6 +90,8 @@ class LarkOptions(Serialize): Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). edit_terminals A callback for editing the terminals before parse. + import_sources + A List of either paths or loader functions to specify from where grammars are imported **=== End Options ===** """ @@ -115,6 +117,7 @@ class LarkOptions(Serialize): 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False, + 'import_sources': [], } def __init__(self, options_dict): @@ -267,7 +270,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'forest', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source, re_module) + self.grammar = load_grammar(grammar, self.source, re_module, self.options.import_sources) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index cd36e4b..bb5f71a 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -20,7 +20,7 @@ from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transfo inline_args = v_args(inline=True) __path__ = os.path.dirname(__file__) -IMPORT_PATHS = [os.path.join(__path__, 'grammars')] +IMPORT_PATHS = ['grammars'] EXT = '.lark' @@ -648,19 +648,35 @@ class Grammar: return terminals, compiled_rules, self.ignore +def stdlib_loader(base_paths, grammar_path): + import pkgutil + for path in IMPORT_PATHS: + text = pkgutil.get_data('lark', path + '/' + grammar_path) + if text is None: + continue + return '', text.decode() + raise FileNotFoundError() + _imported_grammars = {} -def import_grammar(grammar_path, re_, base_paths=[]): +def import_grammar(grammar_path, re_, base_paths=(), import_sources=()): if grammar_path not in _imported_grammars: - import_paths = base_paths + IMPORT_PATHS - for import_path in import_paths: - with suppress(IOError): - joined_path = os.path.join(import_path, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() - grammar = load_grammar(text, joined_path, re_) - _imported_grammars[grammar_path] = grammar - break + import_paths = import_sources + base_paths + [stdlib_loader] + for source in import_paths: + if isinstance(source, str): + with suppress(IOError): + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + grammar = load_grammar(text, joined_path, re_, import_sources) + _imported_grammars[grammar_path] = grammar + break + else: + with suppress(IOError): + joined_path, text = source(base_paths, grammar_path) + grammar = load_grammar(text, joined_path, re_, import_sources) + _imported_grammars[grammar_path] = grammar + break else: open(grammar_path, encoding='utf8') assert False @@ -817,7 +833,7 @@ class GrammarLoader: self.canonize_tree = CanonizeTree() self.re_module = re_module - def load_grammar(self, grammar_text, grammar_name=''): + def load_grammar(self, grammar_text, grammar_name='', import_sources=[]): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." try: @@ -901,7 +917,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, self.re_module, base_paths=base_paths) + g = import_grammar(grammar_path, self.re_module, base_paths=base_paths, import_sources=import_sources) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -981,5 +997,5 @@ class GrammarLoader: -def load_grammar(grammar, source, re_): - return GrammarLoader(re_).load_grammar(grammar, source) +def load_grammar(grammar, source, re_, import_sources): + return GrammarLoader(re_).load_grammar(grammar, source, import_sources) diff --git a/tests/test_parser.py b/tests/test_parser.py index 83336c5..6779f64 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1782,6 +1782,24 @@ def _make_parser_test(LEXER, PARSER): """ self.assertRaises(IOError, _Lark, grammar) + def test_import_custom_sources(self): + def custom_loader(base_paths, grammar_path): + import pkgutil + text = pkgutil.get_data('tests', 'grammars/' + grammar_path) + if text is None: + raise FileNotFoundError() + return '', text.decode() + + grammar = """ + start: startab + + %import ab.startab + """ + + p = _Lark(grammar, import_sources=[custom_loader]) + self.assertEqual(p.parse('ab'), + Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") def test_earley_prioritization(self): "Tests effect of priority on result" From c9b54431270f6b59a271203b42bcb2838e3140c5 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Sat, 26 Sep 2020 21:12:12 +0200 Subject: [PATCH 2/9] Fix for python2.7 --- lark/load_grammar.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index bb5f71a..8849f76 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -663,17 +663,17 @@ def import_grammar(grammar_path, re_, base_paths=(), import_sources=()): if grammar_path not in _imported_grammars: import_paths = import_sources + base_paths + [stdlib_loader] for source in import_paths: - if isinstance(source, str): + if callable(source): with suppress(IOError): - joined_path = os.path.join(source, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() + joined_path, text = source(base_paths, grammar_path) grammar = load_grammar(text, joined_path, re_, import_sources) _imported_grammars[grammar_path] = grammar break else: with suppress(IOError): - joined_path, text = source(base_paths, grammar_path) + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() grammar = load_grammar(text, joined_path, re_, import_sources) _imported_grammars[grammar_path] = grammar break From 009cc105907987a1afab22b769685eb7c89b0e82 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Sun, 27 Sep 2020 16:03:39 +0200 Subject: [PATCH 3/9] Added `FromPackageLoader` and `open_from_package` --- lark-stubs/lark.pyi | 12 +++++++ lark/lark.py | 38 +++++++++++++++++++--- lark/load_grammar.py | 75 ++++++++++++++++++++++++++++++++------------ tests/test_parser.py | 26 +++++++++++---- 4 files changed, 120 insertions(+), 31 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index deb8849..5cb94b2 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -33,6 +33,13 @@ class LarkOptions: g_regex_flags: int use_bytes: bool import_sources: List[Union[str, Callable[[str, str], str]]] + source: Optional[str] + + +class FromPackageLoader: + def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... + + def __call__(self, base_paths: List[str], grammar_path: str) -> Tuple[str, str]: ... class Lark: @@ -62,6 +69,7 @@ class Lark: g_regex_flags: int = ..., use_bytes: bool = False, import_sources: List[Union[str, Callable[[List[str], str], Tuple[str, str]]]] = ..., + source: Optional[str], ): ... @@ -71,6 +79,10 @@ class Lark: @classmethod def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: ... + + @classmethod + def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T: + ... def lex(self, text: str) -> Iterator[Token]: ... diff --git a/lark/lark.py b/lark/lark.py index 9877b00..9f53841 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -5,7 +5,7 @@ from io import open from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger -from .load_grammar import load_grammar +from .load_grammar import load_grammar, FromPackageLoader from .tree import Tree from .common import LexerConf, ParserConf @@ -92,6 +92,8 @@ class LarkOptions(Serialize): A callback for editing the terminals before parse. import_sources A List of either paths or loader functions to specify from where grammars are imported + source + Override the source of from where the grammar was loaded. Usefull for relative imports and unconventional grammar loading **=== End Options ===** """ @@ -118,6 +120,7 @@ class LarkOptions(Serialize): 'g_regex_flags': 0, 'use_bytes': False, 'import_sources': [], + 'source': None, } def __init__(self, options_dict): @@ -193,10 +196,13 @@ class Lark(Serialize): re_module = re # Some, but not all file-like objects have a 'name' attribute - try: - self.source = grammar.name - except AttributeError: - self.source = '' + if self.options.source is None: + try: + self.source = grammar.name + except AttributeError: + self.source = '' + else: + self.source = self.options.source # Drain file-like objects to get their contents try: @@ -404,6 +410,28 @@ class Lark(Serialize): grammar_filename = os.path.join(basepath, grammar_filename) with open(grammar_filename, encoding='utf8') as f: return cls(f, **options) + + @classmethod + def open_from_package(cls, package, grammar_path, search_paths=("",), **options): + """Create an instance of Lark with the grammar loaded from within the package `package`. + This allows grammar loading from zipapps. + + Will also create a `FromPackageLoader` instance and add it to the `import_sources` to simplify importing + + ``search_paths`` is passed to `FromPackageLoader` + + Example: + + Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) + """ + package = FromPackageLoader(package, search_paths) + full_path, text = package([], grammar_path) + options.setdefault('source', full_path) + if 'import_sources' in options: + options['import_sources'].append(package) + else: + options['import_sources'] = [package] + return cls(text, **options) def __repr__(self): return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 8849f76..ba1f0f2 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -4,6 +4,7 @@ import os.path import sys from copy import copy, deepcopy from io import open +import pkgutil from .utils import bfs, eval_escaping, Py36, logger, classify_bool from .lexer import Token, TerminalDef, PatternStr, PatternRE @@ -648,35 +649,69 @@ class Grammar: return terminals, compiled_rules, self.ignore -def stdlib_loader(base_paths, grammar_path): - import pkgutil - for path in IMPORT_PATHS: - text = pkgutil.get_data('lark', path + '/' + grammar_path) - if text is None: - continue - return '', text.decode() - raise FileNotFoundError() +class FromPackageLoader(object): + """ + Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. + This allows them to be compatible even from within zip files. + + Relative imports are handled, so you can just freely use them. + + pkg_name: The name of the package. You can probably provide `__name__` most of the time + search_paths: All the path that will be search on absolute imports. + """ + def __init__(self, pkg_name, search_paths=("", )): + self.pkg_name = pkg_name + self.search_paths = search_paths + + def __repr__(self): + return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths) + + def __call__(self, base_paths, grammar_path): + if len(base_paths) == 0: + to_try = self.search_paths + else: + assert len(base_paths) == 1 + if not base_paths[0].startswith('<%s:' % (self.pkg_name,)): + # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway + raise IOError() + base_path = base_paths[0].partition(':')[2] + if base_path and base_path[0] == '/': + base_path = base_path[1:] + to_try = [base_path] + for path in to_try: + full_path = os.path.join(path, grammar_path) + text = None + with suppress(IOError): + text = pkgutil.get_data(self.pkg_name, full_path) + if text is None: + continue + return '<%s:/%s>' % (self.pkg_name, full_path), text.decode() + raise IOError() + +stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) _imported_grammars = {} -def import_grammar(grammar_path, re_, base_paths=(), import_sources=()): +def import_grammar(grammar_path, re_, base_paths=[], import_sources=[]): if grammar_path not in _imported_grammars: - import_paths = import_sources + base_paths + [stdlib_loader] + # import_sources take priority over base_paths since they should handle relative imports and ignore everthing else. + # Question: should the stdlib_loader really be pushed to the end? + import_paths = import_sources + base_paths + [stdlib_loader] for source in import_paths: - if callable(source): - with suppress(IOError): + text = None + with suppress(IOError): + if callable(source): joined_path, text = source(base_paths, grammar_path) - grammar = load_grammar(text, joined_path, re_, import_sources) - _imported_grammars[grammar_path] = grammar - break - else: - with suppress(IOError): + else: joined_path = os.path.join(source, grammar_path) with open(joined_path, encoding='utf8') as f: text = f.read() - grammar = load_grammar(text, joined_path, re_, import_sources) - _imported_grammars[grammar_path] = grammar - break + if text is not None: + # Don't load the grammar from within the suppress statement. Otherwise the underlying error message will be swallowed + # and the wrong file will be reported as missing + grammar = load_grammar(text, joined_path, re_, import_sources) + _imported_grammars[grammar_path] = grammar + break else: open(grammar_path, encoding='utf8') assert False diff --git a/tests/test_parser.py b/tests/test_parser.py index 6779f64..0406f46 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -11,6 +11,7 @@ from copy import copy, deepcopy from lark.utils import Py36, isascii from lark import Token +from lark.load_grammar import FromPackageLoader try: from cStringIO import StringIO as cStringIO @@ -1783,12 +1784,7 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises(IOError, _Lark, grammar) def test_import_custom_sources(self): - def custom_loader(base_paths, grammar_path): - import pkgutil - text = pkgutil.get_data('tests', 'grammars/' + grammar_path) - if text is None: - raise FileNotFoundError() - return '', text.decode() + custom_loader = FromPackageLoader('tests', ('grammars', )) grammar = """ start: startab @@ -1800,6 +1796,24 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(p.parse('ab'), Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) + grammar = """ + start: rule_to_import + + %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import + """ + p = _Lark(grammar, import_sources=[custom_loader]) + x = p.parse('N') + self.assertEqual(next(x.find_data('rule_to_import')).children, ['N']) + + custom_loader2 = FromPackageLoader('tests') + grammar = """ + %import .test_relative_import (start, WS) + %ignore WS + """ + p = _Lark(grammar, import_sources=[custom_loader2]) + x = p.parse('12 capybaras') + self.assertEqual(x.children, ['12', 'capybaras']) + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") def test_earley_prioritization(self): "Tests effect of priority on result" From a50fc107737f7a89e60711d7be1f186e1c24b1fb Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Tue, 29 Sep 2020 15:24:00 +0200 Subject: [PATCH 4/9] import_paths->sources, source->source_path, various implementation changes --- lark-stubs/lark.pyi | 14 ++++----- lark/lark.py | 49 ++++++++++++++++------------- lark/load_grammar.py | 75 +++++++++++++++++++++++++------------------- tests/test_parser.py | 6 ++-- 4 files changed, 81 insertions(+), 63 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 5cb94b2..96eeda4 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -32,19 +32,19 @@ class LarkOptions: cache: Union[bool, str] g_regex_flags: int use_bytes: bool - import_sources: List[Union[str, Callable[[str, str], str]]] - source: Optional[str] + import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]] + source_path: Optional[str] class FromPackageLoader: def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... - def __call__(self, base_paths: List[str], grammar_path: str) -> Tuple[str, str]: ... + def __call__(self, base_paths: str, grammar_path: str) -> Tuple[str, str]: ... class Lark: - source: str - grammar_source: str + source_path: str + source_code: str options: LarkOptions lexer: Lexer terminals: List[TerminalDef] @@ -68,8 +68,8 @@ class Lark: cache: Union[bool, str] = False, g_regex_flags: int = ..., use_bytes: bool = False, - import_sources: List[Union[str, Callable[[List[str], str], Tuple[str, str]]]] = ..., - source: Optional[str], + import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]] = ..., + source_path: Optional[str], ): ... diff --git a/lark/lark.py b/lark/lark.py index 9f53841..8107e34 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -2,7 +2,7 @@ from __future__ import absolute_import import sys, os, pickle, hashlib from io import open - +from warnings import warn from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger from .load_grammar import load_grammar, FromPackageLoader @@ -90,10 +90,10 @@ class LarkOptions(Serialize): Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). edit_terminals A callback for editing the terminals before parse. - import_sources + import_paths A List of either paths or loader functions to specify from where grammars are imported - source - Override the source of from where the grammar was loaded. Usefull for relative imports and unconventional grammar loading + source_path + Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading **=== End Options ===** """ @@ -119,8 +119,8 @@ class LarkOptions(Serialize): 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False, - 'import_sources': [], - 'source': None, + 'import_paths': [], + 'source_path': None, } def __init__(self, options_dict): @@ -196,13 +196,13 @@ class Lark(Serialize): re_module = re # Some, but not all file-like objects have a 'name' attribute - if self.options.source is None: + if self.options.source_path is None: try: - self.source = grammar.name + self.source_path = grammar.name except AttributeError: - self.source = '' + self.source_path = '' else: - self.source = self.options.source + self.source_path = self.options.source_path # Drain file-like objects to get their contents try: @@ -213,7 +213,7 @@ class Lark(Serialize): grammar = read() assert isinstance(grammar, STRING_TYPE) - self.grammar_source = grammar + self.source_code = grammar if self.options.use_bytes: if not isascii(grammar): raise ValueError("Grammar must be ascii only, when use_bytes=True") @@ -276,7 +276,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'forest', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source, re_module, self.options.import_sources) + self.grammar = load_grammar(grammar, self.source_path, re_module, self.options.import_paths) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -374,7 +374,7 @@ class Lark(Serialize): self.options = LarkOptions.deserialize(options, memo) re_module = regex if self.options.regex else re self.rules = [Rule.deserialize(r, memo) for r in data['rules']] - self.source = '' + self.source_path = '' self._prepare_callbacks() self.parser = self.parser_class.deserialize( data['parser'], @@ -416,9 +416,7 @@ class Lark(Serialize): """Create an instance of Lark with the grammar loaded from within the package `package`. This allows grammar loading from zipapps. - Will also create a `FromPackageLoader` instance and add it to the `import_sources` to simplify importing - - ``search_paths`` is passed to `FromPackageLoader` + Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader` Example: @@ -426,15 +424,15 @@ class Lark(Serialize): """ package = FromPackageLoader(package, search_paths) full_path, text = package([], grammar_path) - options.setdefault('source', full_path) - if 'import_sources' in options: - options['import_sources'].append(package) + options.setdefault('source_path', full_path) + if 'import_paths' in options: + options['import_paths'].append(package) else: - options['import_sources'] = [package] + options['import_paths'] = [package] return cls(text, **options) def __repr__(self): - return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer) + return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) def lex(self, text): @@ -481,6 +479,15 @@ class Lark(Serialize): # Prevent infinite loop raise e2 e = e2 + + @property + def source(self): + warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning) + return self.source_path + + @source.setter + def source(self, value): + self.source_path = value ###} diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ba1f0f2..022e024 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -666,50 +666,61 @@ class FromPackageLoader(object): def __repr__(self): return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths) - def __call__(self, base_paths, grammar_path): - if len(base_paths) == 0: + def __call__(self, base_path, grammar_path): + if base_path is None: to_try = self.search_paths else: - assert len(base_paths) == 1 - if not base_paths[0].startswith('<%s:' % (self.pkg_name,)): + # Check whether or not the importing grammar was loaded by this module. + if not base_path.startswith('<%s:' % (self.pkg_name,)): # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway raise IOError() - base_path = base_paths[0].partition(':')[2] - if base_path and base_path[0] == '/': - base_path = base_path[1:] + # Separate the path and the pkg_name and throw away the slash. `pkgutil.get_data` doesn't like it. (see below) + base_path = base_path.partition(':')[2].lstrip('/') to_try = [base_path] for path in to_try: full_path = os.path.join(path, grammar_path) - text = None - with suppress(IOError): + try: text = pkgutil.get_data(self.pkg_name, full_path) - if text is None: + except IOError: continue - return '<%s:/%s>' % (self.pkg_name, full_path), text.decode() + else: + # Custom format `<{pkg_name}:/{full_path}>` + # These are the arguments to `pkgutil.get_data(pkg_name, full_path)` + # Required since we can not easily provided a actual file path for all package data (e.g. from inside a zip) + + # The additional slash after the `:` is to allow `os.path.split` to work on this without accidentally + # throwing away the `pkg_name`. (As it would inside of `GrammarLoader.load_grammar` otherwise when relative imports + # are resolved. + # Without the slash `""` would turn into `""`, losing the pacakge information + # With the slash `""` turns into `""` into `"' % (self.pkg_name, full_path), text.decode() raise IOError() stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) _imported_grammars = {} -def import_grammar(grammar_path, re_, base_paths=[], import_sources=[]): +def import_grammar(grammar_path, re_, base_path=None, import_paths=[]): if grammar_path not in _imported_grammars: - # import_sources take priority over base_paths since they should handle relative imports and ignore everthing else. + # import_paths take priority over base_path since they should handle relative imports and ignore everything else. # Question: should the stdlib_loader really be pushed to the end? - import_paths = import_sources + base_paths + [stdlib_loader] - for source in import_paths: - text = None - with suppress(IOError): + to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] + for source in to_try: + try: if callable(source): - joined_path, text = source(base_paths, grammar_path) + joined_path, text = source(base_path, grammar_path) else: joined_path = os.path.join(source, grammar_path) with open(joined_path, encoding='utf8') as f: text = f.read() - if text is not None: - # Don't load the grammar from within the suppress statement. Otherwise the underlying error message will be swallowed + except IOError: + continue + else: + # Don't load the grammar from within the try statement. Otherwise the underlying error message will be swallowed # and the wrong file will be reported as missing - grammar = load_grammar(text, joined_path, re_, import_sources) + grammar = load_grammar(text, joined_path, re_, import_paths) _imported_grammars[grammar_path] = grammar break else: @@ -868,7 +879,7 @@ class GrammarLoader: self.canonize_tree = CanonizeTree() self.re_module = re_module - def load_grammar(self, grammar_text, grammar_name='', import_sources=[]): + def load_grammar(self, grammar_text, grammar_name='', import_paths=[]): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." try: @@ -922,7 +933,7 @@ class GrammarLoader: aliases = {name: arg1 or name} # Aliases if exist if path_node.data == 'import_lib': # Import from library - base_paths = [] + base_path = None else: # Relative import if grammar_name == '': # Import relative to script file path if grammar is coded in script try: @@ -932,16 +943,16 @@ class GrammarLoader: else: base_file = grammar_name # Import relative to grammar file path if external grammar file if base_file: - base_paths = [os.path.split(base_file)[0]] + base_path = os.path.split(base_file)[0] else: - base_paths = [os.path.abspath(os.path.curdir)] + base_path = os.path.abspath(os.path.curdir) try: - import_base_paths, import_aliases = imports[dotted_path] - assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path) + import_base_path, import_aliases = imports[dotted_path] + assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) import_aliases.update(aliases) except KeyError: - imports[dotted_path] = base_paths, aliases + imports[dotted_path] = base_path, aliases elif stmt.data == 'declare': for t in stmt.children: @@ -950,9 +961,9 @@ class GrammarLoader: assert False, stmt # import grammars - for dotted_path, (base_paths, aliases) in imports.items(): + for dotted_path, (base_path, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, self.re_module, base_paths=base_paths, import_sources=import_sources) + g = import_grammar(grammar_path, self.re_module, base_path=base_path, import_paths=import_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -1032,5 +1043,5 @@ class GrammarLoader: -def load_grammar(grammar, source, re_, import_sources): - return GrammarLoader(re_).load_grammar(grammar, source, import_sources) +def load_grammar(grammar, source, re_, import_paths): + return GrammarLoader(re_).load_grammar(grammar, source, import_paths) diff --git a/tests/test_parser.py b/tests/test_parser.py index 0406f46..6aaee4d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1792,7 +1792,7 @@ def _make_parser_test(LEXER, PARSER): %import ab.startab """ - p = _Lark(grammar, import_sources=[custom_loader]) + p = _Lark(grammar, import_paths=[custom_loader]) self.assertEqual(p.parse('ab'), Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) @@ -1801,7 +1801,7 @@ def _make_parser_test(LEXER, PARSER): %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import """ - p = _Lark(grammar, import_sources=[custom_loader]) + p = _Lark(grammar, import_paths=[custom_loader]) x = p.parse('N') self.assertEqual(next(x.find_data('rule_to_import')).children, ['N']) @@ -1810,7 +1810,7 @@ def _make_parser_test(LEXER, PARSER): %import .test_relative_import (start, WS) %ignore WS """ - p = _Lark(grammar, import_sources=[custom_loader2]) + p = _Lark(grammar, import_paths=[custom_loader2]) x = p.parse('12 capybaras') self.assertEqual(x.children, ['12', 'capybaras']) From f7d466dc7d394d29f4a4ddd9eb11a3a2d1efd6ed Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Tue, 29 Sep 2020 16:40:37 +0200 Subject: [PATCH 5/9] added PackageResource --- lark-stubs/lark.pyi | 12 +++++++++--- lark/load_grammar.py | 38 +++++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 96eeda4..2d551c2 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -32,14 +32,20 @@ class LarkOptions: cache: Union[bool, str] g_regex_flags: int use_bytes: bool - import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]] + import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] source_path: Optional[str] +class PackageResource(object): + pkg_name: str + path: str + + def __init__(self, pkg_name: str, path: str): + class FromPackageLoader: def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... - def __call__(self, base_paths: str, grammar_path: str) -> Tuple[str, str]: ... + def __call__(self, base_paths: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... class Lark: @@ -68,7 +74,7 @@ class Lark: cache: Union[bool, str] = False, g_regex_flags: int = ..., use_bytes: bool = False, - import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]] = ..., + import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ..., source_path: Optional[str], ): ... diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 022e024..dfd0f11 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -649,6 +649,20 @@ class Grammar: return terminals, compiled_rules, self.ignore +class PackageResource(object): + """ + Represents a path inside a Package. Used by `FromPackageLoader` + """ + def __init__(self, pkg_name, path): + self.pkg_name = pkg_name + self.path = path + + def __str__(self): + return "<%s: %s>" % (self.pkg_name, self.path) + + def __repr__(self): + return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path) + class FromPackageLoader(object): """ Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. @@ -671,12 +685,10 @@ class FromPackageLoader(object): to_try = self.search_paths else: # Check whether or not the importing grammar was loaded by this module. - if not base_path.startswith('<%s:' % (self.pkg_name,)): + if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name: # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway raise IOError() - # Separate the path and the pkg_name and throw away the slash. `pkgutil.get_data` doesn't like it. (see below) - base_path = base_path.partition(':')[2].lstrip('/') - to_try = [base_path] + to_try = [base_path.path] for path in to_try: full_path = os.path.join(path, grammar_path) try: @@ -684,18 +696,7 @@ class FromPackageLoader(object): except IOError: continue else: - # Custom format `<{pkg_name}:/{full_path}>` - # These are the arguments to `pkgutil.get_data(pkg_name, full_path)` - # Required since we can not easily provided a actual file path for all package data (e.g. from inside a zip) - - # The additional slash after the `:` is to allow `os.path.split` to work on this without accidentally - # throwing away the `pkg_name`. (As it would inside of `GrammarLoader.load_grammar` otherwise when relative imports - # are resolved. - # Without the slash `""` would turn into `""`, losing the pacakge information - # With the slash `""` turns into `""` into `"' % (self.pkg_name, full_path), text.decode() + return PackageResource(self.pkg_name, full_path), text.decode() raise IOError() stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) @@ -943,7 +944,10 @@ class GrammarLoader: else: base_file = grammar_name # Import relative to grammar file path if external grammar file if base_file: - base_path = os.path.split(base_file)[0] + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) + else: + base_path = os.path.split(base_file)[0] else: base_path = os.path.abspath(os.path.curdir) From a4260110ff17b6daf87b9e64cd8548d6917446b0 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Tue, 29 Sep 2020 16:40:37 +0200 Subject: [PATCH 6/9] added PackageResource --- lark-stubs/lark.pyi | 12 +++++++++--- lark/load_grammar.py | 38 +++++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 96eeda4..d28a8a0 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -32,14 +32,20 @@ class LarkOptions: cache: Union[bool, str] g_regex_flags: int use_bytes: bool - import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]] + import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] source_path: Optional[str] +class PackageResource(object): + pkg_name: str + path: str + + def __init__(self, pkg_name: str, path: str): + class FromPackageLoader: def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... - def __call__(self, base_paths: str, grammar_path: str) -> Tuple[str, str]: ... + def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... class Lark: @@ -68,7 +74,7 @@ class Lark: cache: Union[bool, str] = False, g_regex_flags: int = ..., use_bytes: bool = False, - import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]] = ..., + import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ..., source_path: Optional[str], ): ... diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 022e024..dfd0f11 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -649,6 +649,20 @@ class Grammar: return terminals, compiled_rules, self.ignore +class PackageResource(object): + """ + Represents a path inside a Package. Used by `FromPackageLoader` + """ + def __init__(self, pkg_name, path): + self.pkg_name = pkg_name + self.path = path + + def __str__(self): + return "<%s: %s>" % (self.pkg_name, self.path) + + def __repr__(self): + return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path) + class FromPackageLoader(object): """ Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. @@ -671,12 +685,10 @@ class FromPackageLoader(object): to_try = self.search_paths else: # Check whether or not the importing grammar was loaded by this module. - if not base_path.startswith('<%s:' % (self.pkg_name,)): + if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name: # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway raise IOError() - # Separate the path and the pkg_name and throw away the slash. `pkgutil.get_data` doesn't like it. (see below) - base_path = base_path.partition(':')[2].lstrip('/') - to_try = [base_path] + to_try = [base_path.path] for path in to_try: full_path = os.path.join(path, grammar_path) try: @@ -684,18 +696,7 @@ class FromPackageLoader(object): except IOError: continue else: - # Custom format `<{pkg_name}:/{full_path}>` - # These are the arguments to `pkgutil.get_data(pkg_name, full_path)` - # Required since we can not easily provided a actual file path for all package data (e.g. from inside a zip) - - # The additional slash after the `:` is to allow `os.path.split` to work on this without accidentally - # throwing away the `pkg_name`. (As it would inside of `GrammarLoader.load_grammar` otherwise when relative imports - # are resolved. - # Without the slash `""` would turn into `""`, losing the pacakge information - # With the slash `""` turns into `""` into `"' % (self.pkg_name, full_path), text.decode() + return PackageResource(self.pkg_name, full_path), text.decode() raise IOError() stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) @@ -943,7 +944,10 @@ class GrammarLoader: else: base_file = grammar_name # Import relative to grammar file path if external grammar file if base_file: - base_path = os.path.split(base_file)[0] + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) + else: + base_path = os.path.split(base_file)[0] else: base_path = os.path.abspath(os.path.curdir) From 5f7a5d428ac0600c3be2aa8c113f5a19e2a0200e Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 1 Oct 2020 16:12:50 +0200 Subject: [PATCH 7/9] renamed `source_code` -> `source_grammar` --- lark-stubs/lark.pyi | 4 ++-- lark/lark.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index d28a8a0..8f373e8 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -50,7 +50,7 @@ class FromPackageLoader: class Lark: source_path: str - source_code: str + source_grammar: str options: LarkOptions lexer: Lexer terminals: List[TerminalDef] @@ -75,7 +75,7 @@ class Lark: g_regex_flags: int = ..., use_bytes: bool = False, import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ..., - source_path: Optional[str], + source_path: Optional[str]=None, ): ... diff --git a/lark/lark.py b/lark/lark.py index 8107e34..f53ac37 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -213,7 +213,7 @@ class Lark(Serialize): grammar = read() assert isinstance(grammar, STRING_TYPE) - self.source_code = grammar + self.source_grammar = grammar if self.options.use_bytes: if not isascii(grammar): raise ValueError("Grammar must be ascii only, when use_bytes=True") From ee9857cabb9869620819d9785833736602c2a959 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Tue, 6 Oct 2020 02:35:08 +0200 Subject: [PATCH 8/9] Added backwards-compatibility property with DeprecationWarning --- lark/lark.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lark/lark.py b/lark/lark.py index f53ac37..3ceddcb 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -489,5 +489,14 @@ class Lark(Serialize): def source(self, value): self.source_path = value + @property + def grammar_source(self): + warn("Lark.grammar_source attribute has been renamed to Lark.source_grammar", DeprecationWarning) + return self.source_grammar + + @grammar_source.setter + def grammar_source(self, value): + self.source_grammar = value + ###} From bc3923aed85fc33b788790d0f61b66cb7298471a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 25 Oct 2020 11:15:03 +0200 Subject: [PATCH 9/9] Added more terminals to grammars/ folder --- examples/advanced/python3.lark | 18 +++++------------- lark/grammars/common.lark | 11 ++++++++++- lark/grammars/python.lark | 19 +++++++++++++++++++ lark/lark.py | 4 ++-- 4 files changed, 36 insertions(+), 16 deletions(-) create mode 100644 lark/grammars/python.lark diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index 78c9875..9be6b43 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -163,22 +163,14 @@ yield_arg: "from" test | testlist number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER string: STRING | LONG_STRING -// Tokens - -NAME: /[a-zA-Z_]\w*/ -COMMENT: /#[^\n]*/ -_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ +// Import terminals from standard library (grammars/python.lark) +%import python (NAME, COMMENT, STRING, LONG_STRING) +%import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) -STRING : /[ubf]?r?("(?!"").*?(?