From a50fc107737f7a89e60711d7be1f186e1c24b1fb Mon Sep 17 00:00:00 2001
From: MegaIng1 <trampchamp@hotmail.de>
Date: Tue, 29 Sep 2020 15:24:00 +0200
Subject: [PATCH] import_paths->sources, source->source_path, various
 implementation changes

---
 lark-stubs/lark.pyi  | 14 ++++-----
 lark/lark.py         | 49 ++++++++++++++++-------------
 lark/load_grammar.py | 75 +++++++++++++++++++++++++-------------------
 tests/test_parser.py |  6 ++--
 4 files changed, 81 insertions(+), 63 deletions(-)
diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi
index 5cb94b2..96eeda4 100644
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -32,19 +32,19 @@ class LarkOptions:
     cache: Union[bool, str]
     g_regex_flags: int
     use_bytes: bool
-    import_sources: List[Union[str, Callable[[str, str], str]]]
-    source: Optional[str]
+    import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]]
+    source_path: Optional[str]
 
 
 class FromPackageLoader:
     def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
     
-    def __call__(self, base_paths: List[str], grammar_path: str) -> Tuple[str, str]: ...
+    def __call__(self, base_paths: str, grammar_path: str) -> Tuple[str, str]: ...
 
 
 class Lark:
-    source: str
-    grammar_source: str
+    source_path: str
+    source_code: str
     options: LarkOptions
     lexer: Lexer
     terminals: List[TerminalDef]
@@ -68,8 +68,8 @@ class Lark:
         cache: Union[bool, str] = False,
         g_regex_flags: int = ...,
         use_bytes: bool = False,
-        import_sources: List[Union[str, Callable[[List[str], str], Tuple[str, str]]]] = ...,
-        source: Optional[str],
+        import_paths: List[Union[str, Callable[[Optional[str], str], Tuple[str, str]]]] = ...,
+        source_path: Optional[str],
     ):
         ...
 
diff --git a/lark/lark.py b/lark/lark.py
index 9f53841..8107e34 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import
 
 import sys, os, pickle, hashlib
 from io import open
-
+from warnings import warn
 
 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
 from .load_grammar import load_grammar, FromPackageLoader
@@ -90,10 +90,10 @@ class LarkOptions(Serialize):
             Accept an input of type ``bytes`` instead of ``str`` (Python 3 only).
     edit_terminals
             A callback for editing the terminals before parse.
-    import_sources
+    import_paths
             A List of either paths or loader functions to specify from where grammars are imported 
-    source
-            Override the source of from where the grammar was loaded. Usefull for relative imports and unconventional grammar loading
+    source_path
+            Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
 
     **=== End Options ===**
     """
@@ -119,8 +119,8 @@ class LarkOptions(Serialize):
         'edit_terminals': None,
         'g_regex_flags': 0,
         'use_bytes': False,
-        'import_sources': [],
-        'source': None,
+        'import_paths': [],
+        'source_path': None,
     }
 
     def __init__(self, options_dict):
@@ -196,13 +196,13 @@ class Lark(Serialize):
             re_module = re
 
         # Some, but not all file-like objects have a 'name' attribute
-        if self.options.source is None:
+        if self.options.source_path is None:
             try:
-                self.source = grammar.name
+                self.source_path = grammar.name
             except AttributeError:
-                self.source = '<string>'
+                self.source_path = '<string>'
         else:
-            self.source = self.options.source
+            self.source_path = self.options.source_path
 
         # Drain file-like objects to get their contents
         try:
@@ -213,7 +213,7 @@ class Lark(Serialize):
             grammar = read()
 
         assert isinstance(grammar, STRING_TYPE)
-        self.grammar_source = grammar
+        self.source_code = grammar
         if self.options.use_bytes:
             if not isascii(grammar):
                 raise ValueError("Grammar must be ascii only, when use_bytes=True")
@@ -276,7 +276,7 @@ class Lark(Serialize):
         assert self.options.ambiguity in ('resolve', 'explicit', 'forest', 'auto', )
 
         # Parse the grammar file and compose the grammars (TODO)
-        self.grammar = load_grammar(grammar, self.source, re_module, self.options.import_sources)
+        self.grammar = load_grammar(grammar, self.source_path, re_module, self.options.import_paths)
 
         # Compile the EBNF grammar into BNF
         self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -374,7 +374,7 @@ class Lark(Serialize):
         self.options = LarkOptions.deserialize(options, memo)
         re_module = regex if self.options.regex else re
         self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
-        self.source = '<deserialized>'
+        self.source_path = '<deserialized>'
         self._prepare_callbacks()
         self.parser = self.parser_class.deserialize(
             data['parser'],
@@ -416,9 +416,7 @@ class Lark(Serialize):
         """Create an instance of Lark with the grammar loaded from within the package `package`.
         This allows grammar loading from zipapps.
         
-        Will also create a `FromPackageLoader` instance and add it to the `import_sources` to simplify importing
-        
-        ``search_paths`` is passed to `FromPackageLoader`
+        Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`
         
         Example:
             
@@ -426,15 +424,15 @@ class Lark(Serialize):
         """
         package = FromPackageLoader(package, search_paths)
         full_path, text = package([], grammar_path)
-        options.setdefault('source', full_path)
-        if 'import_sources' in options:
-            options['import_sources'].append(package)
+        options.setdefault('source_path', full_path)
+        if 'import_paths' in options:
+            options['import_paths'].append(package)
         else:
-            options['import_sources'] = [package]
+            options['import_paths'] = [package]
         return cls(text, **options)
 
     def __repr__(self):
-        return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
+        return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
 
 
     def lex(self, text):
@@ -481,6 +479,15 @@ class Lark(Serialize):
                         # Prevent infinite loop
                         raise e2
                     e = e2
+    
+    @property
+    def source(self):
+        warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning)
+        return self.source_path
+    
+    @source.setter
+    def source(self, value):
+        self.source_path = value
 
 
 ###}
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index ba1f0f2..022e024 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -666,50 +666,61 @@ class FromPackageLoader(object):
     def __repr__(self):
         return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)
 
-    def __call__(self, base_paths, grammar_path):
-        if len(base_paths) == 0:
+    def __call__(self, base_path, grammar_path):
+        if base_path is None:
             to_try = self.search_paths
         else:
-            assert len(base_paths) == 1
-            if not base_paths[0].startswith('<%s:' % (self.pkg_name,)):
+            # Check whether or not the importing grammar was loaded by this module.
+            if not base_path.startswith('<%s:' % (self.pkg_name,)): 
                 # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
                 raise IOError()
-            base_path = base_paths[0].partition(':')[2]
-            if base_path and base_path[0] == '/':
-                base_path = base_path[1:]
+            # Separate the path and the pkg_name and throw away the slash. `pkgutil.get_data` doesn't like it. (see below)
+            base_path = base_path.partition(':')[2].lstrip('/')
             to_try = [base_path]
         for path in to_try:
             full_path = os.path.join(path, grammar_path)
-            text = None
-            with suppress(IOError):
+            try:
                 text = pkgutil.get_data(self.pkg_name, full_path)
-            if text is None:
+            except IOError:
                 continue
-            return '<%s:/%s>' % (self.pkg_name, full_path), text.decode()
+            else:
+                # Custom format `<{pkg_name}:/{full_path}>`
+                # These are the arguments to `pkgutil.get_data(pkg_name, full_path)`
+                # Required since we can not easily provided a actual file path for all package data (e.g. from inside a zip)
+                
+                # The additional slash after the `:` is to allow `os.path.split` to work on this without accidentally
+                # throwing away the `pkg_name`. (As it would inside of `GrammarLoader.load_grammar` otherwise when relative imports
+                # are resolved.
+                # Without the slash `"<lark:common.lark>"` would turn into `""`, losing the pacakge information
+                # With the slash `"<lark:/common.lark>"` turns into `"<lark:"` without the slash, but
+                # `"<lark:/grammars/common.lark>"` into `"<lark:/grammars"`, so we have to strip it away when we look at the path (see above)
+
+                return '<%s:/%s>' % (self.pkg_name, full_path), text.decode()
         raise IOError()
 
 stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)
 
 
 _imported_grammars = {}
-def import_grammar(grammar_path, re_, base_paths=[], import_sources=[]):
+def import_grammar(grammar_path, re_, base_path=None, import_paths=[]):
     if grammar_path not in _imported_grammars:
-        # import_sources take priority over base_paths since they should handle relative imports and ignore everthing else.
+        # import_paths take priority over base_path since they should handle relative imports and ignore everything else.
         # Question: should the stdlib_loader really be pushed to the end?
-        import_paths = import_sources + base_paths + [stdlib_loader] 
-        for source in import_paths:
-            text = None
-            with suppress(IOError):
+        to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] 
+        for source in to_try:
+            try:
                 if callable(source):
-                    joined_path, text = source(base_paths, grammar_path)
+                    joined_path, text = source(base_path, grammar_path)
                 else:
                     joined_path = os.path.join(source, grammar_path)
                     with open(joined_path, encoding='utf8') as f:
                         text = f.read()
-            if text is not None:
-                # Don't load the grammar from within the suppress statement. Otherwise the underlying error message will be swallowed 
+            except IOError:
+                continue
+            else:
+                # Don't load the grammar from within the try statement. Otherwise the underlying error message will be swallowed 
                 # and the wrong file will be reported as missing
-                grammar = load_grammar(text, joined_path, re_, import_sources) 
+                grammar = load_grammar(text, joined_path, re_, import_paths) 
                 _imported_grammars[grammar_path] = grammar
                 break
         else:
@@ -868,7 +879,7 @@ class GrammarLoader:
         self.canonize_tree = CanonizeTree()
         self.re_module = re_module
 
-    def load_grammar(self, grammar_text, grammar_name='<?>', import_sources=[]):
+    def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
         "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
 
         try:
@@ -922,7 +933,7 @@ class GrammarLoader:
                     aliases = {name: arg1 or name}  # Aliases if exist
 
                 if path_node.data == 'import_lib':  # Import from library
-                    base_paths = []
+                    base_path = None
                 else:  # Relative import
                     if grammar_name == '<string>':  # Import relative to script file path if grammar is coded in script
                         try:
@@ -932,16 +943,16 @@ class GrammarLoader:
                     else:
                         base_file = grammar_name  # Import relative to grammar file path if external grammar file
                     if base_file:
-                        base_paths = [os.path.split(base_file)[0]]
+                        base_path = os.path.split(base_file)[0]
                     else:
-                        base_paths = [os.path.abspath(os.path.curdir)]
+                        base_path = os.path.abspath(os.path.curdir)
 
                 try:
-                    import_base_paths, import_aliases = imports[dotted_path]
-                    assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
+                    import_base_path, import_aliases = imports[dotted_path]
+                    assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
                     import_aliases.update(aliases)
                 except KeyError:
-                    imports[dotted_path] = base_paths, aliases
+                    imports[dotted_path] = base_path, aliases
 
             elif stmt.data == 'declare':
                 for t in stmt.children:
@@ -950,9 +961,9 @@ class GrammarLoader:
                 assert False, stmt
 
         # import grammars
-        for dotted_path, (base_paths, aliases) in imports.items():
+        for dotted_path, (base_path, aliases) in imports.items():
             grammar_path = os.path.join(*dotted_path) + EXT
-            g = import_grammar(grammar_path, self.re_module, base_paths=base_paths, import_sources=import_sources)
+            g = import_grammar(grammar_path, self.re_module, base_path=base_path, import_paths=import_paths)
             new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)
 
             term_defs += new_td
@@ -1032,5 +1043,5 @@ class GrammarLoader:
 
 
 
-def load_grammar(grammar, source, re_, import_sources):
-    return GrammarLoader(re_).load_grammar(grammar, source, import_sources)
+def load_grammar(grammar, source, re_, import_paths):
+    return GrammarLoader(re_).load_grammar(grammar, source, import_paths)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 0406f46..6aaee4d 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1792,7 +1792,7 @@ def _make_parser_test(LEXER, PARSER):
             %import ab.startab
             """
 
-            p = _Lark(grammar, import_sources=[custom_loader])
+            p = _Lark(grammar, import_paths=[custom_loader])
             self.assertEqual(p.parse('ab'),
                              Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))
 
@@ -1801,7 +1801,7 @@ def _make_parser_test(LEXER, PARSER):
 
             %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
             """
-            p = _Lark(grammar, import_sources=[custom_loader])
+            p = _Lark(grammar, import_paths=[custom_loader])
             x = p.parse('N')
             self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])
             
@@ -1810,7 +1810,7 @@ def _make_parser_test(LEXER, PARSER):
             %import .test_relative_import (start, WS)
             %ignore WS
             """
-            p = _Lark(grammar, import_sources=[custom_loader2])
+            p = _Lark(grammar, import_paths=[custom_loader2])
             x = p.parse('12 capybaras')
             self.assertEqual(x.children, ['12', 'capybaras'])