This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

415 lines
16 KiB

  1. from __future__ import absolute_import
  2. import sys, os, pickle, hashlib, logging
  3. from io import open
  4. from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii
  5. from .load_grammar import load_grammar
  6. from .tree import Tree
  7. from .common import LexerConf, ParserConf
  8. from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken
  9. from .parse_tree_builder import ParseTreeBuilder
  10. from .parser_frontends import get_frontend
  11. from .grammar import Rule
  12. import re
  13. try:
  14. import regex
  15. except ImportError:
  16. regex = None
  17. ###{standalone
  18. class LarkOptions(Serialize):
  19. """Specifies the options for Lark
  20. """
  21. OPTIONS_DOC = """
  22. # General
  23. start - The start symbol. Either a string, or a list of strings for
  24. multiple possible starts (Default: "start")
  25. debug - Display debug information, such as warnings (default: False)
  26. transformer - Applies the transformer to every parse tree (equivlent to
  27. applying it after the parse, but faster)
  28. propagate_positions - Propagates (line, column, end_line, end_column)
  29. attributes into all tree branches.
  30. maybe_placeholders - When True, the `[]` operator returns `None` when not matched.
  31. When `False`, `[]` behaves like the `?` operator,
  32. and returns no value at all.
  33. (default=`False`. Recommended to set to `True`)
  34. regex - When True, uses the `regex` module instead of the stdlib `re`.
  35. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
  36. LALR only for now.
  37. When `False`, does nothing (default)
  38. When `True`, caches to a temporary file in the local directory
  39. When given a string, caches to the path pointed by the string
  40. g_regex_flags - Flags that are applied to all terminals
  41. (both regex and strings)
  42. keep_all_tokens - Prevent the tree builder from automagically
  43. removing "punctuation" tokens (default: False)
  44. # Algorithm
  45. parser - Decides which parser engine to use
  46. Accepts "earley" or "lalr". (Default: "earley")
  47. (there is also a "cyk" option for legacy)
  48. lexer - Decides whether or not to use a lexer stage
  49. "auto" (default): Choose for me based on the parser
  50. "standard": Use a standard lexer
  51. "contextual": Stronger lexer (only works with parser="lalr")
  52. "dynamic": Flexible and powerful (only with parser="earley")
  53. "dynamic_complete": Same as dynamic, but tries *every* variation
  54. of tokenizing possible.
  55. ambiguity - Decides how to handle ambiguity in the parse.
  56. Only relevant if parser="earley"
  57. "resolve": The parser will automatically choose the simplest
  58. derivation (it chooses consistently: greedy for
  59. tokens, non-greedy for rules)
  60. "explicit": The parser will return all derivations wrapped
  61. in "_ambig" tree nodes (i.e. a forest).
  62. # Domain Specific
  63. postlex - Lexer post-processing (Default: None) Only works with the
  64. standard and contextual lexers.
  65. priority - How priorities should be evaluated - auto, none, normal,
  66. invert (Default: auto)
  67. lexer_callbacks - Dictionary of callbacks for the lexer. May alter
  68. tokens during lexing. Use with caution.
  69. use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only).
  70. edit_terminals - A callback
  71. """
  72. if __doc__:
  73. __doc__ += OPTIONS_DOC
  74. _defaults = {
  75. 'debug': False,
  76. 'keep_all_tokens': False,
  77. 'tree_class': None,
  78. 'cache': False,
  79. 'postlex': None,
  80. 'parser': 'earley',
  81. 'lexer': 'auto',
  82. 'transformer': None,
  83. 'start': 'start',
  84. 'priority': 'auto',
  85. 'ambiguity': 'auto',
  86. 'regex': False,
  87. 'propagate_positions': False,
  88. 'lexer_callbacks': {},
  89. 'maybe_placeholders': False,
  90. 'edit_terminals': None,
  91. 'g_regex_flags': 0,
  92. 'use_bytes': False,
  93. }
  94. def __init__(self, options_dict):
  95. o = dict(options_dict)
  96. options = {}
  97. for name, default in self._defaults.items():
  98. if name in o:
  99. value = o.pop(name)
  100. if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
  101. value = bool(value)
  102. else:
  103. value = default
  104. options[name] = value
  105. if isinstance(options['start'], STRING_TYPE):
  106. options['start'] = [options['start']]
  107. self.__dict__['options'] = options
  108. assert self.parser in ('earley', 'lalr', 'cyk', None)
  109. if self.parser == 'earley' and self.transformer:
  110. raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
  111. 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
  112. if o:
  113. raise ValueError("Unknown options: %s" % o.keys())
  114. def __getattr__(self, name):
  115. try:
  116. return self.options[name]
  117. except KeyError as e:
  118. raise AttributeError(e)
  119. def __setattr__(self, name, value):
  120. assert name in self.options
  121. self.options[name] = value
  122. def serialize(self, memo):
  123. return self.options
  124. @classmethod
  125. def deserialize(cls, data, memo):
  126. return cls(data)
  127. class Lark(Serialize):
  128. def __init__(self, grammar, **options):
  129. """
  130. grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
  131. options : a dictionary controlling various aspects of Lark.
  132. """
  133. self.options = LarkOptions(options)
  134. # Set regex or re module
  135. use_regex = self.options.regex
  136. if use_regex:
  137. if regex:
  138. re_module = regex
  139. else:
  140. raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
  141. else:
  142. re_module = re
  143. # Some, but not all file-like objects have a 'name' attribute
  144. try:
  145. self.source = grammar.name
  146. except AttributeError:
  147. self.source = '<string>'
  148. # Drain file-like objects to get their contents
  149. try:
  150. read = grammar.read
  151. except AttributeError:
  152. pass
  153. else:
  154. grammar = read()
  155. assert isinstance(grammar, STRING_TYPE)
  156. self.grammar_source = grammar
  157. if self.options.use_bytes:
  158. if not isascii(grammar):
  159. raise ValueError("Grammar must be ascii only, when use_bytes=True")
  160. if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
  161. raise NotImplementedError("`use_bytes=True` may have issues on python2."
  162. "Use `use_bytes='force'` to use it at your own risk.")
  163. cache_fn = None
  164. if self.options.cache:
  165. if self.options.parser != 'lalr':
  166. raise NotImplementedError("cache only works with parser='lalr' for now")
  167. if isinstance(self.options.cache, STRING_TYPE):
  168. cache_fn = self.options.cache
  169. else:
  170. if self.options.cache is not True:
  171. raise ValueError("cache argument must be bool or str")
  172. unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
  173. from . import __version__
  174. options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
  175. s = grammar + options_str + __version__
  176. md5 = hashlib.md5(s.encode()).hexdigest()
  177. cache_fn = '.lark_cache_%s.tmp' % md5
  178. if FS.exists(cache_fn):
  179. logging.debug('Loading grammar from cache: %s', cache_fn)
  180. with FS.open(cache_fn, 'rb') as f:
  181. self._load(f, self.options.transformer, self.options.postlex)
  182. return
  183. if self.options.lexer == 'auto':
  184. if self.options.parser == 'lalr':
  185. self.options.lexer = 'contextual'
  186. elif self.options.parser == 'earley':
  187. self.options.lexer = 'dynamic'
  188. elif self.options.parser == 'cyk':
  189. self.options.lexer = 'standard'
  190. else:
  191. assert False, self.options.parser
  192. lexer = self.options.lexer
  193. assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)
  194. if self.options.ambiguity == 'auto':
  195. if self.options.parser == 'earley':
  196. self.options.ambiguity = 'resolve'
  197. else:
  198. disambig_parsers = ['earley', 'cyk']
  199. assert self.options.parser in disambig_parsers, (
  200. 'Only %s supports disambiguation right now') % ', '.join(disambig_parsers)
  201. if self.options.priority == 'auto':
  202. if self.options.parser in ('earley', 'cyk', ):
  203. self.options.priority = 'normal'
  204. elif self.options.parser in ('lalr', ):
  205. self.options.priority = None
  206. elif self.options.priority in ('invert', 'normal'):
  207. assert self.options.parser in ('earley', 'cyk'), "priorities are not supported for LALR at this time"
  208. assert self.options.priority in ('auto', None, 'normal', 'invert'), 'invalid priority option specified: {}. options are auto, none, normal, invert.'.format(self.options.priority)
  209. assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"'
  210. assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )
  211. # Parse the grammar file and compose the grammars (TODO)
  212. self.grammar = load_grammar(grammar, self.source, re_module)
  213. # Compile the EBNF grammar into BNF
  214. self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
  215. if self.options.edit_terminals:
  216. for t in self.terminals:
  217. self.options.edit_terminals(t)
  218. self._terminals_dict = {t.name: t for t in self.terminals}
  219. # If the user asked to invert the priorities, negate them all here.
  220. # This replaces the old 'resolve__antiscore_sum' option.
  221. if self.options.priority == 'invert':
  222. for rule in self.rules:
  223. if rule.options.priority is not None:
  224. rule.options.priority = -rule.options.priority
  225. # Else, if the user asked to disable priorities, strip them from the
  226. # rules. This allows the Earley parsers to skip an extra forest walk
  227. # for improved performance, if you don't need them (or didn't specify any).
  228. elif self.options.priority == None:
  229. for rule in self.rules:
  230. if rule.options.priority is not None:
  231. rule.options.priority = None
  232. # TODO Deprecate lexer_callbacks?
  233. lexer_callbacks = dict(self.options.lexer_callbacks)
  234. if self.options.transformer:
  235. t = self.options.transformer
  236. for term in self.terminals:
  237. if hasattr(t, term.name):
  238. lexer_callbacks[term.name] = getattr(t, term.name)
  239. self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes)
  240. if self.options.parser:
  241. self.parser = self._build_parser()
  242. elif lexer:
  243. self.lexer = self._build_lexer()
  244. if cache_fn:
  245. logging.debug('Saving grammar to cache: %s', cache_fn)
  246. with FS.open(cache_fn, 'wb') as f:
  247. self.save(f)
  248. if __init__.__doc__:
  249. __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC
  250. __serialize_fields__ = 'parser', 'rules', 'options'
  251. def _build_lexer(self):
  252. return TraditionalLexer(self.lexer_conf)
  253. def _prepare_callbacks(self):
  254. self.parser_class = get_frontend(self.options.parser, self.options.lexer)
  255. self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders)
  256. self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
  257. def _build_parser(self):
  258. self._prepare_callbacks()
  259. parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
  260. return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
  261. def save(self, f):
  262. data, m = self.memo_serialize([TerminalDef, Rule])
  263. pickle.dump({'data': data, 'memo': m}, f)
  264. @classmethod
  265. def load(cls, f):
  266. inst = cls.__new__(cls)
  267. return inst._load(f)
  268. def _load(self, f, transformer=None, postlex=None):
  269. if isinstance(f, dict):
  270. d = f
  271. else:
  272. d = pickle.load(f)
  273. memo = d['memo']
  274. data = d['data']
  275. assert memo
  276. memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
  277. options = dict(data['options'])
  278. if transformer is not None:
  279. options['transformer'] = transformer
  280. if postlex is not None:
  281. options['postlex'] = postlex
  282. self.options = LarkOptions.deserialize(options, memo)
  283. re_module = regex if self.options.regex else re
  284. self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
  285. self.source = '<deserialized>'
  286. self._prepare_callbacks()
  287. self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module)
  288. return self
  289. @classmethod
  290. def _load_from_dict(cls, data, memo, transformer=None, postlex=None):
  291. inst = cls.__new__(cls)
  292. return inst._load({'data': data, 'memo': memo}, transformer, postlex)
  293. @classmethod
  294. def open(cls, grammar_filename, rel_to=None, **options):
  295. """Create an instance of Lark with the grammar given by its filename
  296. If rel_to is provided, the function will find the grammar filename in relation to it.
  297. Example:
  298. >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
  299. Lark(...)
  300. """
  301. if rel_to:
  302. basepath = os.path.dirname(rel_to)
  303. grammar_filename = os.path.join(basepath, grammar_filename)
  304. with open(grammar_filename, encoding='utf8') as f:
  305. return cls(f, **options)
  306. def __repr__(self):
  307. return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
  308. def lex(self, text):
  309. "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
  310. if not hasattr(self, 'lexer'):
  311. self.lexer = self._build_lexer()
  312. stream = self.lexer.lex(text)
  313. if self.options.postlex:
  314. return self.options.postlex.process(stream)
  315. return stream
  316. def get_terminal(self, name):
  317. "Get information about a terminal"
  318. return self._terminals_dict[name]
  319. def parse(self, text, start=None, on_error=None):
  320. """Parse the given text, according to the options provided.
  321. Parameters:
  322. start: str - required if Lark was given multiple possible start symbols (using the start option).
  323. on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only.
  324. Returns a tree, unless specified otherwise.
  325. """
  326. try:
  327. return self.parser.parse(text, start=start)
  328. except UnexpectedToken as e:
  329. if on_error is None:
  330. raise
  331. while True:
  332. if not on_error(e):
  333. raise e
  334. try:
  335. return e.puppet.resume_parse()
  336. except UnexpectedToken as e2:
  337. e = e2
  338. ###}