This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

414 lines
16 KiB

  1. from __future__ import absolute_import
  2. import sys, os, pickle, hashlib, logging
  3. from io import open
  4. from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii
  5. from .load_grammar import load_grammar
  6. from .tree import Tree
  7. from .common import LexerConf, ParserConf
  8. from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken
  9. from .parse_tree_builder import ParseTreeBuilder
  10. from .parser_frontends import get_frontend
  11. from .grammar import Rule
  12. import re
  13. try:
  14. import regex
  15. except ImportError:
  16. regex = None
  17. ###{standalone
  18. class LarkOptions(Serialize):
  19. """Specifies the options for Lark
  20. """
  21. OPTIONS_DOC = """
  22. # General
  23. start - The start symbol. Either a string, or a list of strings for
  24. multiple possible starts (Default: "start")
  25. debug - Display debug information, such as warnings (default: False)
  26. transformer - Applies the transformer to every parse tree (equivlent to
  27. applying it after the parse, but faster)
  28. propagate_positions - Propagates (line, column, end_line, end_column)
  29. attributes into all tree branches.
  30. maybe_placeholders - When True, the `[]` operator returns `None` when not matched.
  31. When `False`, `[]` behaves like the `?` operator,
  32. and returns no value at all.
  33. (default=`False`. Recommended to set to `True`)
  34. regex - When True, uses the `regex` module instead of the stdlib `re`.
  35. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
  36. LALR only for now.
  37. When `False`, does nothing (default)
  38. When `True`, caches to a temporary file in the local directory
  39. When given a string, caches to the path pointed by the string
  40. g_regex_flags - Flags that are applied to all terminals
  41. (both regex and strings)
  42. keep_all_tokens - Prevent the tree builder from automagically
  43. removing "punctuation" tokens (default: False)
  44. # Algorithm
  45. parser - Decides which parser engine to use
  46. Accepts "earley" or "lalr". (Default: "earley")
  47. (there is also a "cyk" option for legacy)
  48. lexer - Decides whether or not to use a lexer stage
  49. "auto" (default): Choose for me based on the parser
  50. "standard": Use a standard lexer
  51. "contextual": Stronger lexer (only works with parser="lalr")
  52. "dynamic": Flexible and powerful (only with parser="earley")
  53. "dynamic_complete": Same as dynamic, but tries *every* variation
  54. of tokenizing possible.
  55. ambiguity - Decides how to handle ambiguity in the parse.
  56. Only relevant if parser="earley"
  57. "resolve": The parser will automatically choose the simplest
  58. derivation (it chooses consistently: greedy for
  59. tokens, non-greedy for rules)
  60. "explicit": The parser will return all derivations wrapped
  61. in "_ambig" tree nodes (i.e. a forest).
  62. # Domain Specific
  63. postlex - Lexer post-processing (Default: None) Only works with the
  64. standard and contextual lexers.
  65. priority - How priorities should be evaluated - auto, none, normal,
  66. invert (Default: auto)
  67. lexer_callbacks - Dictionary of callbacks for the lexer. May alter
  68. tokens during lexing. Use with caution.
  69. edit_terminals - A callback
  70. """
  71. if __doc__:
  72. __doc__ += OPTIONS_DOC
  73. _defaults = {
  74. 'debug': False,
  75. 'keep_all_tokens': False,
  76. 'tree_class': None,
  77. 'cache': False,
  78. 'postlex': None,
  79. 'parser': 'earley',
  80. 'lexer': 'auto',
  81. 'transformer': None,
  82. 'start': 'start',
  83. 'priority': 'auto',
  84. 'ambiguity': 'auto',
  85. 'regex': False,
  86. 'propagate_positions': False,
  87. 'lexer_callbacks': {},
  88. 'maybe_placeholders': False,
  89. 'edit_terminals': None,
  90. 'g_regex_flags': 0,
  91. 'use_bytes': False,
  92. }
  93. def __init__(self, options_dict):
  94. o = dict(options_dict)
  95. options = {}
  96. for name, default in self._defaults.items():
  97. if name in o:
  98. value = o.pop(name)
  99. if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
  100. value = bool(value)
  101. else:
  102. value = default
  103. options[name] = value
  104. if isinstance(options['start'], STRING_TYPE):
  105. options['start'] = [options['start']]
  106. self.__dict__['options'] = options
  107. assert self.parser in ('earley', 'lalr', 'cyk', None)
  108. if self.parser == 'earley' and self.transformer:
  109. raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
  110. 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
  111. if o:
  112. raise ValueError("Unknown options: %s" % o.keys())
  113. def __getattr__(self, name):
  114. try:
  115. return self.options[name]
  116. except KeyError as e:
  117. raise AttributeError(e)
  118. def __setattr__(self, name, value):
  119. assert name in self.options
  120. self.options[name] = value
  121. def serialize(self, memo):
  122. return self.options
  123. @classmethod
  124. def deserialize(cls, data, memo):
  125. return cls(data)
  126. class Lark(Serialize):
  127. def __init__(self, grammar, **options):
  128. """
  129. grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
  130. options : a dictionary controlling various aspects of Lark.
  131. """
  132. self.options = LarkOptions(options)
  133. # Set regex or re module
  134. use_regex = self.options.regex
  135. if use_regex:
  136. if regex:
  137. re_module = regex
  138. else:
  139. raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
  140. else:
  141. re_module = re
  142. # Some, but not all file-like objects have a 'name' attribute
  143. try:
  144. self.source = grammar.name
  145. except AttributeError:
  146. self.source = '<string>'
  147. # Drain file-like objects to get their contents
  148. try:
  149. read = grammar.read
  150. except AttributeError:
  151. pass
  152. else:
  153. grammar = read()
  154. assert isinstance(grammar, STRING_TYPE)
  155. self.grammar_source = grammar
  156. if self.options.use_bytes:
  157. assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only"
  158. if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
  159. raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. "
  160. "It might have weird behaviour. Use `use_bytes='force'` "
  161. "to still use it")
  162. cache_fn = None
  163. if self.options.cache:
  164. if self.options.parser != 'lalr':
  165. raise NotImplementedError("cache only works with parser='lalr' for now")
  166. if isinstance(self.options.cache, STRING_TYPE):
  167. cache_fn = self.options.cache
  168. else:
  169. if self.options.cache is not True:
  170. raise ValueError("cache must be bool or str")
  171. unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
  172. from . import __version__
  173. options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
  174. s = grammar + options_str + __version__
  175. md5 = hashlib.md5(s.encode()).hexdigest()
  176. cache_fn = '.lark_cache_%s.tmp' % md5
  177. if FS.exists(cache_fn):
  178. logging.debug('Loading grammar from cache: %s', cache_fn)
  179. with FS.open(cache_fn, 'rb') as f:
  180. self._load(f, self.options.transformer, self.options.postlex)
  181. return
  182. if self.options.lexer == 'auto':
  183. if self.options.parser == 'lalr':
  184. self.options.lexer = 'contextual'
  185. elif self.options.parser == 'earley':
  186. self.options.lexer = 'dynamic'
  187. elif self.options.parser == 'cyk':
  188. self.options.lexer = 'standard'
  189. else:
  190. assert False, self.options.parser
  191. lexer = self.options.lexer
  192. assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)
  193. if self.options.ambiguity == 'auto':
  194. if self.options.parser == 'earley':
  195. self.options.ambiguity = 'resolve'
  196. else:
  197. disambig_parsers = ['earley', 'cyk']
  198. assert self.options.parser in disambig_parsers, (
  199. 'Only %s supports disambiguation right now') % ', '.join(disambig_parsers)
  200. if self.options.priority == 'auto':
  201. if self.options.parser in ('earley', 'cyk', ):
  202. self.options.priority = 'normal'
  203. elif self.options.parser in ('lalr', ):
  204. self.options.priority = None
  205. elif self.options.priority in ('invert', 'normal'):
  206. assert self.options.parser in ('earley', 'cyk'), "priorities are not supported for LALR at this time"
  207. assert self.options.priority in ('auto', None, 'normal', 'invert'), 'invalid priority option specified: {}. options are auto, none, normal, invert.'.format(self.options.priority)
  208. assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"'
  209. assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )
  210. # Parse the grammar file and compose the grammars (TODO)
  211. self.grammar = load_grammar(grammar, self.source, re_module)
  212. # Compile the EBNF grammar into BNF
  213. self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
  214. if self.options.edit_terminals:
  215. for t in self.terminals:
  216. self.options.edit_terminals(t)
  217. self._terminals_dict = {t.name: t for t in self.terminals}
  218. # If the user asked to invert the priorities, negate them all here.
  219. # This replaces the old 'resolve__antiscore_sum' option.
  220. if self.options.priority == 'invert':
  221. for rule in self.rules:
  222. if rule.options.priority is not None:
  223. rule.options.priority = -rule.options.priority
  224. # Else, if the user asked to disable priorities, strip them from the
  225. # rules. This allows the Earley parsers to skip an extra forest walk
  226. # for improved performance, if you don't need them (or didn't specify any).
  227. elif self.options.priority == None:
  228. for rule in self.rules:
  229. if rule.options.priority is not None:
  230. rule.options.priority = None
  231. # TODO Deprecate lexer_callbacks?
  232. lexer_callbacks = dict(self.options.lexer_callbacks)
  233. if self.options.transformer:
  234. t = self.options.transformer
  235. for term in self.terminals:
  236. if hasattr(t, term.name):
  237. lexer_callbacks[term.name] = getattr(t, term.name)
  238. self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes)
  239. if self.options.parser:
  240. self.parser = self._build_parser()
  241. elif lexer:
  242. self.lexer = self._build_lexer()
  243. if cache_fn:
  244. logging.debug('Saving grammar to cache: %s', cache_fn)
  245. with FS.open(cache_fn, 'wb') as f:
  246. self.save(f)
  247. if __init__.__doc__:
  248. __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC
  249. __serialize_fields__ = 'parser', 'rules', 'options'
  250. def _build_lexer(self):
  251. return TraditionalLexer(self.lexer_conf)
  252. def _prepare_callbacks(self):
  253. self.parser_class = get_frontend(self.options.parser, self.options.lexer)
  254. self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders)
  255. self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
  256. def _build_parser(self):
  257. self._prepare_callbacks()
  258. parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
  259. return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
  260. def save(self, f):
  261. data, m = self.memo_serialize([TerminalDef, Rule])
  262. pickle.dump({'data': data, 'memo': m}, f)
  263. @classmethod
  264. def load(cls, f):
  265. inst = cls.__new__(cls)
  266. return inst._load(f)
  267. def _load(self, f, transformer=None, postlex=None):
  268. if isinstance(f, dict):
  269. d = f
  270. else:
  271. d = pickle.load(f)
  272. memo = d['memo']
  273. data = d['data']
  274. assert memo
  275. memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
  276. options = dict(data['options'])
  277. if transformer is not None:
  278. options['transformer'] = transformer
  279. if postlex is not None:
  280. options['postlex'] = postlex
  281. self.options = LarkOptions.deserialize(options, memo)
  282. re_module = regex if self.options.regex else re
  283. self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
  284. self.source = '<deserialized>'
  285. self._prepare_callbacks()
  286. self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module)
  287. return self
  288. @classmethod
  289. def _load_from_dict(cls, data, memo, transformer=None, postlex=None):
  290. inst = cls.__new__(cls)
  291. return inst._load({'data': data, 'memo': memo}, transformer, postlex)
  292. @classmethod
  293. def open(cls, grammar_filename, rel_to=None, **options):
  294. """Create an instance of Lark with the grammar given by its filename
  295. If rel_to is provided, the function will find the grammar filename in relation to it.
  296. Example:
  297. >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
  298. Lark(...)
  299. """
  300. if rel_to:
  301. basepath = os.path.dirname(rel_to)
  302. grammar_filename = os.path.join(basepath, grammar_filename)
  303. with open(grammar_filename, encoding='utf8') as f:
  304. return cls(f, **options)
  305. def __repr__(self):
  306. return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
  307. def lex(self, text):
  308. "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
  309. if not hasattr(self, 'lexer'):
  310. self.lexer = self._build_lexer()
  311. stream = self.lexer.lex(text)
  312. if self.options.postlex:
  313. return self.options.postlex.process(stream)
  314. return stream
  315. def get_terminal(self, name):
  316. "Get information about a terminal"
  317. return self._terminals_dict[name]
  318. def parse(self, text, start=None, on_error=None):
  319. """Parse the given text, according to the options provided.
  320. Parameters:
  321. start: str - required if Lark was given multiple possible start symbols (using the start option).
  322. on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only.
  323. Returns a tree, unless specified otherwise.
  324. """
  325. try:
  326. return self.parser.parse(text, start=start)
  327. except UnexpectedToken as e:
  328. if on_error is None:
  329. raise
  330. while True:
  331. if not on_error(e):
  332. raise e
  333. try:
  334. return e.puppet.resume_parse()
  335. except UnexpectedToken as e2:
  336. e = e2
  337. ###}