This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2330 lines
92 KiB

  1. # The file was automatically generated by Lark v0.9.0
  2. __version__ = "0.9.0"
  3. #
  4. #
  5. # Lark Stand-alone Generator Tool
  6. # ----------------------------------
  7. # Generates a stand-alone LALR(1) parser with a standard lexer
  8. #
  9. # Git: https://github.com/erezsh/lark
  10. # Author: Erez Shinan (erezshin@gmail.com)
  11. #
  12. #
  13. # >>> LICENSE
  14. #
  15. # This tool and its generated code use a separate license from Lark,
  16. # and are subject to the terms of the Mozilla Public License, v. 2.0.
  17. # If a copy of the MPL was not distributed with this
  18. # file, You can obtain one at https://mozilla.org/MPL/2.0/.
  19. #
  20. # If you wish to purchase a commercial license for this tool and its
  21. # generated code, you may contact me via email or otherwise.
  22. #
  23. # If MPL2 is incompatible with your free or open-source project,
  24. # contact me and we'll work it out.
  25. #
  26. #
  27. import os
  28. from io import open
  29. import logging
  30. class LarkError(Exception):
  31. pass
  32. class GrammarError(LarkError):
  33. pass
  34. class ParseError(LarkError):
  35. pass
  36. class LexError(LarkError):
  37. pass
  38. class UnexpectedEOF(ParseError):
  39. def __init__(self, expected):
  40. self.expected = expected
  41. message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
  42. super(UnexpectedEOF, self).__init__(message)
  43. class UnexpectedInput(LarkError):
  44. pos_in_stream = None
  45. def get_context(self, text, span=40):
  46. pos = self.pos_in_stream
  47. start = max(pos - span, 0)
  48. end = pos + span
  49. if not isinstance(text, bytes):
  50. before = text[start:pos].rsplit('\n', 1)[-1]
  51. after = text[pos:end].split('\n', 1)[0]
  52. return before + after + '\n' + ' ' * len(before) + '^\n'
  53. else:
  54. before = text[start:pos].rsplit(b'\n', 1)[-1]
  55. after = text[pos:end].split(b'\n', 1)[0]
  56. return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")
  57. def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False):
  58. """ Given a parser instance and a dictionary mapping some label with
  59. some malformed syntax examples, it'll return the label for the
  60. example that bests matches the current error.
  61. It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility.
  62. """
  63. assert self.state is not None, "Not supported for this exception"
  64. if isinstance(examples, dict):
  65. examples = examples.items()
  66. candidate = (None, False)
  67. for i, (label, example) in enumerate(examples):
  68. assert not isinstance(example, STRING_TYPE)
  69. for j, malformed in enumerate(example):
  70. try:
  71. parse_fn(malformed)
  72. except UnexpectedInput as ut:
  73. if ut.state == self.state:
  74. if use_accepts and ut.accepts != self.accepts:
  75. logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
  76. (self.state, self.accepts, ut.accepts, i, j))
  77. continue
  78. try:
  79. if ut.token == self.token: # Try exact match first
  80. logging.debug("Exact Match at example [%s][%s]" % (i, j))
  81. return label
  82. if token_type_match_fallback:
  83. # Fallback to token types match
  84. if (ut.token.type == self.token.type) and not candidate[-1]:
  85. logging.debug("Token Type Fallback at example [%s][%s]" % (i, j))
  86. candidate = label, True
  87. except AttributeError:
  88. pass
  89. if not candidate[0]:
  90. logging.debug("Same State match at example [%s][%s]" % (i, j))
  91. candidate = label, False
  92. return candidate[0]
  93. class UnexpectedCharacters(LexError, UnexpectedInput):
  94. def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
  95. self.line = line
  96. self.column = column
  97. self.pos_in_stream = lex_pos
  98. self.state = state
  99. self.allowed = allowed
  100. self.considered_tokens = considered_tokens
  101. if isinstance(seq, bytes):
  102. _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
  103. else:
  104. _s = seq[lex_pos]
  105. message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column)
  106. message += '\n\n' + self.get_context(seq)
  107. if allowed:
  108. message += '\nExpecting: %s\n' % allowed
  109. if token_history:
  110. message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)
  111. super(UnexpectedCharacters, self).__init__(message)
  112. class UnexpectedToken(ParseError, UnexpectedInput):
  113. def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
  114. self.line = getattr(token, 'line', '?')
  115. self.column = getattr(token, 'column', '?')
  116. self.pos_in_stream = getattr(token, 'pos_in_stream', None)
  117. self.state = state
  118. self.token = token
  119. self.expected = expected # XXX deprecate? `accepts` is better
  120. self.considered_rules = considered_rules
  121. self.puppet = puppet
  122. # TODO Only calculate `accepts()` when we need to display it to the user
  123. # This will improve performance when doing automatic error handling
  124. self.accepts = puppet and puppet.accepts()
  125. message = ("Unexpected token %r at line %s, column %s.\n"
  126. "Expected one of: \n\t* %s\n"
  127. % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))
  128. super(UnexpectedToken, self).__init__(message)
  129. class VisitError(LarkError):
  130. """VisitError is raised when visitors are interrupted by an exception
  131. It provides the following attributes for inspection:
  132. - obj: the tree node or token it was processing when the exception was raised
  133. - orig_exc: the exception that cause it to fail
  134. """
  135. def __init__(self, rule, obj, orig_exc):
  136. self.obj = obj
  137. self.orig_exc = orig_exc
  138. message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
  139. super(VisitError, self).__init__(message)
  140. def classify(seq, key=None, value=None):
  141. d = {}
  142. for item in seq:
  143. k = key(item) if (key is not None) else item
  144. v = value(item) if (value is not None) else item
  145. if k in d:
  146. d[k].append(v)
  147. else:
  148. d[k] = [v]
  149. return d
  150. def _deserialize(data, namespace, memo):
  151. if isinstance(data, dict):
  152. if '__type__' in data: # Object
  153. class_ = namespace[data['__type__']]
  154. return class_.deserialize(data, memo)
  155. elif '@' in data:
  156. return memo[data['@']]
  157. return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
  158. elif isinstance(data, list):
  159. return [_deserialize(value, namespace, memo) for value in data]
  160. return data
  161. class Serialize(object):
  162. def memo_serialize(self, types_to_memoize):
  163. memo = SerializeMemoizer(types_to_memoize)
  164. return self.serialize(memo), memo.serialize()
  165. def serialize(self, memo=None):
  166. if memo and memo.in_types(self):
  167. return {'@': memo.memoized.get(self)}
  168. fields = getattr(self, '__serialize_fields__')
  169. res = {f: _serialize(getattr(self, f), memo) for f in fields}
  170. res['__type__'] = type(self).__name__
  171. postprocess = getattr(self, '_serialize', None)
  172. if postprocess:
  173. postprocess(res, memo)
  174. return res
  175. @classmethod
  176. def deserialize(cls, data, memo):
  177. namespace = getattr(cls, '__serialize_namespace__', {})
  178. namespace = {c.__name__:c for c in namespace}
  179. fields = getattr(cls, '__serialize_fields__')
  180. if '@' in data:
  181. return memo[data['@']]
  182. inst = cls.__new__(cls)
  183. for f in fields:
  184. try:
  185. setattr(inst, f, _deserialize(data[f], namespace, memo))
  186. except KeyError as e:
  187. raise KeyError("Cannot find key for class", cls, e)
  188. postprocess = getattr(inst, '_deserialize', None)
  189. if postprocess:
  190. postprocess()
  191. return inst
  192. class SerializeMemoizer(Serialize):
  193. __serialize_fields__ = 'memoized',
  194. def __init__(self, types_to_memoize):
  195. self.types_to_memoize = tuple(types_to_memoize)
  196. self.memoized = Enumerator()
  197. def in_types(self, value):
  198. return isinstance(value, self.types_to_memoize)
  199. def serialize(self):
  200. return _serialize(self.memoized.reversed(), None)
  201. @classmethod
  202. def deserialize(cls, data, namespace, memo):
  203. return _deserialize(data, namespace, memo)
  204. try:
  205. STRING_TYPE = basestring
  206. except NameError: # Python 3
  207. STRING_TYPE = str
  208. import types
  209. from functools import wraps, partial
  210. from contextlib import contextmanager
  211. Str = type(u'')
  212. try:
  213. classtype = types.ClassType # Python2
  214. except AttributeError:
  215. classtype = type # Python3
  216. def smart_decorator(f, create_decorator):
  217. if isinstance(f, types.FunctionType):
  218. return wraps(f)(create_decorator(f, True))
  219. elif isinstance(f, (classtype, type, types.BuiltinFunctionType)):
  220. return wraps(f)(create_decorator(f, False))
  221. elif isinstance(f, types.MethodType):
  222. return wraps(f)(create_decorator(f.__func__, True))
  223. elif isinstance(f, partial):
  224. # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
  225. return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))
  226. else:
  227. return create_decorator(f.__func__.__call__, True)
  228. try:
  229. import regex
  230. except ImportError:
  231. regex = None
  232. import sys, re
  233. Py36 = (sys.version_info[:2] >= (3, 6))
  234. import sre_parse
  235. import sre_constants
  236. categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
  237. def get_regexp_width(expr):
  238. if regex:
  239. # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
  240. # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
  241. # match here below.
  242. regexp_final = re.sub(categ_pattern, 'A', expr)
  243. else:
  244. if re.search(categ_pattern, expr):
  245. raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
  246. regexp_final = expr
  247. try:
  248. return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
  249. except sre_constants.error:
  250. raise ValueError(expr)
  251. from collections import OrderedDict
  252. class Meta:
  253. def __init__(self):
  254. self.empty = True
  255. class Tree(object):
  256. def __init__(self, data, children, meta=None):
  257. self.data = data
  258. self.children = children
  259. self._meta = meta
  260. @property
  261. def meta(self):
  262. if self._meta is None:
  263. self._meta = Meta()
  264. return self._meta
  265. def __repr__(self):
  266. return 'Tree(%s, %s)' % (self.data, self.children)
  267. def _pretty_label(self):
  268. return self.data
  269. def _pretty(self, level, indent_str):
  270. if len(self.children) == 1 and not isinstance(self.children[0], Tree):
  271. return [ indent_str*level, self._pretty_label(), '\t', '%s' % (self.children[0],), '\n']
  272. l = [ indent_str*level, self._pretty_label(), '\n' ]
  273. for n in self.children:
  274. if isinstance(n, Tree):
  275. l += n._pretty(level+1, indent_str)
  276. else:
  277. l += [ indent_str*(level+1), '%s' % (n,), '\n' ]
  278. return l
  279. def pretty(self, indent_str=' '):
  280. return ''.join(self._pretty(0, indent_str))
  281. def __eq__(self, other):
  282. try:
  283. return self.data == other.data and self.children == other.children
  284. except AttributeError:
  285. return False
  286. def __ne__(self, other):
  287. return not (self == other)
  288. def __hash__(self):
  289. return hash((self.data, tuple(self.children)))
  290. def iter_subtrees(self):
  291. queue = [self]
  292. subtrees = OrderedDict()
  293. for subtree in queue:
  294. subtrees[id(subtree)] = subtree
  295. queue += [c for c in reversed(subtree.children)
  296. if isinstance(c, Tree) and id(c) not in subtrees]
  297. del queue
  298. return reversed(list(subtrees.values()))
  299. def find_pred(self, pred):
  300. "Find all nodes where pred(tree) == True"
  301. return filter(pred, self.iter_subtrees())
  302. def find_data(self, data):
  303. "Find all nodes where tree.data == data"
  304. return self.find_pred(lambda t: t.data == data)
  305. from inspect import getmembers, getmro
  306. class Discard(Exception):
  307. pass
  308. # Transformers
  309. class _Decoratable:
  310. "Provides support for decorating methods with @v_args"
  311. @classmethod
  312. def _apply_decorator(cls, decorator, **kwargs):
  313. mro = getmro(cls)
  314. assert mro[0] is cls
  315. libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
  316. for name, value in getmembers(cls):
  317. # Make sure the function isn't inherited (unless it's overwritten)
  318. if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
  319. continue
  320. if not callable(value):
  321. continue
  322. # Skip if v_args already applied (at the function level)
  323. if hasattr(cls.__dict__[name], 'vargs_applied') or hasattr(value, 'vargs_applied'):
  324. continue
  325. static = isinstance(cls.__dict__[name], (staticmethod, classmethod))
  326. setattr(cls, name, decorator(value, static=static, **kwargs))
  327. return cls
  328. def __class_getitem__(cls, _):
  329. return cls
  330. class Transformer(_Decoratable):
  331. """Visits the tree recursively, starting with the leaves and finally the root (bottom-up)
  332. Calls its methods (provided by user via inheritance) according to tree.data
  333. The returned value replaces the old one in the structure.
  334. Can be used to implement map or reduce.
  335. """
  336. __visit_tokens__ = True # For backwards compatibility
  337. def __init__(self, visit_tokens=True):
  338. self.__visit_tokens__ = visit_tokens
  339. def _call_userfunc(self, tree, new_children=None):
  340. # Assumes tree is already transformed
  341. children = new_children if new_children is not None else tree.children
  342. try:
  343. f = getattr(self, tree.data)
  344. except AttributeError:
  345. return self.__default__(tree.data, children, tree.meta)
  346. else:
  347. try:
  348. wrapper = getattr(f, 'visit_wrapper', None)
  349. if wrapper is not None:
  350. return f.visit_wrapper(f, tree.data, children, tree.meta)
  351. else:
  352. return f(children)
  353. except (GrammarError, Discard):
  354. raise
  355. except Exception as e:
  356. raise VisitError(tree.data, tree, e)
  357. def _call_userfunc_token(self, token):
  358. try:
  359. f = getattr(self, token.type)
  360. except AttributeError:
  361. return self.__default_token__(token)
  362. else:
  363. try:
  364. return f(token)
  365. except (GrammarError, Discard):
  366. raise
  367. except Exception as e:
  368. raise VisitError(token.type, token, e)
  369. def _transform_children(self, children):
  370. for c in children:
  371. try:
  372. if isinstance(c, Tree):
  373. yield self._transform_tree(c)
  374. elif self.__visit_tokens__ and isinstance(c, Token):
  375. yield self._call_userfunc_token(c)
  376. else:
  377. yield c
  378. except Discard:
  379. pass
  380. def _transform_tree(self, tree):
  381. children = list(self._transform_children(tree.children))
  382. return self._call_userfunc(tree, children)
  383. def transform(self, tree):
  384. return self._transform_tree(tree)
  385. def __mul__(self, other):
  386. return TransformerChain(self, other)
  387. def __default__(self, data, children, meta):
  388. "Default operation on tree (for override)"
  389. return Tree(data, children, meta)
  390. def __default_token__(self, token):
  391. "Default operation on token (for override)"
  392. return token
  393. class InlineTransformer(Transformer): # XXX Deprecated
  394. def _call_userfunc(self, tree, new_children=None):
  395. # Assumes tree is already transformed
  396. children = new_children if new_children is not None else tree.children
  397. try:
  398. f = getattr(self, tree.data)
  399. except AttributeError:
  400. return self.__default__(tree.data, children, tree.meta)
  401. else:
  402. return f(*children)
  403. class TransformerChain(object):
  404. def __init__(self, *transformers):
  405. self.transformers = transformers
  406. def transform(self, tree):
  407. for t in self.transformers:
  408. tree = t.transform(tree)
  409. return tree
  410. def __mul__(self, other):
  411. return TransformerChain(*self.transformers + (other,))
  412. class Transformer_InPlace(Transformer):
  413. "Non-recursive. Changes the tree in-place instead of returning new instances"
  414. def _transform_tree(self, tree): # Cancel recursion
  415. return self._call_userfunc(tree)
  416. def transform(self, tree):
  417. for subtree in tree.iter_subtrees():
  418. subtree.children = list(self._transform_children(subtree.children))
  419. return self._transform_tree(tree)
  420. class Transformer_NonRecursive(Transformer):
  421. "Non-recursive. Doesn't change the original tree."
  422. def transform(self, tree):
  423. # Tree to postfix
  424. rev_postfix = []
  425. q = [tree]
  426. while q:
  427. t = q.pop()
  428. rev_postfix.append( t )
  429. if isinstance(t, Tree):
  430. q += t.children
  431. # Postfix to tree
  432. stack = []
  433. for x in reversed(rev_postfix):
  434. if isinstance(x, Tree):
  435. size = len(x.children)
  436. if size:
  437. args = stack[-size:]
  438. del stack[-size:]
  439. else:
  440. args = []
  441. stack.append(self._call_userfunc(x, args))
  442. else:
  443. stack.append(x)
  444. t ,= stack # We should have only one tree remaining
  445. return t
  446. class Transformer_InPlaceRecursive(Transformer):
  447. "Recursive. Changes the tree in-place instead of returning new instances"
  448. def _transform_tree(self, tree):
  449. tree.children = list(self._transform_children(tree.children))
  450. return self._call_userfunc(tree)
  451. # Visitors
  452. class VisitorBase:
  453. def _call_userfunc(self, tree):
  454. return getattr(self, tree.data, self.__default__)(tree)
  455. def __default__(self, tree):
  456. "Default operation on tree (for override)"
  457. return tree
  458. def __class_getitem__(cls, _):
  459. return cls
  460. class Visitor(VisitorBase):
  461. """Bottom-up visitor, non-recursive
  462. Visits the tree, starting with the leaves and finally the root (bottom-up)
  463. Calls its methods (provided by user via inheritance) according to tree.data
  464. """
  465. def visit(self, tree):
  466. for subtree in tree.iter_subtrees():
  467. self._call_userfunc(subtree)
  468. return tree
  469. def visit_topdown(self,tree):
  470. for subtree in tree.iter_subtrees_topdown():
  471. self._call_userfunc(subtree)
  472. return tree
  473. class Visitor_Recursive(VisitorBase):
  474. """Bottom-up visitor, recursive
  475. Visits the tree, starting with the leaves and finally the root (bottom-up)
  476. Calls its methods (provided by user via inheritance) according to tree.data
  477. """
  478. def visit(self, tree):
  479. for child in tree.children:
  480. if isinstance(child, Tree):
  481. self.visit(child)
  482. self._call_userfunc(tree)
  483. return tree
  484. def visit_topdown(self,tree):
  485. self._call_userfunc(tree)
  486. for child in tree.children:
  487. if isinstance(child, Tree):
  488. self.visit_topdown(child)
  489. return tree
  490. def visit_children_decor(func):
  491. "See Interpreter"
  492. @wraps(func)
  493. def inner(cls, tree):
  494. values = cls.visit_children(tree)
  495. return func(cls, values)
  496. return inner
  497. class Interpreter(_Decoratable):
  498. """Top-down visitor, recursive
  499. Visits the tree, starting with the root and finally the leaves (top-down)
  500. Calls its methods (provided by user via inheritance) according to tree.data
  501. Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches.
  502. The user has to explicitly call visit, visit_children, or use the @visit_children_decor
  503. """
  504. def visit(self, tree):
  505. f = getattr(self, tree.data)
  506. wrapper = getattr(f, 'visit_wrapper', None)
  507. if wrapper is not None:
  508. return f.visit_wrapper(f, tree.data, tree.children, tree.meta)
  509. else:
  510. return f(tree)
  511. def visit_children(self, tree):
  512. return [self.visit(child) if isinstance(child, Tree) else child
  513. for child in tree.children]
  514. def __getattr__(self, name):
  515. return self.__default__
  516. def __default__(self, tree):
  517. return self.visit_children(tree)
  518. # Decorators
  519. def _apply_decorator(obj, decorator, **kwargs):
  520. try:
  521. _apply = obj._apply_decorator
  522. except AttributeError:
  523. return decorator(obj, **kwargs)
  524. else:
  525. return _apply(decorator, **kwargs)
  526. def _inline_args__func(func):
  527. @wraps(func)
  528. def create_decorator(_f, with_self):
  529. if with_self:
  530. def f(self, children):
  531. return _f(self, *children)
  532. else:
  533. def f(self, children):
  534. return _f(*children)
  535. return f
  536. return smart_decorator(func, create_decorator)
  537. def inline_args(obj): # XXX Deprecated
  538. return _apply_decorator(obj, _inline_args__func)
  539. def _visitor_args_func_dec(func, visit_wrapper=None, static=False):
  540. def create_decorator(_f, with_self):
  541. if with_self:
  542. def f(self, *args, **kwargs):
  543. return _f(self, *args, **kwargs)
  544. else:
  545. def f(self, *args, **kwargs):
  546. return _f(*args, **kwargs)
  547. return f
  548. if static:
  549. f = wraps(func)(create_decorator(func, False))
  550. else:
  551. f = smart_decorator(func, create_decorator)
  552. f.vargs_applied = True
  553. f.visit_wrapper = visit_wrapper
  554. return f
  555. def _vargs_inline(f, data, children, meta):
  556. return f(*children)
  557. def _vargs_meta_inline(f, data, children, meta):
  558. return f(meta, *children)
  559. def _vargs_meta(f, data, children, meta):
  560. return f(children, meta) # TODO swap these for consistency? Backwards incompatible!
  561. def _vargs_tree(f, data, children, meta):
  562. return f(Tree(data, children, meta))
  563. def v_args(inline=False, meta=False, tree=False, wrapper=None):
  564. "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods"
  565. if tree and (meta or inline):
  566. raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.")
  567. func = None
  568. if meta:
  569. if inline:
  570. func = _vargs_meta_inline
  571. else:
  572. func = _vargs_meta
  573. elif inline:
  574. func = _vargs_inline
  575. elif tree:
  576. func = _vargs_tree
  577. if wrapper is not None:
  578. if func is not None:
  579. raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.")
  580. func = wrapper
  581. def _visitor_args_dec(obj):
  582. return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func)
  583. return _visitor_args_dec
  584. class Indenter:
  585. def __init__(self):
  586. self.paren_level = None
  587. self.indent_level = None
  588. assert self.tab_len > 0
  589. def handle_NL(self, token):
  590. if self.paren_level > 0:
  591. return
  592. yield token
  593. indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
  594. indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
  595. if indent > self.indent_level[-1]:
  596. self.indent_level.append(indent)
  597. yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
  598. else:
  599. while indent < self.indent_level[-1]:
  600. self.indent_level.pop()
  601. yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
  602. assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])
  603. def _process(self, stream):
  604. for token in stream:
  605. if token.type == self.NL_type:
  606. for t in self.handle_NL(token):
  607. yield t
  608. else:
  609. yield token
  610. if token.type in self.OPEN_PAREN_types:
  611. self.paren_level += 1
  612. elif token.type in self.CLOSE_PAREN_types:
  613. self.paren_level -= 1
  614. assert self.paren_level >= 0
  615. while len(self.indent_level) > 1:
  616. self.indent_level.pop()
  617. yield Token(self.DEDENT_type, '')
  618. assert self.indent_level == [0], self.indent_level
  619. def process(self, stream):
  620. self.paren_level = 0
  621. self.indent_level = [0]
  622. return self._process(stream)
  623. # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
  624. @property
  625. def always_accept(self):
  626. return (self.NL_type,)
  627. class Symbol(Serialize):
  628. __slots__ = ('name',)
  629. is_term = NotImplemented
  630. def __init__(self, name):
  631. self.name = name
  632. def __eq__(self, other):
  633. assert isinstance(other, Symbol), other
  634. return self.is_term == other.is_term and self.name == other.name
  635. def __ne__(self, other):
  636. return not (self == other)
  637. def __hash__(self):
  638. return hash(self.name)
  639. def __repr__(self):
  640. return '%s(%r)' % (type(self).__name__, self.name)
  641. fullrepr = property(__repr__)
  642. class Terminal(Symbol):
  643. __serialize_fields__ = 'name', 'filter_out'
  644. is_term = True
  645. def __init__(self, name, filter_out=False):
  646. self.name = name
  647. self.filter_out = filter_out
  648. @property
  649. def fullrepr(self):
  650. return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)
  651. class NonTerminal(Symbol):
  652. __serialize_fields__ = 'name',
  653. is_term = False
  654. class RuleOptions(Serialize):
  655. __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices'
  656. def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()):
  657. self.keep_all_tokens = keep_all_tokens
  658. self.expand1 = expand1
  659. self.priority = priority
  660. self.template_source = template_source
  661. self.empty_indices = empty_indices
  662. def __repr__(self):
  663. return 'RuleOptions(%r, %r, %r, %r)' % (
  664. self.keep_all_tokens,
  665. self.expand1,
  666. self.priority,
  667. self.template_source
  668. )
  669. class Rule(Serialize):
  670. """
  671. origin : a symbol
  672. expansion : a list of symbols
  673. order : index of this expansion amongst all rules of the same name
  674. """
  675. __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')
  676. __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
  677. __serialize_namespace__ = Terminal, NonTerminal, RuleOptions
  678. def __init__(self, origin, expansion, order=0, alias=None, options=None):
  679. self.origin = origin
  680. self.expansion = expansion
  681. self.alias = alias
  682. self.order = order
  683. self.options = options or RuleOptions()
  684. self._hash = hash((self.origin, tuple(self.expansion)))
  685. def _deserialize(self):
  686. self._hash = hash((self.origin, tuple(self.expansion)))
  687. def __str__(self):
  688. return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
  689. def __repr__(self):
  690. return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
  691. def __hash__(self):
  692. return self._hash
  693. def __eq__(self, other):
  694. if not isinstance(other, Rule):
  695. return False
  696. return self.origin == other.origin and self.expansion == other.expansion
  697. from copy import copy
  698. class Pattern(Serialize):
  699. def __init__(self, value, flags=()):
  700. self.value = value
  701. self.flags = frozenset(flags)
  702. def __repr__(self):
  703. return repr(self.to_regexp())
  704. # Pattern Hashing assumes all subclasses have a different priority!
  705. def __hash__(self):
  706. return hash((type(self), self.value, self.flags))
  707. def __eq__(self, other):
  708. return type(self) == type(other) and self.value == other.value and self.flags == other.flags
  709. def to_regexp(self):
  710. raise NotImplementedError()
  711. if Py36:
  712. # Python 3.6 changed syntax for flags in regular expression
  713. def _get_flags(self, value):
  714. for f in self.flags:
  715. value = ('(?%s:%s)' % (f, value))
  716. return value
  717. else:
  718. def _get_flags(self, value):
  719. for f in self.flags:
  720. value = ('(?%s)' % f) + value
  721. return value
  722. class PatternStr(Pattern):
  723. __serialize_fields__ = 'value', 'flags'
  724. type = "str"
  725. def to_regexp(self):
  726. return self._get_flags(re.escape(self.value))
  727. @property
  728. def min_width(self):
  729. return len(self.value)
  730. max_width = min_width
  731. class PatternRE(Pattern):
  732. __serialize_fields__ = 'value', 'flags', '_width'
  733. type = "re"
  734. def to_regexp(self):
  735. return self._get_flags(self.value)
  736. _width = None
  737. def _get_width(self):
  738. if self._width is None:
  739. self._width = get_regexp_width(self.to_regexp())
  740. return self._width
  741. @property
  742. def min_width(self):
  743. return self._get_width()[0]
  744. @property
  745. def max_width(self):
  746. return self._get_width()[1]
  747. class TerminalDef(Serialize):
  748. __serialize_fields__ = 'name', 'pattern', 'priority'
  749. __serialize_namespace__ = PatternStr, PatternRE
  750. def __init__(self, name, pattern, priority=1):
  751. assert isinstance(pattern, Pattern), pattern
  752. self.name = name
  753. self.pattern = pattern
  754. self.priority = priority
  755. def __repr__(self):
  756. return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
  757. class Token(Str):
  758. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
  759. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
  760. try:
  761. self = super(Token, cls).__new__(cls, value)
  762. except UnicodeDecodeError:
  763. value = value.decode('latin1')
  764. self = super(Token, cls).__new__(cls, value)
  765. self.type = type_
  766. self.pos_in_stream = pos_in_stream
  767. self.value = value
  768. self.line = line
  769. self.column = column
  770. self.end_line = end_line
  771. self.end_column = end_column
  772. self.end_pos = end_pos
  773. return self
  774. def update(self, type_=None, value=None):
  775. return Token.new_borrow_pos(
  776. type_ if type_ is not None else self.type,
  777. value if value is not None else self.value,
  778. self
  779. )
  780. @classmethod
  781. def new_borrow_pos(cls, type_, value, borrow_t):
  782. return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
  783. def __reduce__(self):
  784. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  785. def __repr__(self):
  786. return 'Token(%s, %r)' % (self.type, self.value)
  787. def __deepcopy__(self, memo):
  788. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  789. def __eq__(self, other):
  790. if isinstance(other, Token) and self.type != other.type:
  791. return False
  792. return Str.__eq__(self, other)
  793. __hash__ = Str.__hash__
  794. class LineCounter:
  795. def __init__(self, newline_char):
  796. self.newline_char = newline_char
  797. self.char_pos = 0
  798. self.line = 1
  799. self.column = 1
  800. self.line_start_pos = 0
  801. def feed(self, token, test_newline=True):
  802. """Consume a token and calculate the new line & column.
  803. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  804. """
  805. if test_newline:
  806. newlines = token.count(self.newline_char)
  807. if newlines:
  808. self.line += newlines
  809. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  810. self.char_pos += len(token)
  811. self.column = self.char_pos - self.line_start_pos + 1
  812. class _Lex:
  813. "Built to serve both Lexer and ContextualLexer"
  814. def __init__(self, lexer, state=None):
  815. self.lexer = lexer
  816. self.state = state
  817. def lex(self, stream, newline_types, ignore_types):
  818. newline_types = frozenset(newline_types)
  819. ignore_types = frozenset(ignore_types)
  820. line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
  821. last_token = None
  822. while line_ctr.char_pos < len(stream):
  823. lexer = self.lexer
  824. res = lexer.match(stream, line_ctr.char_pos)
  825. if not res:
  826. allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
  827. if not allowed:
  828. allowed = {"<END-OF-FILE>"}
  829. raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])
  830. value, type_ = res
  831. if type_ not in ignore_types:
  832. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  833. line_ctr.feed(value, type_ in newline_types)
  834. t.end_line = line_ctr.line
  835. t.end_column = line_ctr.column
  836. t.end_pos = line_ctr.char_pos
  837. if t.type in lexer.callback:
  838. t = lexer.callback[t.type](t)
  839. if not isinstance(t, Token):
  840. raise ValueError("Callbacks must return a token (returned %r)" % t)
  841. yield t
  842. last_token = t
  843. else:
  844. if type_ in lexer.callback:
  845. t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  846. lexer.callback[type_](t2)
  847. line_ctr.feed(value, type_ in newline_types)
  848. class UnlessCallback:
  849. def __init__(self, mres):
  850. self.mres = mres
  851. def __call__(self, t):
  852. for mre, type_from_index in self.mres:
  853. m = mre.match(t.value)
  854. if m:
  855. t.type = type_from_index[m.lastindex]
  856. break
  857. return t
  858. class CallChain:
  859. def __init__(self, callback1, callback2, cond):
  860. self.callback1 = callback1
  861. self.callback2 = callback2
  862. self.cond = cond
  863. def __call__(self, t):
  864. t2 = self.callback1(t)
  865. return self.callback2(t) if self.cond(t2) else t2
  866. def _create_unless(terminals, g_regex_flags, re_, use_bytes):
  867. tokens_by_type = classify(terminals, lambda t: type(t.pattern))
  868. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  869. embedded_strs = set()
  870. callback = {}
  871. for retok in tokens_by_type.get(PatternRE, []):
  872. unless = [] # {}
  873. for strtok in tokens_by_type.get(PatternStr, []):
  874. if strtok.priority > retok.priority:
  875. continue
  876. s = strtok.pattern.value
  877. m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
  878. if m and m.group(0) == s:
  879. unless.append(strtok)
  880. if strtok.pattern.flags <= retok.pattern.flags:
  881. embedded_strs.add(strtok)
  882. if unless:
  883. callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
  884. terminals = [t for t in terminals if t not in embedded_strs]
  885. return terminals, callback
  886. def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
  887. # Python sets an unreasonable group limit (currently 100) in its re module
  888. # Worse, the only way to know we reached it is by catching an AssertionError!
  889. # This function recursively tries less and less groups until it's successful.
  890. postfix = '$' if match_whole else ''
  891. mres = []
  892. while terminals:
  893. pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
  894. if use_bytes:
  895. pattern = pattern.encode('latin-1')
  896. try:
  897. mre = re_.compile(pattern, g_regex_flags)
  898. except AssertionError: # Yes, this is what Python provides us.. :/
  899. return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
  900. # terms_from_name = {t.name: t for t in terminals[:max_size]}
  901. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  902. terminals = terminals[max_size:]
  903. return mres
  904. def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
  905. return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)
  906. def _regexp_has_newline(r):
  907. r"""Expressions that may indicate newlines in a regexp:
  908. - newlines (\n)
  909. - escaped newline (\\n)
  910. - anything but ([^...])
  911. - any-char (.) when the flag (?s) exists
  912. - spaces (\s)
  913. """
  914. return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
  915. class Lexer(object):
  916. """Lexer interface
  917. Method Signatures:
  918. lex(self, stream) -> Iterator[Token]
  919. """
  920. lex = NotImplemented
  921. class TraditionalLexer(Lexer):
  922. def __init__(self, conf):
  923. terminals = list(conf.tokens)
  924. assert all(isinstance(t, TerminalDef) for t in terminals), terminals
  925. self.re = conf.re_module
  926. if not conf.skip_validation:
  927. # Sanitization
  928. for t in terminals:
  929. try:
  930. self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
  931. except self.re.error:
  932. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  933. if t.pattern.min_width == 0:
  934. raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
  935. assert set(conf.ignore) <= {t.name for t in terminals}
  936. # Init
  937. self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
  938. self.ignore_types = list(conf.ignore)
  939. terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  940. self.terminals = terminals
  941. self.user_callbacks = conf.callbacks
  942. self.g_regex_flags = conf.g_regex_flags
  943. self.use_bytes = conf.use_bytes
  944. self._mres = None
  945. # self.build(g_regex_flags)
  946. def _build(self):
  947. terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
  948. assert all(self.callback.values())
  949. for type_, f in self.user_callbacks.items():
  950. if type_ in self.callback:
  951. # Already a callback there, probably UnlessCallback
  952. self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
  953. else:
  954. self.callback[type_] = f
  955. self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
  956. @property
  957. def mres(self):
  958. if self._mres is None:
  959. self._build()
  960. return self._mres
  961. def match(self, stream, pos):
  962. for mre, type_from_index in self.mres:
  963. m = mre.match(stream, pos)
  964. if m:
  965. return m.group(0), type_from_index[m.lastindex]
  966. def lex(self, stream):
  967. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  968. class ContextualLexer(Lexer):
  969. def __init__(self, conf, states, always_accept=()):
  970. terminals = list(conf.tokens)
  971. tokens_by_name = {}
  972. for t in terminals:
  973. assert t.name not in tokens_by_name, t
  974. tokens_by_name[t.name] = t
  975. trad_conf = copy(conf)
  976. trad_conf.tokens = terminals
  977. lexer_by_tokens = {}
  978. self.lexers = {}
  979. for state, accepts in states.items():
  980. key = frozenset(accepts)
  981. try:
  982. lexer = lexer_by_tokens[key]
  983. except KeyError:
  984. accepts = set(accepts) | set(conf.ignore) | set(always_accept)
  985. state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
  986. lexer_conf = copy(trad_conf)
  987. lexer_conf.tokens = state_tokens
  988. lexer = TraditionalLexer(lexer_conf)
  989. lexer_by_tokens[key] = lexer
  990. self.lexers[state] = lexer
  991. assert trad_conf.tokens is terminals
  992. self.root_lexer = TraditionalLexer(trad_conf)
  993. def lex(self, stream, get_parser_state):
  994. parser_state = get_parser_state()
  995. l = _Lex(self.lexers[parser_state], parser_state)
  996. try:
  997. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  998. yield x
  999. parser_state = get_parser_state()
  1000. l.lexer = self.lexers[parser_state]
  1001. l.state = parser_state # For debug only, no need to worry about multithreading
  1002. except UnexpectedCharacters as e:
  1003. # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
  1004. # but not in the current context.
  1005. # This tests the input against the global context, to provide a nicer error.
  1006. root_match = self.root_lexer.match(stream, e.pos_in_stream)
  1007. if not root_match:
  1008. raise
  1009. value, type_ = root_match
  1010. t = Token(type_, value, e.pos_in_stream, e.line, e.column)
  1011. raise UnexpectedToken(t, e.allowed, state=e.state)
  1012. class LexerConf(Serialize):
  1013. __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
  1014. __serialize_namespace__ = TerminalDef,
  1015. def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
  1016. self.tokens = tokens # TODO should be terminals
  1017. self.ignore = ignore
  1018. self.postlex = postlex
  1019. self.callbacks = callbacks or {}
  1020. self.g_regex_flags = g_regex_flags
  1021. self.re_module = re_module
  1022. self.skip_validation = skip_validation
  1023. self.use_bytes = use_bytes
  1024. from functools import partial, wraps
  1025. from itertools import repeat, product
  1026. class ExpandSingleChild:
  1027. def __init__(self, node_builder):
  1028. self.node_builder = node_builder
  1029. def __call__(self, children):
  1030. if len(children) == 1:
  1031. return children[0]
  1032. else:
  1033. return self.node_builder(children)
  1034. class PropagatePositions:
  1035. def __init__(self, node_builder):
  1036. self.node_builder = node_builder
  1037. def __call__(self, children):
  1038. res = self.node_builder(children)
  1039. # local reference to Tree.meta reduces number of presence checks
  1040. if isinstance(res, Tree):
  1041. res_meta = res.meta
  1042. for c in children:
  1043. if isinstance(c, Tree):
  1044. child_meta = c.meta
  1045. if not child_meta.empty:
  1046. res_meta.line = child_meta.line
  1047. res_meta.column = child_meta.column
  1048. res_meta.start_pos = child_meta.start_pos
  1049. res_meta.empty = False
  1050. break
  1051. elif isinstance(c, Token):
  1052. res_meta.line = c.line
  1053. res_meta.column = c.column
  1054. res_meta.start_pos = c.pos_in_stream
  1055. res_meta.empty = False
  1056. break
  1057. for c in reversed(children):
  1058. if isinstance(c, Tree):
  1059. child_meta = c.meta
  1060. if not child_meta.empty:
  1061. res_meta.end_line = child_meta.end_line
  1062. res_meta.end_column = child_meta.end_column
  1063. res_meta.end_pos = child_meta.end_pos
  1064. res_meta.empty = False
  1065. break
  1066. elif isinstance(c, Token):
  1067. res_meta.end_line = c.end_line
  1068. res_meta.end_column = c.end_column
  1069. res_meta.end_pos = c.end_pos
  1070. res_meta.empty = False
  1071. break
  1072. return res
  1073. class ChildFilter:
  1074. def __init__(self, to_include, append_none, node_builder):
  1075. self.node_builder = node_builder
  1076. self.to_include = to_include
  1077. self.append_none = append_none
  1078. def __call__(self, children):
  1079. filtered = []
  1080. for i, to_expand, add_none in self.to_include:
  1081. if add_none:
  1082. filtered += [None] * add_none
  1083. if to_expand:
  1084. filtered += children[i].children
  1085. else:
  1086. filtered.append(children[i])
  1087. if self.append_none:
  1088. filtered += [None] * self.append_none
  1089. return self.node_builder(filtered)
  1090. class ChildFilterLALR(ChildFilter):
  1091. "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
  1092. def __call__(self, children):
  1093. filtered = []
  1094. for i, to_expand, add_none in self.to_include:
  1095. if add_none:
  1096. filtered += [None] * add_none
  1097. if to_expand:
  1098. if filtered:
  1099. filtered += children[i].children
  1100. else: # Optimize for left-recursion
  1101. filtered = children[i].children
  1102. else:
  1103. filtered.append(children[i])
  1104. if self.append_none:
  1105. filtered += [None] * self.append_none
  1106. return self.node_builder(filtered)
  1107. class ChildFilterLALR_NoPlaceholders(ChildFilter):
  1108. "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
  1109. def __init__(self, to_include, node_builder):
  1110. self.node_builder = node_builder
  1111. self.to_include = to_include
  1112. def __call__(self, children):
  1113. filtered = []
  1114. for i, to_expand in self.to_include:
  1115. if to_expand:
  1116. if filtered:
  1117. filtered += children[i].children
  1118. else: # Optimize for left-recursion
  1119. filtered = children[i].children
  1120. else:
  1121. filtered.append(children[i])
  1122. return self.node_builder(filtered)
  1123. def _should_expand(sym):
  1124. return not sym.is_term and sym.name.startswith('_')
  1125. def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices):
  1126. # Prepare empty_indices as: How many Nones to insert at each index?
  1127. if _empty_indices:
  1128. assert _empty_indices.count(False) == len(expansion)
  1129. s = ''.join(str(int(b)) for b in _empty_indices)
  1130. empty_indices = [len(ones) for ones in s.split('0')]
  1131. assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion))
  1132. else:
  1133. empty_indices = [0] * (len(expansion)+1)
  1134. to_include = []
  1135. nones_to_add = 0
  1136. for i, sym in enumerate(expansion):
  1137. nones_to_add += empty_indices[i]
  1138. if keep_all_tokens or not (sym.is_term and sym.filter_out):
  1139. to_include.append((i, _should_expand(sym), nones_to_add))
  1140. nones_to_add = 0
  1141. nones_to_add += empty_indices[len(expansion)]
  1142. if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include):
  1143. if _empty_indices or ambiguous:
  1144. return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add)
  1145. else:
  1146. # LALR without placeholders
  1147. return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include])
  1148. class AmbiguousExpander:
  1149. """Deal with the case where we're expanding children ('_rule') into a parent but the children
  1150. are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself
  1151. ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children
  1152. into the right parents in the right places, essentially shifting the ambiguiuty up the tree."""
  1153. def __init__(self, to_expand, tree_class, node_builder):
  1154. self.node_builder = node_builder
  1155. self.tree_class = tree_class
  1156. self.to_expand = to_expand
  1157. def __call__(self, children):
  1158. def _is_ambig_tree(child):
  1159. return hasattr(child, 'data') and child.data == '_ambig'
  1160. #### When we're repeatedly expanding ambiguities we can end up with nested ambiguities.
  1161. # All children of an _ambig node should be a derivation of that ambig node, hence
  1162. # it is safe to assume that if we see an _ambig node nested within an ambig node
  1163. # it is safe to simply expand it into the parent _ambig node as an alternative derivation.
  1164. ambiguous = []
  1165. for i, child in enumerate(children):
  1166. if _is_ambig_tree(child):
  1167. if i in self.to_expand:
  1168. ambiguous.append(i)
  1169. to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)]
  1170. child.expand_kids_by_index(*to_expand)
  1171. if not ambiguous:
  1172. return self.node_builder(children)
  1173. expand = [ iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children) ]
  1174. return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))])
  1175. def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens):
  1176. to_expand = [i for i, sym in enumerate(expansion)
  1177. if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))]
  1178. if to_expand:
  1179. return partial(AmbiguousExpander, to_expand, tree_class)
  1180. def ptb_inline_args(func):
  1181. @wraps(func)
  1182. def f(children):
  1183. return func(*children)
  1184. return f
  1185. def inplace_transformer(func):
  1186. @wraps(func)
  1187. def f(children):
  1188. # function name in a Transformer is a rule name.
  1189. tree = Tree(func.__name__, children)
  1190. return func(tree)
  1191. return f
  1192. def apply_visit_wrapper(func, name, wrapper):
  1193. if wrapper is _vargs_meta or wrapper is _vargs_meta_inline:
  1194. raise NotImplementedError("Meta args not supported for internal transformer")
  1195. @wraps(func)
  1196. def f(children):
  1197. return wrapper(func, name, children, None)
  1198. return f
  1199. class ParseTreeBuilder:
  1200. def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False):
  1201. self.tree_class = tree_class
  1202. self.propagate_positions = propagate_positions
  1203. self.always_keep_all_tokens = keep_all_tokens
  1204. self.ambiguous = ambiguous
  1205. self.maybe_placeholders = maybe_placeholders
  1206. self.rule_builders = list(self._init_builders(rules))
  1207. def _init_builders(self, rules):
  1208. for rule in rules:
  1209. options = rule.options
  1210. keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens
  1211. expand_single_child = options.expand1
  1212. wrapper_chain = list(filter(None, [
  1213. (expand_single_child and not rule.alias) and ExpandSingleChild,
  1214. maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None),
  1215. self.propagate_positions and PropagatePositions,
  1216. self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
  1217. ]))
  1218. yield rule, wrapper_chain
  1219. def create_callback(self, transformer=None):
  1220. callbacks = {}
  1221. for rule, wrapper_chain in self.rule_builders:
  1222. user_callback_name = rule.alias or rule.options.template_source or rule.origin.name
  1223. try:
  1224. f = getattr(transformer, user_callback_name)
  1225. # XXX InlineTransformer is deprecated!
  1226. wrapper = getattr(f, 'visit_wrapper', None)
  1227. if wrapper is not None:
  1228. f = apply_visit_wrapper(f, user_callback_name, wrapper)
  1229. else:
  1230. if isinstance(transformer, InlineTransformer):
  1231. f = ptb_inline_args(f)
  1232. elif isinstance(transformer, Transformer_InPlace):
  1233. f = inplace_transformer(f)
  1234. except AttributeError:
  1235. f = partial(self.tree_class, user_callback_name)
  1236. for w in wrapper_chain:
  1237. f = w(f)
  1238. if rule in callbacks:
  1239. raise GrammarError("Rule '%s' already exists" % (rule,))
  1240. callbacks[rule] = f
  1241. return callbacks
  1242. class LALR_Parser(object):
  1243. def __init__(self, parser_conf, debug=False):
  1244. assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization"
  1245. analysis = LALR_Analyzer(parser_conf, debug=debug)
  1246. analysis.compute_lalr()
  1247. callbacks = parser_conf.callbacks
  1248. self._parse_table = analysis.parse_table
  1249. self.parser_conf = parser_conf
  1250. self.parser = _Parser(analysis.parse_table, callbacks, debug)
  1251. @classmethod
  1252. def deserialize(cls, data, memo, callbacks):
  1253. inst = cls.__new__(cls)
  1254. inst._parse_table = IntParseTable.deserialize(data, memo)
  1255. inst.parser = _Parser(inst._parse_table, callbacks)
  1256. return inst
  1257. def serialize(self, memo):
  1258. return self._parse_table.serialize(memo)
  1259. def parse(self, *args):
  1260. return self.parser.parse(*args)
  1261. class _Parser:
  1262. def __init__(self, parse_table, callbacks, debug=False):
  1263. self.parse_table = parse_table
  1264. self.callbacks = callbacks
  1265. self.debug = debug
  1266. def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None):
  1267. token = None
  1268. stream = iter(seq)
  1269. states = self.parse_table.states
  1270. start_state = self.parse_table.start_states[start]
  1271. end_state = self.parse_table.end_states[start]
  1272. state_stack = state_stack or [start_state]
  1273. value_stack = value_stack or []
  1274. if set_state: set_state(start_state)
  1275. def get_action(token):
  1276. state = state_stack[-1]
  1277. try:
  1278. return states[state][token.type]
  1279. except KeyError:
  1280. expected = {s for s in states[state].keys() if s.isupper()}
  1281. try:
  1282. puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
  1283. except NameError: # For standalone parser
  1284. puppet = None
  1285. raise UnexpectedToken(token, expected, state=state, puppet=puppet)
  1286. def reduce(rule):
  1287. size = len(rule.expansion)
  1288. if size:
  1289. s = value_stack[-size:]
  1290. del state_stack[-size:]
  1291. del value_stack[-size:]
  1292. else:
  1293. s = []
  1294. value = self.callbacks[rule](s)
  1295. _action, new_state = states[state_stack[-1]][rule.origin.name]
  1296. assert _action is Shift
  1297. state_stack.append(new_state)
  1298. value_stack.append(value)
  1299. # Main LALR-parser loop
  1300. try:
  1301. for token in stream:
  1302. while True:
  1303. action, arg = get_action(token)
  1304. assert arg != end_state
  1305. if action is Shift:
  1306. state_stack.append(arg)
  1307. value_stack.append(token)
  1308. if set_state: set_state(arg)
  1309. break # next token
  1310. else:
  1311. reduce(arg)
  1312. except Exception as e:
  1313. if self.debug:
  1314. print("")
  1315. print("STATE STACK DUMP")
  1316. print("----------------")
  1317. for i, s in enumerate(state_stack):
  1318. print('%d)' % i , s)
  1319. print("")
  1320. raise
  1321. token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
  1322. while True:
  1323. _action, arg = get_action(token)
  1324. assert(_action is Reduce)
  1325. reduce(arg)
  1326. if state_stack[-1] == end_state:
  1327. return value_stack[-1]
  1328. class Action:
  1329. def __init__(self, name):
  1330. self.name = name
  1331. def __str__(self):
  1332. return self.name
  1333. def __repr__(self):
  1334. return str(self)
  1335. Shift = Action('Shift')
  1336. Reduce = Action('Reduce')
  1337. class ParseTable:
  1338. def __init__(self, states, start_states, end_states):
  1339. self.states = states
  1340. self.start_states = start_states
  1341. self.end_states = end_states
  1342. def serialize(self, memo):
  1343. tokens = Enumerator()
  1344. rules = Enumerator()
  1345. states = {
  1346. state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
  1347. for token, (action, arg) in actions.items()}
  1348. for state, actions in self.states.items()
  1349. }
  1350. return {
  1351. 'tokens': tokens.reversed(),
  1352. 'states': states,
  1353. 'start_states': self.start_states,
  1354. 'end_states': self.end_states,
  1355. }
  1356. @classmethod
  1357. def deserialize(cls, data, memo):
  1358. tokens = data['tokens']
  1359. states = {
  1360. state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
  1361. for token, (action, arg) in actions.items()}
  1362. for state, actions in data['states'].items()
  1363. }
  1364. return cls(states, data['start_states'], data['end_states'])
  1365. class IntParseTable(ParseTable):
  1366. @classmethod
  1367. def from_ParseTable(cls, parse_table):
  1368. enum = list(parse_table.states)
  1369. state_to_idx = {s:i for i,s in enumerate(enum)}
  1370. int_states = {}
  1371. for s, la in parse_table.states.items():
  1372. la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
  1373. for k,v in la.items()}
  1374. int_states[ state_to_idx[s] ] = la
  1375. start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
  1376. end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
  1377. return cls(int_states, start_states, end_states)
  1378. def get_frontend(parser, lexer):
  1379. if parser=='lalr':
  1380. if lexer is None:
  1381. raise ValueError('The LALR parser requires use of a lexer')
  1382. elif lexer == 'standard':
  1383. return LALR_TraditionalLexer
  1384. elif lexer == 'contextual':
  1385. return LALR_ContextualLexer
  1386. elif issubclass(lexer, Lexer):
  1387. class LALR_CustomLexerWrapper(LALR_CustomLexer):
  1388. def __init__(self, lexer_conf, parser_conf, options=None):
  1389. super(LALR_CustomLexerWrapper, self).__init__(
  1390. lexer, lexer_conf, parser_conf, options=options)
  1391. def init_lexer(self):
  1392. self.lexer = lexer(self.lexer_conf)
  1393. return LALR_CustomLexerWrapper
  1394. else:
  1395. raise ValueError('Unknown lexer: %s' % lexer)
  1396. elif parser=='earley':
  1397. if lexer=='standard':
  1398. return Earley
  1399. elif lexer=='dynamic':
  1400. return XEarley
  1401. elif lexer=='dynamic_complete':
  1402. return XEarley_CompleteLex
  1403. elif lexer=='contextual':
  1404. raise ValueError('The Earley parser does not support the contextual parser')
  1405. else:
  1406. raise ValueError('Unknown lexer: %s' % lexer)
  1407. elif parser == 'cyk':
  1408. if lexer == 'standard':
  1409. return CYK
  1410. else:
  1411. raise ValueError('CYK parser requires using standard parser.')
  1412. else:
  1413. raise ValueError('Unknown parser: %s' % parser)
  1414. class _ParserFrontend(Serialize):
  1415. def _parse(self, input, start, *args):
  1416. if start is None:
  1417. start = self.start
  1418. if len(start) > 1:
  1419. raise ValueError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
  1420. start ,= start
  1421. return self.parser.parse(input, start, *args)
  1422. def _get_lexer_callbacks(transformer, terminals):
  1423. result = {}
  1424. for terminal in terminals:
  1425. callback = getattr(transformer, terminal.name, None)
  1426. if callback is not None:
  1427. result[terminal.name] = callback
  1428. return result
  1429. class WithLexer(_ParserFrontend):
  1430. lexer = None
  1431. parser = None
  1432. lexer_conf = None
  1433. start = None
  1434. __serialize_fields__ = 'parser', 'lexer_conf', 'start'
  1435. __serialize_namespace__ = LexerConf,
  1436. def __init__(self, lexer_conf, parser_conf, options=None):
  1437. self.lexer_conf = lexer_conf
  1438. self.start = parser_conf.start
  1439. self.postlex = lexer_conf.postlex
  1440. @classmethod
  1441. def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module):
  1442. inst = super(WithLexer, cls).deserialize(data, memo)
  1443. inst.postlex = postlex
  1444. inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
  1445. terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]
  1446. inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals)
  1447. inst.lexer_conf.re_module = re_module
  1448. inst.lexer_conf.skip_validation=True
  1449. inst.init_lexer()
  1450. return inst
  1451. def _serialize(self, data, memo):
  1452. data['parser'] = data['parser'].serialize(memo)
  1453. def lex(self, *args):
  1454. stream = self.lexer.lex(*args)
  1455. return self.postlex.process(stream) if self.postlex else stream
  1456. def parse(self, text, start=None):
  1457. token_stream = self.lex(text)
  1458. return self._parse(token_stream, start)
  1459. def init_traditional_lexer(self):
  1460. self.lexer = TraditionalLexer(self.lexer_conf)
  1461. class LALR_WithLexer(WithLexer):
  1462. def __init__(self, lexer_conf, parser_conf, options=None):
  1463. debug = options.debug if options else False
  1464. self.parser = LALR_Parser(parser_conf, debug=debug)
  1465. WithLexer.__init__(self, lexer_conf, parser_conf, options)
  1466. self.init_lexer()
  1467. def init_lexer(self, **kw):
  1468. raise NotImplementedError()
  1469. class LALR_TraditionalLexer(LALR_WithLexer):
  1470. def init_lexer(self):
  1471. self.init_traditional_lexer()
  1472. class LALR_ContextualLexer(LALR_WithLexer):
  1473. def init_lexer(self):
  1474. states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
  1475. always_accept = self.postlex.always_accept if self.postlex else ()
  1476. self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)
  1477. def parse(self, text, start=None):
  1478. parser_state = [None]
  1479. def set_parser_state(s):
  1480. parser_state[0] = s
  1481. token_stream = self.lex(text, lambda: parser_state[0])
  1482. return self._parse(token_stream, start, set_parser_state)
  1483. class LarkOptions(Serialize):
  1484. """Specifies the options for Lark
  1485. """
  1486. OPTIONS_DOC = """
  1487. # General
  1488. start - The start symbol. Either a string, or a list of strings for
  1489. multiple possible starts (Default: "start")
  1490. debug - Display debug information, such as warnings (default: False)
  1491. transformer - Applies the transformer to every parse tree (equivlent to
  1492. applying it after the parse, but faster)
  1493. propagate_positions - Propagates (line, column, end_line, end_column)
  1494. attributes into all tree branches.
  1495. maybe_placeholders - When True, the `[]` operator returns `None` when not matched.
  1496. When `False`, `[]` behaves like the `?` operator,
  1497. and returns no value at all.
  1498. (default=`False`. Recommended to set to `True`)
  1499. regex - When True, uses the `regex` module instead of the stdlib `re`.
  1500. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
  1501. LALR only for now.
  1502. When `False`, does nothing (default)
  1503. When `True`, caches to a temporary file in the local directory
  1504. When given a string, caches to the path pointed by the string
  1505. g_regex_flags - Flags that are applied to all terminals
  1506. (both regex and strings)
  1507. keep_all_tokens - Prevent the tree builder from automagically
  1508. removing "punctuation" tokens (default: False)
  1509. # Algorithm
  1510. parser - Decides which parser engine to use
  1511. Accepts "earley" or "lalr". (Default: "earley")
  1512. (there is also a "cyk" option for legacy)
  1513. lexer - Decides whether or not to use a lexer stage
  1514. "auto" (default): Choose for me based on the parser
  1515. "standard": Use a standard lexer
  1516. "contextual": Stronger lexer (only works with parser="lalr")
  1517. "dynamic": Flexible and powerful (only with parser="earley")
  1518. "dynamic_complete": Same as dynamic, but tries *every* variation
  1519. of tokenizing possible.
  1520. ambiguity - Decides how to handle ambiguity in the parse.
  1521. Only relevant if parser="earley"
  1522. "resolve": The parser will automatically choose the simplest
  1523. derivation (it chooses consistently: greedy for
  1524. tokens, non-greedy for rules)
  1525. "explicit": The parser will return all derivations wrapped
  1526. in "_ambig" tree nodes (i.e. a forest).
  1527. # Domain Specific
  1528. postlex - Lexer post-processing (Default: None) Only works with the
  1529. standard and contextual lexers.
  1530. priority - How priorities should be evaluated - auto, none, normal,
  1531. invert (Default: auto)
  1532. lexer_callbacks - Dictionary of callbacks for the lexer. May alter
  1533. tokens during lexing. Use with caution.
  1534. use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only).
  1535. edit_terminals - A callback
  1536. """
  1537. if __doc__:
  1538. __doc__ += OPTIONS_DOC
  1539. _defaults = {
  1540. 'debug': False,
  1541. 'keep_all_tokens': False,
  1542. 'tree_class': None,
  1543. 'cache': False,
  1544. 'postlex': None,
  1545. 'parser': 'earley',
  1546. 'lexer': 'auto',
  1547. 'transformer': None,
  1548. 'start': 'start',
  1549. 'priority': 'auto',
  1550. 'ambiguity': 'auto',
  1551. 'regex': False,
  1552. 'propagate_positions': False,
  1553. 'lexer_callbacks': {},
  1554. 'maybe_placeholders': False,
  1555. 'edit_terminals': None,
  1556. 'g_regex_flags': 0,
  1557. 'use_bytes': False,
  1558. }
  1559. def __init__(self, options_dict):
  1560. o = dict(options_dict)
  1561. options = {}
  1562. for name, default in self._defaults.items():
  1563. if name in o:
  1564. value = o.pop(name)
  1565. if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
  1566. value = bool(value)
  1567. else:
  1568. value = default
  1569. options[name] = value
  1570. if isinstance(options['start'], STRING_TYPE):
  1571. options['start'] = [options['start']]
  1572. self.__dict__['options'] = options
  1573. assert self.parser in ('earley', 'lalr', 'cyk', None)
  1574. if self.parser == 'earley' and self.transformer:
  1575. raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
  1576. 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
  1577. if o:
  1578. raise ValueError("Unknown options: %s" % o.keys())
  1579. def __getattr__(self, name):
  1580. try:
  1581. return self.options[name]
  1582. except KeyError as e:
  1583. raise AttributeError(e)
  1584. def __setattr__(self, name, value):
  1585. assert name in self.options
  1586. self.options[name] = value
  1587. def serialize(self, memo):
  1588. return self.options
  1589. @classmethod
  1590. def deserialize(cls, data, memo):
  1591. return cls(data)
  1592. class Lark(Serialize):
  1593. def __init__(self, grammar, **options):
  1594. """
  1595. grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
  1596. options : a dictionary controlling various aspects of Lark.
  1597. """
  1598. self.options = LarkOptions(options)
  1599. # Set regex or re module
  1600. use_regex = self.options.regex
  1601. if use_regex:
  1602. if regex:
  1603. re_module = regex
  1604. else:
  1605. raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
  1606. else:
  1607. re_module = re
  1608. # Some, but not all file-like objects have a 'name' attribute
  1609. try:
  1610. self.source = grammar.name
  1611. except AttributeError:
  1612. self.source = '<string>'
  1613. # Drain file-like objects to get their contents
  1614. try:
  1615. read = grammar.read
  1616. except AttributeError:
  1617. pass
  1618. else:
  1619. grammar = read()
  1620. assert isinstance(grammar, STRING_TYPE)
  1621. self.grammar_source = grammar
  1622. if self.options.use_bytes:
  1623. if not isascii(grammar):
  1624. raise ValueError("Grammar must be ascii only, when use_bytes=True")
  1625. if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
  1626. raise NotImplementedError("`use_bytes=True` may have issues on python2."
  1627. "Use `use_bytes='force'` to use it at your own risk.")
  1628. cache_fn = None
  1629. if self.options.cache:
  1630. if self.options.parser != 'lalr':
  1631. raise NotImplementedError("cache only works with parser='lalr' for now")
  1632. if isinstance(self.options.cache, STRING_TYPE):
  1633. cache_fn = self.options.cache
  1634. else:
  1635. if self.options.cache is not True:
  1636. raise ValueError("cache argument must be bool or str")
  1637. unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
  1638. from . import __version__
  1639. options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
  1640. s = grammar + options_str + __version__
  1641. md5 = hashlib.md5(s.encode()).hexdigest()
  1642. cache_fn = '.lark_cache_%s.tmp' % md5
  1643. if FS.exists(cache_fn):
  1644. logging.debug('Loading grammar from cache: %s', cache_fn)
  1645. with FS.open(cache_fn, 'rb') as f:
  1646. self._load(f, self.options.transformer, self.options.postlex)
  1647. return
  1648. if self.options.lexer == 'auto':
  1649. if self.options.parser == 'lalr':
  1650. self.options.lexer = 'contextual'
  1651. elif self.options.parser == 'earley':
  1652. self.options.lexer = 'dynamic'
  1653. elif self.options.parser == 'cyk':
  1654. self.options.lexer = 'standard'
  1655. else:
  1656. assert False, self.options.parser
  1657. lexer = self.options.lexer
  1658. assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)
  1659. if self.options.ambiguity == 'auto':
  1660. if self.options.parser == 'earley':
  1661. self.options.ambiguity = 'resolve'
  1662. else:
  1663. disambig_parsers = ['earley', 'cyk']
  1664. assert self.options.parser in disambig_parsers, (
  1665. 'Only %s supports disambiguation right now') % ', '.join(disambig_parsers)
  1666. if self.options.priority == 'auto':
  1667. if self.options.parser in ('earley', 'cyk', ):
  1668. self.options.priority = 'normal'
  1669. elif self.options.parser in ('lalr', ):
  1670. self.options.priority = None
  1671. elif self.options.priority in ('invert', 'normal'):
  1672. assert self.options.parser in ('earley', 'cyk'), "priorities are not supported for LALR at this time"
  1673. assert self.options.priority in ('auto', None, 'normal', 'invert'), 'invalid priority option specified: {}. options are auto, none, normal, invert.'.format(self.options.priority)
  1674. assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"'
  1675. assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )
  1676. # Parse the grammar file and compose the grammars (TODO)
  1677. self.grammar = load_grammar(grammar, self.source, re_module)
  1678. # Compile the EBNF grammar into BNF
  1679. self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
  1680. if self.options.edit_terminals:
  1681. for t in self.terminals:
  1682. self.options.edit_terminals(t)
  1683. self._terminals_dict = {t.name: t for t in self.terminals}
  1684. # If the user asked to invert the priorities, negate them all here.
  1685. # This replaces the old 'resolve__antiscore_sum' option.
  1686. if self.options.priority == 'invert':
  1687. for rule in self.rules:
  1688. if rule.options.priority is not None:
  1689. rule.options.priority = -rule.options.priority
  1690. # Else, if the user asked to disable priorities, strip them from the
  1691. # rules. This allows the Earley parsers to skip an extra forest walk
  1692. # for improved performance, if you don't need them (or didn't specify any).
  1693. elif self.options.priority == None:
  1694. for rule in self.rules:
  1695. if rule.options.priority is not None:
  1696. rule.options.priority = None
  1697. # TODO Deprecate lexer_callbacks?
  1698. lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals)
  1699. if self.options.transformer
  1700. else {})
  1701. lexer_callbacks.update(self.options.lexer_callbacks)
  1702. self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes)
  1703. if self.options.parser:
  1704. self.parser = self._build_parser()
  1705. elif lexer:
  1706. self.lexer = self._build_lexer()
  1707. if cache_fn:
  1708. logging.debug('Saving grammar to cache: %s', cache_fn)
  1709. with FS.open(cache_fn, 'wb') as f:
  1710. self.save(f)
  1711. if __init__.__doc__:
  1712. __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC
  1713. __serialize_fields__ = 'parser', 'rules', 'options'
  1714. def _build_lexer(self):
  1715. return TraditionalLexer(self.lexer_conf)
  1716. def _prepare_callbacks(self):
  1717. self.parser_class = get_frontend(self.options.parser, self.options.lexer)
  1718. self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders)
  1719. self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
  1720. def _build_parser(self):
  1721. self._prepare_callbacks()
  1722. parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
  1723. return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
  1724. def save(self, f):
  1725. data, m = self.memo_serialize([TerminalDef, Rule])
  1726. pickle.dump({'data': data, 'memo': m}, f)
  1727. @classmethod
  1728. def load(cls, f):
  1729. inst = cls.__new__(cls)
  1730. return inst._load(f)
  1731. def _load(self, f, transformer=None, postlex=None):
  1732. if isinstance(f, dict):
  1733. d = f
  1734. else:
  1735. d = pickle.load(f)
  1736. memo = d['memo']
  1737. data = d['data']
  1738. assert memo
  1739. memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
  1740. options = dict(data['options'])
  1741. if transformer is not None:
  1742. options['transformer'] = transformer
  1743. if postlex is not None:
  1744. options['postlex'] = postlex
  1745. self.options = LarkOptions.deserialize(options, memo)
  1746. re_module = regex if self.options.regex else re
  1747. self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
  1748. self.source = '<deserialized>'
  1749. self._prepare_callbacks()
  1750. self.parser = self.parser_class.deserialize(
  1751. data['parser'],
  1752. memo,
  1753. self._callbacks,
  1754. self.options.postlex,
  1755. self.options.transformer,
  1756. re_module
  1757. )
  1758. return self
  1759. @classmethod
  1760. def _load_from_dict(cls, data, memo, transformer=None, postlex=None):
  1761. inst = cls.__new__(cls)
  1762. return inst._load({'data': data, 'memo': memo}, transformer, postlex)
  1763. @classmethod
  1764. def open(cls, grammar_filename, rel_to=None, **options):
  1765. """Create an instance of Lark with the grammar given by its filename
  1766. If rel_to is provided, the function will find the grammar filename in relation to it.
  1767. Example:
  1768. >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
  1769. Lark(...)
  1770. """
  1771. if rel_to:
  1772. basepath = os.path.dirname(rel_to)
  1773. grammar_filename = os.path.join(basepath, grammar_filename)
  1774. with open(grammar_filename, encoding='utf8') as f:
  1775. return cls(f, **options)
  1776. def __repr__(self):
  1777. return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
  1778. def lex(self, text):
  1779. "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
  1780. if not hasattr(self, 'lexer'):
  1781. self.lexer = self._build_lexer()
  1782. stream = self.lexer.lex(text)
  1783. if self.options.postlex:
  1784. return self.options.postlex.process(stream)
  1785. return stream
  1786. def get_terminal(self, name):
  1787. "Get information about a terminal"
  1788. return self._terminals_dict[name]
  1789. def parse(self, text, start=None, on_error=None):
  1790. """Parse the given text, according to the options provided.
  1791. Parameters:
  1792. start: str - required if Lark was given multiple possible start symbols (using the start option).
  1793. on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only.
  1794. Returns a tree, unless specified otherwise.
  1795. """
  1796. try:
  1797. return self.parser.parse(text, start=start)
  1798. except UnexpectedToken as e:
  1799. if on_error is None:
  1800. raise
  1801. while True:
  1802. if not on_error(e):
  1803. raise e
  1804. try:
  1805. return e.puppet.resume_parse()
  1806. except UnexpectedToken as e2:
  1807. e = e2
  1808. DATA = (
  1809. {'parser': {'parser': {'tokens': {0: 'RSQB', 1: 'COMMA', 2: '$END', 3: 'RBRACE', 4: 'ESCAPED_STRING', 5: 'string', 6: 'pair', 7: 'LSQB', 8: 'LBRACE', 9: 'SIGNED_NUMBER', 10: 'NULL', 11: 'FALSE', 12: 'value', 13: 'array', 14: 'object', 15: 'TRUE', 16: '__array_star_0', 17: 'COLON', 18: '__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 2: {1: (0, 25), 0: (0, 19)}, 3: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 4: {4: (0, 31), 5: (0, 13), 6: (0, 26)}, 5: {0: (1, {'@': 15}), 1: (1, {'@': 15}), 2: (1, {'@': 15}), 3: (1, {'@': 15})}, 6: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 7: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 2: (1, {'@': 17}), 3: (1, {'@': 17})}, 8: {1: (0, 14), 3: (0, 28)}, 9: {0: (0, 21), 7: (0, 9), 8: (0, 18), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 12: (0, 10), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24)}, 10: {1: (0, 20), 16: (0, 2), 0: (0, 3)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18})}, 12: {2: (1, {'@': 19})}, 13: {17: (0, 32)}, 14: {5: (0, 13), 4: (0, 31), 6: (0, 23)}, 15: {18: (0, 8), 1: (0, 4), 3: (0, 17)}, 16: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 17: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 2: (1, {'@': 21}), 3: (1, {'@': 21})}, 18: {4: (0, 31), 6: (0, 15), 5: (0, 13), 3: (0, 6)}, 19: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22})}, 20: {7: (0, 9), 8: (0, 18), 12: (0, 11), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 21: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 2: (1, {'@': 23}), 3: (1, {'@': 23})}, 22: {1: (1, {'@': 24}), 3: (1, {'@': 24})}, 23: {1: (1, {'@': 25}), 3: (1, {'@': 25})}, 24: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 25: {7: (0, 9), 12: (0, 16), 8: (0, 18), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 26: {1: (1, {'@': 27}), 3: (1, {'@': 27})}, 27: {7: (0, 9), 8: (0, 18), 12: (0, 12), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24), 19: (0, 30)}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {}, 31: {17: (1, {'@': 30}), 0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 32: {7: (0, 9), 8: (0, 18), 12: (0, 22), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}}, 'start_states': {'start': 27}, 'end_states': {'start': 30}}, 'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': ['WS'], 'g_regex_flags': 0, 'use_bytes': False, '__type__': 'LexerConf'}, 'start': ['start'], '__type__': 'LALR_ContextualLexer'}, 'rules': [{'@': 19}, {'@': 31}, {'@': 17}, {'@': 15}, {'@': 12}, {'@': 26}, {'@': 29}, {'@': 13}, {'@': 22}, {'@': 14}, {'@': 23}, {'@': 28}, {'@': 21}, {'@': 16}, {'@': 24}, {'@': 30}, {'@': 18}, {'@': 20}, {'@': 27}, {'@': 25}], 'options': {'debug': False, 'keep_all_tokens': False, 'tree_class': None, 'cache': False, 'postlex': None, 'parser': 'lalr', 'lexer': 'contextual', 'transformer': None, 'start': ['start'], 'priority': None, 'ambiguity': 'auto', 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False}, '__type__': 'Lark'}
  1810. )
  1811. MEMO = (
  1812. {0: {'name': 'ESCAPED_STRING', 'pattern': {'value': '".*?(?<!\\\\)(\\\\\\\\)*?"', 'flags': [], '_width': [2, 4294967295], '__type__': 'PatternRE'}, 'priority': 1, '__type__': 'TerminalDef'}, 1: {'name': 'SIGNED_NUMBER', 'pattern': {'value': '(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+)', 'flags': [], '_width': [1, 4294967295], '__type__': 'PatternRE'}, 'priority': 1, '__type__': 'TerminalDef'}, 2: {'name': 'WS', 'pattern': {'value': '(?:[ \t\x0c\r\n])+', 'flags': [], '_width': [1, 4294967295], '__type__': 'PatternRE'}, 'priority': 1, '__type__': 'TerminalDef'}, 3: {'name': 'TRUE', 'pattern': {'value': 'true', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 4: {'name': 'FALSE', 'pattern': {'value': 'false', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 5: {'name': 'NULL', 'pattern': {'value': 'null', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 6: {'name': 'COMMA', 'pattern': {'value': ',', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 7: {'name': 'LSQB', 'pattern': {'value': '[', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 8: {'name': 'RSQB', 'pattern': {'value': ']', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 9: {'name': 'LBRACE', 'pattern': {'value': '{', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 10: {'name': 'RBRACE', 'pattern': {'value': '}', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 11: {'name': 'COLON', 'pattern': {'value': ':', 'flags': [], '__type__': 'PatternStr'}, 'priority': 1, '__type__': 'TerminalDef'}, 12: {'origin': {'name': 'value', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'SIGNED_NUMBER', 'filter_out': False, '__type__': 'Terminal'}], 'order': 3, 'alias': 'number', 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 13: {'origin': {'name': 'value', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'NULL', 'filter_out': True, '__type__': 'Terminal'}], 'order': 6, 'alias': 'null', 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 14: {'origin': {'name': 'array', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'LSQB', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'value', '__type__': 'NonTerminal'}, {'name': 'RSQB', 'filter_out': True, '__type__': 'Terminal'}], 'order': 1, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 15: {'origin': {'name': 'value', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'string', '__type__': 'NonTerminal'}], 'order': 2, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 16: {'origin': {'name': 'object', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'LBRACE', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'RBRACE', 'filter_out': True, '__type__': 'Terminal'}], 'order': 2, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': [False, True, False], '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 17: {'origin': {'name': 'value', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'array', '__type__': 'NonTerminal'}], 'order': 1, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 18: {'origin': {'name': '__array_star_0', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'COMMA', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'value', '__type__': 'NonTerminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 19: {'origin': {'name': 'start', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'value', '__type__': 'NonTerminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 20: {'origin': {'name': '__array_star_0', '__type__': 'NonTerminal'}, 'expansion': [{'name': '__array_star_0', '__type__': 'NonTerminal'}, {'name': 'COMMA', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'value', '__type__': 'NonTerminal'}], 'order': 1, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 21: {'origin': {'name': 'object', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'LBRACE', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'pair', '__type__': 'NonTerminal'}, {'name': 'RBRACE', 'filter_out': True, '__type__': 'Terminal'}], 'order': 1, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 22: {'origin': {'name': 'array', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'LSQB', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'value', '__type__': 'NonTerminal'}, {'name': '__array_star_0', '__type__': 'NonTerminal'}, {'name': 'RSQB', 'filter_out': True, '__type__': 'Terminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 23: {'origin': {'name': 'array', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'LSQB', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'RSQB', 'filter_out': True, '__type__': 'Terminal'}], 'order': 2, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': [False, True, False], '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 24: {'origin': {'name': 'pair', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'string', '__type__': 'NonTerminal'}, {'name': 'COLON', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'value', '__type__': 'NonTerminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 25: {'origin': {'name': '__object_star_1', '__type__': 'NonTerminal'}, 'expansion': [{'name': '__object_star_1', '__type__': 'NonTerminal'}, {'name': 'COMMA', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'pair', '__type__': 'NonTerminal'}], 'order': 1, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 26: {'origin': {'name': 'value', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'TRUE', 'filter_out': True, '__type__': 'Terminal'}], 'order': 4, 'alias': 'true', 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 27: {'origin': {'name': '__object_star_1', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'COMMA', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'pair', '__type__': 'NonTerminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 28: {'origin': {'name': 'object', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'LBRACE', 'filter_out': True, '__type__': 'Terminal'}, {'name': 'pair', '__type__': 'NonTerminal'}, {'name': '__object_star_1', '__type__': 'NonTerminal'}, {'name': 'RBRACE', 'filter_out': True, '__type__': 'Terminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 29: {'origin': {'name': 'value', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'FALSE', 'filter_out': True, '__type__': 'Terminal'}], 'order': 5, 'alias': 'false', 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 30: {'origin': {'name': 'string', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'ESCAPED_STRING', 'filter_out': False, '__type__': 'Terminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': False, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}, 31: {'origin': {'name': 'value', '__type__': 'NonTerminal'}, 'expansion': [{'name': 'object', '__type__': 'NonTerminal'}], 'order': 0, 'alias': None, 'options': {'keep_all_tokens': False, 'expand1': True, 'priority': None, 'template_source': None, 'empty_indices': (), '__type__': 'RuleOptions'}, '__type__': 'Rule'}}
  1813. )
  1814. Shift = 0
  1815. Reduce = 1
  1816. def Lark_StandAlone(transformer=None, postlex=None):
  1817. return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)