This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2256 lines
88 KiB

  1. # The file was automatically generated by Lark v0.9.0
  2. #
  3. #
  4. # Lark Stand-alone Generator Tool
  5. # ----------------------------------
  6. # Generates a stand-alone LALR(1) parser with a standard lexer
  7. #
  8. # Git: https://github.com/erezsh/lark
  9. # Author: Erez Shinan (erezshin@gmail.com)
  10. #
  11. #
  12. # >>> LICENSE
  13. #
  14. # This tool and its generated code use a separate license from Lark,
  15. # and are subject to the terms of the Mozilla Public License, v. 2.0.
  16. # If a copy of the MPL was not distributed with this
  17. # file, You can obtain one at https://mozilla.org/MPL/2.0/.
  18. #
  19. # If you wish to purchase a commercial license for this tool and its
  20. # generated code, you may contact me via email or otherwise.
  21. #
  22. # If MPL2 is incompatible with your free or open-source project,
  23. # contact me and we'll work it out.
  24. #
  25. #
  26. import os
  27. from io import open
  28. class LarkError(Exception):
  29. pass
  30. class GrammarError(LarkError):
  31. pass
  32. class ParseError(LarkError):
  33. pass
  34. class LexError(LarkError):
  35. pass
  36. class UnexpectedEOF(ParseError):
  37. def __init__(self, expected):
  38. self.expected = expected
  39. message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
  40. super(UnexpectedEOF, self).__init__(message)
  41. class UnexpectedInput(LarkError):
  42. pos_in_stream = None
  43. def get_context(self, text, span=40):
  44. pos = self.pos_in_stream
  45. start = max(pos - span, 0)
  46. end = pos + span
  47. before = text[start:pos].rsplit('\n', 1)[-1]
  48. after = text[pos:end].split('\n', 1)[0]
  49. return before + after + '\n' + ' ' * len(before) + '^\n'
  50. def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
  51. """ Given a parser instance and a dictionary mapping some label with
  52. some malformed syntax examples, it'll return the label for the
  53. example that bests matches the current error.
  54. """
  55. assert self.state is not None, "Not supported for this exception"
  56. candidate = (None, False)
  57. for label, example in examples.items():
  58. assert not isinstance(example, STRING_TYPE)
  59. for malformed in example:
  60. try:
  61. parse_fn(malformed)
  62. except UnexpectedInput as ut:
  63. if ut.state == self.state:
  64. try:
  65. if ut.token == self.token: # Try exact match first
  66. return label
  67. if token_type_match_fallback:
  68. # Fallback to token types match
  69. if (ut.token.type == self.token.type) and not candidate[-1]:
  70. candidate = label, True
  71. except AttributeError:
  72. pass
  73. if not candidate[0]:
  74. candidate = label, False
  75. return candidate[0]
  76. class UnexpectedCharacters(LexError, UnexpectedInput):
  77. def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
  78. message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)
  79. self.line = line
  80. self.column = column
  81. self.allowed = allowed
  82. self.considered_tokens = considered_tokens
  83. self.pos_in_stream = lex_pos
  84. self.state = state
  85. message += '\n\n' + self.get_context(seq)
  86. if allowed:
  87. message += '\nExpecting: %s\n' % allowed
  88. if token_history:
  89. message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)
  90. super(UnexpectedCharacters, self).__init__(message)
  91. class UnexpectedToken(ParseError, UnexpectedInput):
  92. def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
  93. self.token = token
  94. self.expected = expected # XXX str shouldn't necessary
  95. self.line = getattr(token, 'line', '?')
  96. self.column = getattr(token, 'column', '?')
  97. self.considered_rules = considered_rules
  98. self.state = state
  99. self.pos_in_stream = getattr(token, 'pos_in_stream', None)
  100. self.puppet = puppet
  101. message = ("Unexpected token %r at line %s, column %s.\n"
  102. "Expected one of: \n\t* %s\n"
  103. % (token, self.line, self.column, '\n\t* '.join(self.expected)))
  104. super(UnexpectedToken, self).__init__(message)
  105. class VisitError(LarkError):
  106. """VisitError is raised when visitors are interrupted by an exception
  107. It provides the following attributes for inspection:
  108. - obj: the tree node or token it was processing when the exception was raised
  109. - orig_exc: the exception that cause it to fail
  110. """
  111. def __init__(self, rule, obj, orig_exc):
  112. self.obj = obj
  113. self.orig_exc = orig_exc
  114. message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
  115. super(VisitError, self).__init__(message)
  116. def classify(seq, key=None, value=None):
  117. d = {}
  118. for item in seq:
  119. k = key(item) if (key is not None) else item
  120. v = value(item) if (value is not None) else item
  121. if k in d:
  122. d[k].append(v)
  123. else:
  124. d[k] = [v]
  125. return d
  126. def _deserialize(data, namespace, memo):
  127. if isinstance(data, dict):
  128. if '__type__' in data: # Object
  129. class_ = namespace[data['__type__']]
  130. return class_.deserialize(data, memo)
  131. elif '@' in data:
  132. return memo[data['@']]
  133. return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
  134. elif isinstance(data, list):
  135. return [_deserialize(value, namespace, memo) for value in data]
  136. return data
  137. class Serialize(object):
  138. def memo_serialize(self, types_to_memoize):
  139. memo = SerializeMemoizer(types_to_memoize)
  140. return self.serialize(memo), memo.serialize()
  141. def serialize(self, memo=None):
  142. if memo and memo.in_types(self):
  143. return {'@': memo.memoized.get(self)}
  144. fields = getattr(self, '__serialize_fields__')
  145. res = {f: _serialize(getattr(self, f), memo) for f in fields}
  146. res['__type__'] = type(self).__name__
  147. postprocess = getattr(self, '_serialize', None)
  148. if postprocess:
  149. postprocess(res, memo)
  150. return res
  151. @classmethod
  152. def deserialize(cls, data, memo):
  153. namespace = getattr(cls, '__serialize_namespace__', {})
  154. namespace = {c.__name__:c for c in namespace}
  155. fields = getattr(cls, '__serialize_fields__')
  156. if '@' in data:
  157. return memo[data['@']]
  158. inst = cls.__new__(cls)
  159. for f in fields:
  160. try:
  161. setattr(inst, f, _deserialize(data[f], namespace, memo))
  162. except KeyError as e:
  163. raise KeyError("Cannot find key for class", cls, e)
  164. postprocess = getattr(inst, '_deserialize', None)
  165. if postprocess:
  166. postprocess()
  167. return inst
  168. class SerializeMemoizer(Serialize):
  169. __serialize_fields__ = 'memoized',
  170. def __init__(self, types_to_memoize):
  171. self.types_to_memoize = tuple(types_to_memoize)
  172. self.memoized = Enumerator()
  173. def in_types(self, value):
  174. return isinstance(value, self.types_to_memoize)
  175. def serialize(self):
  176. return _serialize(self.memoized.reversed(), None)
  177. @classmethod
  178. def deserialize(cls, data, namespace, memo):
  179. return _deserialize(data, namespace, memo)
  180. try:
  181. STRING_TYPE = basestring
  182. except NameError: # Python 3
  183. STRING_TYPE = str
  184. import types
  185. from functools import wraps, partial
  186. from contextlib import contextmanager
  187. Str = type(u'')
  188. try:
  189. classtype = types.ClassType # Python2
  190. except AttributeError:
  191. classtype = type # Python3
  192. def smart_decorator(f, create_decorator):
  193. if isinstance(f, types.FunctionType):
  194. return wraps(f)(create_decorator(f, True))
  195. elif isinstance(f, (classtype, type, types.BuiltinFunctionType)):
  196. return wraps(f)(create_decorator(f, False))
  197. elif isinstance(f, types.MethodType):
  198. return wraps(f)(create_decorator(f.__func__, True))
  199. elif isinstance(f, partial):
  200. # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
  201. return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))
  202. else:
  203. return create_decorator(f.__func__.__call__, True)
  204. try:
  205. import regex
  206. except ImportError:
  207. regex = None
  208. import sys, re
  209. Py36 = (sys.version_info[:2] >= (3, 6))
  210. import sre_parse
  211. import sre_constants
  212. categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
  213. def get_regexp_width(expr):
  214. if regex:
  215. # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
  216. # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
  217. # match here below.
  218. regexp_final = re.sub(categ_pattern, 'A', expr)
  219. else:
  220. if re.search(categ_pattern, expr):
  221. raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
  222. regexp_final = expr
  223. try:
  224. return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
  225. except sre_constants.error:
  226. raise ValueError(expr)
  227. class Meta:
  228. def __init__(self):
  229. self.empty = True
  230. class Tree(object):
  231. def __init__(self, data, children, meta=None):
  232. self.data = data
  233. self.children = children
  234. self._meta = meta
  235. @property
  236. def meta(self):
  237. if self._meta is None:
  238. self._meta = Meta()
  239. return self._meta
  240. def __repr__(self):
  241. return 'Tree(%s, %s)' % (self.data, self.children)
  242. def _pretty_label(self):
  243. return self.data
  244. def _pretty(self, level, indent_str):
  245. if len(self.children) == 1 and not isinstance(self.children[0], Tree):
  246. return [ indent_str*level, self._pretty_label(), '\t', '%s' % (self.children[0],), '\n']
  247. l = [ indent_str*level, self._pretty_label(), '\n' ]
  248. for n in self.children:
  249. if isinstance(n, Tree):
  250. l += n._pretty(level+1, indent_str)
  251. else:
  252. l += [ indent_str*(level+1), '%s' % (n,), '\n' ]
  253. return l
  254. def pretty(self, indent_str=' '):
  255. return ''.join(self._pretty(0, indent_str))
  256. def __eq__(self, other):
  257. try:
  258. return self.data == other.data and self.children == other.children
  259. except AttributeError:
  260. return False
  261. def __ne__(self, other):
  262. return not (self == other)
  263. def __hash__(self):
  264. return hash((self.data, tuple(self.children)))
  265. def iter_subtrees(self):
  266. queue = [self]
  267. subtrees = OrderedDict()
  268. for subtree in queue:
  269. subtrees[id(subtree)] = subtree
  270. queue += [c for c in reversed(subtree.children)
  271. if isinstance(c, Tree) and id(c) not in subtrees]
  272. del queue
  273. return reversed(list(subtrees.values()))
  274. def find_pred(self, pred):
  275. "Find all nodes where pred(tree) == True"
  276. return filter(pred, self.iter_subtrees())
  277. def find_data(self, data):
  278. "Find all nodes where tree.data == data"
  279. return self.find_pred(lambda t: t.data == data)
  280. from inspect import getmembers, getmro
  281. class Discard(Exception):
  282. pass
  283. # Transformers
  284. class _Decoratable:
  285. @classmethod
  286. def _apply_decorator(cls, decorator, **kwargs):
  287. mro = getmro(cls)
  288. assert mro[0] is cls
  289. libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
  290. for name, value in getmembers(cls):
  291. # Make sure the function isn't inherited (unless it's overwritten)
  292. if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
  293. continue
  294. if not callable(value):
  295. continue
  296. # Skip if v_args already applied (at the function level)
  297. if hasattr(cls.__dict__[name], 'vargs_applied') or hasattr(value, 'vargs_applied'):
  298. continue
  299. static = isinstance(cls.__dict__[name], (staticmethod, classmethod))
  300. setattr(cls, name, decorator(value, static=static, **kwargs))
  301. return cls
  302. def __class_getitem__(cls, _):
  303. return cls
  304. class Transformer(_Decoratable):
  305. """Visits the tree recursively, starting with the leaves and finally the root (bottom-up)
  306. Calls its methods (provided by user via inheritance) according to tree.data
  307. The returned value replaces the old one in the structure.
  308. Can be used to implement map or reduce.
  309. """
  310. __visit_tokens__ = True # For backwards compatibility
  311. def __init__(self, visit_tokens=True):
  312. self.__visit_tokens__ = visit_tokens
  313. def _call_userfunc(self, tree, new_children=None):
  314. # Assumes tree is already transformed
  315. children = new_children if new_children is not None else tree.children
  316. try:
  317. f = getattr(self, tree.data)
  318. except AttributeError:
  319. return self.__default__(tree.data, children, tree.meta)
  320. else:
  321. try:
  322. wrapper = getattr(f, 'visit_wrapper', None)
  323. if wrapper is not None:
  324. return f.visit_wrapper(f, tree.data, children, tree.meta)
  325. else:
  326. return f(children)
  327. except (GrammarError, Discard):
  328. raise
  329. except Exception as e:
  330. raise VisitError(tree.data, tree, e)
  331. def _call_userfunc_token(self, token):
  332. try:
  333. f = getattr(self, token.type)
  334. except AttributeError:
  335. return self.__default_token__(token)
  336. else:
  337. try:
  338. return f(token)
  339. except (GrammarError, Discard):
  340. raise
  341. except Exception as e:
  342. raise VisitError(token.type, token, e)
  343. def _transform_children(self, children):
  344. for c in children:
  345. try:
  346. if isinstance(c, Tree):
  347. yield self._transform_tree(c)
  348. elif self.__visit_tokens__ and isinstance(c, Token):
  349. yield self._call_userfunc_token(c)
  350. else:
  351. yield c
  352. except Discard:
  353. pass
  354. def _transform_tree(self, tree):
  355. children = list(self._transform_children(tree.children))
  356. return self._call_userfunc(tree, children)
  357. def transform(self, tree):
  358. return self._transform_tree(tree)
  359. def __mul__(self, other):
  360. return TransformerChain(self, other)
  361. def __default__(self, data, children, meta):
  362. "Default operation on tree (for override)"
  363. return Tree(data, children, meta)
  364. def __default_token__(self, token):
  365. "Default operation on token (for override)"
  366. return token
  367. class InlineTransformer(Transformer): # XXX Deprecated
  368. def _call_userfunc(self, tree, new_children=None):
  369. # Assumes tree is already transformed
  370. children = new_children if new_children is not None else tree.children
  371. try:
  372. f = getattr(self, tree.data)
  373. except AttributeError:
  374. return self.__default__(tree.data, children, tree.meta)
  375. else:
  376. return f(*children)
  377. class TransformerChain(object):
  378. def __init__(self, *transformers):
  379. self.transformers = transformers
  380. def transform(self, tree):
  381. for t in self.transformers:
  382. tree = t.transform(tree)
  383. return tree
  384. def __mul__(self, other):
  385. return TransformerChain(*self.transformers + (other,))
  386. class Transformer_InPlace(Transformer):
  387. "Non-recursive. Changes the tree in-place instead of returning new instances"
  388. def _transform_tree(self, tree): # Cancel recursion
  389. return self._call_userfunc(tree)
  390. def transform(self, tree):
  391. for subtree in tree.iter_subtrees():
  392. subtree.children = list(self._transform_children(subtree.children))
  393. return self._transform_tree(tree)
  394. class Transformer_NonRecursive(Transformer):
  395. "Non-recursive. Doesn't change the original tree."
  396. def transform(self, tree):
  397. # Tree to postfix
  398. rev_postfix = []
  399. q = [tree]
  400. while q:
  401. t = q.pop()
  402. rev_postfix.append( t )
  403. if isinstance(t, Tree):
  404. q += t.children
  405. # Postfix to tree
  406. stack = []
  407. for x in reversed(rev_postfix):
  408. if isinstance(x, Tree):
  409. size = len(x.children)
  410. if size:
  411. args = stack[-size:]
  412. del stack[-size:]
  413. else:
  414. args = []
  415. stack.append(self._call_userfunc(x, args))
  416. else:
  417. stack.append(x)
  418. t ,= stack # We should have only one tree remaining
  419. return t
  420. class Transformer_InPlaceRecursive(Transformer):
  421. "Recursive. Changes the tree in-place instead of returning new instances"
  422. def _transform_tree(self, tree):
  423. tree.children = list(self._transform_children(tree.children))
  424. return self._call_userfunc(tree)
  425. # Visitors
  426. class VisitorBase:
  427. def _call_userfunc(self, tree):
  428. return getattr(self, tree.data, self.__default__)(tree)
  429. def __default__(self, tree):
  430. "Default operation on tree (for override)"
  431. return tree
  432. def __class_getitem__(cls, _):
  433. return cls
  434. class Visitor(VisitorBase):
  435. """Bottom-up visitor, non-recursive
  436. Visits the tree, starting with the leaves and finally the root (bottom-up)
  437. Calls its methods (provided by user via inheritance) according to tree.data
  438. """
  439. def visit(self, tree):
  440. for subtree in tree.iter_subtrees():
  441. self._call_userfunc(subtree)
  442. return tree
  443. def visit_topdown(self,tree):
  444. for subtree in tree.iter_subtrees_topdown():
  445. self._call_userfunc(subtree)
  446. return tree
  447. class Visitor_Recursive(VisitorBase):
  448. """Bottom-up visitor, recursive
  449. Visits the tree, starting with the leaves and finally the root (bottom-up)
  450. Calls its methods (provided by user via inheritance) according to tree.data
  451. """
  452. def visit(self, tree):
  453. for child in tree.children:
  454. if isinstance(child, Tree):
  455. self.visit(child)
  456. self._call_userfunc(tree)
  457. return tree
  458. def visit_topdown(self,tree):
  459. self._call_userfunc(tree)
  460. for child in tree.children:
  461. if isinstance(child, Tree):
  462. self.visit_topdown(child)
  463. return tree
  464. def visit_children_decor(func):
  465. "See Interpreter"
  466. @wraps(func)
  467. def inner(cls, tree):
  468. values = cls.visit_children(tree)
  469. return func(cls, values)
  470. return inner
  471. class Interpreter(_Decoratable):
  472. """Top-down visitor, recursive
  473. Visits the tree, starting with the root and finally the leaves (top-down)
  474. Calls its methods (provided by user via inheritance) according to tree.data
  475. Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches.
  476. The user has to explicitly call visit, visit_children, or use the @visit_children_decor
  477. """
  478. def visit(self, tree):
  479. f = getattr(self, tree.data)
  480. wrapper = getattr(f, 'visit_wrapper', None)
  481. if wrapper is not None:
  482. return f.visit_wrapper(f, tree.data, tree.children, tree.meta)
  483. else:
  484. return f(tree)
  485. def visit_children(self, tree):
  486. return [self.visit(child) if isinstance(child, Tree) else child
  487. for child in tree.children]
  488. def __getattr__(self, name):
  489. return self.__default__
  490. def __default__(self, tree):
  491. return self.visit_children(tree)
  492. # Decorators
  493. def _apply_decorator(obj, decorator, **kwargs):
  494. try:
  495. _apply = obj._apply_decorator
  496. except AttributeError:
  497. return decorator(obj, **kwargs)
  498. else:
  499. return _apply(decorator, **kwargs)
  500. def _inline_args__func(func):
  501. @wraps(func)
  502. def create_decorator(_f, with_self):
  503. if with_self:
  504. def f(self, children):
  505. return _f(self, *children)
  506. else:
  507. def f(self, children):
  508. return _f(*children)
  509. return f
  510. return smart_decorator(func, create_decorator)
  511. def inline_args(obj): # XXX Deprecated
  512. return _apply_decorator(obj, _inline_args__func)
  513. def _visitor_args_func_dec(func, visit_wrapper=None, static=False):
  514. def create_decorator(_f, with_self):
  515. if with_self:
  516. def f(self, *args, **kwargs):
  517. return _f(self, *args, **kwargs)
  518. else:
  519. def f(self, *args, **kwargs):
  520. return _f(*args, **kwargs)
  521. return f
  522. if static:
  523. f = wraps(func)(create_decorator(func, False))
  524. else:
  525. f = smart_decorator(func, create_decorator)
  526. f.vargs_applied = True
  527. f.visit_wrapper = visit_wrapper
  528. return f
  529. def _vargs_inline(f, data, children, meta):
  530. return f(*children)
  531. def _vargs_meta_inline(f, data, children, meta):
  532. return f(meta, *children)
  533. def _vargs_meta(f, data, children, meta):
  534. return f(children, meta) # TODO swap these for consistency? Backwards incompatible!
  535. def _vargs_tree(f, data, children, meta):
  536. return f(Tree(data, children, meta))
  537. def v_args(inline=False, meta=False, tree=False, wrapper=None):
  538. "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods"
  539. if tree and (meta or inline):
  540. raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.")
  541. func = None
  542. if meta:
  543. if inline:
  544. func = _vargs_meta_inline
  545. else:
  546. func = _vargs_meta
  547. elif inline:
  548. func = _vargs_inline
  549. elif tree:
  550. func = _vargs_tree
  551. if wrapper is not None:
  552. if func is not None:
  553. raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.")
  554. func = wrapper
  555. def _visitor_args_dec(obj):
  556. return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func)
  557. return _visitor_args_dec
  558. class Indenter:
  559. def __init__(self):
  560. self.paren_level = None
  561. self.indent_level = None
  562. assert self.tab_len > 0
  563. def handle_NL(self, token):
  564. if self.paren_level > 0:
  565. return
  566. yield token
  567. indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
  568. indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
  569. if indent > self.indent_level[-1]:
  570. self.indent_level.append(indent)
  571. yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
  572. else:
  573. while indent < self.indent_level[-1]:
  574. self.indent_level.pop()
  575. yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
  576. assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])
  577. def _process(self, stream):
  578. for token in stream:
  579. if token.type == self.NL_type:
  580. for t in self.handle_NL(token):
  581. yield t
  582. else:
  583. yield token
  584. if token.type in self.OPEN_PAREN_types:
  585. self.paren_level += 1
  586. elif token.type in self.CLOSE_PAREN_types:
  587. self.paren_level -= 1
  588. assert self.paren_level >= 0
  589. while len(self.indent_level) > 1:
  590. self.indent_level.pop()
  591. yield Token(self.DEDENT_type, '')
  592. assert self.indent_level == [0], self.indent_level
  593. def process(self, stream):
  594. self.paren_level = 0
  595. self.indent_level = [0]
  596. return self._process(stream)
  597. # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
  598. @property
  599. def always_accept(self):
  600. return (self.NL_type,)
  601. class Symbol(Serialize):
  602. __slots__ = ('name',)
  603. is_term = NotImplemented
  604. def __init__(self, name):
  605. self.name = name
  606. def __eq__(self, other):
  607. assert isinstance(other, Symbol), other
  608. return self.is_term == other.is_term and self.name == other.name
  609. def __ne__(self, other):
  610. return not (self == other)
  611. def __hash__(self):
  612. return hash(self.name)
  613. def __repr__(self):
  614. return '%s(%r)' % (type(self).__name__, self.name)
  615. fullrepr = property(__repr__)
  616. class Terminal(Symbol):
  617. __serialize_fields__ = 'name', 'filter_out'
  618. is_term = True
  619. def __init__(self, name, filter_out=False):
  620. self.name = name
  621. self.filter_out = filter_out
  622. @property
  623. def fullrepr(self):
  624. return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)
  625. class NonTerminal(Symbol):
  626. __serialize_fields__ = 'name',
  627. is_term = False
  628. class RuleOptions(Serialize):
  629. __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices'
  630. def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()):
  631. self.keep_all_tokens = keep_all_tokens
  632. self.expand1 = expand1
  633. self.priority = priority
  634. self.template_source = template_source
  635. self.empty_indices = empty_indices
  636. def __repr__(self):
  637. return 'RuleOptions(%r, %r, %r, %r)' % (
  638. self.keep_all_tokens,
  639. self.expand1,
  640. self.priority,
  641. self.template_source
  642. )
  643. class Rule(Serialize):
  644. """
  645. origin : a symbol
  646. expansion : a list of symbols
  647. order : index of this expansion amongst all rules of the same name
  648. """
  649. __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')
  650. __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
  651. __serialize_namespace__ = Terminal, NonTerminal, RuleOptions
  652. def __init__(self, origin, expansion, order=0, alias=None, options=None):
  653. self.origin = origin
  654. self.expansion = expansion
  655. self.alias = alias
  656. self.order = order
  657. self.options = options or RuleOptions()
  658. self._hash = hash((self.origin, tuple(self.expansion)))
  659. def _deserialize(self):
  660. self._hash = hash((self.origin, tuple(self.expansion)))
  661. def __str__(self):
  662. return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
  663. def __repr__(self):
  664. return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
  665. def __hash__(self):
  666. return self._hash
  667. def __eq__(self, other):
  668. if not isinstance(other, Rule):
  669. return False
  670. return self.origin == other.origin and self.expansion == other.expansion
  671. from copy import copy
  672. class Pattern(Serialize):
  673. def __init__(self, value, flags=()):
  674. self.value = value
  675. self.flags = frozenset(flags)
  676. def __repr__(self):
  677. return repr(self.to_regexp())
  678. # Pattern Hashing assumes all subclasses have a different priority!
  679. def __hash__(self):
  680. return hash((type(self), self.value, self.flags))
  681. def __eq__(self, other):
  682. return type(self) == type(other) and self.value == other.value and self.flags == other.flags
  683. def to_regexp(self):
  684. raise NotImplementedError()
  685. if Py36:
  686. # Python 3.6 changed syntax for flags in regular expression
  687. def _get_flags(self, value):
  688. for f in self.flags:
  689. value = ('(?%s:%s)' % (f, value))
  690. return value
  691. else:
  692. def _get_flags(self, value):
  693. for f in self.flags:
  694. value = ('(?%s)' % f) + value
  695. return value
  696. class PatternStr(Pattern):
  697. __serialize_fields__ = 'value', 'flags'
  698. type = "str"
  699. def to_regexp(self):
  700. return self._get_flags(re.escape(self.value))
  701. @property
  702. def min_width(self):
  703. return len(self.value)
  704. max_width = min_width
  705. class PatternRE(Pattern):
  706. __serialize_fields__ = 'value', 'flags', '_width'
  707. type = "re"
  708. def to_regexp(self):
  709. return self._get_flags(self.value)
  710. _width = None
  711. def _get_width(self):
  712. if self._width is None:
  713. self._width = get_regexp_width(self.to_regexp())
  714. return self._width
  715. @property
  716. def min_width(self):
  717. return self._get_width()[0]
  718. @property
  719. def max_width(self):
  720. return self._get_width()[1]
  721. class TerminalDef(Serialize):
  722. __serialize_fields__ = 'name', 'pattern', 'priority'
  723. __serialize_namespace__ = PatternStr, PatternRE
  724. def __init__(self, name, pattern, priority=1):
  725. assert isinstance(pattern, Pattern), pattern
  726. self.name = name
  727. self.pattern = pattern
  728. self.priority = priority
  729. def __repr__(self):
  730. return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
  731. class Token(Str):
  732. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
  733. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
  734. try:
  735. self = super(Token, cls).__new__(cls, value)
  736. except UnicodeDecodeError:
  737. # value = value.decode('latin1')
  738. value = value.decode("ascii", "backslashreplace")
  739. self = super(Token, cls).__new__(cls, value)
  740. self.type = type_
  741. self.pos_in_stream = pos_in_stream
  742. self.value = value
  743. self.line = line
  744. self.column = column
  745. self.end_line = end_line
  746. self.end_column = end_column
  747. self.end_pos = end_pos
  748. return self
  749. def update(self, type_=None, value=None):
  750. return Token.new_borrow_pos(
  751. type_ if type_ is not None else self.type,
  752. value if value is not None else self.value,
  753. self
  754. )
  755. @classmethod
  756. def new_borrow_pos(cls, type_, value, borrow_t):
  757. return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
  758. def __reduce__(self):
  759. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  760. def __repr__(self):
  761. return 'Token(%s, %r)' % (self.type, self.value)
  762. def __deepcopy__(self, memo):
  763. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  764. def __eq__(self, other):
  765. if isinstance(other, Token) and self.type != other.type:
  766. return False
  767. return Str.__eq__(self, other)
  768. __hash__ = Str.__hash__
  769. class LineCounter:
  770. def __init__(self):
  771. self.newline_char = '\n'
  772. self.char_pos = 0
  773. self.line = 1
  774. self.column = 1
  775. self.line_start_pos = 0
  776. def feed(self, token, test_newline=True):
  777. """Consume a token and calculate the new line & column.
  778. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  779. """
  780. if test_newline:
  781. newlines = token.count(self.newline_char)
  782. if newlines:
  783. self.line += newlines
  784. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  785. self.char_pos += len(token)
  786. self.column = self.char_pos - self.line_start_pos + 1
  787. class _Lex:
  788. "Built to serve both Lexer and ContextualLexer"
  789. def __init__(self, lexer, state=None):
  790. self.lexer = lexer
  791. self.state = state
  792. def lex(self, stream, newline_types, ignore_types):
  793. newline_types = frozenset(newline_types)
  794. ignore_types = frozenset(ignore_types)
  795. line_ctr = LineCounter()
  796. last_token = None
  797. while line_ctr.char_pos < len(stream):
  798. lexer = self.lexer
  799. res = lexer.match(stream, line_ctr.char_pos)
  800. if not res:
  801. allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
  802. if not allowed:
  803. allowed = {"<END-OF-FILE>"}
  804. raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])
  805. value, type_ = res
  806. if type_ not in ignore_types:
  807. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  808. line_ctr.feed(value, type_ in newline_types)
  809. t.end_line = line_ctr.line
  810. t.end_column = line_ctr.column
  811. t.end_pos = line_ctr.char_pos
  812. if t.type in lexer.callback:
  813. t = lexer.callback[t.type](t)
  814. if not isinstance(t, Token):
  815. raise ValueError("Callbacks must return a token (returned %r)" % t)
  816. yield t
  817. last_token = t
  818. else:
  819. if type_ in lexer.callback:
  820. t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  821. lexer.callback[type_](t2)
  822. line_ctr.feed(value, type_ in newline_types)
  823. class UnlessCallback:
  824. def __init__(self, mres):
  825. self.mres = mres
  826. def __call__(self, t):
  827. for mre, type_from_index in self.mres:
  828. m = mre.match(t.value)
  829. if m:
  830. t.type = type_from_index[m.lastindex]
  831. break
  832. return t
  833. class CallChain:
  834. def __init__(self, callback1, callback2, cond):
  835. self.callback1 = callback1
  836. self.callback2 = callback2
  837. self.cond = cond
  838. def __call__(self, t):
  839. t2 = self.callback1(t)
  840. return self.callback2(t) if self.cond(t2) else t2
  841. def _create_unless(terminals, g_regex_flags, re_):
  842. tokens_by_type = classify(terminals, lambda t: type(t.pattern))
  843. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  844. embedded_strs = set()
  845. callback = {}
  846. for retok in tokens_by_type.get(PatternRE, []):
  847. unless = [] # {}
  848. for strtok in tokens_by_type.get(PatternStr, []):
  849. if strtok.priority > retok.priority:
  850. continue
  851. s = strtok.pattern.value
  852. m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
  853. if m and m.group(0) == s:
  854. unless.append(strtok)
  855. if strtok.pattern.flags <= retok.pattern.flags:
  856. embedded_strs.add(strtok)
  857. if unless:
  858. callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))
  859. terminals = [t for t in terminals if t not in embedded_strs]
  860. return terminals, callback
  861. def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
  862. # Python sets an unreasonable group limit (currently 100) in its re module
  863. # Worse, the only way to know we reached it is by catching an AssertionError!
  864. # This function recursively tries less and less groups until it's successful.
  865. postfix = '$' if match_whole else ''
  866. mres = []
  867. while terminals:
  868. try:
  869. mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
  870. except AssertionError: # Yes, this is what Python provides us.. :/
  871. return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)
  872. # terms_from_name = {t.name: t for t in terminals[:max_size]}
  873. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  874. terminals = terminals[max_size:]
  875. return mres
  876. def build_mres(terminals, g_regex_flags, re_, match_whole=False):
  877. return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)
  878. def _regexp_has_newline(r):
  879. r"""Expressions that may indicate newlines in a regexp:
  880. - newlines (\n)
  881. - escaped newline (\\n)
  882. - anything but ([^...])
  883. - any-char (.) when the flag (?s) exists
  884. - spaces (\s)
  885. """
  886. return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
  887. class Lexer(object):
  888. """Lexer interface
  889. Method Signatures:
  890. lex(self, stream) -> Iterator[Token]
  891. """
  892. lex = NotImplemented
  893. class TraditionalLexer(Lexer):
  894. def __init__(self, conf):
  895. terminals = list(conf.tokens)
  896. assert all(isinstance(t, TerminalDef) for t in terminals), terminals
  897. self.re = conf.re_module
  898. if not conf.skip_validation:
  899. # Sanitization
  900. for t in terminals:
  901. try:
  902. self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
  903. except self.re.error:
  904. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  905. if t.pattern.min_width == 0:
  906. raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
  907. assert set(conf.ignore) <= {t.name for t in terminals}
  908. # Init
  909. self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
  910. self.ignore_types = list(conf.ignore)
  911. terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  912. self.terminals = terminals
  913. self.user_callbacks = conf.callbacks
  914. self.g_regex_flags = conf.g_regex_flags
  915. self._mres = None
  916. # self.build(g_regex_flags)
  917. def _build(self):
  918. terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re)
  919. assert all(self.callback.values())
  920. for type_, f in self.user_callbacks.items():
  921. if type_ in self.callback:
  922. # Already a callback there, probably UnlessCallback
  923. self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
  924. else:
  925. self.callback[type_] = f
  926. self._mres = build_mres(terminals, self.g_regex_flags, self.re)
  927. @property
  928. def mres(self):
  929. if self._mres is None:
  930. self._build()
  931. return self._mres
  932. def match(self, stream, pos):
  933. for mre, type_from_index in self.mres:
  934. m = mre.match(stream, pos)
  935. if m:
  936. return m.group(0), type_from_index[m.lastindex]
  937. def lex(self, stream):
  938. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  939. class ContextualLexer(Lexer):
  940. def __init__(self, conf, states, always_accept=()):
  941. terminals = list(conf.tokens)
  942. tokens_by_name = {}
  943. for t in terminals:
  944. assert t.name not in tokens_by_name, t
  945. tokens_by_name[t.name] = t
  946. trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation)
  947. lexer_by_tokens = {}
  948. self.lexers = {}
  949. for state, accepts in states.items():
  950. key = frozenset(accepts)
  951. try:
  952. lexer = lexer_by_tokens[key]
  953. except KeyError:
  954. accepts = set(accepts) | set(conf.ignore) | set(always_accept)
  955. state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
  956. lexer_conf = copy(trad_conf)
  957. lexer_conf.tokens = state_tokens
  958. lexer = TraditionalLexer(lexer_conf)
  959. lexer_by_tokens[key] = lexer
  960. self.lexers[state] = lexer
  961. assert trad_conf.tokens is terminals
  962. self.root_lexer = TraditionalLexer(trad_conf)
  963. def lex(self, stream, get_parser_state):
  964. parser_state = get_parser_state()
  965. l = _Lex(self.lexers[parser_state], parser_state)
  966. try:
  967. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  968. yield x
  969. parser_state = get_parser_state()
  970. l.lexer = self.lexers[parser_state]
  971. l.state = parser_state # For debug only, no need to worry about multithreading
  972. except UnexpectedCharacters as e:
  973. # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
  974. # but not in the current context.
  975. # This tests the input against the global context, to provide a nicer error.
  976. root_match = self.root_lexer.match(stream, e.pos_in_stream)
  977. if not root_match:
  978. raise
  979. value, type_ = root_match
  980. t = Token(type_, value, e.pos_in_stream, e.line, e.column)
  981. raise UnexpectedToken(t, e.allowed, state=e.state)
  982. class LexerConf(Serialize):
  983. __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
  984. __serialize_namespace__ = TerminalDef,
  985. def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False):
  986. self.tokens = tokens # TODO should be terminals
  987. self.ignore = ignore
  988. self.postlex = postlex
  989. self.callbacks = callbacks or {}
  990. self.g_regex_flags = g_regex_flags
  991. self.re_module = re_module
  992. self.skip_validation = skip_validation
  993. def _deserialize(self):
  994. self.callbacks = {} # TODO
  995. from functools import partial, wraps
  996. from itertools import repeat, product
  997. class ExpandSingleChild:
  998. def __init__(self, node_builder):
  999. self.node_builder = node_builder
  1000. def __call__(self, children):
  1001. if len(children) == 1:
  1002. return children[0]
  1003. else:
  1004. return self.node_builder(children)
  1005. class PropagatePositions:
  1006. def __init__(self, node_builder):
  1007. self.node_builder = node_builder
  1008. def __call__(self, children):
  1009. res = self.node_builder(children)
  1010. # local reference to Tree.meta reduces number of presence checks
  1011. if isinstance(res, Tree):
  1012. res_meta = res.meta
  1013. for c in children:
  1014. if isinstance(c, Tree):
  1015. child_meta = c.meta
  1016. if not child_meta.empty:
  1017. res_meta.line = child_meta.line
  1018. res_meta.column = child_meta.column
  1019. res_meta.start_pos = child_meta.start_pos
  1020. res_meta.empty = False
  1021. break
  1022. elif isinstance(c, Token):
  1023. res_meta.line = c.line
  1024. res_meta.column = c.column
  1025. res_meta.start_pos = c.pos_in_stream
  1026. res_meta.empty = False
  1027. break
  1028. for c in reversed(children):
  1029. if isinstance(c, Tree):
  1030. child_meta = c.meta
  1031. if not child_meta.empty:
  1032. res_meta.end_line = child_meta.end_line
  1033. res_meta.end_column = child_meta.end_column
  1034. res_meta.end_pos = child_meta.end_pos
  1035. res_meta.empty = False
  1036. break
  1037. elif isinstance(c, Token):
  1038. res_meta.end_line = c.end_line
  1039. res_meta.end_column = c.end_column
  1040. res_meta.end_pos = c.end_pos
  1041. res_meta.empty = False
  1042. break
  1043. return res
  1044. class ChildFilter:
  1045. def __init__(self, to_include, append_none, node_builder):
  1046. self.node_builder = node_builder
  1047. self.to_include = to_include
  1048. self.append_none = append_none
  1049. def __call__(self, children):
  1050. filtered = []
  1051. for i, to_expand, add_none in self.to_include:
  1052. if add_none:
  1053. filtered += [None] * add_none
  1054. if to_expand:
  1055. filtered += children[i].children
  1056. else:
  1057. filtered.append(children[i])
  1058. if self.append_none:
  1059. filtered += [None] * self.append_none
  1060. return self.node_builder(filtered)
  1061. class ChildFilterLALR(ChildFilter):
  1062. "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
  1063. def __call__(self, children):
  1064. filtered = []
  1065. for i, to_expand, add_none in self.to_include:
  1066. if add_none:
  1067. filtered += [None] * add_none
  1068. if to_expand:
  1069. if filtered:
  1070. filtered += children[i].children
  1071. else: # Optimize for left-recursion
  1072. filtered = children[i].children
  1073. else:
  1074. filtered.append(children[i])
  1075. if self.append_none:
  1076. filtered += [None] * self.append_none
  1077. return self.node_builder(filtered)
  1078. class ChildFilterLALR_NoPlaceholders(ChildFilter):
  1079. "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
  1080. def __init__(self, to_include, node_builder):
  1081. self.node_builder = node_builder
  1082. self.to_include = to_include
  1083. def __call__(self, children):
  1084. filtered = []
  1085. for i, to_expand in self.to_include:
  1086. if to_expand:
  1087. if filtered:
  1088. filtered += children[i].children
  1089. else: # Optimize for left-recursion
  1090. filtered = children[i].children
  1091. else:
  1092. filtered.append(children[i])
  1093. return self.node_builder(filtered)
  1094. def _should_expand(sym):
  1095. return not sym.is_term and sym.name.startswith('_')
  1096. def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices):
  1097. # Prepare empty_indices as: How many Nones to insert at each index?
  1098. if _empty_indices:
  1099. assert _empty_indices.count(False) == len(expansion)
  1100. s = ''.join(str(int(b)) for b in _empty_indices)
  1101. empty_indices = [len(ones) for ones in s.split('0')]
  1102. assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion))
  1103. else:
  1104. empty_indices = [0] * (len(expansion)+1)
  1105. to_include = []
  1106. nones_to_add = 0
  1107. for i, sym in enumerate(expansion):
  1108. nones_to_add += empty_indices[i]
  1109. if keep_all_tokens or not (sym.is_term and sym.filter_out):
  1110. to_include.append((i, _should_expand(sym), nones_to_add))
  1111. nones_to_add = 0
  1112. nones_to_add += empty_indices[len(expansion)]
  1113. if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include):
  1114. if _empty_indices or ambiguous:
  1115. return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add)
  1116. else:
  1117. # LALR without placeholders
  1118. return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include])
  1119. class AmbiguousExpander:
  1120. """Deal with the case where we're expanding children ('_rule') into a parent but the children
  1121. are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself
  1122. ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children
  1123. into the right parents in the right places, essentially shifting the ambiguiuty up the tree."""
  1124. def __init__(self, to_expand, tree_class, node_builder):
  1125. self.node_builder = node_builder
  1126. self.tree_class = tree_class
  1127. self.to_expand = to_expand
  1128. def __call__(self, children):
  1129. def _is_ambig_tree(child):
  1130. return hasattr(child, 'data') and child.data == '_ambig'
  1131. #### When we're repeatedly expanding ambiguities we can end up with nested ambiguities.
  1132. # All children of an _ambig node should be a derivation of that ambig node, hence
  1133. # it is safe to assume that if we see an _ambig node nested within an ambig node
  1134. # it is safe to simply expand it into the parent _ambig node as an alternative derivation.
  1135. ambiguous = []
  1136. for i, child in enumerate(children):
  1137. if _is_ambig_tree(child):
  1138. if i in self.to_expand:
  1139. ambiguous.append(i)
  1140. to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)]
  1141. child.expand_kids_by_index(*to_expand)
  1142. if not ambiguous:
  1143. return self.node_builder(children)
  1144. expand = [ iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children) ]
  1145. return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))])
  1146. def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens):
  1147. to_expand = [i for i, sym in enumerate(expansion)
  1148. if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))]
  1149. if to_expand:
  1150. return partial(AmbiguousExpander, to_expand, tree_class)
  1151. def ptb_inline_args(func):
  1152. @wraps(func)
  1153. def f(children):
  1154. return func(*children)
  1155. return f
  1156. def inplace_transformer(func):
  1157. @wraps(func)
  1158. def f(children):
  1159. # function name in a Transformer is a rule name.
  1160. tree = Tree(func.__name__, children)
  1161. return func(tree)
  1162. return f
  1163. def apply_visit_wrapper(func, name, wrapper):
  1164. if wrapper is _vargs_meta or wrapper is _vargs_meta_inline:
  1165. raise NotImplementedError("Meta args not supported for internal transformer")
  1166. @wraps(func)
  1167. def f(children):
  1168. return wrapper(func, name, children, None)
  1169. return f
  1170. class ParseTreeBuilder:
  1171. def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False):
  1172. self.tree_class = tree_class
  1173. self.propagate_positions = propagate_positions
  1174. self.always_keep_all_tokens = keep_all_tokens
  1175. self.ambiguous = ambiguous
  1176. self.maybe_placeholders = maybe_placeholders
  1177. self.rule_builders = list(self._init_builders(rules))
  1178. def _init_builders(self, rules):
  1179. for rule in rules:
  1180. options = rule.options
  1181. keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens
  1182. expand_single_child = options.expand1
  1183. wrapper_chain = list(filter(None, [
  1184. (expand_single_child and not rule.alias) and ExpandSingleChild,
  1185. maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None),
  1186. self.propagate_positions and PropagatePositions,
  1187. self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
  1188. ]))
  1189. yield rule, wrapper_chain
  1190. def create_callback(self, transformer=None):
  1191. callbacks = {}
  1192. for rule, wrapper_chain in self.rule_builders:
  1193. user_callback_name = rule.alias or rule.options.template_source or rule.origin.name
  1194. try:
  1195. f = getattr(transformer, user_callback_name)
  1196. # XXX InlineTransformer is deprecated!
  1197. wrapper = getattr(f, 'visit_wrapper', None)
  1198. if wrapper is not None:
  1199. f = apply_visit_wrapper(f, user_callback_name, wrapper)
  1200. else:
  1201. if isinstance(transformer, InlineTransformer):
  1202. f = ptb_inline_args(f)
  1203. elif isinstance(transformer, Transformer_InPlace):
  1204. f = inplace_transformer(f)
  1205. except AttributeError:
  1206. f = partial(self.tree_class, user_callback_name)
  1207. for w in wrapper_chain:
  1208. f = w(f)
  1209. if rule in callbacks:
  1210. raise GrammarError("Rule '%s' already exists" % (rule,))
  1211. callbacks[rule] = f
  1212. return callbacks
  1213. class LALR_Parser(object):
  1214. def __init__(self, parser_conf, debug=False):
  1215. assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization"
  1216. analysis = LALR_Analyzer(parser_conf, debug=debug)
  1217. analysis.compute_lalr()
  1218. callbacks = parser_conf.callbacks
  1219. self._parse_table = analysis.parse_table
  1220. self.parser_conf = parser_conf
  1221. self.parser = _Parser(analysis.parse_table, callbacks, debug)
  1222. @classmethod
  1223. def deserialize(cls, data, memo, callbacks):
  1224. inst = cls.__new__(cls)
  1225. inst._parse_table = IntParseTable.deserialize(data, memo)
  1226. inst.parser = _Parser(inst._parse_table, callbacks)
  1227. return inst
  1228. def serialize(self, memo):
  1229. return self._parse_table.serialize(memo)
  1230. def parse(self, *args):
  1231. return self.parser.parse(*args)
  1232. class _Parser:
  1233. def __init__(self, parse_table, callbacks, debug=False):
  1234. self.parse_table = parse_table
  1235. self.callbacks = callbacks
  1236. self.debug = debug
  1237. def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None):
  1238. token = None
  1239. stream = iter(seq)
  1240. states = self.parse_table.states
  1241. start_state = self.parse_table.start_states[start]
  1242. end_state = self.parse_table.end_states[start]
  1243. state_stack = state_stack or [start_state]
  1244. value_stack = value_stack or []
  1245. if set_state: set_state(start_state)
  1246. def get_action(token):
  1247. state = state_stack[-1]
  1248. try:
  1249. return states[state][token.type]
  1250. except KeyError:
  1251. expected = [s for s in states[state].keys() if s.isupper()]
  1252. try:
  1253. puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
  1254. except NameError:
  1255. puppet = None
  1256. raise UnexpectedToken(token, expected, state=state, puppet=puppet)
  1257. def reduce(rule):
  1258. size = len(rule.expansion)
  1259. if size:
  1260. s = value_stack[-size:]
  1261. del state_stack[-size:]
  1262. del value_stack[-size:]
  1263. else:
  1264. s = []
  1265. value = self.callbacks[rule](s)
  1266. _action, new_state = states[state_stack[-1]][rule.origin.name]
  1267. assert _action is Shift
  1268. state_stack.append(new_state)
  1269. value_stack.append(value)
  1270. # Main LALR-parser loop
  1271. try:
  1272. for token in stream:
  1273. while True:
  1274. action, arg = get_action(token)
  1275. assert arg != end_state
  1276. if action is Shift:
  1277. state_stack.append(arg)
  1278. value_stack.append(token)
  1279. if set_state: set_state(arg)
  1280. break # next token
  1281. else:
  1282. reduce(arg)
  1283. except Exception as e:
  1284. if self.debug:
  1285. print("")
  1286. print("STATE STACK DUMP")
  1287. print("----------------")
  1288. for i, s in enumerate(state_stack):
  1289. print('%d)' % i , s)
  1290. print("")
  1291. raise
  1292. token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
  1293. while True:
  1294. _action, arg = get_action(token)
  1295. assert(_action is Reduce)
  1296. reduce(arg)
  1297. if state_stack[-1] == end_state:
  1298. return value_stack[-1]
  1299. class Action:
  1300. def __init__(self, name):
  1301. self.name = name
  1302. def __str__(self):
  1303. return self.name
  1304. def __repr__(self):
  1305. return str(self)
  1306. Shift = Action('Shift')
  1307. Reduce = Action('Reduce')
  1308. class ParseTable:
  1309. def __init__(self, states, start_states, end_states):
  1310. self.states = states
  1311. self.start_states = start_states
  1312. self.end_states = end_states
  1313. def serialize(self, memo):
  1314. tokens = Enumerator()
  1315. rules = Enumerator()
  1316. states = {
  1317. state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
  1318. for token, (action, arg) in actions.items()}
  1319. for state, actions in self.states.items()
  1320. }
  1321. return {
  1322. 'tokens': tokens.reversed(),
  1323. 'states': states,
  1324. 'start_states': self.start_states,
  1325. 'end_states': self.end_states,
  1326. }
  1327. @classmethod
  1328. def deserialize(cls, data, memo):
  1329. tokens = data['tokens']
  1330. states = {
  1331. state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
  1332. for token, (action, arg) in actions.items()}
  1333. for state, actions in data['states'].items()
  1334. }
  1335. return cls(states, data['start_states'], data['end_states'])
  1336. class IntParseTable(ParseTable):
  1337. @classmethod
  1338. def from_ParseTable(cls, parse_table):
  1339. enum = list(parse_table.states)
  1340. state_to_idx = {s:i for i,s in enumerate(enum)}
  1341. int_states = {}
  1342. for s, la in parse_table.states.items():
  1343. la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
  1344. for k,v in la.items()}
  1345. int_states[ state_to_idx[s] ] = la
  1346. start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
  1347. end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
  1348. return cls(int_states, start_states, end_states)
  1349. def get_frontend(parser, lexer):
  1350. if parser=='lalr':
  1351. if lexer is None:
  1352. raise ValueError('The LALR parser requires use of a lexer')
  1353. elif lexer == 'standard':
  1354. return LALR_TraditionalLexer
  1355. elif lexer == 'contextual':
  1356. return LALR_ContextualLexer
  1357. elif issubclass(lexer, Lexer):
  1358. return partial(LALR_CustomLexer, lexer)
  1359. else:
  1360. raise ValueError('Unknown lexer: %s' % lexer)
  1361. elif parser=='earley':
  1362. if lexer=='standard':
  1363. return Earley
  1364. elif lexer=='dynamic':
  1365. return XEarley
  1366. elif lexer=='dynamic_complete':
  1367. return XEarley_CompleteLex
  1368. elif lexer=='contextual':
  1369. raise ValueError('The Earley parser does not support the contextual parser')
  1370. else:
  1371. raise ValueError('Unknown lexer: %s' % lexer)
  1372. elif parser == 'cyk':
  1373. if lexer == 'standard':
  1374. return CYK
  1375. else:
  1376. raise ValueError('CYK parser requires using standard parser.')
  1377. else:
  1378. raise ValueError('Unknown parser: %s' % parser)
  1379. class _ParserFrontend(Serialize):
  1380. def _parse(self, input, start, *args):
  1381. if start is None:
  1382. start = self.start
  1383. if len(start) > 1:
  1384. raise ValueError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
  1385. start ,= start
  1386. return self.parser.parse(input, start, *args)
  1387. class WithLexer(_ParserFrontend):
  1388. lexer = None
  1389. parser = None
  1390. lexer_conf = None
  1391. start = None
  1392. __serialize_fields__ = 'parser', 'lexer_conf', 'start'
  1393. __serialize_namespace__ = LexerConf,
  1394. def __init__(self, lexer_conf, parser_conf, options=None):
  1395. self.lexer_conf = lexer_conf
  1396. self.start = parser_conf.start
  1397. self.postlex = lexer_conf.postlex
  1398. @classmethod
  1399. def deserialize(cls, data, memo, callbacks, postlex, re_module):
  1400. inst = super(WithLexer, cls).deserialize(data, memo)
  1401. inst.postlex = postlex
  1402. inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
  1403. inst.lexer_conf.re_module = re_module
  1404. inst.lexer_conf.skip_validation=True
  1405. inst.init_lexer()
  1406. return inst
  1407. def _serialize(self, data, memo):
  1408. data['parser'] = data['parser'].serialize(memo)
  1409. def lex(self, *args):
  1410. stream = self.lexer.lex(*args)
  1411. return self.postlex.process(stream) if self.postlex else stream
  1412. def parse(self, text, start=None):
  1413. token_stream = self.lex(text)
  1414. return self._parse(token_stream, start)
  1415. def init_traditional_lexer(self):
  1416. self.lexer = TraditionalLexer(self.lexer_conf)
  1417. class LALR_WithLexer(WithLexer):
  1418. def __init__(self, lexer_conf, parser_conf, options=None):
  1419. debug = options.debug if options else False
  1420. self.parser = LALR_Parser(parser_conf, debug=debug)
  1421. WithLexer.__init__(self, lexer_conf, parser_conf, options)
  1422. self.init_lexer()
  1423. def init_lexer(self, **kw):
  1424. raise NotImplementedError()
  1425. class LALR_TraditionalLexer(LALR_WithLexer):
  1426. def init_lexer(self):
  1427. self.init_traditional_lexer()
  1428. class LALR_ContextualLexer(LALR_WithLexer):
  1429. def init_lexer(self):
  1430. states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
  1431. always_accept = self.postlex.always_accept if self.postlex else ()
  1432. self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)
  1433. def parse(self, text, start=None):
  1434. parser_state = [None]
  1435. def set_parser_state(s):
  1436. parser_state[0] = s
  1437. token_stream = self.lex(text, lambda: parser_state[0])
  1438. return self._parse(token_stream, start, set_parser_state)
  1439. class LarkOptions(Serialize):
  1440. """Specifies the options for Lark
  1441. """
  1442. OPTIONS_DOC = """
  1443. # General
  1444. start - The start symbol. Either a string, or a list of strings for
  1445. multiple possible starts (Default: "start")
  1446. debug - Display debug information, such as warnings (default: False)
  1447. transformer - Applies the transformer to every parse tree (equivlent to
  1448. applying it after the parse, but faster)
  1449. propagate_positions - Propagates (line, column, end_line, end_column)
  1450. attributes into all tree branches.
  1451. maybe_placeholders - When True, the `[]` operator returns `None` when not matched.
  1452. When `False`, `[]` behaves like the `?` operator,
  1453. and returns no value at all.
  1454. (default=`False`. Recommended to set to `True`)
  1455. regex - When True, uses the `regex` module instead of the stdlib `re`.
  1456. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
  1457. LALR only for now.
  1458. When `False`, does nothing (default)
  1459. When `True`, caches to a temporary file in the local directory
  1460. When given a string, caches to the path pointed by the string
  1461. g_regex_flags - Flags that are applied to all terminals
  1462. (both regex and strings)
  1463. keep_all_tokens - Prevent the tree builder from automagically
  1464. removing "punctuation" tokens (default: False)
  1465. # Algorithm
  1466. parser - Decides which parser engine to use
  1467. Accepts "earley" or "lalr". (Default: "earley")
  1468. (there is also a "cyk" option for legacy)
  1469. lexer - Decides whether or not to use a lexer stage
  1470. "auto" (default): Choose for me based on the parser
  1471. "standard": Use a standard lexer
  1472. "contextual": Stronger lexer (only works with parser="lalr")
  1473. "dynamic": Flexible and powerful (only with parser="earley")
  1474. "dynamic_complete": Same as dynamic, but tries *every* variation
  1475. of tokenizing possible.
  1476. ambiguity - Decides how to handle ambiguity in the parse.
  1477. Only relevant if parser="earley"
  1478. "resolve": The parser will automatically choose the simplest
  1479. derivation (it chooses consistently: greedy for
  1480. tokens, non-greedy for rules)
  1481. "explicit": The parser will return all derivations wrapped
  1482. in "_ambig" tree nodes (i.e. a forest).
  1483. # Domain Specific
  1484. postlex - Lexer post-processing (Default: None) Only works with the
  1485. standard and contextual lexers.
  1486. priority - How priorities should be evaluated - auto, none, normal,
  1487. invert (Default: auto)
  1488. lexer_callbacks - Dictionary of callbacks for the lexer. May alter
  1489. tokens during lexing. Use with caution.
  1490. edit_terminals - A callback
  1491. """
  1492. if __doc__:
  1493. __doc__ += OPTIONS_DOC
  1494. _defaults = {
  1495. 'debug': False,
  1496. 'keep_all_tokens': False,
  1497. 'tree_class': None,
  1498. 'cache': False,
  1499. 'postlex': None,
  1500. 'parser': 'earley',
  1501. 'lexer': 'auto',
  1502. 'transformer': None,
  1503. 'start': 'start',
  1504. 'priority': 'auto',
  1505. 'ambiguity': 'auto',
  1506. 'regex': False,
  1507. 'propagate_positions': False,
  1508. 'lexer_callbacks': {},
  1509. 'maybe_placeholders': False,
  1510. 'edit_terminals': None,
  1511. 'g_regex_flags': 0,
  1512. }
  1513. def __init__(self, options_dict):
  1514. o = dict(options_dict)
  1515. options = {}
  1516. for name, default in self._defaults.items():
  1517. if name in o:
  1518. value = o.pop(name)
  1519. if isinstance(default, bool) and name != 'cache':
  1520. value = bool(value)
  1521. else:
  1522. value = default
  1523. options[name] = value
  1524. if isinstance(options['start'], STRING_TYPE):
  1525. options['start'] = [options['start']]
  1526. self.__dict__['options'] = options
  1527. assert self.parser in ('earley', 'lalr', 'cyk', None)
  1528. if self.parser == 'earley' and self.transformer:
  1529. raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
  1530. 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
  1531. if o:
  1532. raise ValueError("Unknown options: %s" % o.keys())
  1533. def __getattr__(self, name):
  1534. try:
  1535. return self.options[name]
  1536. except KeyError as e:
  1537. raise AttributeError(e)
  1538. def __setattr__(self, name, value):
  1539. assert name in self.options
  1540. self.options[name] = value
  1541. def serialize(self, memo):
  1542. return self.options
  1543. @classmethod
  1544. def deserialize(cls, data, memo):
  1545. return cls(data)
  1546. class Lark(Serialize):
  1547. def __init__(self, grammar, **options):
  1548. """
  1549. grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
  1550. options : a dictionary controlling various aspects of Lark.
  1551. """
  1552. self.options = LarkOptions(options)
  1553. # Set regex or re module
  1554. use_regex = self.options.regex
  1555. if use_regex:
  1556. if regex:
  1557. re_module = regex
  1558. else:
  1559. raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
  1560. else:
  1561. re_module = re
  1562. # Some, but not all file-like objects have a 'name' attribute
  1563. try:
  1564. self.source = grammar.name
  1565. except AttributeError:
  1566. self.source = '<string>'
  1567. # Drain file-like objects to get their contents
  1568. try:
  1569. read = grammar.read
  1570. except AttributeError:
  1571. pass
  1572. else:
  1573. grammar = read()
  1574. assert isinstance(grammar, STRING_TYPE)
  1575. cache_fn = None
  1576. if self.options.cache:
  1577. if self.options.parser != 'lalr':
  1578. raise NotImplementedError("cache only works with parser='lalr' for now")
  1579. if isinstance(self.options.cache, STRING_TYPE):
  1580. cache_fn = self.options.cache
  1581. else:
  1582. if self.options.cache is not True:
  1583. raise ValueError("cache must be bool or str")
  1584. unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
  1585. from . import __version__
  1586. options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
  1587. s = grammar + options_str + __version__
  1588. md5 = hashlib.md5(s.encode()).hexdigest()
  1589. cache_fn = '.lark_cache_%s.tmp' % md5
  1590. if FS.exists(cache_fn):
  1591. logging.debug('Loading grammar from cache: %s', cache_fn)
  1592. with FS.open(cache_fn, 'rb') as f:
  1593. self._load(f, self.options.transformer, self.options.postlex)
  1594. return
  1595. if self.options.lexer == 'auto':
  1596. if self.options.parser == 'lalr':
  1597. self.options.lexer = 'contextual'
  1598. elif self.options.parser == 'earley':
  1599. self.options.lexer = 'dynamic'
  1600. elif self.options.parser == 'cyk':
  1601. self.options.lexer = 'standard'
  1602. else:
  1603. assert False, self.options.parser
  1604. lexer = self.options.lexer
  1605. assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)
  1606. if self.options.ambiguity == 'auto':
  1607. if self.options.parser == 'earley':
  1608. self.options.ambiguity = 'resolve'
  1609. else:
  1610. disambig_parsers = ['earley', 'cyk']
  1611. assert self.options.parser in disambig_parsers, (
  1612. 'Only %s supports disambiguation right now') % ', '.join(disambig_parsers)
  1613. if self.options.priority == 'auto':
  1614. if self.options.parser in ('earley', 'cyk', ):
  1615. self.options.priority = 'normal'
  1616. elif self.options.parser in ('lalr', ):
  1617. self.options.priority = None
  1618. elif self.options.priority in ('invert', 'normal'):
  1619. assert self.options.parser in ('earley', 'cyk'), "priorities are not supported for LALR at this time"
  1620. assert self.options.priority in ('auto', None, 'normal', 'invert'), 'invalid priority option specified: {}. options are auto, none, normal, invert.'.format(self.options.priority)
  1621. assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"'
  1622. assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )
  1623. # Parse the grammar file and compose the grammars (TODO)
  1624. self.grammar = load_grammar(grammar, self.source, re_module)
  1625. # Compile the EBNF grammar into BNF
  1626. self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
  1627. if self.options.edit_terminals:
  1628. for t in self.terminals:
  1629. self.options.edit_terminals(t)
  1630. self._terminals_dict = {t.name:t for t in self.terminals}
  1631. # If the user asked to invert the priorities, negate them all here.
  1632. # This replaces the old 'resolve__antiscore_sum' option.
  1633. if self.options.priority == 'invert':
  1634. for rule in self.rules:
  1635. if rule.options.priority is not None:
  1636. rule.options.priority = -rule.options.priority
  1637. # Else, if the user asked to disable priorities, strip them from the
  1638. # rules. This allows the Earley parsers to skip an extra forest walk
  1639. # for improved performance, if you don't need them (or didn't specify any).
  1640. elif self.options.priority == None:
  1641. for rule in self.rules:
  1642. if rule.options.priority is not None:
  1643. rule.options.priority = None
  1644. # TODO Deprecate lexer_callbacks?
  1645. lexer_callbacks = dict(self.options.lexer_callbacks)
  1646. if self.options.transformer:
  1647. t = self.options.transformer
  1648. for term in self.terminals:
  1649. if hasattr(t, term.name):
  1650. lexer_callbacks[term.name] = getattr(t, term.name)
  1651. self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)
  1652. if self.options.parser:
  1653. self.parser = self._build_parser()
  1654. elif lexer:
  1655. self.lexer = self._build_lexer()
  1656. if cache_fn:
  1657. logging.debug('Saving grammar to cache: %s', cache_fn)
  1658. with FS.open(cache_fn, 'wb') as f:
  1659. self.save(f)
  1660. if __init__.__doc__:
  1661. __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC
  1662. __serialize_fields__ = 'parser', 'rules', 'options'
  1663. def _build_lexer(self):
  1664. return TraditionalLexer(self.lexer_conf)
  1665. def _prepare_callbacks(self):
  1666. self.parser_class = get_frontend(self.options.parser, self.options.lexer)
  1667. self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders)
  1668. self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
  1669. def _build_parser(self):
  1670. self._prepare_callbacks()
  1671. parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
  1672. return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
  1673. def save(self, f):
  1674. data, m = self.memo_serialize([TerminalDef, Rule])
  1675. pickle.dump({'data': data, 'memo': m}, f)
  1676. @classmethod
  1677. def load(cls, f):
  1678. inst = cls.__new__(cls)
  1679. return inst._load(f)
  1680. def _load(self, f, transformer=None, postlex=None):
  1681. if isinstance(f, dict):
  1682. d = f
  1683. else:
  1684. d = pickle.load(f)
  1685. memo = d['memo']
  1686. data = d['data']
  1687. assert memo
  1688. memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
  1689. options = dict(data['options'])
  1690. if transformer is not None:
  1691. options['transformer'] = transformer
  1692. if postlex is not None:
  1693. options['postlex'] = postlex
  1694. self.options = LarkOptions.deserialize(options, memo)
  1695. re_module = regex if self.options.regex else re
  1696. self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
  1697. self.source = '<deserialized>'
  1698. self._prepare_callbacks()
  1699. self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module)
  1700. return self
  1701. @classmethod
  1702. def _load_from_dict(cls, data, memo, transformer=None, postlex=None):
  1703. inst = cls.__new__(cls)
  1704. return inst._load({'data': data, 'memo': memo}, transformer, postlex)
  1705. @classmethod
  1706. def open(cls, grammar_filename, rel_to=None, **options):
  1707. """Create an instance of Lark with the grammar given by its filename
  1708. If rel_to is provided, the function will find the grammar filename in relation to it.
  1709. Example:
  1710. >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
  1711. Lark(...)
  1712. """
  1713. if rel_to:
  1714. basepath = os.path.dirname(rel_to)
  1715. grammar_filename = os.path.join(basepath, grammar_filename)
  1716. with open(grammar_filename, encoding='utf8') as f:
  1717. return cls(f, **options)
  1718. def __repr__(self):
  1719. return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
  1720. def lex(self, text):
  1721. "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
  1722. if not hasattr(self, 'lexer'):
  1723. self.lexer = self._build_lexer()
  1724. stream = self.lexer.lex(text)
  1725. if self.options.postlex:
  1726. return self.options.postlex.process(stream)
  1727. return stream
  1728. def get_terminal(self, name):
  1729. "Get information about a terminal"
  1730. return self._terminals_dict[name]
  1731. def parse(self, text, start=None, on_error=None):
  1732. """Parse the given text, according to the options provided.
  1733. Parameters:
  1734. start: str - required if Lark was given multiple possible start symbols (using the start option).
  1735. on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only.
  1736. Returns a tree, unless specified otherwise.
  1737. """
  1738. try:
  1739. return self.parser.parse(text, start=start)
  1740. except UnexpectedToken as e:
  1741. if on_error is None:
  1742. raise
  1743. while True:
  1744. if not on_error(e):
  1745. raise e
  1746. try:
  1747. return e.puppet.resume_parse()
  1748. except UnexpectedToken as e2:
  1749. e = e2
  1750. DATA = (
  1751. {'rules': [{'@': 23}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 19}, {'@': 14}, {'@': 27}, {'@': 28}, {'@': 16}, {'@': 29}, {'@': 12}, {'@': 25}, {'@': 30}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 21}, {'@': 17}, {'@': 18}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], 'g_regex_flags': 0, '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'COMMA', 1: 'RSQB', 2: 'RBRACE', 3: '$END', 4: 'LBRACE', 5: u'FALSE', 6: u'string', 7: u'object', 8: u'NULL', 9: u'SIGNED_NUMBER', 10: u'value', 11: u'array', 12: u'ESCAPED_STRING', 13: u'TRUE', 14: 'LSQB', 15: 'COLON', 16: u'pair', 17: u'__array_star_0', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {1: (0, 29), 4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 6), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 2: {0: (0, 23), 2: (0, 0)}, 3: {15: (0, 12)}, 4: {16: (0, 13), 12: (0, 21), 6: (0, 3)}, 5: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 6: {0: (0, 7), 1: (0, 11), 17: (0, 17)}, 7: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 9), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 20), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 11: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 12: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 18), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 13: {0: (1, {'@': 17}), 2: (1, {'@': 17})}, 14: {}, 15: {0: (1, {'@': 18}), 2: (1, {'@': 18})}, 16: {0: (1, {'@': 19}), 1: (1, {'@': 19}), 2: (1, {'@': 19}), 3: (1, {'@': 19})}, 17: {0: (0, 10), 1: (0, 28)}, 18: {0: (1, {'@': 20}), 2: (1, {'@': 20})}, 19: {0: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {0: (1, {'@': 21}), 1: (1, {'@': 21})}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22}), 15: (1, {'@': 22})}, 22: {3: (1, {'@': 23})}, 23: {16: (0, 15), 12: (0, 21), 6: (0, 3)}, 24: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 25: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 26: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 27: {0: (1, {'@': 27}), 1: (1, {'@': 27}), 2: (1, {'@': 27}), 3: (1, {'@': 27})}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 31: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}, 32: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 22), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1), 19: (0, 14)}, 33: {16: (0, 19), 2: (0, 30), 12: (0, 21), 6: (0, 3)}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'regex': False, 'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'start': ['start'], 'debug': False, 'postlex': None, 'parser': 'lalr', 'tree_class': None, 'priority': None, 'cache': False, 'g_regex_flags': 0, 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': False}}
  1752. )
  1753. MEMO = (
  1754. {0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [2, 4294967295], 'flags': [], 'value': u'\\".*?(?<!\\\\)(\\\\\\\\)*?\\"'}, '__type__': 'TerminalDef', 'name': u'ESCAPED_STRING'}, 1: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [1, 4294967295], 'flags': [], 'value': u'(?:[ \t\x0c\r\n])+'}, '__type__': 'TerminalDef', 'name': u'WS'}, 2: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [1, 4294967295], 'flags': [], 'value': u'(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+)'}, '__type__': 'TerminalDef', 'name': u'SIGNED_NUMBER'}, 3: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u'true'}, '__type__': 'TerminalDef', 'name': u'TRUE'}, 4: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u'false'}, '__type__': 'TerminalDef', 'name': u'FALSE'}, 5: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u'null'}, '__type__': 'TerminalDef', 'name': u'NULL'}, 6: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u','}, '__type__': 'TerminalDef', 'name': 'COMMA'}, 7: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u'['}, '__type__': 'TerminalDef', 'name': 'LSQB'}, 8: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u']'}, '__type__': 'TerminalDef', 'name': 'RSQB'}, 9: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u'{'}, '__type__': 'TerminalDef', 'name': 'LBRACE'}, 10: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u'}'}, '__type__': 'TerminalDef', 'name': 'RBRACE'}, 11: {'priority': 1, 'pattern': {'__type__': 'PatternStr', 'flags': [], 'value': u':'}, '__type__': 'TerminalDef', 'name': 'COLON'}, 12: {'origin': {'__type__': 'NonTerminal', 'name': u'object'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'LBRACE'}, {'__type__': 'NonTerminal', 'name': u'pair'}, {'__type__': 'NonTerminal', 'name': u'__object_star_1'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'RBRACE'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 0}, 13: {'origin': {'__type__': 'NonTerminal', 'name': u'value'}, '__type__': 'Rule', 'expansion': [{'__type__': 'NonTerminal', 'name': u'string'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': None, 'order': 2}, 14: {'origin': {'__type__': 'NonTerminal', 'name': u'value'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': u'FALSE'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': u'false', 'order': 5}, 15: {'origin': {'__type__': 'NonTerminal', 'name': u'__array_star_0'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'COMMA'}, {'__type__': 'NonTerminal', 'name': u'value'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 0}, 16: {'origin': {'__type__': 'NonTerminal', 'name': u'array'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'LSQB'}, {'__type__': 'NonTerminal', 'name': u'value'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'RSQB'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 1}, 17: {'origin': {'__type__': 'NonTerminal', 'name': u'__object_star_1'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'COMMA'}, {'__type__': 'NonTerminal', 'name': u'pair'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 0}, 18: {'origin': {'__type__': 'NonTerminal', 'name': u'__object_star_1'}, '__type__': 'Rule', 'expansion': [{'__type__': 'NonTerminal', 'name': u'__object_star_1'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'COMMA'}, {'__type__': 'NonTerminal', 'name': u'pair'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 1}, 19: {'origin': {'__type__': 'NonTerminal', 'name': u'value'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': u'TRUE'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': u'true', 'order': 4}, 20: {'origin': {'__type__': 'NonTerminal', 'name': u'pair'}, '__type__': 'Rule', 'expansion': [{'__type__': 'NonTerminal', 'name': u'string'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'COLON'}, {'__type__': 'NonTerminal', 'name': u'value'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 0}, 21: {'origin': {'__type__': 'NonTerminal', 'name': u'__array_star_0'}, '__type__': 'Rule', 'expansion': [{'__type__': 'NonTerminal', 'name': u'__array_star_0'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'COMMA'}, {'__type__': 'NonTerminal', 'name': u'value'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 1}, 22: {'origin': {'__type__': 'NonTerminal', 'name': u'string'}, '__type__': 'Rule', 'expansion': [{'filter_out': False, '__type__': 'Terminal', 'name': u'ESCAPED_STRING'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 0}, 23: {'origin': {'__type__': 'NonTerminal', 'name': u'start'}, '__type__': 'Rule', 'expansion': [{'__type__': 'NonTerminal', 'name': u'value'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': None, 'order': 0}, 24: {'origin': {'__type__': 'NonTerminal', 'name': u'value'}, '__type__': 'Rule', 'expansion': [{'filter_out': False, '__type__': 'Terminal', 'name': u'SIGNED_NUMBER'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': u'number', 'order': 3}, 25: {'origin': {'__type__': 'NonTerminal', 'name': u'object'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'LBRACE'}, {'__type__': 'NonTerminal', 'name': u'pair'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'RBRACE'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 1}, 26: {'origin': {'__type__': 'NonTerminal', 'name': u'value'}, '__type__': 'Rule', 'expansion': [{'__type__': 'NonTerminal', 'name': u'array'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': None, 'order': 1}, 27: {'origin': {'__type__': 'NonTerminal', 'name': u'value'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': u'NULL'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': u'null', 'order': 6}, 28: {'origin': {'__type__': 'NonTerminal', 'name': u'array'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'LSQB'}, {'__type__': 'NonTerminal', 'name': u'value'}, {'__type__': 'NonTerminal', 'name': u'__array_star_0'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'RSQB'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': False}, 'alias': None, 'order': 0}, 29: {'origin': {'__type__': 'NonTerminal', 'name': u'array'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'LSQB'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'RSQB'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': [False, True, False], 'expand1': False}, 'alias': None, 'order': 2}, 30: {'origin': {'__type__': 'NonTerminal', 'name': u'object'}, '__type__': 'Rule', 'expansion': [{'filter_out': True, '__type__': 'Terminal', 'name': 'LBRACE'}, {'filter_out': True, '__type__': 'Terminal', 'name': 'RBRACE'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': [False, True, False], 'expand1': False}, 'alias': None, 'order': 2}, 31: {'origin': {'__type__': 'NonTerminal', 'name': u'value'}, '__type__': 'Rule', 'expansion': [{'__type__': 'NonTerminal', 'name': u'object'}], 'options': {'template_source': None, '__type__': 'RuleOptions', 'priority': None, 'keep_all_tokens': False, 'empty_indices': (), 'expand1': True}, 'alias': None, 'order': 0}}
  1755. )
  1756. Shift = 0
  1757. Reduce = 1
  1758. def Lark_StandAlone(transformer=None, postlex=None):
  1759. return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)