This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2531 lines
87 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import re
  4. import unittest
  5. import logging
  6. import os
  7. import sys
  8. from copy import copy, deepcopy
  9. from lark.utils import Py36, isascii
  10. from lark import Token, Transformer_NonRecursive, LexError
  11. try:
  12. from cStringIO import StringIO as cStringIO
  13. except ImportError:
  14. # Available only in Python 2.x, 3.x only has io.StringIO from below
  15. cStringIO = None
  16. from io import (
  17. StringIO as uStringIO,
  18. BytesIO,
  19. open,
  20. )
  21. try:
  22. import regex
  23. except ImportError:
  24. regex = None
  25. import lark
  26. from lark import logger
  27. from lark.lark import Lark
  28. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
  29. from lark.tree import Tree
  30. from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive
  31. from lark.grammar import Rule
  32. from lark.lexer import TerminalDef, Lexer, TraditionalLexer
  33. from lark.indenter import Indenter
  34. __all__ = ['TestParsers']
  35. __path__ = os.path.dirname(__file__)
  36. def _read(n, *args):
  37. with open(os.path.join(__path__, n), *args) as f:
  38. return f.read()
  39. class TestParsers(unittest.TestCase):
  40. def test_big_list(self):
  41. Lark(r"""
  42. start: {}
  43. """.format(
  44. "|".join(['"%s"'%i for i in range(250)])
  45. ))
  46. def test_same_ast(self):
  47. "Tests that Earley and LALR parsers produce equal trees"
  48. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  49. name_list: NAME | name_list "," NAME
  50. NAME: /\w+/ """, parser='lalr')
  51. l = g.parse('(a,b,c,*x)')
  52. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  53. name_list: NAME | name_list "," NAME
  54. NAME: /\w/+ """)
  55. l2 = g.parse('(a,b,c,*x)')
  56. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  57. def test_infinite_recurse(self):
  58. g = """start: a
  59. a: a | "a"
  60. """
  61. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  62. # TODO: should it? shouldn't it?
  63. # l = Lark(g, parser='earley', lexer='dynamic')
  64. # self.assertRaises(ParseError, l.parse, 'a')
  65. def test_propagate_positions(self):
  66. g = Lark("""start: a
  67. a: "a"
  68. """, propagate_positions=True)
  69. r = g.parse('a')
  70. self.assertEqual( r.children[0].meta.line, 1 )
  71. g = Lark("""start: x
  72. x: a
  73. a: "a"
  74. """, propagate_positions=True)
  75. r = g.parse('a')
  76. self.assertEqual( r.children[0].meta.line, 1 )
  77. def test_expand1(self):
  78. g = Lark("""start: a
  79. ?a: b
  80. b: "x"
  81. """)
  82. r = g.parse('x')
  83. self.assertEqual( r.children[0].data, "b" )
  84. g = Lark("""start: a
  85. ?a: b -> c
  86. b: "x"
  87. """)
  88. r = g.parse('x')
  89. self.assertEqual( r.children[0].data, "c" )
  90. g = Lark("""start: a
  91. ?a: B -> c
  92. B: "x"
  93. """)
  94. self.assertEqual( r.children[0].data, "c" )
  95. g = Lark("""start: a
  96. ?a: b b -> c
  97. b: "x"
  98. """)
  99. r = g.parse('xx')
  100. self.assertEqual( r.children[0].data, "c" )
  101. def test_comment_in_rule_definition(self):
  102. g = Lark("""start: a
  103. a: "a"
  104. // A comment
  105. // Another comment
  106. | "b"
  107. // Still more
  108. c: "unrelated"
  109. """)
  110. r = g.parse('b')
  111. self.assertEqual( r.children[0].data, "a" )
  112. def test_visit_tokens(self):
  113. class T(Transformer):
  114. def a(self, children):
  115. return children[0] + "!"
  116. def A(self, tok):
  117. return tok.update(value=tok.upper())
  118. # Test regular
  119. g = """start: a
  120. a : A
  121. A: "x"
  122. """
  123. p = Lark(g, parser='lalr')
  124. r = T(False).transform(p.parse("x"))
  125. self.assertEqual( r.children, ["x!"] )
  126. r = T().transform(p.parse("x"))
  127. self.assertEqual( r.children, ["X!"] )
  128. # Test internal transformer
  129. p = Lark(g, parser='lalr', transformer=T())
  130. r = p.parse("x")
  131. self.assertEqual( r.children, ["X!"] )
  132. def test_visit_tokens2(self):
  133. g = """
  134. start: add+
  135. add: NUM "+" NUM
  136. NUM: /\d+/
  137. %ignore " "
  138. """
  139. text = "1+2 3+4"
  140. expected = Tree('start', [3, 7])
  141. for base in (Transformer, Transformer_InPlace, Transformer_NonRecursive, Transformer_InPlaceRecursive):
  142. class T(base):
  143. def add(self, children):
  144. return sum(children if isinstance(children, list) else children.children)
  145. def NUM(self, token):
  146. return int(token)
  147. parser = Lark(g, parser='lalr', transformer=T())
  148. result = parser.parse(text)
  149. self.assertEqual(result, expected)
  150. def test_vargs_meta(self):
  151. @v_args(meta=True)
  152. class T1(Transformer):
  153. def a(self, children, meta):
  154. assert not children
  155. return meta.line
  156. def start(self, children, meta):
  157. return children
  158. @v_args(meta=True, inline=True)
  159. class T2(Transformer):
  160. def a(self, meta):
  161. return meta.line
  162. def start(self, meta, *res):
  163. return list(res)
  164. for T in (T1, T2):
  165. for internal in [False, True]:
  166. try:
  167. g = Lark(r"""start: a+
  168. a : "x" _NL?
  169. _NL: /\n/+
  170. """, parser='lalr', transformer=T() if internal else None, propagate_positions=True)
  171. except NotImplementedError:
  172. assert internal
  173. continue
  174. res = g.parse("xx\nx\nxxx\n\n\nxx")
  175. assert not internal
  176. res = T().transform(res)
  177. self.assertEqual(res, [1, 1, 2, 3, 3, 3, 6, 6])
  178. def test_vargs_tree(self):
  179. tree = Lark('''
  180. start: a a a
  181. !a: "A"
  182. ''').parse('AAA')
  183. tree_copy = deepcopy(tree)
  184. @v_args(tree=True)
  185. class T(Transformer):
  186. def a(self, tree):
  187. return 1
  188. def start(self, tree):
  189. return tree.children
  190. res = T().transform(tree)
  191. self.assertEqual(res, [1, 1, 1])
  192. self.assertEqual(tree, tree_copy)
  193. def test_embedded_transformer(self):
  194. class T(Transformer):
  195. def a(self, children):
  196. return "<a>"
  197. def b(self, children):
  198. return "<b>"
  199. def c(self, children):
  200. return "<c>"
  201. # Test regular
  202. g = Lark("""start: a
  203. a : "x"
  204. """, parser='lalr')
  205. r = T().transform(g.parse("x"))
  206. self.assertEqual( r.children, ["<a>"] )
  207. g = Lark("""start: a
  208. a : "x"
  209. """, parser='lalr', transformer=T())
  210. r = g.parse("x")
  211. self.assertEqual( r.children, ["<a>"] )
  212. # Test Expand1
  213. g = Lark("""start: a
  214. ?a : b
  215. b : "x"
  216. """, parser='lalr')
  217. r = T().transform(g.parse("x"))
  218. self.assertEqual( r.children, ["<b>"] )
  219. g = Lark("""start: a
  220. ?a : b
  221. b : "x"
  222. """, parser='lalr', transformer=T())
  223. r = g.parse("x")
  224. self.assertEqual( r.children, ["<b>"] )
  225. # Test Expand1 -> Alias
  226. g = Lark("""start: a
  227. ?a : b b -> c
  228. b : "x"
  229. """, parser='lalr')
  230. r = T().transform(g.parse("xx"))
  231. self.assertEqual( r.children, ["<c>"] )
  232. g = Lark("""start: a
  233. ?a : b b -> c
  234. b : "x"
  235. """, parser='lalr', transformer=T())
  236. r = g.parse("xx")
  237. self.assertEqual( r.children, ["<c>"] )
  238. def test_embedded_transformer_inplace(self):
  239. @v_args(tree=True)
  240. class T1(Transformer_InPlace):
  241. def a(self, tree):
  242. assert isinstance(tree, Tree), tree
  243. tree.children.append("tested")
  244. return tree
  245. def b(self, tree):
  246. return Tree(tree.data, tree.children + ['tested2'])
  247. @v_args(tree=True)
  248. class T2(Transformer):
  249. def a(self, tree):
  250. assert isinstance(tree, Tree), tree
  251. tree.children.append("tested")
  252. return tree
  253. def b(self, tree):
  254. return Tree(tree.data, tree.children + ['tested2'])
  255. class T3(Transformer):
  256. @v_args(tree=True)
  257. def a(self, tree):
  258. assert isinstance(tree, Tree)
  259. tree.children.append("tested")
  260. return tree
  261. @v_args(tree=True)
  262. def b(self, tree):
  263. return Tree(tree.data, tree.children + ['tested2'])
  264. for t in [T1(), T2(), T3()]:
  265. for internal in [False, True]:
  266. g = Lark("""start: a b
  267. a : "x"
  268. b : "y"
  269. """, parser='lalr', transformer=t if internal else None)
  270. r = g.parse("xy")
  271. if not internal:
  272. r = t.transform(r)
  273. a, b = r.children
  274. self.assertEqual(a.children, ["tested"])
  275. self.assertEqual(b.children, ["tested2"])
  276. def test_alias(self):
  277. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  278. def test_backwards_custom_lexer(self):
  279. class OldCustomLexer(Lexer):
  280. def __init__(self, lexer_conf):
  281. pass
  282. def lex(self, text):
  283. yield Token('A', 'A')
  284. p = Lark("""
  285. start: A
  286. %declare A
  287. """, parser='lalr', lexer=OldCustomLexer)
  288. r = p.parse('')
  289. self.assertEqual(r, Tree('start', [Token('A', 'A')]))
  290. def _make_full_earley_test(LEXER):
  291. def _Lark(grammar, **kwargs):
  292. return Lark(grammar, lexer=LEXER, parser='earley', propagate_positions=True, **kwargs)
  293. class _TestFullEarley(unittest.TestCase):
  294. def test_anon(self):
  295. # Fails an Earley implementation without special handling for empty rules,
  296. # or re-processing of already completed rules.
  297. g = Lark(r"""start: B
  298. B: ("ab"|/[^b]/)+
  299. """, lexer=LEXER)
  300. self.assertEqual( g.parse('abc').children[0], 'abc')
  301. def test_earley(self):
  302. g = Lark("""start: A "b" c
  303. A: "a"+
  304. c: "abc"
  305. """, parser="earley", lexer=LEXER)
  306. x = g.parse('aaaababc')
  307. def test_earley2(self):
  308. grammar = """
  309. start: statement+
  310. statement: "r"
  311. | "c" /[a-z]/+
  312. %ignore " "
  313. """
  314. program = """c b r"""
  315. l = Lark(grammar, parser='earley', lexer=LEXER)
  316. l.parse(program)
  317. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  318. def test_earley3(self):
  319. """Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)
  320. By default, `+` should immitate regexp greedy-matching
  321. """
  322. grammar = """
  323. start: A A
  324. A: "a"+
  325. """
  326. l = Lark(grammar, parser='earley', lexer=LEXER)
  327. res = l.parse("aaa")
  328. self.assertEqual(set(res.children), {'aa', 'a'})
  329. # XXX TODO fix Earley to maintain correct order
  330. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  331. # self.assertEqual(res.children, ['aa', 'a'])
  332. def test_earley4(self):
  333. grammar = """
  334. start: A A?
  335. A: "a"+
  336. """
  337. l = Lark(grammar, parser='earley', lexer=LEXER)
  338. res = l.parse("aaa")
  339. assert set(res.children) == {'aa', 'a'} or res.children == ['aaa']
  340. # XXX TODO fix Earley to maintain correct order
  341. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  342. # self.assertEqual(res.children, ['aaa'])
  343. def test_earley_repeating_empty(self):
  344. # This was a sneaky bug!
  345. grammar = """
  346. !start: "a" empty empty "b"
  347. empty: empty2
  348. empty2:
  349. """
  350. parser = Lark(grammar, parser='earley', lexer=LEXER)
  351. res = parser.parse('ab')
  352. empty_tree = Tree('empty', [Tree('empty2', [])])
  353. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  354. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  355. def test_earley_explicit_ambiguity(self):
  356. # This was a sneaky bug!
  357. grammar = """
  358. start: a b | ab
  359. a: "a"
  360. b: "b"
  361. ab: "ab"
  362. """
  363. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  364. ambig_tree = parser.parse('ab')
  365. self.assertEqual( ambig_tree.data, '_ambig')
  366. self.assertEqual( len(ambig_tree.children), 2)
  367. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  368. def test_ambiguity1(self):
  369. grammar = """
  370. start: cd+ "e"
  371. !cd: "c"
  372. | "d"
  373. | "cd"
  374. """
  375. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  376. ambig_tree = l.parse('cde')
  377. assert ambig_tree.data == '_ambig', ambig_tree
  378. assert len(ambig_tree.children) == 2
  379. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  380. def test_ambiguity2(self):
  381. grammar = """
  382. ANY: /[a-zA-Z0-9 ]+/
  383. a.2: "A" b+
  384. b.2: "B"
  385. c: ANY
  386. start: (a|c)*
  387. """
  388. l = Lark(grammar, parser='earley', lexer=LEXER)
  389. res = l.parse('ABX')
  390. expected = Tree('start', [
  391. Tree('a', [
  392. Tree('b', [])
  393. ]),
  394. Tree('c', [
  395. 'X'
  396. ])
  397. ])
  398. self.assertEqual(res, expected)
  399. def test_ambiguous_intermediate_node(self):
  400. grammar = """
  401. start: ab bc d?
  402. !ab: "A" "B"?
  403. !bc: "B"? "C"
  404. !d: "D"
  405. """
  406. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  407. ambig_tree = l.parse("ABCD")
  408. expected = {
  409. Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C']), Tree('d', ['D'])]),
  410. Tree('start', [Tree('ab', ['A', 'B']), Tree('bc', ['C']), Tree('d', ['D'])])
  411. }
  412. self.assertEqual(ambig_tree.data, '_ambig')
  413. self.assertEqual(set(ambig_tree.children), expected)
  414. def test_ambiguous_symbol_and_intermediate_nodes(self):
  415. grammar = """
  416. start: ab bc cd
  417. !ab: "A" "B"?
  418. !bc: "B"? "C"?
  419. !cd: "C"? "D"
  420. """
  421. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  422. ambig_tree = l.parse("ABCD")
  423. expected = {
  424. Tree('start', [
  425. Tree('ab', ['A', 'B']),
  426. Tree('bc', ['C']),
  427. Tree('cd', ['D'])
  428. ]),
  429. Tree('start', [
  430. Tree('ab', ['A', 'B']),
  431. Tree('bc', []),
  432. Tree('cd', ['C', 'D'])
  433. ]),
  434. Tree('start', [
  435. Tree('ab', ['A']),
  436. Tree('bc', ['B', 'C']),
  437. Tree('cd', ['D'])
  438. ]),
  439. Tree('start', [
  440. Tree('ab', ['A']),
  441. Tree('bc', ['B']),
  442. Tree('cd', ['C', 'D'])
  443. ]),
  444. }
  445. self.assertEqual(ambig_tree.data, '_ambig')
  446. self.assertEqual(set(ambig_tree.children), expected)
  447. def test_nested_ambiguous_intermediate_nodes(self):
  448. grammar = """
  449. start: ab bc cd e?
  450. !ab: "A" "B"?
  451. !bc: "B"? "C"?
  452. !cd: "C"? "D"
  453. !e: "E"
  454. """
  455. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  456. ambig_tree = l.parse("ABCDE")
  457. expected = {
  458. Tree('start', [
  459. Tree('ab', ['A', 'B']),
  460. Tree('bc', ['C']),
  461. Tree('cd', ['D']),
  462. Tree('e', ['E'])
  463. ]),
  464. Tree('start', [
  465. Tree('ab', ['A']),
  466. Tree('bc', ['B', 'C']),
  467. Tree('cd', ['D']),
  468. Tree('e', ['E'])
  469. ]),
  470. Tree('start', [
  471. Tree('ab', ['A']),
  472. Tree('bc', ['B']),
  473. Tree('cd', ['C', 'D']),
  474. Tree('e', ['E'])
  475. ]),
  476. Tree('start', [
  477. Tree('ab', ['A', 'B']),
  478. Tree('bc', []),
  479. Tree('cd', ['C', 'D']),
  480. Tree('e', ['E'])
  481. ]),
  482. }
  483. self.assertEqual(ambig_tree.data, '_ambig')
  484. self.assertEqual(set(ambig_tree.children), expected)
  485. def test_nested_ambiguous_intermediate_nodes2(self):
  486. grammar = """
  487. start: ab bc cd de f
  488. !ab: "A" "B"?
  489. !bc: "B"? "C"?
  490. !cd: "C"? "D"?
  491. !de: "D"? "E"
  492. !f: "F"
  493. """
  494. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  495. ambig_tree = l.parse("ABCDEF")
  496. expected = {
  497. Tree('start', [
  498. Tree('ab', ['A', 'B']),
  499. Tree('bc', ['C']),
  500. Tree('cd', ['D']),
  501. Tree('de', ['E']),
  502. Tree('f', ['F']),
  503. ]),
  504. Tree('start', [
  505. Tree('ab', ['A']),
  506. Tree('bc', ['B', 'C']),
  507. Tree('cd', ['D']),
  508. Tree('de', ['E']),
  509. Tree('f', ['F']),
  510. ]),
  511. Tree('start', [
  512. Tree('ab', ['A']),
  513. Tree('bc', ['B']),
  514. Tree('cd', ['C', 'D']),
  515. Tree('de', ['E']),
  516. Tree('f', ['F']),
  517. ]),
  518. Tree('start', [
  519. Tree('ab', ['A']),
  520. Tree('bc', ['B']),
  521. Tree('cd', ['C']),
  522. Tree('de', ['D', 'E']),
  523. Tree('f', ['F']),
  524. ]),
  525. Tree('start', [
  526. Tree('ab', ['A', "B"]),
  527. Tree('bc', []),
  528. Tree('cd', ['C']),
  529. Tree('de', ['D', 'E']),
  530. Tree('f', ['F']),
  531. ]),
  532. Tree('start', [
  533. Tree('ab', ['A']),
  534. Tree('bc', ['B', 'C']),
  535. Tree('cd', []),
  536. Tree('de', ['D', 'E']),
  537. Tree('f', ['F']),
  538. ]),
  539. Tree('start', [
  540. Tree('ab', ['A', 'B']),
  541. Tree('bc', []),
  542. Tree('cd', ['C', 'D']),
  543. Tree('de', ['E']),
  544. Tree('f', ['F']),
  545. ]),
  546. Tree('start', [
  547. Tree('ab', ['A', 'B']),
  548. Tree('bc', ['C']),
  549. Tree('cd', []),
  550. Tree('de', ['D', 'E']),
  551. Tree('f', ['F']),
  552. ]),
  553. }
  554. self.assertEqual(ambig_tree.data, '_ambig')
  555. self.assertEqual(set(ambig_tree.children), expected)
  556. def test_ambiguous_intermediate_node_unnamed_token(self):
  557. grammar = """
  558. start: ab bc "D"
  559. !ab: "A" "B"?
  560. !bc: "B"? "C"
  561. """
  562. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  563. ambig_tree = l.parse("ABCD")
  564. expected = {
  565. Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C'])]),
  566. Tree('start', [Tree('ab', ['A', 'B']), Tree('bc', ['C'])])
  567. }
  568. self.assertEqual(ambig_tree.data, '_ambig')
  569. self.assertEqual(set(ambig_tree.children), expected)
  570. def test_ambiguous_intermediate_node_inlined_rule(self):
  571. grammar = """
  572. start: ab _bc d?
  573. !ab: "A" "B"?
  574. _bc: "B"? "C"
  575. !d: "D"
  576. """
  577. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  578. ambig_tree = l.parse("ABCD")
  579. expected = {
  580. Tree('start', [Tree('ab', ['A']), Tree('d', ['D'])]),
  581. Tree('start', [Tree('ab', ['A', 'B']), Tree('d', ['D'])])
  582. }
  583. self.assertEqual(ambig_tree.data, '_ambig')
  584. self.assertEqual(set(ambig_tree.children), expected)
  585. def test_ambiguous_intermediate_node_conditionally_inlined_rule(self):
  586. grammar = """
  587. start: ab bc d?
  588. !ab: "A" "B"?
  589. !?bc: "B"? "C"
  590. !d: "D"
  591. """
  592. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  593. ambig_tree = l.parse("ABCD")
  594. expected = {
  595. Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C']), Tree('d', ['D'])]),
  596. Tree('start', [Tree('ab', ['A', 'B']), 'C', Tree('d', ['D'])])
  597. }
  598. self.assertEqual(ambig_tree.data, '_ambig')
  599. self.assertEqual(set(ambig_tree.children), expected)
  600. def test_fruitflies_ambig(self):
  601. grammar = """
  602. start: noun verb noun -> simple
  603. | noun verb "like" noun -> comparative
  604. noun: adj? NOUN
  605. verb: VERB
  606. adj: ADJ
  607. NOUN: "flies" | "bananas" | "fruit"
  608. VERB: "like" | "flies"
  609. ADJ: "fruit"
  610. %import common.WS
  611. %ignore WS
  612. """
  613. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  614. tree = parser.parse('fruit flies like bananas')
  615. expected = Tree('_ambig', [
  616. Tree('comparative', [
  617. Tree('noun', ['fruit']),
  618. Tree('verb', ['flies']),
  619. Tree('noun', ['bananas'])
  620. ]),
  621. Tree('simple', [
  622. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  623. Tree('verb', ['like']),
  624. Tree('noun', ['bananas'])
  625. ])
  626. ])
  627. # self.assertEqual(tree, expected)
  628. self.assertEqual(tree.data, expected.data)
  629. self.assertEqual(set(tree.children), set(expected.children))
  630. @unittest.skipIf(LEXER!='dynamic_complete', "Only relevant for the dynamic_complete parser")
  631. def test_explicit_ambiguity2(self):
  632. grammar = r"""
  633. start: NAME+
  634. NAME: /\w+/
  635. %ignore " "
  636. """
  637. text = """cat"""
  638. parser = _Lark(grammar, start='start', ambiguity='explicit')
  639. tree = parser.parse(text)
  640. self.assertEqual(tree.data, '_ambig')
  641. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  642. self.assertEqual(combinations, {
  643. ('cat',),
  644. ('ca', 't'),
  645. ('c', 'at'),
  646. ('c', 'a' ,'t')
  647. })
  648. def test_term_ambig_resolve(self):
  649. grammar = r"""
  650. !start: NAME+
  651. NAME: /\w+/
  652. %ignore " "
  653. """
  654. text = """foo bar"""
  655. parser = Lark(grammar)
  656. tree = parser.parse(text)
  657. self.assertEqual(tree.children, ['foo', 'bar'])
  658. def test_cycle(self):
  659. grammar = """
  660. start: start?
  661. """
  662. l = Lark(grammar, ambiguity='resolve', lexer=LEXER)
  663. tree = l.parse('')
  664. self.assertEqual(tree, Tree('start', []))
  665. l = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  666. tree = l.parse('')
  667. self.assertEqual(tree, Tree('start', []))
  668. def test_cycle2(self):
  669. grammar = """
  670. start: _operation
  671. _operation: value
  672. value: "b"
  673. | "a" value
  674. | _operation
  675. """
  676. l = Lark(grammar, ambiguity="explicit", lexer=LEXER)
  677. tree = l.parse("ab")
  678. self.assertEqual(tree, Tree('start', [Tree('value', [Tree('value', [])])]))
  679. def test_cycles(self):
  680. grammar = """
  681. a: b
  682. b: c*
  683. c: a
  684. """
  685. l = Lark(grammar, start='a', ambiguity='resolve', lexer=LEXER)
  686. tree = l.parse('')
  687. self.assertEqual(tree, Tree('a', [Tree('b', [])]))
  688. l = Lark(grammar, start='a', ambiguity='explicit', lexer=LEXER)
  689. tree = l.parse('')
  690. self.assertEqual(tree, Tree('a', [Tree('b', [])]))
  691. def test_many_cycles(self):
  692. grammar = """
  693. start: a? | start start
  694. !a: "a"
  695. """
  696. l = Lark(grammar, ambiguity='resolve', lexer=LEXER)
  697. tree = l.parse('a')
  698. self.assertEqual(tree, Tree('start', [Tree('a', ['a'])]))
  699. l = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  700. tree = l.parse('a')
  701. self.assertEqual(tree, Tree('start', [Tree('a', ['a'])]))
  702. def test_cycles_with_child_filter(self):
  703. grammar = """
  704. a: _x
  705. _x: _x? b
  706. b:
  707. """
  708. grammar2 = """
  709. a: x
  710. x: x? b
  711. b:
  712. """
  713. l = Lark(grammar, start='a', ambiguity='resolve', lexer=LEXER)
  714. tree = l.parse('')
  715. self.assertEqual(tree, Tree('a', [Tree('b', [])]))
  716. l = Lark(grammar, start='a', ambiguity='explicit', lexer=LEXER)
  717. tree = l.parse('');
  718. self.assertEqual(tree, Tree('a', [Tree('b', [])]))
  719. l = Lark(grammar2, start='a', ambiguity='resolve', lexer=LEXER)
  720. tree = l.parse('');
  721. self.assertEqual(tree, Tree('a', [Tree('x', [Tree('b', [])])]))
  722. l = Lark(grammar2, start='a', ambiguity='explicit', lexer=LEXER)
  723. tree = l.parse('');
  724. self.assertEqual(tree, Tree('a', [Tree('x', [Tree('b', [])])]))
  725. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  726. # def test_not_all_derivations(self):
  727. # grammar = """
  728. # start: cd+ "e"
  729. # !cd: "c"
  730. # | "d"
  731. # | "cd"
  732. # """
  733. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  734. # x = l.parse('cde')
  735. # assert x.data != '_ambig', x
  736. # assert len(x.children) == 1
  737. _NAME = "TestFullEarley" + LEXER.capitalize()
  738. _TestFullEarley.__name__ = _NAME
  739. globals()[_NAME] = _TestFullEarley
  740. __all__.append(_NAME)
  741. class CustomLexerNew(Lexer):
  742. """
  743. Purpose of this custom lexer is to test the integration,
  744. so it uses the traditionalparser as implementation without custom lexing behaviour.
  745. """
  746. def __init__(self, lexer_conf):
  747. self.lexer = TraditionalLexer(copy(lexer_conf))
  748. def lex(self, lexer_state, parser_state):
  749. return self.lexer.lex(lexer_state, parser_state)
  750. __future_interface__ = True
  751. class CustomLexerOld(Lexer):
  752. """
  753. Purpose of this custom lexer is to test the integration,
  754. so it uses the traditionalparser as implementation without custom lexing behaviour.
  755. """
  756. def __init__(self, lexer_conf):
  757. self.lexer = TraditionalLexer(copy(lexer_conf))
  758. def lex(self, text):
  759. ls = self.lexer.make_lexer_state(text)
  760. return self.lexer.lex(ls, None)
  761. __future_interface__ = False
  762. def _tree_structure_check(a, b):
  763. """
  764. Checks that both Tree objects have the same structure, without checking their values.
  765. """
  766. assert a.data == b.data and len(a.children) == len(b.children)
  767. for ca,cb in zip(a.children, b.children):
  768. assert type(ca) == type(cb)
  769. if isinstance(ca, Tree):
  770. _tree_structure_check(ca, cb)
  771. elif isinstance(ca, Token):
  772. assert ca.type == cb.type
  773. else:
  774. assert ca == cb
  775. class DualBytesLark:
  776. """
  777. A helper class that wraps both a normal parser, and a parser for bytes.
  778. It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
  779. It always checks that both produce the same output/error
  780. NOTE: Not currently used, but left here for future debugging.
  781. """
  782. def __init__(self, g, *args, **kwargs):
  783. self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs)
  784. g = self.text_lexer.grammar_source.lower()
  785. if '\\u' in g or not isascii(g):
  786. # Bytes re can't deal with uniode escapes
  787. self.bytes_lark = None
  788. else:
  789. # Everything here should work, so use `use_bytes='force'`
  790. self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs)
  791. def parse(self, text, start=None):
  792. # TODO: Easy workaround, more complex checks would be beneficial
  793. if not isascii(text) or self.bytes_lark is None:
  794. return self.text_lexer.parse(text, start)
  795. try:
  796. rv = self.text_lexer.parse(text, start)
  797. except Exception as e:
  798. try:
  799. self.bytes_lark.parse(text.encode(), start)
  800. except Exception as be:
  801. assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions"
  802. raise e
  803. assert False, "Parser without `use_bytes` raises exception, with doesn't"
  804. try:
  805. bv = self.bytes_lark.parse(text.encode(), start)
  806. except Exception as be:
  807. assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
  808. _tree_structure_check(rv, bv)
  809. return rv
  810. @classmethod
  811. def open(cls, grammar_filename, rel_to=None, **options):
  812. if rel_to:
  813. basepath = os.path.dirname(rel_to)
  814. grammar_filename = os.path.join(basepath, grammar_filename)
  815. with open(grammar_filename, encoding='utf8') as f:
  816. return cls(f, **options)
  817. def save(self,f):
  818. self.text_lexer.save(f)
  819. if self.bytes_lark is not None:
  820. self.bytes_lark.save(f)
  821. def load(self,f):
  822. self.text_lexer = self.text_lexer.load(f)
  823. if self.bytes_lark is not None:
  824. self.bytes_lark.load(f)
  825. def _make_parser_test(LEXER, PARSER):
  826. lexer_class_or_name = {
  827. 'custom_new': CustomLexerNew,
  828. 'custom_old': CustomLexerOld,
  829. }.get(LEXER, LEXER)
  830. def _Lark(grammar, **kwargs):
  831. return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
  832. def _Lark_open(gfilename, **kwargs):
  833. return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
  834. if (LEXER, PARSER) == ('standard', 'earley'):
  835. # Check that the `lark.lark` grammar represents can parse every example used in these tests.
  836. # Standard-Earley was an arbitrary choice, to make sure it only ran once.
  837. lalr_parser = Lark.open(os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark'), parser='lalr')
  838. def wrap_with_test_grammar(f):
  839. def _f(x, **kwargs):
  840. inst = f(x, **kwargs)
  841. lalr_parser.parse(inst.source_grammar) # Test after instance creation. When the grammar should fail, don't test it.
  842. return inst
  843. return _f
  844. _Lark = wrap_with_test_grammar(_Lark)
  845. _Lark_open = wrap_with_test_grammar(_Lark_open)
  846. class _TestParser(unittest.TestCase):
  847. def test_basic1(self):
  848. g = _Lark("""start: a+ b a* "b" a*
  849. b: "b"
  850. a: "a"
  851. """)
  852. r = g.parse('aaabaab')
  853. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  854. r = g.parse('aaabaaba')
  855. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  856. self.assertRaises(ParseError, g.parse, 'aaabaa')
  857. def test_basic2(self):
  858. # Multiple parsers and colliding tokens
  859. g = _Lark("""start: B A
  860. B: "12"
  861. A: "1" """)
  862. g2 = _Lark("""start: B A
  863. B: "12"
  864. A: "2" """)
  865. x = g.parse('121')
  866. assert x.data == 'start' and x.children == ['12', '1'], x
  867. x = g2.parse('122')
  868. assert x.data == 'start' and x.children == ['12', '2'], x
  869. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  870. def test_stringio_bytes(self):
  871. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  872. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  873. def test_stringio_unicode(self):
  874. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  875. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  876. def test_unicode(self):
  877. g = _Lark(u"""start: UNIA UNIB UNIA
  878. UNIA: /\xa3/
  879. UNIB: /\u0101/
  880. """)
  881. g.parse(u'\xa3\u0101\u00a3')
  882. def test_unicode2(self):
  883. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  884. UNIA: /\xa3/
  885. UNIB: "a\u0101b\ "
  886. UNIC: /a?\u0101c\n/
  887. """)
  888. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  889. def test_unicode3(self):
  890. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  891. UNIA: /\xa3/
  892. UNIB: "\u0101"
  893. UNIC: /\u0203/ /\n/
  894. """)
  895. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  896. def test_unicode4(self):
  897. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  898. UNIA: /\xa3/
  899. UNIB: "\U0010FFFF"
  900. UNIC: /\U00100000/ /\n/
  901. """)
  902. g.parse(u'\xa3\U0010FFFF\u00a3\U00100000\n')
  903. def test_hex_escape(self):
  904. g = _Lark(r"""start: A B C
  905. A: "\x01"
  906. B: /\x02/
  907. C: "\xABCD"
  908. """)
  909. g.parse('\x01\x02\xABCD')
  910. def test_unicode_literal_range_escape(self):
  911. g = _Lark(r"""start: A+
  912. A: "\u0061".."\u0063"
  913. """)
  914. g.parse('abc')
  915. @unittest.skipIf(sys.version_info < (3, 3), "re package did not support 32bit unicode escape sequence before Python 3.3")
  916. def test_unicode_literal_range_escape2(self):
  917. g = _Lark(r"""start: A+
  918. A: "\U0000FFFF".."\U00010002"
  919. """)
  920. g.parse('\U0000FFFF\U00010000\U00010001\U00010002')
  921. def test_hex_literal_range_escape(self):
  922. g = _Lark(r"""start: A+
  923. A: "\x01".."\x03"
  924. """)
  925. g.parse('\x01\x02\x03')
  926. @unittest.skipIf(sys.version_info[0]==2 or sys.version_info[:2]==(3, 4),
  927. "bytes parser isn't perfect in Python2, exceptions don't work correctly")
  928. def test_bytes_utf8(self):
  929. g = r"""
  930. start: BOM? char+
  931. BOM: "\xef\xbb\xbf"
  932. char: CHAR1 | CHAR2 | CHAR3 | CHAR4
  933. CONTINUATION_BYTE: "\x80" .. "\xbf"
  934. CHAR1: "\x00" .. "\x7f"
  935. CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE
  936. CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
  937. CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
  938. """
  939. g = _Lark(g, use_bytes=True)
  940. s = u"🔣 地? gurīn".encode('utf-8')
  941. self.assertEqual(len(g.parse(s).children), 10)
  942. for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"),
  943. ("sjis", u"売春婦"),
  944. ("euc-jp", u"乂鵬鵠")]:
  945. s = j.encode(enc)
  946. self.assertRaises(UnexpectedCharacters, g.parse, s)
  947. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  948. def test_stack_for_ebnf(self):
  949. """Verify that stack depth isn't an issue for EBNF grammars"""
  950. g = _Lark(r"""start: a+
  951. a : "a" """)
  952. g.parse("a" * (sys.getrecursionlimit()*2 ))
  953. def test_expand1_lists_with_one_item(self):
  954. g = _Lark(r"""start: list
  955. ?list: item+
  956. item : A
  957. A: "a"
  958. """)
  959. r = g.parse("a")
  960. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  961. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  962. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  963. self.assertEqual(len(r.children), 1)
  964. def test_expand1_lists_with_one_item_2(self):
  965. g = _Lark(r"""start: list
  966. ?list: item+ "!"
  967. item : A
  968. A: "a"
  969. """)
  970. r = g.parse("a!")
  971. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  972. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  973. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  974. self.assertEqual(len(r.children), 1)
  975. def test_dont_expand1_lists_with_multiple_items(self):
  976. g = _Lark(r"""start: list
  977. ?list: item+
  978. item : A
  979. A: "a"
  980. """)
  981. r = g.parse("aa")
  982. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  983. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  984. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  985. self.assertEqual(len(r.children), 1)
  986. # Sanity check: verify that 'list' contains the two 'item's we've given it
  987. [list] = r.children
  988. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  989. def test_dont_expand1_lists_with_multiple_items_2(self):
  990. g = _Lark(r"""start: list
  991. ?list: item+ "!"
  992. item : A
  993. A: "a"
  994. """)
  995. r = g.parse("aa!")
  996. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  997. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  998. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  999. self.assertEqual(len(r.children), 1)
  1000. # Sanity check: verify that 'list' contains the two 'item's we've given it
  1001. [list] = r.children
  1002. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  1003. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1004. def test_empty_expand1_list(self):
  1005. g = _Lark(r"""start: list
  1006. ?list: item*
  1007. item : A
  1008. A: "a"
  1009. """)
  1010. r = g.parse("")
  1011. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  1012. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  1013. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  1014. self.assertEqual(len(r.children), 1)
  1015. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  1016. [list] = r.children
  1017. self.assertSequenceEqual([item.data for item in list.children], ())
  1018. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1019. def test_empty_expand1_list_2(self):
  1020. g = _Lark(r"""start: list
  1021. ?list: item* "!"?
  1022. item : A
  1023. A: "a"
  1024. """)
  1025. r = g.parse("")
  1026. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  1027. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  1028. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  1029. self.assertEqual(len(r.children), 1)
  1030. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  1031. [list] = r.children
  1032. self.assertSequenceEqual([item.data for item in list.children], ())
  1033. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1034. def test_empty_flatten_list(self):
  1035. g = _Lark(r"""start: list
  1036. list: | item "," list
  1037. item : A
  1038. A: "a"
  1039. """)
  1040. r = g.parse("")
  1041. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  1042. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  1043. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  1044. [list] = r.children
  1045. self.assertSequenceEqual([item.data for item in list.children], ())
  1046. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  1047. def test_single_item_flatten_list(self):
  1048. g = _Lark(r"""start: list
  1049. list: | item "," list
  1050. item : A
  1051. A: "a"
  1052. """)
  1053. r = g.parse("a,")
  1054. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  1055. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  1056. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  1057. [list] = r.children
  1058. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  1059. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  1060. def test_multiple_item_flatten_list(self):
  1061. g = _Lark(r"""start: list
  1062. #list: | item "," list
  1063. item : A
  1064. A: "a"
  1065. """)
  1066. r = g.parse("a,a,")
  1067. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  1068. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  1069. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  1070. [list] = r.children
  1071. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  1072. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  1073. def test_recurse_flatten(self):
  1074. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  1075. g = _Lark(r"""start: a | start a
  1076. a : A
  1077. A : "a" """)
  1078. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  1079. # STree data structures, which uses recursion).
  1080. g.parse("a" * (sys.getrecursionlimit() // 4))
  1081. def test_token_collision(self):
  1082. g = _Lark(r"""start: "Hello" NAME
  1083. NAME: /\w/+
  1084. %ignore " "
  1085. """)
  1086. x = g.parse('Hello World')
  1087. self.assertSequenceEqual(x.children, ['World'])
  1088. x = g.parse('Hello HelloWorld')
  1089. self.assertSequenceEqual(x.children, ['HelloWorld'])
  1090. def test_token_collision_WS(self):
  1091. g = _Lark(r"""start: "Hello" NAME
  1092. NAME: /\w/+
  1093. %import common.WS
  1094. %ignore WS
  1095. """)
  1096. x = g.parse('Hello World')
  1097. self.assertSequenceEqual(x.children, ['World'])
  1098. x = g.parse('Hello HelloWorld')
  1099. self.assertSequenceEqual(x.children, ['HelloWorld'])
  1100. def test_token_collision2(self):
  1101. g = _Lark("""
  1102. !start: "starts"
  1103. %import common.LCASE_LETTER
  1104. """)
  1105. x = g.parse("starts")
  1106. self.assertSequenceEqual(x.children, ['starts'])
  1107. def test_templates(self):
  1108. g = _Lark(r"""
  1109. start: "[" sep{NUMBER, ","} "]"
  1110. sep{item, delim}: item (delim item)*
  1111. NUMBER: /\d+/
  1112. %ignore " "
  1113. """)
  1114. x = g.parse("[1, 2, 3, 4]")
  1115. self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])])
  1116. x = g.parse("[1]")
  1117. self.assertSequenceEqual(x.children, [Tree('sep', ['1'])])
  1118. def test_templates_recursion(self):
  1119. g = _Lark(r"""
  1120. start: "[" _sep{NUMBER, ","} "]"
  1121. _sep{item, delim}: item | _sep{item, delim} delim item
  1122. NUMBER: /\d+/
  1123. %ignore " "
  1124. """)
  1125. x = g.parse("[1, 2, 3, 4]")
  1126. self.assertSequenceEqual(x.children, ['1', '2', '3', '4'])
  1127. x = g.parse("[1]")
  1128. self.assertSequenceEqual(x.children, ['1'])
  1129. def test_templates_import(self):
  1130. g = _Lark_open("test_templates_import.lark", rel_to=__file__)
  1131. x = g.parse("[1, 2, 3, 4]")
  1132. self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])])
  1133. x = g.parse("[1]")
  1134. self.assertSequenceEqual(x.children, [Tree('sep', ['1'])])
  1135. def test_templates_alias(self):
  1136. g = _Lark(r"""
  1137. start: expr{"C"}
  1138. expr{t}: "A" t
  1139. | "B" t -> b
  1140. """)
  1141. x = g.parse("AC")
  1142. self.assertSequenceEqual(x.children, [Tree('expr', [])])
  1143. x = g.parse("BC")
  1144. self.assertSequenceEqual(x.children, [Tree('b', [])])
  1145. def test_templates_modifiers(self):
  1146. g = _Lark(r"""
  1147. start: expr{"B"}
  1148. !expr{t}: "A" t
  1149. """)
  1150. x = g.parse("AB")
  1151. self.assertSequenceEqual(x.children, [Tree('expr', ["A", "B"])])
  1152. g = _Lark(r"""
  1153. start: _expr{"B"}
  1154. !_expr{t}: "A" t
  1155. """)
  1156. x = g.parse("AB")
  1157. self.assertSequenceEqual(x.children, ["A", "B"])
  1158. g = _Lark(r"""
  1159. start: expr{b}
  1160. b: "B"
  1161. ?expr{t}: "A" t
  1162. """)
  1163. x = g.parse("AB")
  1164. self.assertSequenceEqual(x.children, [Tree('b',[])])
  1165. def test_templates_templates(self):
  1166. g = _Lark('''start: a{b}
  1167. a{t}: t{"a"}
  1168. b{x}: x''')
  1169. x = g.parse('a')
  1170. self.assertSequenceEqual(x.children, [Tree('a', [Tree('b',[])])])
  1171. def test_g_regex_flags(self):
  1172. g = _Lark("""
  1173. start: "a" /b+/ C
  1174. C: "C" | D
  1175. D: "D" E
  1176. E: "e"
  1177. """, g_regex_flags=re.I)
  1178. x1 = g.parse("ABBc")
  1179. x2 = g.parse("abdE")
  1180. # def test_string_priority(self):
  1181. # g = _Lark("""start: (A | /a?bb/)+
  1182. # A: "a" """)
  1183. # x = g.parse('abb')
  1184. # self.assertEqual(len(x.children), 2)
  1185. # # This parse raises an exception because the lexer will always try to consume
  1186. # # "a" first and will never match the regular expression
  1187. # # This behavior is subject to change!!
  1188. # # Thie won't happen with ambiguity handling.
  1189. # g = _Lark("""start: (A | /a?ab/)+
  1190. # A: "a" """)
  1191. # self.assertRaises(LexError, g.parse, 'aab')
  1192. def test_rule_collision(self):
  1193. g = _Lark("""start: "a"+ "b"
  1194. | "a"+ """)
  1195. x = g.parse('aaaa')
  1196. x = g.parse('aaaab')
  1197. def test_rule_collision2(self):
  1198. g = _Lark("""start: "a"* "b"
  1199. | "a"+ """)
  1200. x = g.parse('aaaa')
  1201. x = g.parse('aaaab')
  1202. x = g.parse('b')
  1203. def test_token_not_anon(self):
  1204. """Tests that "a" is matched as an anonymous token, and not A.
  1205. """
  1206. g = _Lark("""start: "a"
  1207. A: "a" """)
  1208. x = g.parse('a')
  1209. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  1210. g = _Lark("""start: "a" A
  1211. A: "a" """)
  1212. x = g.parse('aa')
  1213. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  1214. self.assertEqual(x.children[0].type, "A")
  1215. g = _Lark("""start: /a/
  1216. A: /a/ """)
  1217. x = g.parse('a')
  1218. self.assertEqual(len(x.children), 1)
  1219. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  1220. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1221. def test_maybe(self):
  1222. g = _Lark("""start: ["a"] """)
  1223. x = g.parse('a')
  1224. x = g.parse('')
  1225. def test_start(self):
  1226. g = _Lark("""a: "a" a? """, start='a')
  1227. x = g.parse('a')
  1228. x = g.parse('aa')
  1229. x = g.parse('aaa')
  1230. def test_alias(self):
  1231. g = _Lark("""start: "a" -> b """)
  1232. x = g.parse('a')
  1233. self.assertEqual(x.data, "b")
  1234. def test_token_ebnf(self):
  1235. g = _Lark("""start: A
  1236. A: "a"* ("b"? "c".."e")+
  1237. """)
  1238. x = g.parse('abcde')
  1239. x = g.parse('dd')
  1240. def test_backslash(self):
  1241. g = _Lark(r"""start: "\\" "a"
  1242. """)
  1243. x = g.parse(r'\a')
  1244. g = _Lark(r"""start: /\\/ /a/
  1245. """)
  1246. x = g.parse(r'\a')
  1247. def test_backslash2(self):
  1248. g = _Lark(r"""start: "\"" "-"
  1249. """)
  1250. x = g.parse('"-')
  1251. g = _Lark(r"""start: /\// /-/
  1252. """)
  1253. x = g.parse('/-')
  1254. def test_special_chars(self):
  1255. g = _Lark(r"""start: "\n"
  1256. """)
  1257. x = g.parse('\n')
  1258. g = _Lark(r"""start: /\n/
  1259. """)
  1260. x = g.parse('\n')
  1261. # def test_token_recurse(self):
  1262. # g = _Lark("""start: A
  1263. # A: B
  1264. # B: A
  1265. # """)
  1266. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1267. def test_empty(self):
  1268. # Fails an Earley implementation without special handling for empty rules,
  1269. # or re-processing of already completed rules.
  1270. g = _Lark(r"""start: _empty a "B"
  1271. a: _empty "A"
  1272. _empty:
  1273. """)
  1274. x = g.parse('AB')
  1275. def test_regex_quote(self):
  1276. g = r"""
  1277. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  1278. SINGLE_QUOTED_STRING : /'[^']*'/
  1279. DOUBLE_QUOTED_STRING : /"[^"]*"/
  1280. """
  1281. g = _Lark(g)
  1282. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  1283. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  1284. @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+")
  1285. def test_join_regex_flags(self):
  1286. g = r"""
  1287. start: A
  1288. A: B C
  1289. B: /./s
  1290. C: /./
  1291. """
  1292. g = _Lark(g)
  1293. self.assertEqual(g.parse(" ").children,[" "])
  1294. self.assertEqual(g.parse("\n ").children,["\n "])
  1295. self.assertRaises(UnexpectedCharacters, g.parse, "\n\n")
  1296. g = r"""
  1297. start: A
  1298. A: B | C
  1299. B: "b"i
  1300. C: "c"
  1301. """
  1302. g = _Lark(g)
  1303. self.assertEqual(g.parse("b").children,["b"])
  1304. self.assertEqual(g.parse("B").children,["B"])
  1305. self.assertEqual(g.parse("c").children,["c"])
  1306. self.assertRaises(UnexpectedCharacters, g.parse, "C")
  1307. def test_lexer_token_limit(self):
  1308. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  1309. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  1310. g = _Lark("""start: %s
  1311. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  1312. def test_float_without_lexer(self):
  1313. expected_error = UnexpectedCharacters if 'dynamic' in LEXER else UnexpectedToken
  1314. if PARSER == 'cyk':
  1315. expected_error = ParseError
  1316. g = _Lark("""start: ["+"|"-"] float
  1317. float: digit* "." digit+ exp?
  1318. | digit+ exp
  1319. exp: ("e"|"E") ["+"|"-"] digit+
  1320. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  1321. """)
  1322. g.parse("1.2")
  1323. g.parse("-.2e9")
  1324. g.parse("+2e-9")
  1325. self.assertRaises( expected_error, g.parse, "+2e-9e")
  1326. def test_keep_all_tokens(self):
  1327. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  1328. tree = l.parse('aaa')
  1329. self.assertEqual(tree.children, ['a', 'a', 'a'])
  1330. def test_token_flags(self):
  1331. l = _Lark("""!start: "a"i+
  1332. """
  1333. )
  1334. tree = l.parse('aA')
  1335. self.assertEqual(tree.children, ['a', 'A'])
  1336. l = _Lark("""!start: /a/i+
  1337. """
  1338. )
  1339. tree = l.parse('aA')
  1340. self.assertEqual(tree.children, ['a', 'A'])
  1341. # g = """!start: "a"i "a"
  1342. # """
  1343. # self.assertRaises(GrammarError, _Lark, g)
  1344. # g = """!start: /a/i /a/
  1345. # """
  1346. # self.assertRaises(GrammarError, _Lark, g)
  1347. g = """start: NAME "," "a"
  1348. NAME: /[a-z_]/i /[a-z0-9_]/i*
  1349. """
  1350. l = _Lark(g)
  1351. tree = l.parse('ab,a')
  1352. self.assertEqual(tree.children, ['ab'])
  1353. tree = l.parse('AB,a')
  1354. self.assertEqual(tree.children, ['AB'])
  1355. def test_token_flags3(self):
  1356. l = _Lark("""!start: ABC+
  1357. ABC: "abc"i
  1358. """
  1359. )
  1360. tree = l.parse('aBcAbC')
  1361. self.assertEqual(tree.children, ['aBc', 'AbC'])
  1362. def test_token_flags2(self):
  1363. g = """!start: ("a"i | /a/ /b/?)+
  1364. """
  1365. l = _Lark(g)
  1366. tree = l.parse('aA')
  1367. self.assertEqual(tree.children, ['a', 'A'])
  1368. def test_token_flags_verbose(self):
  1369. g = _Lark(r"""start: NL | ABC
  1370. ABC: / [a-z] /x
  1371. NL: /\n/
  1372. """)
  1373. x = g.parse('a')
  1374. self.assertEqual(x.children, ['a'])
  1375. def test_token_flags_verbose_multiline(self):
  1376. g = _Lark(r"""start: ABC
  1377. ABC: / a b c
  1378. d
  1379. e f
  1380. /x
  1381. """)
  1382. x = g.parse('abcdef')
  1383. self.assertEqual(x.children, ['abcdef'])
  1384. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1385. def test_twice_empty(self):
  1386. g = """!start: ("A"?)?
  1387. """
  1388. l = _Lark(g)
  1389. tree = l.parse('A')
  1390. self.assertEqual(tree.children, ['A'])
  1391. tree = l.parse('')
  1392. self.assertEqual(tree.children, [])
  1393. def test_line_and_column(self):
  1394. g = r"""!start: "A" bc "D"
  1395. !bc: "B\nC"
  1396. """
  1397. l = _Lark(g)
  1398. a, bc, d = l.parse("AB\nCD").children
  1399. self.assertEqual(a.line, 1)
  1400. self.assertEqual(a.column, 1)
  1401. bc ,= bc.children
  1402. self.assertEqual(bc.line, 1)
  1403. self.assertEqual(bc.column, 2)
  1404. self.assertEqual(d.line, 2)
  1405. self.assertEqual(d.column, 2)
  1406. # if LEXER != 'dynamic':
  1407. self.assertEqual(a.end_line, 1)
  1408. self.assertEqual(a.end_column, 2)
  1409. self.assertEqual(bc.end_line, 2)
  1410. self.assertEqual(bc.end_column, 2)
  1411. self.assertEqual(d.end_line, 2)
  1412. self.assertEqual(d.end_column, 3)
  1413. def test_reduce_cycle(self):
  1414. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  1415. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  1416. """
  1417. l = _Lark("""
  1418. term: A
  1419. | term term
  1420. A: "a"
  1421. """, start='term')
  1422. tree = l.parse("aa")
  1423. self.assertEqual(len(tree.children), 2)
  1424. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  1425. def test_lexer_prioritization(self):
  1426. "Tests effect of priority on result"
  1427. grammar = """
  1428. start: A B | AB
  1429. A.2: "a"
  1430. B: "b"
  1431. AB: "ab"
  1432. """
  1433. l = _Lark(grammar)
  1434. res = l.parse("ab")
  1435. self.assertEqual(res.children, ['a', 'b'])
  1436. self.assertNotEqual(res.children, ['ab'])
  1437. grammar = """
  1438. start: A B | AB
  1439. A: "a"
  1440. B: "b"
  1441. AB.3: "ab"
  1442. """
  1443. l = _Lark(grammar)
  1444. res = l.parse("ab")
  1445. self.assertNotEqual(res.children, ['a', 'b'])
  1446. self.assertEqual(res.children, ['ab'])
  1447. grammar = """
  1448. start: A B | AB
  1449. A: "a"
  1450. B.-20: "b"
  1451. AB.-10: "ab"
  1452. """
  1453. l = _Lark(grammar)
  1454. res = l.parse("ab")
  1455. self.assertEqual(res.children, ['a', 'b'])
  1456. grammar = """
  1457. start: A B | AB
  1458. A.-99999999999999999999999: "a"
  1459. B: "b"
  1460. AB: "ab"
  1461. """
  1462. l = _Lark(grammar)
  1463. res = l.parse("ab")
  1464. self.assertEqual(res.children, ['ab'])
  1465. def test_import(self):
  1466. grammar = """
  1467. start: NUMBER WORD
  1468. %import common.NUMBER
  1469. %import common.WORD
  1470. %import common.WS
  1471. %ignore WS
  1472. """
  1473. l = _Lark(grammar)
  1474. x = l.parse('12 elephants')
  1475. self.assertEqual(x.children, ['12', 'elephants'])
  1476. def test_import_rename(self):
  1477. grammar = """
  1478. start: N W
  1479. %import common.NUMBER -> N
  1480. %import common.WORD -> W
  1481. %import common.WS
  1482. %ignore WS
  1483. """
  1484. l = _Lark(grammar)
  1485. x = l.parse('12 elephants')
  1486. self.assertEqual(x.children, ['12', 'elephants'])
  1487. def test_relative_import(self):
  1488. l = _Lark_open('test_relative_import.lark', rel_to=__file__)
  1489. x = l.parse('12 lions')
  1490. self.assertEqual(x.children, ['12', 'lions'])
  1491. def test_relative_import_unicode(self):
  1492. l = _Lark_open('test_relative_import_unicode.lark', rel_to=__file__)
  1493. x = l.parse(u'Ø')
  1494. self.assertEqual(x.children, [u'Ø'])
  1495. def test_relative_import_rename(self):
  1496. l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__)
  1497. x = l.parse('12 lions')
  1498. self.assertEqual(x.children, ['12', 'lions'])
  1499. def test_relative_rule_import(self):
  1500. l = _Lark_open('test_relative_rule_import.lark', rel_to=__file__)
  1501. x = l.parse('xaabby')
  1502. self.assertEqual(x.children, [
  1503. 'x',
  1504. Tree('expr', ['a', Tree('expr', ['a', 'b']), 'b']),
  1505. 'y'])
  1506. def test_relative_rule_import_drop_ignore(self):
  1507. # %ignore rules are dropped on import
  1508. l = _Lark_open('test_relative_rule_import_drop_ignore.lark',
  1509. rel_to=__file__)
  1510. self.assertRaises((ParseError, UnexpectedInput),
  1511. l.parse, 'xa abby')
  1512. def test_relative_rule_import_subrule(self):
  1513. l = _Lark_open('test_relative_rule_import_subrule.lark',
  1514. rel_to=__file__)
  1515. x = l.parse('xaabby')
  1516. self.assertEqual(x.children, [
  1517. 'x',
  1518. Tree('startab', [
  1519. Tree('grammars__ab__expr', [
  1520. 'a', Tree('grammars__ab__expr', ['a', 'b']), 'b',
  1521. ]),
  1522. ]),
  1523. 'y'])
  1524. def test_relative_rule_import_subrule_no_conflict(self):
  1525. l = _Lark_open(
  1526. 'test_relative_rule_import_subrule_no_conflict.lark',
  1527. rel_to=__file__)
  1528. x = l.parse('xaby')
  1529. self.assertEqual(x.children, [Tree('expr', [
  1530. 'x',
  1531. Tree('startab', [
  1532. Tree('grammars__ab__expr', ['a', 'b']),
  1533. ]),
  1534. 'y'])])
  1535. self.assertRaises((ParseError, UnexpectedInput),
  1536. l.parse, 'xaxabyby')
  1537. def test_relative_rule_import_rename(self):
  1538. l = _Lark_open('test_relative_rule_import_rename.lark',
  1539. rel_to=__file__)
  1540. x = l.parse('xaabby')
  1541. self.assertEqual(x.children, [
  1542. 'x',
  1543. Tree('ab', ['a', Tree('ab', ['a', 'b']), 'b']),
  1544. 'y'])
  1545. def test_multi_import(self):
  1546. grammar = """
  1547. start: NUMBER WORD
  1548. %import common (NUMBER, WORD, WS)
  1549. %ignore WS
  1550. """
  1551. l = _Lark(grammar)
  1552. x = l.parse('12 toucans')
  1553. self.assertEqual(x.children, ['12', 'toucans'])
  1554. def test_relative_multi_import(self):
  1555. l = _Lark_open("test_relative_multi_import.lark", rel_to=__file__)
  1556. x = l.parse('12 capybaras')
  1557. self.assertEqual(x.children, ['12', 'capybaras'])
  1558. def test_relative_import_preserves_leading_underscore(self):
  1559. l = _Lark_open("test_relative_import_preserves_leading_underscore.lark", rel_to=__file__)
  1560. x = l.parse('Ax')
  1561. self.assertEqual(next(x.find_data('c')).children, ['A'])
  1562. def test_relative_import_of_nested_grammar(self):
  1563. l = _Lark_open("grammars/test_relative_import_of_nested_grammar.lark", rel_to=__file__)
  1564. x = l.parse('N')
  1565. self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])
  1566. def test_relative_import_rules_dependencies_imported_only_once(self):
  1567. l = _Lark_open("test_relative_import_rules_dependencies_imported_only_once.lark", rel_to=__file__)
  1568. x = l.parse('AAA')
  1569. self.assertEqual(next(x.find_data('a')).children, ['A'])
  1570. self.assertEqual(next(x.find_data('b')).children, ['A'])
  1571. self.assertEqual(next(x.find_data('d')).children, ['A'])
  1572. def test_import_errors(self):
  1573. grammar = """
  1574. start: NUMBER WORD
  1575. %import .grammars.bad_test.NUMBER
  1576. """
  1577. self.assertRaises(IOError, _Lark, grammar)
  1578. grammar = """
  1579. start: NUMBER WORD
  1580. %import bad_test.NUMBER
  1581. """
  1582. self.assertRaises(IOError, _Lark, grammar)
  1583. @unittest.skipIf('dynamic' in LEXER, "%declare/postlex doesn't work with dynamic")
  1584. def test_postlex_declare(self): # Note: this test does a lot. maybe split it up?
  1585. class TestPostLexer:
  1586. def process(self, stream):
  1587. for t in stream:
  1588. if t.type == 'A':
  1589. t.type = 'B'
  1590. yield t
  1591. else:
  1592. yield t
  1593. always_accept = ('A',)
  1594. parser = _Lark("""
  1595. start: B
  1596. A: "A"
  1597. %declare B
  1598. """, postlex=TestPostLexer())
  1599. test_file = "A"
  1600. tree = parser.parse(test_file)
  1601. self.assertEqual(tree.children, [Token('B', 'A')])
  1602. @unittest.skipIf('dynamic' in LEXER, "%declare/postlex doesn't work with dynamic")
  1603. def test_postlex_indenter(self):
  1604. class CustomIndenter(Indenter):
  1605. NL_type = 'NEWLINE'
  1606. OPEN_PAREN_types = []
  1607. CLOSE_PAREN_types = []
  1608. INDENT_type = 'INDENT'
  1609. DEDENT_type = 'DEDENT'
  1610. tab_len = 8
  1611. grammar = r"""
  1612. start: "a" NEWLINE INDENT "b" NEWLINE DEDENT
  1613. NEWLINE: ( /\r?\n */ )+
  1614. %ignore " "+
  1615. %declare INDENT DEDENT
  1616. """
  1617. parser = _Lark(grammar, postlex=CustomIndenter())
  1618. parser.parse("a\n b\n")
  1619. @unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK")
  1620. def test_prioritization(self):
  1621. "Tests effect of priority on result"
  1622. grammar = """
  1623. start: a | b
  1624. a.1: "a"
  1625. b.2: "a"
  1626. """
  1627. l = _Lark(grammar)
  1628. res = l.parse("a")
  1629. self.assertEqual(res.children[0].data, 'b')
  1630. grammar = """
  1631. start: a | b
  1632. a.2: "a"
  1633. b.1: "a"
  1634. """
  1635. l = _Lark(grammar)
  1636. res = l.parse("a")
  1637. self.assertEqual(res.children[0].data, 'a')
  1638. grammar = """
  1639. start: a | b
  1640. a.2: "A"+
  1641. b.1: "A"+ "B"?
  1642. """
  1643. l = _Lark(grammar)
  1644. res = l.parse("AAAA")
  1645. self.assertEqual(res.children[0].data, 'a')
  1646. l = _Lark(grammar)
  1647. res = l.parse("AAAB")
  1648. self.assertEqual(res.children[0].data, 'b')
  1649. l = _Lark(grammar, priority="invert")
  1650. res = l.parse("AAAA")
  1651. self.assertEqual(res.children[0].data, 'b')
  1652. @unittest.skipIf(PARSER != 'earley' or 'dynamic' not in LEXER, "Currently only Earley supports priority sum in rules")
  1653. def test_prioritization_sum(self):
  1654. "Tests effect of priority on result"
  1655. grammar = """
  1656. start: ab_ b_ a_ | indirection
  1657. indirection: a_ bb_ a_
  1658. a_: "a"
  1659. b_: "b"
  1660. ab_: "ab"
  1661. bb_.1: "bb"
  1662. """
  1663. l = _Lark(grammar, priority="invert")
  1664. res = l.parse('abba')
  1665. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  1666. grammar = """
  1667. start: ab_ b_ a_ | indirection
  1668. indirection: a_ bb_ a_
  1669. a_: "a"
  1670. b_: "b"
  1671. ab_.1: "ab"
  1672. bb_: "bb"
  1673. """
  1674. l = _Lark(grammar, priority="invert")
  1675. res = l.parse('abba')
  1676. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  1677. grammar = """
  1678. start: ab_ b_ a_ | indirection
  1679. indirection: a_ bb_ a_
  1680. a_.2: "a"
  1681. b_.1: "b"
  1682. ab_.3: "ab"
  1683. bb_.3: "bb"
  1684. """
  1685. l = _Lark(grammar, priority="invert")
  1686. res = l.parse('abba')
  1687. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  1688. grammar = """
  1689. start: ab_ b_ a_ | indirection
  1690. indirection: a_ bb_ a_
  1691. a_.1: "a"
  1692. b_.1: "b"
  1693. ab_.4: "ab"
  1694. bb_.3: "bb"
  1695. """
  1696. l = _Lark(grammar, priority="invert")
  1697. res = l.parse('abba')
  1698. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  1699. def test_utf8(self):
  1700. g = u"""start: a
  1701. a: "±a"
  1702. """
  1703. l = _Lark(g)
  1704. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  1705. g = u"""start: A
  1706. A: "±a"
  1707. """
  1708. l = _Lark(g)
  1709. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  1710. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1711. def test_ignore(self):
  1712. grammar = r"""
  1713. COMMENT: /(!|(\/\/))[^\n]*/
  1714. %ignore COMMENT
  1715. %import common.WS -> _WS
  1716. %import common.INT
  1717. start: "INT"i _WS+ INT _WS*
  1718. """
  1719. parser = _Lark(grammar)
  1720. tree = parser.parse("int 1 ! This is a comment\n")
  1721. self.assertEqual(tree.children, ['1'])
  1722. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  1723. self.assertEqual(tree.children, ['1'])
  1724. parser = _Lark(r"""
  1725. start : "a"*
  1726. %ignore "b"
  1727. """)
  1728. tree = parser.parse("bb")
  1729. self.assertEqual(tree.children, [])
  1730. def test_regex_escaping(self):
  1731. g = _Lark("start: /[ab]/")
  1732. g.parse('a')
  1733. g.parse('b')
  1734. self.assertRaises( UnexpectedInput, g.parse, 'c')
  1735. _Lark(r'start: /\w/').parse('a')
  1736. g = _Lark(r'start: /\\w/')
  1737. self.assertRaises( UnexpectedInput, g.parse, 'a')
  1738. g.parse(r'\w')
  1739. _Lark(r'start: /\[/').parse('[')
  1740. _Lark(r'start: /\//').parse('/')
  1741. _Lark(r'start: /\\/').parse('\\')
  1742. _Lark(r'start: /\[ab]/').parse('[ab]')
  1743. _Lark(r'start: /\\[ab]/').parse('\\a')
  1744. _Lark(r'start: /\t/').parse('\t')
  1745. _Lark(r'start: /\\t/').parse('\\t')
  1746. _Lark(r'start: /\\\t/').parse('\\\t')
  1747. _Lark(r'start: "\t"').parse('\t')
  1748. _Lark(r'start: "\\t"').parse('\\t')
  1749. _Lark(r'start: "\\\t"').parse('\\\t')
  1750. def test_ranged_repeat_rules(self):
  1751. g = u"""!start: "A"~3
  1752. """
  1753. l = _Lark(g)
  1754. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  1755. self.assertRaises(ParseError, l.parse, u'AA')
  1756. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  1757. g = u"""!start: "A"~0..2
  1758. """
  1759. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  1760. l = _Lark(g)
  1761. self.assertEqual(l.parse(u''), Tree('start', []))
  1762. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  1763. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  1764. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  1765. g = u"""!start: "A"~3..2
  1766. """
  1767. self.assertRaises(GrammarError, _Lark, g)
  1768. g = u"""!start: "A"~2..3 "B"~2
  1769. """
  1770. l = _Lark(g)
  1771. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  1772. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  1773. self.assertRaises(ParseError, l.parse, u'AAAB')
  1774. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  1775. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  1776. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  1777. def test_ranged_repeat_terms(self):
  1778. g = u"""!start: AAA
  1779. AAA: "A"~3
  1780. """
  1781. l = _Lark(g)
  1782. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  1783. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  1784. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  1785. g = u"""!start: AABB CC
  1786. AABB: "A"~0..2 "B"~2
  1787. CC: "C"~1..2
  1788. """
  1789. l = _Lark(g)
  1790. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  1791. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  1792. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  1793. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  1794. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  1795. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  1796. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  1797. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  1798. def test_priority_vs_embedded(self):
  1799. g = """
  1800. A.2: "a"
  1801. WORD: ("a".."z")+
  1802. start: (A | WORD)+
  1803. """
  1804. l = _Lark(g)
  1805. t = l.parse('abc')
  1806. self.assertEqual(t.children, ['a', 'bc'])
  1807. self.assertEqual(t.children[0].type, 'A')
  1808. def test_line_counting(self):
  1809. p = _Lark("start: /[^x]+/")
  1810. text = 'hello\nworld'
  1811. t = p.parse(text)
  1812. tok = t.children[0]
  1813. self.assertEqual(tok, text)
  1814. self.assertEqual(tok.line, 1)
  1815. self.assertEqual(tok.column, 1)
  1816. # if _LEXER != 'dynamic':
  1817. self.assertEqual(tok.end_line, 2)
  1818. self.assertEqual(tok.end_column, 6)
  1819. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1820. def test_empty_end(self):
  1821. p = _Lark("""
  1822. start: b c d
  1823. b: "B"
  1824. c: | "C"
  1825. d: | "D"
  1826. """)
  1827. res = p.parse('B')
  1828. self.assertEqual(len(res.children), 3)
  1829. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1830. def test_maybe_placeholders(self):
  1831. # Anonymous tokens shouldn't count
  1832. p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True)
  1833. self.assertEqual(p.parse("").children, [])
  1834. # Unless keep_all_tokens=True
  1835. p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True, keep_all_tokens=True)
  1836. self.assertEqual(p.parse("").children, [None, None, None])
  1837. # All invisible constructs shouldn't count
  1838. p = _Lark("""start: [A] ["b"] [_c] ["e" "f" _c]
  1839. A: "a"
  1840. _c: "c" """, maybe_placeholders=True)
  1841. self.assertEqual(p.parse("").children, [None])
  1842. self.assertEqual(p.parse("c").children, [None])
  1843. self.assertEqual(p.parse("aefc").children, ['a'])
  1844. # ? shouldn't apply
  1845. p = _Lark("""!start: ["a"] "b"? ["c"] """, maybe_placeholders=True)
  1846. self.assertEqual(p.parse("").children, [None, None])
  1847. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1848. p = _Lark("""!start: ["a"] ["b"] ["c"] """, maybe_placeholders=True)
  1849. self.assertEqual(p.parse("").children, [None, None, None])
  1850. self.assertEqual(p.parse("a").children, ['a', None, None])
  1851. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1852. self.assertEqual(p.parse("c").children, [None, None, 'c'])
  1853. self.assertEqual(p.parse("ab").children, ['a', 'b', None])
  1854. self.assertEqual(p.parse("ac").children, ['a', None, 'c'])
  1855. self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
  1856. self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])
  1857. p = _Lark("""!start: (["a"] "b" ["c"])+ """, maybe_placeholders=True)
  1858. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1859. self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
  1860. self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
  1861. self.assertEqual(p.parse("babbcabcb").children,
  1862. [None, 'b', None,
  1863. 'a', 'b', None,
  1864. None, 'b', 'c',
  1865. 'a', 'b', 'c',
  1866. None, 'b', None])
  1867. p = _Lark("""!start: ["a"] ["c"] "b"+ ["a"] ["d"] """, maybe_placeholders=True)
  1868. self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None])
  1869. self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd'])
  1870. self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None])
  1871. self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None])
  1872. def test_escaped_string(self):
  1873. "Tests common.ESCAPED_STRING"
  1874. grammar = r"""
  1875. start: ESCAPED_STRING+
  1876. %import common (WS_INLINE, ESCAPED_STRING)
  1877. %ignore WS_INLINE
  1878. """
  1879. parser = _Lark(grammar)
  1880. parser.parse(r'"\\" "b" "c"')
  1881. parser.parse(r'"That" "And a \"b"')
  1882. def test_meddling_unused(self):
  1883. "Unless 'unused' is removed, LALR analysis will fail on reduce-reduce collision"
  1884. grammar = """
  1885. start: EKS* x
  1886. x: EKS
  1887. unused: x*
  1888. EKS: "x"
  1889. """
  1890. parser = _Lark(grammar)
  1891. @unittest.skipIf(PARSER!='lalr' or 'custom' in LEXER, "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
  1892. def test_serialize(self):
  1893. grammar = """
  1894. start: _ANY b "C"
  1895. _ANY: /./
  1896. b: "B"
  1897. """
  1898. parser = _Lark(grammar)
  1899. s = BytesIO()
  1900. parser.save(s)
  1901. s.seek(0)
  1902. parser2 = Lark.load(s)
  1903. self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) )
  1904. def test_multi_start(self):
  1905. parser = _Lark('''
  1906. a: "x" "a"?
  1907. b: "x" "b"?
  1908. ''', start=['a', 'b'])
  1909. self.assertEqual(parser.parse('xa', 'a'), Tree('a', []))
  1910. self.assertEqual(parser.parse('xb', 'b'), Tree('b', []))
  1911. def test_lexer_detect_newline_tokens(self):
  1912. # Detect newlines in regular tokens
  1913. g = _Lark(r"""start: "go" tail*
  1914. !tail : SA "@" | SB "@" | SC "@" | SD "@"
  1915. SA : "a" /\n/
  1916. SB : /b./s
  1917. SC : "c" /[^a-z]/
  1918. SD : "d" /\s/
  1919. """)
  1920. a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children]
  1921. self.assertEqual(a.line, 2)
  1922. self.assertEqual(b.line, 3)
  1923. self.assertEqual(c.line, 4)
  1924. self.assertEqual(d.line, 5)
  1925. # Detect newlines in ignored tokens
  1926. for re in ['/\\n/', '/[^a-z]/', '/\\s/']:
  1927. g = _Lark('''!start: "a" "a"
  1928. %ignore {}'''.format(re))
  1929. a, b = g.parse('a\na').children
  1930. self.assertEqual(a.line, 1)
  1931. self.assertEqual(b.line, 2)
  1932. @unittest.skipIf(PARSER=='cyk' or LEXER=='custom_old', "match_examples() not supported for CYK/old custom lexer")
  1933. def test_match_examples(self):
  1934. p = _Lark(r"""
  1935. start: "a" "b" "c"
  1936. """)
  1937. def match_error(s):
  1938. try:
  1939. _ = p.parse(s)
  1940. except UnexpectedInput as u:
  1941. return u.match_examples(p.parse, {
  1942. 0: ['abe'],
  1943. 1: ['ab'],
  1944. 2: ['cbc', 'dbc'],
  1945. })
  1946. assert False
  1947. assert match_error("abe") == 0
  1948. assert match_error("ab") == 1
  1949. assert match_error("bbc") == 2
  1950. assert match_error("cbc") == 2
  1951. self.assertEqual( match_error("dbc"), 2 )
  1952. self.assertEqual( match_error("ebc"), 2 )
  1953. @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
  1954. def test_unicode_class(self):
  1955. "Tests that character classes from the `regex` module work correctly."
  1956. g = _Lark(r"""?start: NAME
  1957. NAME: ID_START ID_CONTINUE*
  1958. ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
  1959. ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)
  1960. self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
  1961. @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
  1962. def test_unicode_word(self):
  1963. "Tests that a persistent bug in the `re` module works when `regex` is enabled."
  1964. g = _Lark(r"""?start: NAME
  1965. NAME: /[\w]+/
  1966. """, regex=True)
  1967. self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
  1968. @unittest.skipIf(not regex, "regex not installed")
  1969. def test_regex_width_fallback(self):
  1970. g = r"""
  1971. start: NAME NAME?
  1972. NAME: /(?(?=\d)\d+|\w+)/
  1973. """
  1974. self.assertRaises((GrammarError, LexError, re.error), _Lark, g)
  1975. p = _Lark(g, regex=True)
  1976. self.assertEqual(p.parse("123abc"), Tree('start', ['123', 'abc']))
  1977. g = r"""
  1978. start: NAME NAME?
  1979. NAME: /(?(?=\d)\d+|\w*)/
  1980. """
  1981. self.assertRaises((GrammarError, LexError, re.error), _Lark, g, regex=True)
  1982. @unittest.skipIf(PARSER!='lalr', "interactive_parser is only implemented for LALR at the moment")
  1983. def test_parser_interactive_parser(self):
  1984. g = _Lark(r'''
  1985. start: A+ B*
  1986. A: "a"
  1987. B: "b"
  1988. ''')
  1989. ip = g.parse_interactive()
  1990. self.assertRaises(UnexpectedToken, ip.feed_eof)
  1991. self.assertRaises(TypeError, ip.exhaust_lexer)
  1992. ip.feed_token(Token('A', 'a'))
  1993. res = ip.feed_eof()
  1994. self.assertEqual(res, Tree('start', ['a']))
  1995. ip = g.parse_interactive("ab")
  1996. ip.exhaust_lexer()
  1997. ip_copy = ip.copy()
  1998. self.assertEqual(ip_copy.parser_state, ip.parser_state)
  1999. self.assertEqual(ip_copy.lexer_state.state, ip.lexer_state.state)
  2000. self.assertIsNot(ip_copy.parser_state, ip.parser_state)
  2001. self.assertIsNot(ip_copy.lexer_state.state, ip.lexer_state.state)
  2002. self.assertIsNot(ip_copy.lexer_state.state.line_ctr, ip.lexer_state.state.line_ctr)
  2003. res = ip.feed_eof(ip.lexer_state.state.last_token)
  2004. self.assertEqual(res, Tree('start', ['a', 'b']))
  2005. self.assertRaises(UnexpectedToken ,ip.feed_eof)
  2006. self.assertRaises(UnexpectedToken, ip_copy.feed_token, Token('A', 'a'))
  2007. ip_copy.feed_token(Token('B', 'b'))
  2008. res = ip_copy.feed_eof()
  2009. self.assertEqual(res, Tree('start', ['a', 'b', 'b']))
  2010. @unittest.skipIf(PARSER!='lalr', "interactive_parser error handling only works with LALR for now")
  2011. def test_error_with_interactive_parser(self):
  2012. def ignore_errors(e):
  2013. if isinstance(e, UnexpectedCharacters):
  2014. # Skip bad character
  2015. return True
  2016. # Must be UnexpectedToken
  2017. if e.token.type == 'COMMA':
  2018. # Skip comma
  2019. return True
  2020. elif e.token.type == 'SIGNED_NUMBER':
  2021. # Try to feed a comma and retry the number
  2022. e.interactive_parser.feed_token(Token('COMMA', ','))
  2023. e.interactive_parser.feed_token(e.token)
  2024. return True
  2025. # Unhandled error. Will stop parse and raise exception
  2026. return False
  2027. g = _Lark(r'''
  2028. start: "[" num ("," num)* "]"
  2029. ?num: SIGNED_NUMBER
  2030. %import common.SIGNED_NUMBER
  2031. %ignore " "
  2032. ''')
  2033. s = "[0 1, 2,, 3,,, 4, 5 6 ]"
  2034. tree = g.parse(s, on_error=ignore_errors)
  2035. res = [int(x) for x in tree.children]
  2036. assert res == list(range(7))
  2037. s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
  2038. tree = g.parse(s, on_error=ignore_errors)
  2039. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  2040. _TestParser.__name__ = _NAME
  2041. _TestParser.__qualname__ = "tests.test_parser." + _NAME
  2042. globals()[_NAME] = _TestParser
  2043. __all__.append(_NAME)
  2044. _TO_TEST = [
  2045. ('standard', 'earley'),
  2046. ('standard', 'cyk'),
  2047. ('standard', 'lalr'),
  2048. ('dynamic', 'earley'),
  2049. ('dynamic_complete', 'earley'),
  2050. ('contextual', 'lalr'),
  2051. ('custom_new', 'lalr'),
  2052. ('custom_new', 'cyk'),
  2053. ('custom_old', 'earley'),
  2054. ]
  2055. for _LEXER, _PARSER in _TO_TEST:
  2056. _make_parser_test(_LEXER, _PARSER)
  2057. for _LEXER in ('dynamic', 'dynamic_complete'):
  2058. _make_full_earley_test(_LEXER)
  2059. if __name__ == '__main__':
  2060. unittest.main()