This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1851 lines
63 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import re
  4. import unittest
  5. import logging
  6. import os
  7. import sys
  8. from copy import copy, deepcopy
  9. from lark.utils import Py36
  10. try:
  11. from cStringIO import StringIO as cStringIO
  12. except ImportError:
  13. # Available only in Python 2.x, 3.x only has io.StringIO from below
  14. cStringIO = None
  15. from io import (
  16. StringIO as uStringIO,
  17. BytesIO,
  18. open,
  19. )
  20. logging.basicConfig(level=logging.INFO)
  21. try:
  22. import regex
  23. except ImportError:
  24. regex = None
  25. from lark.lark import Lark
  26. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
  27. from lark.tree import Tree
  28. from lark.visitors import Transformer, Transformer_InPlace, v_args
  29. from lark.grammar import Rule
  30. from lark.lexer import TerminalDef, Lexer, TraditionalLexer
  31. __path__ = os.path.dirname(__file__)
  32. def _read(n, *args):
  33. with open(os.path.join(__path__, n), *args) as f:
  34. return f.read()
  35. class TestParsers(unittest.TestCase):
  36. def test_big_list(self):
  37. Lark(r"""
  38. start: {}
  39. """.format(
  40. "|".join(['"%s"'%i for i in range(250)])
  41. ))
  42. def test_same_ast(self):
  43. "Tests that Earley and LALR parsers produce equal trees"
  44. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  45. name_list: NAME | name_list "," NAME
  46. NAME: /\w+/ """, parser='lalr')
  47. l = g.parse('(a,b,c,*x)')
  48. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  49. name_list: NAME | name_list "," NAME
  50. NAME: /\w/+ """)
  51. l2 = g.parse('(a,b,c,*x)')
  52. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  53. def test_infinite_recurse(self):
  54. g = """start: a
  55. a: a | "a"
  56. """
  57. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  58. # TODO: should it? shouldn't it?
  59. # l = Lark(g, parser='earley', lexer='dynamic')
  60. # self.assertRaises(ParseError, l.parse, 'a')
  61. def test_propagate_positions(self):
  62. g = Lark("""start: a
  63. a: "a"
  64. """, propagate_positions=True)
  65. r = g.parse('a')
  66. self.assertEqual( r.children[0].meta.line, 1 )
  67. g = Lark("""start: x
  68. x: a
  69. a: "a"
  70. """, propagate_positions=True)
  71. r = g.parse('a')
  72. self.assertEqual( r.children[0].meta.line, 1 )
  73. def test_expand1(self):
  74. g = Lark("""start: a
  75. ?a: b
  76. b: "x"
  77. """)
  78. r = g.parse('x')
  79. self.assertEqual( r.children[0].data, "b" )
  80. g = Lark("""start: a
  81. ?a: b -> c
  82. b: "x"
  83. """)
  84. r = g.parse('x')
  85. self.assertEqual( r.children[0].data, "c" )
  86. g = Lark("""start: a
  87. ?a: B -> c
  88. B: "x"
  89. """)
  90. self.assertEqual( r.children[0].data, "c" )
  91. g = Lark("""start: a
  92. ?a: b b -> c
  93. b: "x"
  94. """)
  95. r = g.parse('xx')
  96. self.assertEqual( r.children[0].data, "c" )
  97. def test_comment_in_rule_definition(self):
  98. g = Lark("""start: a
  99. a: "a"
  100. // A comment
  101. // Another comment
  102. | "b"
  103. // Still more
  104. c: "unrelated"
  105. """)
  106. r = g.parse('b')
  107. self.assertEqual( r.children[0].data, "a" )
  108. def test_visit_tokens(self):
  109. class T(Transformer):
  110. def a(self, children):
  111. return children[0] + "!"
  112. def A(self, tok):
  113. return tok.update(value=tok.upper())
  114. # Test regular
  115. g = """start: a
  116. a : A
  117. A: "x"
  118. """
  119. p = Lark(g, parser='lalr')
  120. r = T(False).transform(p.parse("x"))
  121. self.assertEqual( r.children, ["x!"] )
  122. r = T().transform(p.parse("x"))
  123. self.assertEqual( r.children, ["X!"] )
  124. # Test internal transformer
  125. p = Lark(g, parser='lalr', transformer=T())
  126. r = p.parse("x")
  127. self.assertEqual( r.children, ["X!"] )
  128. def test_vargs_meta(self):
  129. @v_args(meta=True)
  130. class T1(Transformer):
  131. def a(self, children, meta):
  132. assert not children
  133. return meta.line
  134. def start(self, children, meta):
  135. return children
  136. @v_args(meta=True, inline=True)
  137. class T2(Transformer):
  138. def a(self, meta):
  139. return meta.line
  140. def start(self, meta, *res):
  141. return list(res)
  142. for T in (T1, T2):
  143. for internal in [False, True]:
  144. try:
  145. g = Lark(r"""start: a+
  146. a : "x" _NL?
  147. _NL: /\n/+
  148. """, parser='lalr', transformer=T() if internal else None, propagate_positions=True)
  149. except NotImplementedError:
  150. assert internal
  151. continue
  152. res = g.parse("xx\nx\nxxx\n\n\nxx")
  153. assert not internal
  154. res = T().transform(res)
  155. self.assertEqual(res, [1, 1, 2, 3, 3, 3, 6, 6])
  156. def test_vargs_tree(self):
  157. tree = Lark('''
  158. start: a a a
  159. !a: "A"
  160. ''').parse('AAA')
  161. tree_copy = deepcopy(tree)
  162. @v_args(tree=True)
  163. class T(Transformer):
  164. def a(self, tree):
  165. return 1
  166. def start(self, tree):
  167. return tree.children
  168. res = T().transform(tree)
  169. self.assertEqual(res, [1, 1, 1])
  170. self.assertEqual(tree, tree_copy)
  171. def test_embedded_transformer(self):
  172. class T(Transformer):
  173. def a(self, children):
  174. return "<a>"
  175. def b(self, children):
  176. return "<b>"
  177. def c(self, children):
  178. return "<c>"
  179. # Test regular
  180. g = Lark("""start: a
  181. a : "x"
  182. """, parser='lalr')
  183. r = T().transform(g.parse("x"))
  184. self.assertEqual( r.children, ["<a>"] )
  185. g = Lark("""start: a
  186. a : "x"
  187. """, parser='lalr', transformer=T())
  188. r = g.parse("x")
  189. self.assertEqual( r.children, ["<a>"] )
  190. # Test Expand1
  191. g = Lark("""start: a
  192. ?a : b
  193. b : "x"
  194. """, parser='lalr')
  195. r = T().transform(g.parse("x"))
  196. self.assertEqual( r.children, ["<b>"] )
  197. g = Lark("""start: a
  198. ?a : b
  199. b : "x"
  200. """, parser='lalr', transformer=T())
  201. r = g.parse("x")
  202. self.assertEqual( r.children, ["<b>"] )
  203. # Test Expand1 -> Alias
  204. g = Lark("""start: a
  205. ?a : b b -> c
  206. b : "x"
  207. """, parser='lalr')
  208. r = T().transform(g.parse("xx"))
  209. self.assertEqual( r.children, ["<c>"] )
  210. g = Lark("""start: a
  211. ?a : b b -> c
  212. b : "x"
  213. """, parser='lalr', transformer=T())
  214. r = g.parse("xx")
  215. self.assertEqual( r.children, ["<c>"] )
  216. def test_embedded_transformer_inplace(self):
  217. @v_args(tree=True)
  218. class T1(Transformer_InPlace):
  219. def a(self, tree):
  220. assert isinstance(tree, Tree), tree
  221. tree.children.append("tested")
  222. return tree
  223. def b(self, tree):
  224. return Tree(tree.data, tree.children + ['tested2'])
  225. @v_args(tree=True)
  226. class T2(Transformer):
  227. def a(self, tree):
  228. assert isinstance(tree, Tree), tree
  229. tree.children.append("tested")
  230. return tree
  231. def b(self, tree):
  232. return Tree(tree.data, tree.children + ['tested2'])
  233. class T3(Transformer):
  234. @v_args(tree=True)
  235. def a(self, tree):
  236. assert isinstance(tree, Tree)
  237. tree.children.append("tested")
  238. return tree
  239. @v_args(tree=True)
  240. def b(self, tree):
  241. return Tree(tree.data, tree.children + ['tested2'])
  242. for t in [T1(), T2(), T3()]:
  243. for internal in [False, True]:
  244. g = Lark("""start: a b
  245. a : "x"
  246. b : "y"
  247. """, parser='lalr', transformer=t if internal else None)
  248. r = g.parse("xy")
  249. if not internal:
  250. r = t.transform(r)
  251. a, b = r.children
  252. self.assertEqual(a.children, ["tested"])
  253. self.assertEqual(b.children, ["tested2"])
  254. def test_alias(self):
  255. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  256. def _make_full_earley_test(LEXER):
  257. def _Lark(grammar, **kwargs):
  258. return Lark(grammar, lexer=LEXER, parser='earley', propagate_positions=True, **kwargs)
  259. class _TestFullEarley(unittest.TestCase):
  260. def test_anon(self):
  261. # Fails an Earley implementation without special handling for empty rules,
  262. # or re-processing of already completed rules.
  263. g = Lark(r"""start: B
  264. B: ("ab"|/[^b]/)+
  265. """, lexer=LEXER)
  266. self.assertEqual( g.parse('abc').children[0], 'abc')
  267. def test_earley(self):
  268. g = Lark("""start: A "b" c
  269. A: "a"+
  270. c: "abc"
  271. """, parser="earley", lexer=LEXER)
  272. x = g.parse('aaaababc')
  273. def test_earley2(self):
  274. grammar = """
  275. start: statement+
  276. statement: "r"
  277. | "c" /[a-z]/+
  278. %ignore " "
  279. """
  280. program = """c b r"""
  281. l = Lark(grammar, parser='earley', lexer=LEXER)
  282. l.parse(program)
  283. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  284. def test_earley3(self):
  285. """Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)
  286. By default, `+` should immitate regexp greedy-matching
  287. """
  288. grammar = """
  289. start: A A
  290. A: "a"+
  291. """
  292. l = Lark(grammar, parser='earley', lexer=LEXER)
  293. res = l.parse("aaa")
  294. self.assertEqual(set(res.children), {'aa', 'a'})
  295. # XXX TODO fix Earley to maintain correct order
  296. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  297. # self.assertEqual(res.children, ['aa', 'a'])
  298. def test_earley4(self):
  299. grammar = """
  300. start: A A?
  301. A: "a"+
  302. """
  303. l = Lark(grammar, parser='earley', lexer=LEXER)
  304. res = l.parse("aaa")
  305. assert set(res.children) == {'aa', 'a'} or res.children == ['aaa']
  306. # XXX TODO fix Earley to maintain correct order
  307. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  308. # self.assertEqual(res.children, ['aaa'])
  309. def test_earley_repeating_empty(self):
  310. # This was a sneaky bug!
  311. grammar = """
  312. !start: "a" empty empty "b"
  313. empty: empty2
  314. empty2:
  315. """
  316. parser = Lark(grammar, parser='earley', lexer=LEXER)
  317. res = parser.parse('ab')
  318. empty_tree = Tree('empty', [Tree('empty2', [])])
  319. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  320. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  321. def test_earley_explicit_ambiguity(self):
  322. # This was a sneaky bug!
  323. grammar = """
  324. start: a b | ab
  325. a: "a"
  326. b: "b"
  327. ab: "ab"
  328. """
  329. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  330. ambig_tree = parser.parse('ab')
  331. self.assertEqual( ambig_tree.data, '_ambig')
  332. self.assertEqual( len(ambig_tree.children), 2)
  333. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  334. def test_ambiguity1(self):
  335. grammar = """
  336. start: cd+ "e"
  337. !cd: "c"
  338. | "d"
  339. | "cd"
  340. """
  341. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  342. ambig_tree = l.parse('cde')
  343. assert ambig_tree.data == '_ambig', ambig_tree
  344. assert len(ambig_tree.children) == 2
  345. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  346. def test_ambiguity2(self):
  347. grammar = """
  348. ANY: /[a-zA-Z0-9 ]+/
  349. a.2: "A" b+
  350. b.2: "B"
  351. c: ANY
  352. start: (a|c)*
  353. """
  354. l = Lark(grammar, parser='earley', lexer=LEXER)
  355. res = l.parse('ABX')
  356. expected = Tree('start', [
  357. Tree('a', [
  358. Tree('b', [])
  359. ]),
  360. Tree('c', [
  361. 'X'
  362. ])
  363. ])
  364. self.assertEqual(res, expected)
  365. def test_fruitflies_ambig(self):
  366. grammar = """
  367. start: noun verb noun -> simple
  368. | noun verb "like" noun -> comparative
  369. noun: adj? NOUN
  370. verb: VERB
  371. adj: ADJ
  372. NOUN: "flies" | "bananas" | "fruit"
  373. VERB: "like" | "flies"
  374. ADJ: "fruit"
  375. %import common.WS
  376. %ignore WS
  377. """
  378. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  379. tree = parser.parse('fruit flies like bananas')
  380. expected = Tree('_ambig', [
  381. Tree('comparative', [
  382. Tree('noun', ['fruit']),
  383. Tree('verb', ['flies']),
  384. Tree('noun', ['bananas'])
  385. ]),
  386. Tree('simple', [
  387. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  388. Tree('verb', ['like']),
  389. Tree('noun', ['bananas'])
  390. ])
  391. ])
  392. # self.assertEqual(tree, expected)
  393. self.assertEqual(tree.data, expected.data)
  394. self.assertEqual(set(tree.children), set(expected.children))
  395. @unittest.skipIf(LEXER!='dynamic_complete', "Only relevant for the dynamic_complete parser")
  396. def test_explicit_ambiguity2(self):
  397. grammar = r"""
  398. start: NAME+
  399. NAME: /\w+/
  400. %ignore " "
  401. """
  402. text = """cat"""
  403. parser = _Lark(grammar, start='start', ambiguity='explicit')
  404. tree = parser.parse(text)
  405. self.assertEqual(tree.data, '_ambig')
  406. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  407. self.assertEqual(combinations, {
  408. ('cat',),
  409. ('ca', 't'),
  410. ('c', 'at'),
  411. ('c', 'a' ,'t')
  412. })
  413. def test_term_ambig_resolve(self):
  414. grammar = r"""
  415. !start: NAME+
  416. NAME: /\w+/
  417. %ignore " "
  418. """
  419. text = """foo bar"""
  420. parser = Lark(grammar)
  421. tree = parser.parse(text)
  422. self.assertEqual(tree.children, ['foo', 'bar'])
  423. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  424. # def test_not_all_derivations(self):
  425. # grammar = """
  426. # start: cd+ "e"
  427. # !cd: "c"
  428. # | "d"
  429. # | "cd"
  430. # """
  431. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  432. # x = l.parse('cde')
  433. # assert x.data != '_ambig', x
  434. # assert len(x.children) == 1
  435. _NAME = "TestFullEarley" + LEXER.capitalize()
  436. _TestFullEarley.__name__ = _NAME
  437. globals()[_NAME] = _TestFullEarley
  438. class CustomLexer(Lexer):
  439. """
  440. Purpose of this custom lexer is to test the integration,
  441. so it uses the traditionalparser as implementation without custom lexing behaviour.
  442. """
  443. def __init__(self, lexer_conf):
  444. self.lexer = TraditionalLexer(copy(lexer_conf))
  445. def lex(self, *args, **kwargs):
  446. return self.lexer.lex(*args, **kwargs)
  447. def _make_parser_test(LEXER, PARSER):
  448. lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
  449. def _Lark(grammar, **kwargs):
  450. return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
  451. def _Lark_open(gfilename, **kwargs):
  452. return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
  453. class _TestParser(unittest.TestCase):
  454. def test_basic1(self):
  455. g = _Lark("""start: a+ b a* "b" a*
  456. b: "b"
  457. a: "a"
  458. """)
  459. r = g.parse('aaabaab')
  460. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  461. r = g.parse('aaabaaba')
  462. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  463. self.assertRaises(ParseError, g.parse, 'aaabaa')
  464. def test_basic2(self):
  465. # Multiple parsers and colliding tokens
  466. g = _Lark("""start: B A
  467. B: "12"
  468. A: "1" """)
  469. g2 = _Lark("""start: B A
  470. B: "12"
  471. A: "2" """)
  472. x = g.parse('121')
  473. assert x.data == 'start' and x.children == ['12', '1'], x
  474. x = g2.parse('122')
  475. assert x.data == 'start' and x.children == ['12', '2'], x
  476. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  477. def test_stringio_bytes(self):
  478. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  479. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  480. def test_stringio_unicode(self):
  481. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  482. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  483. def test_unicode(self):
  484. g = _Lark(u"""start: UNIA UNIB UNIA
  485. UNIA: /\xa3/
  486. UNIB: /\u0101/
  487. """)
  488. g.parse(u'\xa3\u0101\u00a3')
  489. def test_unicode2(self):
  490. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  491. UNIA: /\xa3/
  492. UNIB: "a\u0101b\ "
  493. UNIC: /a?\u0101c\n/
  494. """)
  495. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  496. def test_unicode3(self):
  497. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  498. UNIA: /\xa3/
  499. UNIB: "\u0101"
  500. UNIC: /\u0203/ /\n/
  501. """)
  502. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  503. def test_hex_escape(self):
  504. g = _Lark(r"""start: A B C
  505. A: "\x01"
  506. B: /\x02/
  507. C: "\xABCD"
  508. """)
  509. g.parse('\x01\x02\xABCD')
  510. def test_unicode_literal_range_escape(self):
  511. g = _Lark(r"""start: A+
  512. A: "\u0061".."\u0063"
  513. """)
  514. g.parse('abc')
  515. def test_hex_literal_range_escape(self):
  516. g = _Lark(r"""start: A+
  517. A: "\x01".."\x03"
  518. """)
  519. g.parse('\x01\x02\x03')
  520. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  521. def test_stack_for_ebnf(self):
  522. """Verify that stack depth isn't an issue for EBNF grammars"""
  523. g = _Lark(r"""start: a+
  524. a : "a" """)
  525. g.parse("a" * (sys.getrecursionlimit()*2 ))
  526. def test_expand1_lists_with_one_item(self):
  527. g = _Lark(r"""start: list
  528. ?list: item+
  529. item : A
  530. A: "a"
  531. """)
  532. r = g.parse("a")
  533. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  534. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  535. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  536. self.assertEqual(len(r.children), 1)
  537. def test_expand1_lists_with_one_item_2(self):
  538. g = _Lark(r"""start: list
  539. ?list: item+ "!"
  540. item : A
  541. A: "a"
  542. """)
  543. r = g.parse("a!")
  544. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  545. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  546. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  547. self.assertEqual(len(r.children), 1)
  548. def test_dont_expand1_lists_with_multiple_items(self):
  549. g = _Lark(r"""start: list
  550. ?list: item+
  551. item : A
  552. A: "a"
  553. """)
  554. r = g.parse("aa")
  555. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  556. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  557. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  558. self.assertEqual(len(r.children), 1)
  559. # Sanity check: verify that 'list' contains the two 'item's we've given it
  560. [list] = r.children
  561. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  562. def test_dont_expand1_lists_with_multiple_items_2(self):
  563. g = _Lark(r"""start: list
  564. ?list: item+ "!"
  565. item : A
  566. A: "a"
  567. """)
  568. r = g.parse("aa!")
  569. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  570. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  571. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  572. self.assertEqual(len(r.children), 1)
  573. # Sanity check: verify that 'list' contains the two 'item's we've given it
  574. [list] = r.children
  575. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  576. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  577. def test_empty_expand1_list(self):
  578. g = _Lark(r"""start: list
  579. ?list: item*
  580. item : A
  581. A: "a"
  582. """)
  583. r = g.parse("")
  584. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  585. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  586. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  587. self.assertEqual(len(r.children), 1)
  588. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  589. [list] = r.children
  590. self.assertSequenceEqual([item.data for item in list.children], ())
  591. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  592. def test_empty_expand1_list_2(self):
  593. g = _Lark(r"""start: list
  594. ?list: item* "!"?
  595. item : A
  596. A: "a"
  597. """)
  598. r = g.parse("")
  599. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  600. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  601. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  602. self.assertEqual(len(r.children), 1)
  603. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  604. [list] = r.children
  605. self.assertSequenceEqual([item.data for item in list.children], ())
  606. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  607. def test_empty_flatten_list(self):
  608. g = _Lark(r"""start: list
  609. list: | item "," list
  610. item : A
  611. A: "a"
  612. """)
  613. r = g.parse("")
  614. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  615. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  616. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  617. [list] = r.children
  618. self.assertSequenceEqual([item.data for item in list.children], ())
  619. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  620. def test_single_item_flatten_list(self):
  621. g = _Lark(r"""start: list
  622. list: | item "," list
  623. item : A
  624. A: "a"
  625. """)
  626. r = g.parse("a,")
  627. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  628. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  629. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  630. [list] = r.children
  631. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  632. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  633. def test_multiple_item_flatten_list(self):
  634. g = _Lark(r"""start: list
  635. #list: | item "," list
  636. item : A
  637. A: "a"
  638. """)
  639. r = g.parse("a,a,")
  640. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  641. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  642. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  643. [list] = r.children
  644. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  645. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  646. def test_recurse_flatten(self):
  647. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  648. g = _Lark(r"""start: a | start a
  649. a : A
  650. A : "a" """)
  651. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  652. # STree data structures, which uses recursion).
  653. g.parse("a" * (sys.getrecursionlimit() // 4))
  654. def test_token_collision(self):
  655. g = _Lark(r"""start: "Hello" NAME
  656. NAME: /\w/+
  657. %ignore " "
  658. """)
  659. x = g.parse('Hello World')
  660. self.assertSequenceEqual(x.children, ['World'])
  661. x = g.parse('Hello HelloWorld')
  662. self.assertSequenceEqual(x.children, ['HelloWorld'])
  663. def test_token_collision_WS(self):
  664. g = _Lark(r"""start: "Hello" NAME
  665. NAME: /\w/+
  666. %import common.WS
  667. %ignore WS
  668. """)
  669. x = g.parse('Hello World')
  670. self.assertSequenceEqual(x.children, ['World'])
  671. x = g.parse('Hello HelloWorld')
  672. self.assertSequenceEqual(x.children, ['HelloWorld'])
  673. def test_token_collision2(self):
  674. g = _Lark("""
  675. !start: "starts"
  676. %import common.LCASE_LETTER
  677. """)
  678. x = g.parse("starts")
  679. self.assertSequenceEqual(x.children, ['starts'])
  680. def test_templates(self):
  681. g = _Lark(r"""
  682. start: "[" sep{NUMBER, ","} "]"
  683. sep{item, delim}: item (delim item)*
  684. NUMBER: /\d+/
  685. %ignore " "
  686. """)
  687. x = g.parse("[1, 2, 3, 4]")
  688. self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])])
  689. x = g.parse("[1]")
  690. self.assertSequenceEqual(x.children, [Tree('sep', ['1'])])
  691. def test_templates_recursion(self):
  692. g = _Lark(r"""
  693. start: "[" _sep{NUMBER, ","} "]"
  694. _sep{item, delim}: item | _sep{item, delim} delim item
  695. NUMBER: /\d+/
  696. %ignore " "
  697. """)
  698. x = g.parse("[1, 2, 3, 4]")
  699. self.assertSequenceEqual(x.children, ['1', '2', '3', '4'])
  700. x = g.parse("[1]")
  701. self.assertSequenceEqual(x.children, ['1'])
  702. def test_templates_import(self):
  703. g = _Lark_open("test_templates_import.lark", rel_to=__file__)
  704. x = g.parse("[1, 2, 3, 4]")
  705. self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])])
  706. x = g.parse("[1]")
  707. self.assertSequenceEqual(x.children, [Tree('sep', ['1'])])
  708. def test_templates_alias(self):
  709. g = _Lark(r"""
  710. start: expr{"C"}
  711. expr{t}: "A" t
  712. | "B" t -> b
  713. """)
  714. x = g.parse("AC")
  715. self.assertSequenceEqual(x.children, [Tree('expr', [])])
  716. x = g.parse("BC")
  717. self.assertSequenceEqual(x.children, [Tree('b', [])])
  718. def test_templates_modifiers(self):
  719. g = _Lark(r"""
  720. start: expr{"B"}
  721. !expr{t}: "A" t
  722. """)
  723. x = g.parse("AB")
  724. self.assertSequenceEqual(x.children, [Tree('expr', ["A", "B"])])
  725. g = _Lark(r"""
  726. start: _expr{"B"}
  727. !_expr{t}: "A" t
  728. """)
  729. x = g.parse("AB")
  730. self.assertSequenceEqual(x.children, ["A", "B"])
  731. g = _Lark(r"""
  732. start: expr{b}
  733. b: "B"
  734. ?expr{t}: "A" t
  735. """)
  736. x = g.parse("AB")
  737. self.assertSequenceEqual(x.children, [Tree('b',[])])
  738. def test_templates_templates(self):
  739. g = _Lark('''start: a{b}
  740. a{t}: t{"a"}
  741. b{x}: x''')
  742. x = g.parse('a')
  743. self.assertSequenceEqual(x.children, [Tree('a', [Tree('b',[])])])
  744. def test_g_regex_flags(self):
  745. g = _Lark("""
  746. start: "a" /b+/ C
  747. C: "C" | D
  748. D: "D" E
  749. E: "e"
  750. """, g_regex_flags=re.I)
  751. x1 = g.parse("ABBc")
  752. x2 = g.parse("abdE")
  753. # def test_string_priority(self):
  754. # g = _Lark("""start: (A | /a?bb/)+
  755. # A: "a" """)
  756. # x = g.parse('abb')
  757. # self.assertEqual(len(x.children), 2)
  758. # # This parse raises an exception because the lexer will always try to consume
  759. # # "a" first and will never match the regular expression
  760. # # This behavior is subject to change!!
  761. # # Thie won't happen with ambiguity handling.
  762. # g = _Lark("""start: (A | /a?ab/)+
  763. # A: "a" """)
  764. # self.assertRaises(LexError, g.parse, 'aab')
  765. def test_undefined_rule(self):
  766. self.assertRaises(GrammarError, _Lark, """start: a""")
  767. def test_undefined_token(self):
  768. self.assertRaises(GrammarError, _Lark, """start: A""")
  769. def test_rule_collision(self):
  770. g = _Lark("""start: "a"+ "b"
  771. | "a"+ """)
  772. x = g.parse('aaaa')
  773. x = g.parse('aaaab')
  774. def test_rule_collision2(self):
  775. g = _Lark("""start: "a"* "b"
  776. | "a"+ """)
  777. x = g.parse('aaaa')
  778. x = g.parse('aaaab')
  779. x = g.parse('b')
  780. def test_token_not_anon(self):
  781. """Tests that "a" is matched as an anonymous token, and not A.
  782. """
  783. g = _Lark("""start: "a"
  784. A: "a" """)
  785. x = g.parse('a')
  786. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  787. g = _Lark("""start: "a" A
  788. A: "a" """)
  789. x = g.parse('aa')
  790. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  791. self.assertEqual(x.children[0].type, "A")
  792. g = _Lark("""start: /a/
  793. A: /a/ """)
  794. x = g.parse('a')
  795. self.assertEqual(len(x.children), 1)
  796. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  797. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  798. def test_maybe(self):
  799. g = _Lark("""start: ["a"] """)
  800. x = g.parse('a')
  801. x = g.parse('')
  802. def test_start(self):
  803. g = _Lark("""a: "a" a? """, start='a')
  804. x = g.parse('a')
  805. x = g.parse('aa')
  806. x = g.parse('aaa')
  807. def test_alias(self):
  808. g = _Lark("""start: "a" -> b """)
  809. x = g.parse('a')
  810. self.assertEqual(x.data, "b")
  811. def test_token_ebnf(self):
  812. g = _Lark("""start: A
  813. A: "a"* ("b"? "c".."e")+
  814. """)
  815. x = g.parse('abcde')
  816. x = g.parse('dd')
  817. def test_backslash(self):
  818. g = _Lark(r"""start: "\\" "a"
  819. """)
  820. x = g.parse(r'\a')
  821. g = _Lark(r"""start: /\\/ /a/
  822. """)
  823. x = g.parse(r'\a')
  824. def test_backslash2(self):
  825. g = _Lark(r"""start: "\"" "-"
  826. """)
  827. x = g.parse('"-')
  828. g = _Lark(r"""start: /\// /-/
  829. """)
  830. x = g.parse('/-')
  831. def test_special_chars(self):
  832. g = _Lark(r"""start: "\n"
  833. """)
  834. x = g.parse('\n')
  835. g = _Lark(r"""start: /\n/
  836. """)
  837. x = g.parse('\n')
  838. # def test_token_recurse(self):
  839. # g = _Lark("""start: A
  840. # A: B
  841. # B: A
  842. # """)
  843. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  844. def test_empty(self):
  845. # Fails an Earley implementation without special handling for empty rules,
  846. # or re-processing of already completed rules.
  847. g = _Lark(r"""start: _empty a "B"
  848. a: _empty "A"
  849. _empty:
  850. """)
  851. x = g.parse('AB')
  852. def test_regex_quote(self):
  853. g = r"""
  854. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  855. SINGLE_QUOTED_STRING : /'[^']*'/
  856. DOUBLE_QUOTED_STRING : /"[^"]*"/
  857. """
  858. g = _Lark(g)
  859. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  860. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  861. @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+")
  862. def test_join_regex_flags(self):
  863. g = r"""
  864. start: A
  865. A: B C
  866. B: /./s
  867. C: /./
  868. """
  869. g = _Lark(g)
  870. self.assertEqual(g.parse(" ").children,[" "])
  871. self.assertEqual(g.parse("\n ").children,["\n "])
  872. self.assertRaises(UnexpectedCharacters, g.parse, "\n\n")
  873. def test_lexer_token_limit(self):
  874. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  875. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  876. g = _Lark("""start: %s
  877. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  878. def test_float_without_lexer(self):
  879. expected_error = UnexpectedCharacters if LEXER.startswith('dynamic') else UnexpectedToken
  880. if PARSER == 'cyk':
  881. expected_error = ParseError
  882. g = _Lark("""start: ["+"|"-"] float
  883. float: digit* "." digit+ exp?
  884. | digit+ exp
  885. exp: ("e"|"E") ["+"|"-"] digit+
  886. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  887. """)
  888. g.parse("1.2")
  889. g.parse("-.2e9")
  890. g.parse("+2e-9")
  891. self.assertRaises( expected_error, g.parse, "+2e-9e")
  892. def test_keep_all_tokens(self):
  893. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  894. tree = l.parse('aaa')
  895. self.assertEqual(tree.children, ['a', 'a', 'a'])
  896. def test_token_flags(self):
  897. l = _Lark("""!start: "a"i+
  898. """
  899. )
  900. tree = l.parse('aA')
  901. self.assertEqual(tree.children, ['a', 'A'])
  902. l = _Lark("""!start: /a/i+
  903. """
  904. )
  905. tree = l.parse('aA')
  906. self.assertEqual(tree.children, ['a', 'A'])
  907. # g = """!start: "a"i "a"
  908. # """
  909. # self.assertRaises(GrammarError, _Lark, g)
  910. # g = """!start: /a/i /a/
  911. # """
  912. # self.assertRaises(GrammarError, _Lark, g)
  913. g = """start: NAME "," "a"
  914. NAME: /[a-z_]/i /[a-z0-9_]/i*
  915. """
  916. l = _Lark(g)
  917. tree = l.parse('ab,a')
  918. self.assertEqual(tree.children, ['ab'])
  919. tree = l.parse('AB,a')
  920. self.assertEqual(tree.children, ['AB'])
  921. def test_token_flags3(self):
  922. l = _Lark("""!start: ABC+
  923. ABC: "abc"i
  924. """
  925. )
  926. tree = l.parse('aBcAbC')
  927. self.assertEqual(tree.children, ['aBc', 'AbC'])
  928. def test_token_flags2(self):
  929. g = """!start: ("a"i | /a/ /b/?)+
  930. """
  931. l = _Lark(g)
  932. tree = l.parse('aA')
  933. self.assertEqual(tree.children, ['a', 'A'])
  934. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  935. def test_twice_empty(self):
  936. g = """!start: ("A"?)?
  937. """
  938. l = _Lark(g)
  939. tree = l.parse('A')
  940. self.assertEqual(tree.children, ['A'])
  941. tree = l.parse('')
  942. self.assertEqual(tree.children, [])
  943. def test_undefined_ignore(self):
  944. g = """!start: "A"
  945. %ignore B
  946. """
  947. self.assertRaises( GrammarError, _Lark, g)
  948. def test_alias_in_terminal(self):
  949. g = """start: TERM
  950. TERM: "a" -> alias
  951. """
  952. self.assertRaises( GrammarError, _Lark, g)
  953. def test_line_and_column(self):
  954. g = r"""!start: "A" bc "D"
  955. !bc: "B\nC"
  956. """
  957. l = _Lark(g)
  958. a, bc, d = l.parse("AB\nCD").children
  959. self.assertEqual(a.line, 1)
  960. self.assertEqual(a.column, 1)
  961. bc ,= bc.children
  962. self.assertEqual(bc.line, 1)
  963. self.assertEqual(bc.column, 2)
  964. self.assertEqual(d.line, 2)
  965. self.assertEqual(d.column, 2)
  966. if LEXER != 'dynamic':
  967. self.assertEqual(a.end_line, 1)
  968. self.assertEqual(a.end_column, 2)
  969. self.assertEqual(bc.end_line, 2)
  970. self.assertEqual(bc.end_column, 2)
  971. self.assertEqual(d.end_line, 2)
  972. self.assertEqual(d.end_column, 3)
  973. def test_reduce_cycle(self):
  974. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  975. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  976. """
  977. l = _Lark("""
  978. term: A
  979. | term term
  980. A: "a"
  981. """, start='term')
  982. tree = l.parse("aa")
  983. self.assertEqual(len(tree.children), 2)
  984. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  985. def test_lexer_prioritization(self):
  986. "Tests effect of priority on result"
  987. grammar = """
  988. start: A B | AB
  989. A.2: "a"
  990. B: "b"
  991. AB: "ab"
  992. """
  993. l = _Lark(grammar)
  994. res = l.parse("ab")
  995. self.assertEqual(res.children, ['a', 'b'])
  996. self.assertNotEqual(res.children, ['ab'])
  997. grammar = """
  998. start: A B | AB
  999. A: "a"
  1000. B: "b"
  1001. AB.3: "ab"
  1002. """
  1003. l = _Lark(grammar)
  1004. res = l.parse("ab")
  1005. self.assertNotEqual(res.children, ['a', 'b'])
  1006. self.assertEqual(res.children, ['ab'])
  1007. grammar = """
  1008. start: A B | AB
  1009. A: "a"
  1010. B.-20: "b"
  1011. AB.-10: "ab"
  1012. """
  1013. l = _Lark(grammar)
  1014. res = l.parse("ab")
  1015. self.assertEqual(res.children, ['a', 'b'])
  1016. grammar = """
  1017. start: A B | AB
  1018. A.-99999999999999999999999: "a"
  1019. B: "b"
  1020. AB: "ab"
  1021. """
  1022. l = _Lark(grammar)
  1023. res = l.parse("ab")
  1024. self.assertEqual(res.children, ['ab'])
  1025. def test_import(self):
  1026. grammar = """
  1027. start: NUMBER WORD
  1028. %import common.NUMBER
  1029. %import common.WORD
  1030. %import common.WS
  1031. %ignore WS
  1032. """
  1033. l = _Lark(grammar)
  1034. x = l.parse('12 elephants')
  1035. self.assertEqual(x.children, ['12', 'elephants'])
  1036. def test_import_rename(self):
  1037. grammar = """
  1038. start: N W
  1039. %import common.NUMBER -> N
  1040. %import common.WORD -> W
  1041. %import common.WS
  1042. %ignore WS
  1043. """
  1044. l = _Lark(grammar)
  1045. x = l.parse('12 elephants')
  1046. self.assertEqual(x.children, ['12', 'elephants'])
  1047. def test_relative_import(self):
  1048. l = _Lark_open('test_relative_import.lark', rel_to=__file__)
  1049. x = l.parse('12 lions')
  1050. self.assertEqual(x.children, ['12', 'lions'])
  1051. def test_relative_import_unicode(self):
  1052. l = _Lark_open('test_relative_import_unicode.lark', rel_to=__file__)
  1053. x = l.parse(u'Ø')
  1054. self.assertEqual(x.children, [u'Ø'])
  1055. def test_relative_import_rename(self):
  1056. l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__)
  1057. x = l.parse('12 lions')
  1058. self.assertEqual(x.children, ['12', 'lions'])
  1059. def test_relative_rule_import(self):
  1060. l = _Lark_open('test_relative_rule_import.lark', rel_to=__file__)
  1061. x = l.parse('xaabby')
  1062. self.assertEqual(x.children, [
  1063. 'x',
  1064. Tree('expr', ['a', Tree('expr', ['a', 'b']), 'b']),
  1065. 'y'])
  1066. def test_relative_rule_import_drop_ignore(self):
  1067. # %ignore rules are dropped on import
  1068. l = _Lark_open('test_relative_rule_import_drop_ignore.lark',
  1069. rel_to=__file__)
  1070. self.assertRaises((ParseError, UnexpectedInput),
  1071. l.parse, 'xa abby')
  1072. def test_relative_rule_import_subrule(self):
  1073. l = _Lark_open('test_relative_rule_import_subrule.lark',
  1074. rel_to=__file__)
  1075. x = l.parse('xaabby')
  1076. self.assertEqual(x.children, [
  1077. 'x',
  1078. Tree('startab', [
  1079. Tree('grammars__ab__expr', [
  1080. 'a', Tree('grammars__ab__expr', ['a', 'b']), 'b',
  1081. ]),
  1082. ]),
  1083. 'y'])
  1084. def test_relative_rule_import_subrule_no_conflict(self):
  1085. l = _Lark_open(
  1086. 'test_relative_rule_import_subrule_no_conflict.lark',
  1087. rel_to=__file__)
  1088. x = l.parse('xaby')
  1089. self.assertEqual(x.children, [Tree('expr', [
  1090. 'x',
  1091. Tree('startab', [
  1092. Tree('grammars__ab__expr', ['a', 'b']),
  1093. ]),
  1094. 'y'])])
  1095. self.assertRaises((ParseError, UnexpectedInput),
  1096. l.parse, 'xaxabyby')
  1097. def test_relative_rule_import_rename(self):
  1098. l = _Lark_open('test_relative_rule_import_rename.lark',
  1099. rel_to=__file__)
  1100. x = l.parse('xaabby')
  1101. self.assertEqual(x.children, [
  1102. 'x',
  1103. Tree('ab', ['a', Tree('ab', ['a', 'b']), 'b']),
  1104. 'y'])
  1105. def test_multi_import(self):
  1106. grammar = """
  1107. start: NUMBER WORD
  1108. %import common (NUMBER, WORD, WS)
  1109. %ignore WS
  1110. """
  1111. l = _Lark(grammar)
  1112. x = l.parse('12 toucans')
  1113. self.assertEqual(x.children, ['12', 'toucans'])
  1114. def test_relative_multi_import(self):
  1115. l = _Lark_open("test_relative_multi_import.lark", rel_to=__file__)
  1116. x = l.parse('12 capybaras')
  1117. self.assertEqual(x.children, ['12', 'capybaras'])
  1118. def test_relative_import_preserves_leading_underscore(self):
  1119. l = _Lark_open("test_relative_import_preserves_leading_underscore.lark", rel_to=__file__)
  1120. x = l.parse('Ax')
  1121. self.assertEqual(next(x.find_data('c')).children, ['A'])
  1122. def test_relative_import_of_nested_grammar(self):
  1123. l = _Lark_open("grammars/test_relative_import_of_nested_grammar.lark", rel_to=__file__)
  1124. x = l.parse('N')
  1125. self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])
  1126. def test_relative_import_rules_dependencies_imported_only_once(self):
  1127. l = _Lark_open("test_relative_import_rules_dependencies_imported_only_once.lark", rel_to=__file__)
  1128. x = l.parse('AAA')
  1129. self.assertEqual(next(x.find_data('a')).children, ['A'])
  1130. self.assertEqual(next(x.find_data('b')).children, ['A'])
  1131. self.assertEqual(next(x.find_data('d')).children, ['A'])
  1132. def test_import_errors(self):
  1133. grammar = """
  1134. start: NUMBER WORD
  1135. %import .grammars.bad_test.NUMBER
  1136. """
  1137. self.assertRaises(IOError, _Lark, grammar)
  1138. grammar = """
  1139. start: NUMBER WORD
  1140. %import bad_test.NUMBER
  1141. """
  1142. self.assertRaises(IOError, _Lark, grammar)
  1143. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  1144. def test_earley_prioritization(self):
  1145. "Tests effect of priority on result"
  1146. grammar = """
  1147. start: a | b
  1148. a.1: "a"
  1149. b.2: "a"
  1150. """
  1151. # l = Lark(grammar, parser='earley', lexer='standard')
  1152. l = _Lark(grammar)
  1153. res = l.parse("a")
  1154. self.assertEqual(res.children[0].data, 'b')
  1155. grammar = """
  1156. start: a | b
  1157. a.2: "a"
  1158. b.1: "a"
  1159. """
  1160. l = _Lark(grammar)
  1161. # l = Lark(grammar, parser='earley', lexer='standard')
  1162. res = l.parse("a")
  1163. self.assertEqual(res.children[0].data, 'a')
  1164. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  1165. def test_earley_prioritization_sum(self):
  1166. "Tests effect of priority on result"
  1167. grammar = """
  1168. start: ab_ b_ a_ | indirection
  1169. indirection: a_ bb_ a_
  1170. a_: "a"
  1171. b_: "b"
  1172. ab_: "ab"
  1173. bb_.1: "bb"
  1174. """
  1175. l = Lark(grammar, priority="invert")
  1176. res = l.parse('abba')
  1177. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  1178. grammar = """
  1179. start: ab_ b_ a_ | indirection
  1180. indirection: a_ bb_ a_
  1181. a_: "a"
  1182. b_: "b"
  1183. ab_.1: "ab"
  1184. bb_: "bb"
  1185. """
  1186. l = Lark(grammar, priority="invert")
  1187. res = l.parse('abba')
  1188. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  1189. grammar = """
  1190. start: ab_ b_ a_ | indirection
  1191. indirection: a_ bb_ a_
  1192. a_.2: "a"
  1193. b_.1: "b"
  1194. ab_.3: "ab"
  1195. bb_.3: "bb"
  1196. """
  1197. l = Lark(grammar, priority="invert")
  1198. res = l.parse('abba')
  1199. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  1200. grammar = """
  1201. start: ab_ b_ a_ | indirection
  1202. indirection: a_ bb_ a_
  1203. a_.1: "a"
  1204. b_.1: "b"
  1205. ab_.4: "ab"
  1206. bb_.3: "bb"
  1207. """
  1208. l = Lark(grammar, priority="invert")
  1209. res = l.parse('abba')
  1210. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  1211. def test_utf8(self):
  1212. g = u"""start: a
  1213. a: "±a"
  1214. """
  1215. l = _Lark(g)
  1216. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  1217. g = u"""start: A
  1218. A: "±a"
  1219. """
  1220. l = _Lark(g)
  1221. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  1222. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1223. def test_ignore(self):
  1224. grammar = r"""
  1225. COMMENT: /(!|(\/\/))[^\n]*/
  1226. %ignore COMMENT
  1227. %import common.WS -> _WS
  1228. %import common.INT
  1229. start: "INT"i _WS+ INT _WS*
  1230. """
  1231. parser = _Lark(grammar)
  1232. tree = parser.parse("int 1 ! This is a comment\n")
  1233. self.assertEqual(tree.children, ['1'])
  1234. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  1235. self.assertEqual(tree.children, ['1'])
  1236. parser = _Lark(r"""
  1237. start : "a"*
  1238. %ignore "b"
  1239. """)
  1240. tree = parser.parse("bb")
  1241. self.assertEqual(tree.children, [])
  1242. def test_regex_escaping(self):
  1243. g = _Lark("start: /[ab]/")
  1244. g.parse('a')
  1245. g.parse('b')
  1246. self.assertRaises( UnexpectedInput, g.parse, 'c')
  1247. _Lark(r'start: /\w/').parse('a')
  1248. g = _Lark(r'start: /\\w/')
  1249. self.assertRaises( UnexpectedInput, g.parse, 'a')
  1250. g.parse(r'\w')
  1251. _Lark(r'start: /\[/').parse('[')
  1252. _Lark(r'start: /\//').parse('/')
  1253. _Lark(r'start: /\\/').parse('\\')
  1254. _Lark(r'start: /\[ab]/').parse('[ab]')
  1255. _Lark(r'start: /\\[ab]/').parse('\\a')
  1256. _Lark(r'start: /\t/').parse('\t')
  1257. _Lark(r'start: /\\t/').parse('\\t')
  1258. _Lark(r'start: /\\\t/').parse('\\\t')
  1259. _Lark(r'start: "\t"').parse('\t')
  1260. _Lark(r'start: "\\t"').parse('\\t')
  1261. _Lark(r'start: "\\\t"').parse('\\\t')
  1262. def test_ranged_repeat_rules(self):
  1263. g = u"""!start: "A"~3
  1264. """
  1265. l = _Lark(g)
  1266. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  1267. self.assertRaises(ParseError, l.parse, u'AA')
  1268. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  1269. g = u"""!start: "A"~0..2
  1270. """
  1271. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  1272. l = _Lark(g)
  1273. self.assertEqual(l.parse(u''), Tree('start', []))
  1274. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  1275. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  1276. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  1277. g = u"""!start: "A"~3..2
  1278. """
  1279. self.assertRaises(GrammarError, _Lark, g)
  1280. g = u"""!start: "A"~2..3 "B"~2
  1281. """
  1282. l = _Lark(g)
  1283. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  1284. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  1285. self.assertRaises(ParseError, l.parse, u'AAAB')
  1286. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  1287. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  1288. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  1289. def test_ranged_repeat_terms(self):
  1290. g = u"""!start: AAA
  1291. AAA: "A"~3
  1292. """
  1293. l = _Lark(g)
  1294. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  1295. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  1296. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  1297. g = u"""!start: AABB CC
  1298. AABB: "A"~0..2 "B"~2
  1299. CC: "C"~1..2
  1300. """
  1301. l = _Lark(g)
  1302. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  1303. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  1304. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  1305. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  1306. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  1307. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  1308. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  1309. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  1310. def test_priority_vs_embedded(self):
  1311. g = """
  1312. A.2: "a"
  1313. WORD: ("a".."z")+
  1314. start: (A | WORD)+
  1315. """
  1316. l = _Lark(g)
  1317. t = l.parse('abc')
  1318. self.assertEqual(t.children, ['a', 'bc'])
  1319. self.assertEqual(t.children[0].type, 'A')
  1320. def test_line_counting(self):
  1321. p = _Lark("start: /[^x]+/")
  1322. text = 'hello\nworld'
  1323. t = p.parse(text)
  1324. tok = t.children[0]
  1325. self.assertEqual(tok, text)
  1326. self.assertEqual(tok.line, 1)
  1327. self.assertEqual(tok.column, 1)
  1328. if _LEXER != 'dynamic':
  1329. self.assertEqual(tok.end_line, 2)
  1330. self.assertEqual(tok.end_column, 6)
  1331. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1332. def test_empty_end(self):
  1333. p = _Lark("""
  1334. start: b c d
  1335. b: "B"
  1336. c: | "C"
  1337. d: | "D"
  1338. """)
  1339. res = p.parse('B')
  1340. self.assertEqual(len(res.children), 3)
  1341. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1342. def test_maybe_placeholders(self):
  1343. # Anonymous tokens shouldn't count
  1344. p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True)
  1345. self.assertEqual(p.parse("").children, [])
  1346. # All invisible constructs shouldn't count
  1347. p = _Lark("""start: [A] ["b"] [_c] ["e" "f" _c]
  1348. A: "a"
  1349. _c: "c" """, maybe_placeholders=True)
  1350. self.assertEqual(p.parse("").children, [None])
  1351. self.assertEqual(p.parse("c").children, [None])
  1352. self.assertEqual(p.parse("aefc").children, ['a'])
  1353. # ? shouldn't apply
  1354. p = _Lark("""!start: ["a"] "b"? ["c"] """, maybe_placeholders=True)
  1355. self.assertEqual(p.parse("").children, [None, None])
  1356. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1357. p = _Lark("""!start: ["a"] ["b"] ["c"] """, maybe_placeholders=True)
  1358. self.assertEqual(p.parse("").children, [None, None, None])
  1359. self.assertEqual(p.parse("a").children, ['a', None, None])
  1360. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1361. self.assertEqual(p.parse("c").children, [None, None, 'c'])
  1362. self.assertEqual(p.parse("ab").children, ['a', 'b', None])
  1363. self.assertEqual(p.parse("ac").children, ['a', None, 'c'])
  1364. self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
  1365. self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])
  1366. p = _Lark("""!start: (["a"] "b" ["c"])+ """, maybe_placeholders=True)
  1367. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1368. self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
  1369. self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
  1370. self.assertEqual(p.parse("babbcabcb").children,
  1371. [None, 'b', None,
  1372. 'a', 'b', None,
  1373. None, 'b', 'c',
  1374. 'a', 'b', 'c',
  1375. None, 'b', None])
  1376. p = _Lark("""!start: ["a"] ["c"] "b"+ ["a"] ["d"] """, maybe_placeholders=True)
  1377. self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None])
  1378. self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd'])
  1379. self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None])
  1380. self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None])
  1381. def test_escaped_string(self):
  1382. "Tests common.ESCAPED_STRING"
  1383. grammar = r"""
  1384. start: ESCAPED_STRING+
  1385. %import common (WS_INLINE, ESCAPED_STRING)
  1386. %ignore WS_INLINE
  1387. """
  1388. parser = _Lark(grammar)
  1389. parser.parse(r'"\\" "b" "c"')
  1390. parser.parse(r'"That" "And a \"b"')
  1391. def test_meddling_unused(self):
  1392. "Unless 'unused' is removed, LALR analysis will fail on reduce-reduce collision"
  1393. grammar = """
  1394. start: EKS* x
  1395. x: EKS
  1396. unused: x*
  1397. EKS: "x"
  1398. """
  1399. parser = _Lark(grammar)
  1400. @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
  1401. def test_serialize(self):
  1402. grammar = """
  1403. start: _ANY b "C"
  1404. _ANY: /./
  1405. b: "B"
  1406. """
  1407. parser = _Lark(grammar)
  1408. s = BytesIO()
  1409. parser.save(s)
  1410. s.seek(0)
  1411. parser2 = Lark.load(s)
  1412. self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) )
  1413. def test_multi_start(self):
  1414. parser = _Lark('''
  1415. a: "x" "a"?
  1416. b: "x" "b"?
  1417. ''', start=['a', 'b'])
  1418. self.assertEqual(parser.parse('xa', 'a'), Tree('a', []))
  1419. self.assertEqual(parser.parse('xb', 'b'), Tree('b', []))
  1420. def test_lexer_detect_newline_tokens(self):
  1421. # Detect newlines in regular tokens
  1422. g = _Lark(r"""start: "go" tail*
  1423. !tail : SA "@" | SB "@" | SC "@" | SD "@"
  1424. SA : "a" /\n/
  1425. SB : /b./s
  1426. SC : "c" /[^a-z]/
  1427. SD : "d" /\s/
  1428. """)
  1429. a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children]
  1430. self.assertEqual(a.line, 2)
  1431. self.assertEqual(b.line, 3)
  1432. self.assertEqual(c.line, 4)
  1433. self.assertEqual(d.line, 5)
  1434. # Detect newlines in ignored tokens
  1435. for re in ['/\\n/', '/[^a-z]/', '/\\s/']:
  1436. g = _Lark('''!start: "a" "a"
  1437. %ignore {}'''.format(re))
  1438. a, b = g.parse('a\na').children
  1439. self.assertEqual(a.line, 1)
  1440. self.assertEqual(b.line, 2)
  1441. @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
  1442. def test_unicode_class(self):
  1443. "Tests that character classes from the `regex` module work correctly."
  1444. g = _Lark(r"""?start: NAME
  1445. NAME: ID_START ID_CONTINUE*
  1446. ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
  1447. ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)
  1448. self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
  1449. @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
  1450. def test_unicode_word(self):
  1451. "Tests that a persistent bug in the `re` module works when `regex` is enabled."
  1452. g = _Lark(r"""?start: NAME
  1453. NAME: /[\w]+/
  1454. """, regex=True)
  1455. self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
  1456. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  1457. _TestParser.__name__ = _NAME
  1458. _TestParser.__qualname__ = "tests.test_parser." + _NAME
  1459. globals()[_NAME] = _TestParser
  1460. # Note: You still have to import them in __main__ for the tests to run
  1461. _TO_TEST = [
  1462. ('standard', 'earley'),
  1463. ('standard', 'cyk'),
  1464. ('dynamic', 'earley'),
  1465. ('dynamic_complete', 'earley'),
  1466. ('standard', 'lalr'),
  1467. ('contextual', 'lalr'),
  1468. ('custom', 'lalr'),
  1469. # (None, 'earley'),
  1470. ]
  1471. for _LEXER, _PARSER in _TO_TEST:
  1472. _make_parser_test(_LEXER, _PARSER)
  1473. for _LEXER in ('dynamic', 'dynamic_complete'):
  1474. _make_full_earley_test(_LEXER)
  1475. if __name__ == '__main__':
  1476. unittest.main()