This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1649 lines
56 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
  19. from lark.tree import Tree
  20. from lark.visitors import Transformer, Transformer_InPlace, v_args
  21. from lark.grammar import Rule
  22. from lark.lexer import TerminalDef, Lexer, TraditionalLexer
  23. __path__ = os.path.dirname(__file__)
  24. def _read(n, *args):
  25. with open(os.path.join(__path__, n), *args) as f:
  26. return f.read()
  27. class TestParsers(unittest.TestCase):
  28. def test_same_ast(self):
  29. "Tests that Earley and LALR parsers produce equal trees"
  30. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  31. name_list: NAME | name_list "," NAME
  32. NAME: /\w+/ """, parser='lalr')
  33. l = g.parse('(a,b,c,*x)')
  34. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  35. name_list: NAME | name_list "," NAME
  36. NAME: /\w/+ """)
  37. l2 = g.parse('(a,b,c,*x)')
  38. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  39. def test_infinite_recurse(self):
  40. g = """start: a
  41. a: a | "a"
  42. """
  43. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  44. # TODO: should it? shouldn't it?
  45. # l = Lark(g, parser='earley', lexer='dynamic')
  46. # self.assertRaises(ParseError, l.parse, 'a')
  47. def test_propagate_positions(self):
  48. g = Lark("""start: a
  49. a: "a"
  50. """, propagate_positions=True)
  51. r = g.parse('a')
  52. self.assertEqual( r.children[0].meta.line, 1 )
  53. def test_expand1(self):
  54. g = Lark("""start: a
  55. ?a: b
  56. b: "x"
  57. """)
  58. r = g.parse('x')
  59. self.assertEqual( r.children[0].data, "b" )
  60. g = Lark("""start: a
  61. ?a: b -> c
  62. b: "x"
  63. """)
  64. r = g.parse('x')
  65. self.assertEqual( r.children[0].data, "c" )
  66. g = Lark("""start: a
  67. ?a: B -> c
  68. B: "x"
  69. """)
  70. self.assertEqual( r.children[0].data, "c" )
  71. g = Lark("""start: a
  72. ?a: b b -> c
  73. b: "x"
  74. """)
  75. r = g.parse('xx')
  76. self.assertEqual( r.children[0].data, "c" )
  77. def test_visit_tokens(self):
  78. class T(Transformer):
  79. def a(self, children):
  80. return children[0] + "!"
  81. def A(self, tok):
  82. return tok.update(value=tok.upper())
  83. # Test regular
  84. g = """start: a
  85. a : A
  86. A: "x"
  87. """
  88. p = Lark(g, parser='lalr')
  89. r = T(False).transform(p.parse("x"))
  90. self.assertEqual( r.children, ["x!"] )
  91. r = T().transform(p.parse("x"))
  92. self.assertEqual( r.children, ["X!"] )
  93. # Test internal transformer
  94. p = Lark(g, parser='lalr', transformer=T())
  95. r = p.parse("x")
  96. self.assertEqual( r.children, ["X!"] )
  97. def test_embedded_transformer(self):
  98. class T(Transformer):
  99. def a(self, children):
  100. return "<a>"
  101. def b(self, children):
  102. return "<b>"
  103. def c(self, children):
  104. return "<c>"
  105. # Test regular
  106. g = Lark("""start: a
  107. a : "x"
  108. """, parser='lalr')
  109. r = T().transform(g.parse("x"))
  110. self.assertEqual( r.children, ["<a>"] )
  111. g = Lark("""start: a
  112. a : "x"
  113. """, parser='lalr', transformer=T())
  114. r = g.parse("x")
  115. self.assertEqual( r.children, ["<a>"] )
  116. # Test Expand1
  117. g = Lark("""start: a
  118. ?a : b
  119. b : "x"
  120. """, parser='lalr')
  121. r = T().transform(g.parse("x"))
  122. self.assertEqual( r.children, ["<b>"] )
  123. g = Lark("""start: a
  124. ?a : b
  125. b : "x"
  126. """, parser='lalr', transformer=T())
  127. r = g.parse("x")
  128. self.assertEqual( r.children, ["<b>"] )
  129. # Test Expand1 -> Alias
  130. g = Lark("""start: a
  131. ?a : b b -> c
  132. b : "x"
  133. """, parser='lalr')
  134. r = T().transform(g.parse("xx"))
  135. self.assertEqual( r.children, ["<c>"] )
  136. g = Lark("""start: a
  137. ?a : b b -> c
  138. b : "x"
  139. """, parser='lalr', transformer=T())
  140. r = g.parse("xx")
  141. self.assertEqual( r.children, ["<c>"] )
  142. def test_embedded_transformer_inplace(self):
  143. @v_args(tree=True)
  144. class T1(Transformer_InPlace):
  145. def a(self, tree):
  146. assert isinstance(tree, Tree), tree
  147. tree.children.append("tested")
  148. return tree
  149. def b(self, tree):
  150. return Tree(tree.data, tree.children + ['tested2'])
  151. @v_args(tree=True)
  152. class T2(Transformer):
  153. def a(self, tree):
  154. assert isinstance(tree, Tree)
  155. tree.children.append("tested")
  156. return tree
  157. def b(self, tree):
  158. return Tree(tree.data, tree.children + ['tested2'])
  159. class T3(Transformer):
  160. @v_args(tree=True)
  161. def a(self, tree):
  162. assert isinstance(tree, Tree)
  163. tree.children.append("tested")
  164. return tree
  165. @v_args(tree=True)
  166. def b(self, tree):
  167. return Tree(tree.data, tree.children + ['tested2'])
  168. for t in [T1(), T2(), T3()]:
  169. for internal in [False, True]:
  170. g = Lark("""start: a b
  171. a : "x"
  172. b : "y"
  173. """, parser='lalr', transformer=t if internal else None)
  174. r = g.parse("xy")
  175. if not internal:
  176. r = t.transform(r)
  177. a, b = r.children
  178. self.assertEqual(a.children, ["tested"])
  179. self.assertEqual(b.children, ["tested2"])
  180. def test_alias(self):
  181. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  182. def _make_full_earley_test(LEXER):
  183. def _Lark(grammar, **kwargs):
  184. return Lark(grammar, lexer=LEXER, parser='earley', propagate_positions=True, **kwargs)
  185. class _TestFullEarley(unittest.TestCase):
  186. def test_anon(self):
  187. # Fails an Earley implementation without special handling for empty rules,
  188. # or re-processing of already completed rules.
  189. g = Lark(r"""start: B
  190. B: ("ab"|/[^b]/)+
  191. """, lexer=LEXER)
  192. self.assertEqual( g.parse('abc').children[0], 'abc')
  193. def test_earley(self):
  194. g = Lark("""start: A "b" c
  195. A: "a"+
  196. c: "abc"
  197. """, parser="earley", lexer=LEXER)
  198. x = g.parse('aaaababc')
  199. def test_earley2(self):
  200. grammar = """
  201. start: statement+
  202. statement: "r"
  203. | "c" /[a-z]/+
  204. %ignore " "
  205. """
  206. program = """c b r"""
  207. l = Lark(grammar, parser='earley', lexer=LEXER)
  208. l.parse(program)
  209. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  210. def test_earley3(self):
  211. """Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)
  212. By default, `+` should immitate regexp greedy-matching
  213. """
  214. grammar = """
  215. start: A A
  216. A: "a"+
  217. """
  218. l = Lark(grammar, parser='earley', lexer=LEXER)
  219. res = l.parse("aaa")
  220. self.assertEqual(set(res.children), {'aa', 'a'})
  221. # XXX TODO fix Earley to maintain correct order
  222. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  223. # self.assertEqual(res.children, ['aa', 'a'])
  224. def test_earley4(self):
  225. grammar = """
  226. start: A A?
  227. A: "a"+
  228. """
  229. l = Lark(grammar, parser='earley', lexer=LEXER)
  230. res = l.parse("aaa")
  231. assert set(res.children) == {'aa', 'a'} or res.children == ['aaa']
  232. # XXX TODO fix Earley to maintain correct order
  233. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  234. # self.assertEqual(res.children, ['aaa'])
  235. def test_earley_repeating_empty(self):
  236. # This was a sneaky bug!
  237. grammar = """
  238. !start: "a" empty empty "b"
  239. empty: empty2
  240. empty2:
  241. """
  242. parser = Lark(grammar, parser='earley', lexer=LEXER)
  243. res = parser.parse('ab')
  244. empty_tree = Tree('empty', [Tree('empty2', [])])
  245. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  246. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  247. def test_earley_explicit_ambiguity(self):
  248. # This was a sneaky bug!
  249. grammar = """
  250. start: a b | ab
  251. a: "a"
  252. b: "b"
  253. ab: "ab"
  254. """
  255. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  256. ambig_tree = parser.parse('ab')
  257. self.assertEqual( ambig_tree.data, '_ambig')
  258. self.assertEqual( len(ambig_tree.children), 2)
  259. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  260. def test_ambiguity1(self):
  261. grammar = """
  262. start: cd+ "e"
  263. !cd: "c"
  264. | "d"
  265. | "cd"
  266. """
  267. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  268. ambig_tree = l.parse('cde')
  269. assert ambig_tree.data == '_ambig', ambig_tree
  270. assert len(ambig_tree.children) == 2
  271. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  272. def test_ambiguity2(self):
  273. grammar = """
  274. ANY: /[a-zA-Z0-9 ]+/
  275. a.2: "A" b+
  276. b.2: "B"
  277. c: ANY
  278. start: (a|c)*
  279. """
  280. l = Lark(grammar, parser='earley', lexer=LEXER)
  281. res = l.parse('ABX')
  282. expected = Tree('start', [
  283. Tree('a', [
  284. Tree('b', [])
  285. ]),
  286. Tree('c', [
  287. 'X'
  288. ])
  289. ])
  290. self.assertEqual(res, expected)
  291. def test_fruitflies_ambig(self):
  292. grammar = """
  293. start: noun verb noun -> simple
  294. | noun verb "like" noun -> comparative
  295. noun: adj? NOUN
  296. verb: VERB
  297. adj: ADJ
  298. NOUN: "flies" | "bananas" | "fruit"
  299. VERB: "like" | "flies"
  300. ADJ: "fruit"
  301. %import common.WS
  302. %ignore WS
  303. """
  304. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  305. tree = parser.parse('fruit flies like bananas')
  306. expected = Tree('_ambig', [
  307. Tree('comparative', [
  308. Tree('noun', ['fruit']),
  309. Tree('verb', ['flies']),
  310. Tree('noun', ['bananas'])
  311. ]),
  312. Tree('simple', [
  313. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  314. Tree('verb', ['like']),
  315. Tree('noun', ['bananas'])
  316. ])
  317. ])
  318. # self.assertEqual(tree, expected)
  319. self.assertEqual(tree.data, expected.data)
  320. self.assertEqual(set(tree.children), set(expected.children))
  321. @unittest.skipIf(LEXER!='dynamic_complete', "Only relevant for the dynamic_complete parser")
  322. def test_explicit_ambiguity2(self):
  323. grammar = r"""
  324. start: NAME+
  325. NAME: /\w+/
  326. %ignore " "
  327. """
  328. text = """cat"""
  329. parser = _Lark(grammar, start='start', ambiguity='explicit')
  330. tree = parser.parse(text)
  331. self.assertEqual(tree.data, '_ambig')
  332. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  333. self.assertEqual(combinations, {
  334. ('cat',),
  335. ('ca', 't'),
  336. ('c', 'at'),
  337. ('c', 'a' ,'t')
  338. })
  339. def test_term_ambig_resolve(self):
  340. grammar = r"""
  341. !start: NAME+
  342. NAME: /\w+/
  343. %ignore " "
  344. """
  345. text = """foo bar"""
  346. parser = Lark(grammar)
  347. tree = parser.parse(text)
  348. self.assertEqual(tree.children, ['foo', 'bar'])
  349. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  350. # def test_not_all_derivations(self):
  351. # grammar = """
  352. # start: cd+ "e"
  353. # !cd: "c"
  354. # | "d"
  355. # | "cd"
  356. # """
  357. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  358. # x = l.parse('cde')
  359. # assert x.data != '_ambig', x
  360. # assert len(x.children) == 1
  361. _NAME = "TestFullEarley" + LEXER.capitalize()
  362. _TestFullEarley.__name__ = _NAME
  363. globals()[_NAME] = _TestFullEarley
  364. class CustomLexer(Lexer):
  365. """
  366. Purpose of this custom lexer is to test the integration,
  367. so it uses the traditionalparser as implementation without custom lexing behaviour.
  368. """
  369. def __init__(self, lexer_conf):
  370. self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
  371. def lex(self, *args, **kwargs):
  372. return self.lexer.lex(*args, **kwargs)
  373. def _make_parser_test(LEXER, PARSER):
  374. lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
  375. def _Lark(grammar, **kwargs):
  376. return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
  377. def _Lark_open(gfilename, **kwargs):
  378. return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
  379. class _TestParser(unittest.TestCase):
  380. def test_basic1(self):
  381. g = _Lark("""start: a+ b a* "b" a*
  382. b: "b"
  383. a: "a"
  384. """)
  385. r = g.parse('aaabaab')
  386. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  387. r = g.parse('aaabaaba')
  388. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  389. self.assertRaises(ParseError, g.parse, 'aaabaa')
  390. def test_basic2(self):
  391. # Multiple parsers and colliding tokens
  392. g = _Lark("""start: B A
  393. B: "12"
  394. A: "1" """)
  395. g2 = _Lark("""start: B A
  396. B: "12"
  397. A: "2" """)
  398. x = g.parse('121')
  399. assert x.data == 'start' and x.children == ['12', '1'], x
  400. x = g2.parse('122')
  401. assert x.data == 'start' and x.children == ['12', '2'], x
  402. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  403. def test_stringio_bytes(self):
  404. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  405. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  406. def test_stringio_unicode(self):
  407. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  408. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  409. def test_unicode(self):
  410. g = _Lark(u"""start: UNIA UNIB UNIA
  411. UNIA: /\xa3/
  412. UNIB: /\u0101/
  413. """)
  414. g.parse(u'\xa3\u0101\u00a3')
  415. def test_unicode2(self):
  416. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  417. UNIA: /\xa3/
  418. UNIB: "a\u0101b\ "
  419. UNIC: /a?\u0101c\n/
  420. """)
  421. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  422. def test_unicode3(self):
  423. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  424. UNIA: /\xa3/
  425. UNIB: "\u0101"
  426. UNIC: /\u0203/ /\n/
  427. """)
  428. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  429. def test_hex_escape(self):
  430. g = _Lark(r"""start: A B C
  431. A: "\x01"
  432. B: /\x02/
  433. C: "\xABCD"
  434. """)
  435. g.parse('\x01\x02\xABCD')
  436. def test_unicode_literal_range_escape(self):
  437. g = _Lark(r"""start: A+
  438. A: "\u0061".."\u0063"
  439. """)
  440. g.parse('abc')
  441. def test_hex_literal_range_escape(self):
  442. g = _Lark(r"""start: A+
  443. A: "\x01".."\x03"
  444. """)
  445. g.parse('\x01\x02\x03')
  446. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  447. def test_stack_for_ebnf(self):
  448. """Verify that stack depth isn't an issue for EBNF grammars"""
  449. g = _Lark(r"""start: a+
  450. a : "a" """)
  451. g.parse("a" * (sys.getrecursionlimit()*2 ))
  452. def test_expand1_lists_with_one_item(self):
  453. g = _Lark(r"""start: list
  454. ?list: item+
  455. item : A
  456. A: "a"
  457. """)
  458. r = g.parse("a")
  459. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  460. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  461. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  462. self.assertEqual(len(r.children), 1)
  463. def test_expand1_lists_with_one_item_2(self):
  464. g = _Lark(r"""start: list
  465. ?list: item+ "!"
  466. item : A
  467. A: "a"
  468. """)
  469. r = g.parse("a!")
  470. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  471. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  472. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  473. self.assertEqual(len(r.children), 1)
  474. def test_dont_expand1_lists_with_multiple_items(self):
  475. g = _Lark(r"""start: list
  476. ?list: item+
  477. item : A
  478. A: "a"
  479. """)
  480. r = g.parse("aa")
  481. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  482. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  483. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  484. self.assertEqual(len(r.children), 1)
  485. # Sanity check: verify that 'list' contains the two 'item's we've given it
  486. [list] = r.children
  487. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  488. def test_dont_expand1_lists_with_multiple_items_2(self):
  489. g = _Lark(r"""start: list
  490. ?list: item+ "!"
  491. item : A
  492. A: "a"
  493. """)
  494. r = g.parse("aa!")
  495. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  496. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  497. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  498. self.assertEqual(len(r.children), 1)
  499. # Sanity check: verify that 'list' contains the two 'item's we've given it
  500. [list] = r.children
  501. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  502. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  503. def test_empty_expand1_list(self):
  504. g = _Lark(r"""start: list
  505. ?list: item*
  506. item : A
  507. A: "a"
  508. """)
  509. r = g.parse("")
  510. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  511. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  512. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  513. self.assertEqual(len(r.children), 1)
  514. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  515. [list] = r.children
  516. self.assertSequenceEqual([item.data for item in list.children], ())
  517. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  518. def test_empty_expand1_list_2(self):
  519. g = _Lark(r"""start: list
  520. ?list: item* "!"?
  521. item : A
  522. A: "a"
  523. """)
  524. r = g.parse("")
  525. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  526. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  527. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  528. self.assertEqual(len(r.children), 1)
  529. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  530. [list] = r.children
  531. self.assertSequenceEqual([item.data for item in list.children], ())
  532. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  533. def test_empty_flatten_list(self):
  534. g = _Lark(r"""start: list
  535. list: | item "," list
  536. item : A
  537. A: "a"
  538. """)
  539. r = g.parse("")
  540. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  541. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  542. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  543. [list] = r.children
  544. self.assertSequenceEqual([item.data for item in list.children], ())
  545. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  546. def test_single_item_flatten_list(self):
  547. g = _Lark(r"""start: list
  548. list: | item "," list
  549. item : A
  550. A: "a"
  551. """)
  552. r = g.parse("a,")
  553. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  554. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  555. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  556. [list] = r.children
  557. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  558. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  559. def test_multiple_item_flatten_list(self):
  560. g = _Lark(r"""start: list
  561. #list: | item "," list
  562. item : A
  563. A: "a"
  564. """)
  565. r = g.parse("a,a,")
  566. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  567. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  568. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  569. [list] = r.children
  570. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  571. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  572. def test_recurse_flatten(self):
  573. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  574. g = _Lark(r"""start: a | start a
  575. a : A
  576. A : "a" """)
  577. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  578. # STree data structures, which uses recursion).
  579. g.parse("a" * (sys.getrecursionlimit() // 4))
  580. def test_token_collision(self):
  581. g = _Lark(r"""start: "Hello" NAME
  582. NAME: /\w/+
  583. %ignore " "
  584. """)
  585. x = g.parse('Hello World')
  586. self.assertSequenceEqual(x.children, ['World'])
  587. x = g.parse('Hello HelloWorld')
  588. self.assertSequenceEqual(x.children, ['HelloWorld'])
  589. def test_token_collision_WS(self):
  590. g = _Lark(r"""start: "Hello" NAME
  591. NAME: /\w/+
  592. %import common.WS
  593. %ignore WS
  594. """)
  595. x = g.parse('Hello World')
  596. self.assertSequenceEqual(x.children, ['World'])
  597. x = g.parse('Hello HelloWorld')
  598. self.assertSequenceEqual(x.children, ['HelloWorld'])
  599. def test_token_collision2(self):
  600. g = _Lark("""
  601. !start: "starts"
  602. %import common.LCASE_LETTER
  603. """)
  604. x = g.parse("starts")
  605. self.assertSequenceEqual(x.children, ['starts'])
  606. # def test_string_priority(self):
  607. # g = _Lark("""start: (A | /a?bb/)+
  608. # A: "a" """)
  609. # x = g.parse('abb')
  610. # self.assertEqual(len(x.children), 2)
  611. # # This parse raises an exception because the lexer will always try to consume
  612. # # "a" first and will never match the regular expression
  613. # # This behavior is subject to change!!
  614. # # Thie won't happen with ambiguity handling.
  615. # g = _Lark("""start: (A | /a?ab/)+
  616. # A: "a" """)
  617. # self.assertRaises(LexError, g.parse, 'aab')
  618. def test_undefined_rule(self):
  619. self.assertRaises(GrammarError, _Lark, """start: a""")
  620. def test_undefined_token(self):
  621. self.assertRaises(GrammarError, _Lark, """start: A""")
  622. def test_rule_collision(self):
  623. g = _Lark("""start: "a"+ "b"
  624. | "a"+ """)
  625. x = g.parse('aaaa')
  626. x = g.parse('aaaab')
  627. def test_rule_collision2(self):
  628. g = _Lark("""start: "a"* "b"
  629. | "a"+ """)
  630. x = g.parse('aaaa')
  631. x = g.parse('aaaab')
  632. x = g.parse('b')
  633. def test_token_not_anon(self):
  634. """Tests that "a" is matched as an anonymous token, and not A.
  635. """
  636. g = _Lark("""start: "a"
  637. A: "a" """)
  638. x = g.parse('a')
  639. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  640. g = _Lark("""start: "a" A
  641. A: "a" """)
  642. x = g.parse('aa')
  643. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  644. self.assertEqual(x.children[0].type, "A")
  645. g = _Lark("""start: /a/
  646. A: /a/ """)
  647. x = g.parse('a')
  648. self.assertEqual(len(x.children), 1)
  649. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  650. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  651. def test_maybe(self):
  652. g = _Lark("""start: ["a"] """)
  653. x = g.parse('a')
  654. x = g.parse('')
  655. def test_start(self):
  656. g = _Lark("""a: "a" a? """, start='a')
  657. x = g.parse('a')
  658. x = g.parse('aa')
  659. x = g.parse('aaa')
  660. def test_alias(self):
  661. g = _Lark("""start: "a" -> b """)
  662. x = g.parse('a')
  663. self.assertEqual(x.data, "b")
  664. def test_token_ebnf(self):
  665. g = _Lark("""start: A
  666. A: "a"* ("b"? "c".."e")+
  667. """)
  668. x = g.parse('abcde')
  669. x = g.parse('dd')
  670. def test_backslash(self):
  671. g = _Lark(r"""start: "\\" "a"
  672. """)
  673. x = g.parse(r'\a')
  674. g = _Lark(r"""start: /\\/ /a/
  675. """)
  676. x = g.parse(r'\a')
  677. def test_backslash2(self):
  678. g = _Lark(r"""start: "\"" "-"
  679. """)
  680. x = g.parse('"-')
  681. g = _Lark(r"""start: /\// /-/
  682. """)
  683. x = g.parse('/-')
  684. def test_special_chars(self):
  685. g = _Lark(r"""start: "\n"
  686. """)
  687. x = g.parse('\n')
  688. g = _Lark(r"""start: /\n/
  689. """)
  690. x = g.parse('\n')
  691. # def test_token_recurse(self):
  692. # g = _Lark("""start: A
  693. # A: B
  694. # B: A
  695. # """)
  696. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  697. def test_empty(self):
  698. # Fails an Earley implementation without special handling for empty rules,
  699. # or re-processing of already completed rules.
  700. g = _Lark(r"""start: _empty a "B"
  701. a: _empty "A"
  702. _empty:
  703. """)
  704. x = g.parse('AB')
  705. def test_regex_quote(self):
  706. g = r"""
  707. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  708. SINGLE_QUOTED_STRING : /'[^']*'/
  709. DOUBLE_QUOTED_STRING : /"[^"]*"/
  710. """
  711. g = _Lark(g)
  712. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  713. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  714. def test_lexer_token_limit(self):
  715. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  716. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  717. g = _Lark("""start: %s
  718. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  719. def test_float_without_lexer(self):
  720. expected_error = UnexpectedCharacters if LEXER.startswith('dynamic') else UnexpectedToken
  721. if PARSER == 'cyk':
  722. expected_error = ParseError
  723. g = _Lark("""start: ["+"|"-"] float
  724. float: digit* "." digit+ exp?
  725. | digit+ exp
  726. exp: ("e"|"E") ["+"|"-"] digit+
  727. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  728. """)
  729. g.parse("1.2")
  730. g.parse("-.2e9")
  731. g.parse("+2e-9")
  732. self.assertRaises( expected_error, g.parse, "+2e-9e")
  733. def test_keep_all_tokens(self):
  734. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  735. tree = l.parse('aaa')
  736. self.assertEqual(tree.children, ['a', 'a', 'a'])
  737. def test_token_flags(self):
  738. l = _Lark("""!start: "a"i+
  739. """
  740. )
  741. tree = l.parse('aA')
  742. self.assertEqual(tree.children, ['a', 'A'])
  743. l = _Lark("""!start: /a/i+
  744. """
  745. )
  746. tree = l.parse('aA')
  747. self.assertEqual(tree.children, ['a', 'A'])
  748. # g = """!start: "a"i "a"
  749. # """
  750. # self.assertRaises(GrammarError, _Lark, g)
  751. # g = """!start: /a/i /a/
  752. # """
  753. # self.assertRaises(GrammarError, _Lark, g)
  754. g = """start: NAME "," "a"
  755. NAME: /[a-z_]/i /[a-z0-9_]/i*
  756. """
  757. l = _Lark(g)
  758. tree = l.parse('ab,a')
  759. self.assertEqual(tree.children, ['ab'])
  760. tree = l.parse('AB,a')
  761. self.assertEqual(tree.children, ['AB'])
  762. def test_token_flags3(self):
  763. l = _Lark("""!start: ABC+
  764. ABC: "abc"i
  765. """
  766. )
  767. tree = l.parse('aBcAbC')
  768. self.assertEqual(tree.children, ['aBc', 'AbC'])
  769. def test_token_flags2(self):
  770. g = """!start: ("a"i | /a/ /b/?)+
  771. """
  772. l = _Lark(g)
  773. tree = l.parse('aA')
  774. self.assertEqual(tree.children, ['a', 'A'])
  775. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  776. def test_twice_empty(self):
  777. g = """!start: ("A"?)?
  778. """
  779. l = _Lark(g)
  780. tree = l.parse('A')
  781. self.assertEqual(tree.children, ['A'])
  782. tree = l.parse('')
  783. self.assertEqual(tree.children, [])
  784. def test_undefined_ignore(self):
  785. g = """!start: "A"
  786. %ignore B
  787. """
  788. self.assertRaises( GrammarError, _Lark, g)
  789. def test_alias_in_terminal(self):
  790. g = """start: TERM
  791. TERM: "a" -> alias
  792. """
  793. self.assertRaises( GrammarError, _Lark, g)
  794. def test_line_and_column(self):
  795. g = r"""!start: "A" bc "D"
  796. !bc: "B\nC"
  797. """
  798. l = _Lark(g)
  799. a, bc, d = l.parse("AB\nCD").children
  800. self.assertEqual(a.line, 1)
  801. self.assertEqual(a.column, 1)
  802. bc ,= bc.children
  803. self.assertEqual(bc.line, 1)
  804. self.assertEqual(bc.column, 2)
  805. self.assertEqual(d.line, 2)
  806. self.assertEqual(d.column, 2)
  807. if LEXER != 'dynamic':
  808. self.assertEqual(a.end_line, 1)
  809. self.assertEqual(a.end_column, 2)
  810. self.assertEqual(bc.end_line, 2)
  811. self.assertEqual(bc.end_column, 2)
  812. self.assertEqual(d.end_line, 2)
  813. self.assertEqual(d.end_column, 3)
  814. def test_reduce_cycle(self):
  815. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  816. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  817. """
  818. l = _Lark("""
  819. term: A
  820. | term term
  821. A: "a"
  822. """, start='term')
  823. tree = l.parse("aa")
  824. self.assertEqual(len(tree.children), 2)
  825. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  826. def test_lexer_prioritization(self):
  827. "Tests effect of priority on result"
  828. grammar = """
  829. start: A B | AB
  830. A.2: "a"
  831. B: "b"
  832. AB: "ab"
  833. """
  834. l = _Lark(grammar)
  835. res = l.parse("ab")
  836. self.assertEqual(res.children, ['a', 'b'])
  837. self.assertNotEqual(res.children, ['ab'])
  838. grammar = """
  839. start: A B | AB
  840. A: "a"
  841. B: "b"
  842. AB.3: "ab"
  843. """
  844. l = _Lark(grammar)
  845. res = l.parse("ab")
  846. self.assertNotEqual(res.children, ['a', 'b'])
  847. self.assertEqual(res.children, ['ab'])
  848. grammar = """
  849. start: A B | AB
  850. A: "a"
  851. B.-20: "b"
  852. AB.-10: "ab"
  853. """
  854. l = _Lark(grammar)
  855. res = l.parse("ab")
  856. self.assertEqual(res.children, ['a', 'b'])
  857. grammar = """
  858. start: A B | AB
  859. A.-99999999999999999999999: "a"
  860. B: "b"
  861. AB: "ab"
  862. """
  863. l = _Lark(grammar)
  864. res = l.parse("ab")
  865. self.assertEqual(res.children, ['ab'])
  866. def test_import(self):
  867. grammar = """
  868. start: NUMBER WORD
  869. %import common.NUMBER
  870. %import common.WORD
  871. %import common.WS
  872. %ignore WS
  873. """
  874. l = _Lark(grammar)
  875. x = l.parse('12 elephants')
  876. self.assertEqual(x.children, ['12', 'elephants'])
  877. def test_import_rename(self):
  878. grammar = """
  879. start: N W
  880. %import common.NUMBER -> N
  881. %import common.WORD -> W
  882. %import common.WS
  883. %ignore WS
  884. """
  885. l = _Lark(grammar)
  886. x = l.parse('12 elephants')
  887. self.assertEqual(x.children, ['12', 'elephants'])
  888. def test_relative_import(self):
  889. l = _Lark_open('test_relative_import.lark', rel_to=__file__)
  890. x = l.parse('12 lions')
  891. self.assertEqual(x.children, ['12', 'lions'])
  892. def test_relative_import_unicode(self):
  893. l = _Lark_open('test_relative_import_unicode.lark', rel_to=__file__)
  894. x = l.parse(u'Ø')
  895. self.assertEqual(x.children, [u'Ø'])
  896. def test_relative_import_rename(self):
  897. l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__)
  898. x = l.parse('12 lions')
  899. self.assertEqual(x.children, ['12', 'lions'])
  900. def test_relative_rule_import(self):
  901. l = _Lark_open('test_relative_rule_import.lark', rel_to=__file__)
  902. x = l.parse('xaabby')
  903. self.assertEqual(x.children, [
  904. 'x',
  905. Tree('expr', ['a', Tree('expr', ['a', 'b']), 'b']),
  906. 'y'])
  907. def test_relative_rule_import_drop_ignore(self):
  908. # %ignore rules are dropped on import
  909. l = _Lark_open('test_relative_rule_import_drop_ignore.lark',
  910. rel_to=__file__)
  911. self.assertRaises((ParseError, UnexpectedInput),
  912. l.parse, 'xa abby')
  913. def test_relative_rule_import_subrule(self):
  914. l = _Lark_open('test_relative_rule_import_subrule.lark',
  915. rel_to=__file__)
  916. x = l.parse('xaabby')
  917. self.assertEqual(x.children, [
  918. 'x',
  919. Tree('startab', [
  920. Tree('grammars__ab__expr', [
  921. 'a', Tree('grammars__ab__expr', ['a', 'b']), 'b',
  922. ]),
  923. ]),
  924. 'y'])
  925. def test_relative_rule_import_subrule_no_conflict(self):
  926. l = _Lark_open(
  927. 'test_relative_rule_import_subrule_no_conflict.lark',
  928. rel_to=__file__)
  929. x = l.parse('xaby')
  930. self.assertEqual(x.children, [Tree('expr', [
  931. 'x',
  932. Tree('startab', [
  933. Tree('grammars__ab__expr', ['a', 'b']),
  934. ]),
  935. 'y'])])
  936. self.assertRaises((ParseError, UnexpectedInput),
  937. l.parse, 'xaxabyby')
  938. def test_relative_rule_import_rename(self):
  939. l = _Lark_open('test_relative_rule_import_rename.lark',
  940. rel_to=__file__)
  941. x = l.parse('xaabby')
  942. self.assertEqual(x.children, [
  943. 'x',
  944. Tree('ab', ['a', Tree('ab', ['a', 'b']), 'b']),
  945. 'y'])
  946. def test_multi_import(self):
  947. grammar = """
  948. start: NUMBER WORD
  949. %import common (NUMBER, WORD, WS)
  950. %ignore WS
  951. """
  952. l = _Lark(grammar)
  953. x = l.parse('12 toucans')
  954. self.assertEqual(x.children, ['12', 'toucans'])
  955. def test_relative_multi_import(self):
  956. l = _Lark_open("test_relative_multi_import.lark", rel_to=__file__)
  957. x = l.parse('12 capybaras')
  958. self.assertEqual(x.children, ['12', 'capybaras'])
  959. def test_relative_import_preserves_leading_underscore(self):
  960. l = _Lark_open("test_relative_import_preserves_leading_underscore.lark", rel_to=__file__)
  961. x = l.parse('Ax')
  962. self.assertEqual(next(x.find_data('c')).children, ['A'])
  963. def test_relative_import_of_nested_grammar(self):
  964. l = _Lark_open("grammars/test_relative_import_of_nested_grammar.lark", rel_to=__file__)
  965. x = l.parse('N')
  966. self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])
  967. def test_relative_import_rules_dependencies_imported_only_once(self):
  968. l = _Lark_open("test_relative_import_rules_dependencies_imported_only_once.lark", rel_to=__file__)
  969. x = l.parse('AAA')
  970. self.assertEqual(next(x.find_data('a')).children, ['A'])
  971. self.assertEqual(next(x.find_data('b')).children, ['A'])
  972. self.assertEqual(next(x.find_data('d')).children, ['A'])
  973. def test_import_errors(self):
  974. grammar = """
  975. start: NUMBER WORD
  976. %import .grammars.bad_test.NUMBER
  977. """
  978. self.assertRaises(IOError, _Lark, grammar)
  979. grammar = """
  980. start: NUMBER WORD
  981. %import bad_test.NUMBER
  982. """
  983. self.assertRaises(IOError, _Lark, grammar)
  984. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  985. def test_earley_prioritization(self):
  986. "Tests effect of priority on result"
  987. grammar = """
  988. start: a | b
  989. a.1: "a"
  990. b.2: "a"
  991. """
  992. # l = Lark(grammar, parser='earley', lexer='standard')
  993. l = _Lark(grammar)
  994. res = l.parse("a")
  995. self.assertEqual(res.children[0].data, 'b')
  996. grammar = """
  997. start: a | b
  998. a.2: "a"
  999. b.1: "a"
  1000. """
  1001. l = _Lark(grammar)
  1002. # l = Lark(grammar, parser='earley', lexer='standard')
  1003. res = l.parse("a")
  1004. self.assertEqual(res.children[0].data, 'a')
  1005. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  1006. def test_earley_prioritization_sum(self):
  1007. "Tests effect of priority on result"
  1008. grammar = """
  1009. start: ab_ b_ a_ | indirection
  1010. indirection: a_ bb_ a_
  1011. a_: "a"
  1012. b_: "b"
  1013. ab_: "ab"
  1014. bb_.1: "bb"
  1015. """
  1016. l = Lark(grammar, priority="invert")
  1017. res = l.parse('abba')
  1018. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  1019. grammar = """
  1020. start: ab_ b_ a_ | indirection
  1021. indirection: a_ bb_ a_
  1022. a_: "a"
  1023. b_: "b"
  1024. ab_.1: "ab"
  1025. bb_: "bb"
  1026. """
  1027. l = Lark(grammar, priority="invert")
  1028. res = l.parse('abba')
  1029. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  1030. grammar = """
  1031. start: ab_ b_ a_ | indirection
  1032. indirection: a_ bb_ a_
  1033. a_.2: "a"
  1034. b_.1: "b"
  1035. ab_.3: "ab"
  1036. bb_.3: "bb"
  1037. """
  1038. l = Lark(grammar, priority="invert")
  1039. res = l.parse('abba')
  1040. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  1041. grammar = """
  1042. start: ab_ b_ a_ | indirection
  1043. indirection: a_ bb_ a_
  1044. a_.1: "a"
  1045. b_.1: "b"
  1046. ab_.4: "ab"
  1047. bb_.3: "bb"
  1048. """
  1049. l = Lark(grammar, priority="invert")
  1050. res = l.parse('abba')
  1051. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  1052. def test_utf8(self):
  1053. g = u"""start: a
  1054. a: "±a"
  1055. """
  1056. l = _Lark(g)
  1057. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  1058. g = u"""start: A
  1059. A: "±a"
  1060. """
  1061. l = _Lark(g)
  1062. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  1063. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  1064. def test_ignore(self):
  1065. grammar = r"""
  1066. COMMENT: /(!|(\/\/))[^\n]*/
  1067. %ignore COMMENT
  1068. %import common.WS -> _WS
  1069. %import common.INT
  1070. start: "INT"i _WS+ INT _WS*
  1071. """
  1072. parser = _Lark(grammar)
  1073. tree = parser.parse("int 1 ! This is a comment\n")
  1074. self.assertEqual(tree.children, ['1'])
  1075. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  1076. self.assertEqual(tree.children, ['1'])
  1077. parser = _Lark(r"""
  1078. start : "a"*
  1079. %ignore "b"
  1080. """)
  1081. tree = parser.parse("bb")
  1082. self.assertEqual(tree.children, [])
  1083. def test_regex_escaping(self):
  1084. g = _Lark("start: /[ab]/")
  1085. g.parse('a')
  1086. g.parse('b')
  1087. self.assertRaises( UnexpectedInput, g.parse, 'c')
  1088. _Lark(r'start: /\w/').parse('a')
  1089. g = _Lark(r'start: /\\w/')
  1090. self.assertRaises( UnexpectedInput, g.parse, 'a')
  1091. g.parse(r'\w')
  1092. _Lark(r'start: /\[/').parse('[')
  1093. _Lark(r'start: /\//').parse('/')
  1094. _Lark(r'start: /\\/').parse('\\')
  1095. _Lark(r'start: /\[ab]/').parse('[ab]')
  1096. _Lark(r'start: /\\[ab]/').parse('\\a')
  1097. _Lark(r'start: /\t/').parse('\t')
  1098. _Lark(r'start: /\\t/').parse('\\t')
  1099. _Lark(r'start: /\\\t/').parse('\\\t')
  1100. _Lark(r'start: "\t"').parse('\t')
  1101. _Lark(r'start: "\\t"').parse('\\t')
  1102. _Lark(r'start: "\\\t"').parse('\\\t')
  1103. def test_ranged_repeat_rules(self):
  1104. g = u"""!start: "A"~3
  1105. """
  1106. l = _Lark(g)
  1107. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  1108. self.assertRaises(ParseError, l.parse, u'AA')
  1109. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  1110. g = u"""!start: "A"~0..2
  1111. """
  1112. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  1113. l = _Lark(g)
  1114. self.assertEqual(l.parse(u''), Tree('start', []))
  1115. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  1116. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  1117. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  1118. g = u"""!start: "A"~3..2
  1119. """
  1120. self.assertRaises(GrammarError, _Lark, g)
  1121. g = u"""!start: "A"~2..3 "B"~2
  1122. """
  1123. l = _Lark(g)
  1124. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  1125. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  1126. self.assertRaises(ParseError, l.parse, u'AAAB')
  1127. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  1128. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  1129. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  1130. def test_ranged_repeat_terms(self):
  1131. g = u"""!start: AAA
  1132. AAA: "A"~3
  1133. """
  1134. l = _Lark(g)
  1135. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  1136. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  1137. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  1138. g = u"""!start: AABB CC
  1139. AABB: "A"~0..2 "B"~2
  1140. CC: "C"~1..2
  1141. """
  1142. l = _Lark(g)
  1143. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  1144. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  1145. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  1146. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  1147. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  1148. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  1149. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  1150. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  1151. def test_priority_vs_embedded(self):
  1152. g = """
  1153. A.2: "a"
  1154. WORD: ("a".."z")+
  1155. start: (A | WORD)+
  1156. """
  1157. l = _Lark(g)
  1158. t = l.parse('abc')
  1159. self.assertEqual(t.children, ['a', 'bc'])
  1160. self.assertEqual(t.children[0].type, 'A')
  1161. def test_line_counting(self):
  1162. p = _Lark("start: /[^x]+/")
  1163. text = 'hello\nworld'
  1164. t = p.parse(text)
  1165. tok = t.children[0]
  1166. self.assertEqual(tok, text)
  1167. self.assertEqual(tok.line, 1)
  1168. self.assertEqual(tok.column, 1)
  1169. if _LEXER != 'dynamic':
  1170. self.assertEqual(tok.end_line, 2)
  1171. self.assertEqual(tok.end_column, 6)
  1172. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1173. def test_empty_end(self):
  1174. p = _Lark("""
  1175. start: b c d
  1176. b: "B"
  1177. c: | "C"
  1178. d: | "D"
  1179. """)
  1180. res = p.parse('B')
  1181. self.assertEqual(len(res.children), 3)
  1182. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1183. def test_maybe_placeholders(self):
  1184. # Anonymous tokens shouldn't count
  1185. p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True)
  1186. self.assertEqual(p.parse("").children, [])
  1187. # All invisible constructs shouldn't count
  1188. p = _Lark("""start: [A] ["b"] [_c] ["e" "f" _c]
  1189. A: "a"
  1190. _c: "c" """, maybe_placeholders=True)
  1191. self.assertEqual(p.parse("").children, [None])
  1192. self.assertEqual(p.parse("c").children, [None])
  1193. self.assertEqual(p.parse("aefc").children, ['a'])
  1194. # ? shouldn't apply
  1195. p = _Lark("""!start: ["a"] "b"? ["c"] """, maybe_placeholders=True)
  1196. self.assertEqual(p.parse("").children, [None, None])
  1197. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1198. p = _Lark("""!start: ["a"] ["b"] ["c"] """, maybe_placeholders=True)
  1199. self.assertEqual(p.parse("").children, [None, None, None])
  1200. self.assertEqual(p.parse("a").children, ['a', None, None])
  1201. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1202. self.assertEqual(p.parse("c").children, [None, None, 'c'])
  1203. self.assertEqual(p.parse("ab").children, ['a', 'b', None])
  1204. self.assertEqual(p.parse("ac").children, ['a', None, 'c'])
  1205. self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
  1206. self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])
  1207. p = _Lark("""!start: (["a"] "b" ["c"])+ """, maybe_placeholders=True)
  1208. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1209. self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
  1210. self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
  1211. self.assertEqual(p.parse("babbcabcb").children,
  1212. [None, 'b', None,
  1213. 'a', 'b', None,
  1214. None, 'b', 'c',
  1215. 'a', 'b', 'c',
  1216. None, 'b', None])
  1217. p = _Lark("""!start: ["a"] ["c"] "b"+ ["a"] ["d"] """, maybe_placeholders=True)
  1218. self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None])
  1219. self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd'])
  1220. self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None])
  1221. self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None])
  1222. def test_escaped_string(self):
  1223. "Tests common.ESCAPED_STRING"
  1224. grammar = r"""
  1225. start: ESCAPED_STRING+
  1226. %import common (WS_INLINE, ESCAPED_STRING)
  1227. %ignore WS_INLINE
  1228. """
  1229. parser = _Lark(grammar)
  1230. parser.parse(r'"\\" "b" "c"')
  1231. parser.parse(r'"That" "And a \"b"')
  1232. def test_meddling_unused(self):
  1233. "Unless 'unused' is removed, LALR analysis will fail on reduce-reduce collision"
  1234. grammar = """
  1235. start: EKS* x
  1236. x: EKS
  1237. unused: x*
  1238. EKS: "x"
  1239. """
  1240. parser = _Lark(grammar)
  1241. @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
  1242. def test_serialize(self):
  1243. grammar = """
  1244. start: _ANY b "C"
  1245. _ANY: /./
  1246. b: "B"
  1247. """
  1248. parser = _Lark(grammar)
  1249. d = parser.serialize()
  1250. parser2 = Lark.deserialize(d, {}, {})
  1251. self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) )
  1252. namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}
  1253. d, m = parser.memo_serialize(namespace.values())
  1254. parser3 = Lark.deserialize(d, namespace, m)
  1255. self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) )
  1256. def test_multi_start(self):
  1257. parser = _Lark('''
  1258. a: "x" "a"?
  1259. b: "x" "b"?
  1260. ''', start=['a', 'b'])
  1261. self.assertEqual(parser.parse('xa', 'a'), Tree('a', []))
  1262. self.assertEqual(parser.parse('xb', 'b'), Tree('b', []))
  1263. def test_lexer_detect_newline_tokens(self):
  1264. # Detect newlines in regular tokens
  1265. g = _Lark(r"""start: "go" tail*
  1266. !tail : SA "@" | SB "@" | SC "@" | SD "@"
  1267. SA : "a" /\n/
  1268. SB : /b./s
  1269. SC : "c" /[^a-z]/
  1270. SD : "d" /\s/
  1271. """)
  1272. a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children]
  1273. self.assertEqual(a.line, 2)
  1274. self.assertEqual(b.line, 3)
  1275. self.assertEqual(c.line, 4)
  1276. self.assertEqual(d.line, 5)
  1277. # Detect newlines in ignored tokens
  1278. for re in ['/\\n/', '/[^a-z]/', '/\\s/']:
  1279. g = _Lark('''!start: "a" "a"
  1280. %ignore {}'''.format(re))
  1281. a, b = g.parse('a\na').children
  1282. self.assertEqual(a.line, 1)
  1283. self.assertEqual(b.line, 2)
  1284. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  1285. _TestParser.__name__ = _NAME
  1286. globals()[_NAME] = _TestParser
  1287. # Note: You still have to import them in __main__ for the tests to run
  1288. _TO_TEST = [
  1289. ('standard', 'earley'),
  1290. ('standard', 'cyk'),
  1291. ('dynamic', 'earley'),
  1292. ('dynamic_complete', 'earley'),
  1293. ('standard', 'lalr'),
  1294. ('contextual', 'lalr'),
  1295. ('custom', 'lalr'),
  1296. # (None, 'earley'),
  1297. ]
  1298. for _LEXER, _PARSER in _TO_TEST:
  1299. _make_parser_test(_LEXER, _PARSER)
  1300. for _LEXER in ('dynamic', 'dynamic_complete'):
  1301. _make_full_earley_test(_LEXER)
  1302. if __name__ == '__main__':
  1303. unittest.main()