This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1351 lines
45 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput
  19. from lark.tree import Tree
  20. from lark.visitors import Transformer
  21. from lark.parsers.earley import ApplyCallbacks
  22. __path__ = os.path.dirname(__file__)
  23. def _read(n, *args):
  24. with open(os.path.join(__path__, n), *args) as f:
  25. return f.read()
  26. class TestParsers(unittest.TestCase):
  27. def test_same_ast(self):
  28. "Tests that Earley and LALR parsers produce equal trees"
  29. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  30. name_list: NAME | name_list "," NAME
  31. NAME: /\w+/ """, parser='lalr')
  32. l = g.parse('(a,b,c,*x)')
  33. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  34. name_list: NAME | name_list "," NAME
  35. NAME: /\w/+ """)
  36. l2 = g.parse('(a,b,c,*x)')
  37. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  38. def test_infinite_recurse(self):
  39. g = """start: a
  40. a: a | "a"
  41. """
  42. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  43. # TODO: should it? shouldn't it?
  44. # l = Lark(g, parser='earley', lexer='dynamic')
  45. # self.assertRaises(ParseError, l.parse, 'a')
  46. def test_propagate_positions(self):
  47. g = Lark("""start: a
  48. a: "a"
  49. """, propagate_positions=True)
  50. r = g.parse('a')
  51. self.assertEqual( r.children[0].meta.line, 1 )
  52. def test_expand1(self):
  53. g = Lark("""start: a
  54. ?a: b
  55. b: "x"
  56. """)
  57. r = g.parse('x')
  58. self.assertEqual( r.children[0].data, "b" )
  59. g = Lark("""start: a
  60. ?a: b -> c
  61. b: "x"
  62. """)
  63. r = g.parse('x')
  64. self.assertEqual( r.children[0].data, "c" )
  65. g = Lark("""start: a
  66. ?a: B -> c
  67. B: "x"
  68. """)
  69. self.assertEqual( r.children[0].data, "c" )
  70. g = Lark("""start: a
  71. ?a: b b -> c
  72. b: "x"
  73. """)
  74. r = g.parse('xx')
  75. self.assertEqual( r.children[0].data, "c" )
  76. def test_embedded_transformer(self):
  77. class T(Transformer):
  78. def a(self, children):
  79. return "<a>"
  80. def b(self, children):
  81. return "<b>"
  82. def c(self, children):
  83. return "<c>"
  84. # Test regular
  85. g = Lark("""start: a
  86. a : "x"
  87. """, parser='lalr')
  88. r = T().transform(g.parse("x"))
  89. self.assertEqual( r.children, ["<a>"] )
  90. g = Lark("""start: a
  91. a : "x"
  92. """, parser='lalr', transformer=T())
  93. r = g.parse("x")
  94. self.assertEqual( r.children, ["<a>"] )
  95. # Test Expand1
  96. g = Lark("""start: a
  97. ?a : b
  98. b : "x"
  99. """, parser='lalr')
  100. r = T().transform(g.parse("x"))
  101. self.assertEqual( r.children, ["<b>"] )
  102. g = Lark("""start: a
  103. ?a : b
  104. b : "x"
  105. """, parser='lalr', transformer=T())
  106. r = g.parse("x")
  107. self.assertEqual( r.children, ["<b>"] )
  108. # Test Expand1 -> Alias
  109. g = Lark("""start: a
  110. ?a : b b -> c
  111. b : "x"
  112. """, parser='lalr')
  113. r = T().transform(g.parse("xx"))
  114. self.assertEqual( r.children, ["<c>"] )
  115. g = Lark("""start: a
  116. ?a : b b -> c
  117. b : "x"
  118. """, parser='lalr', transformer=T())
  119. r = g.parse("xx")
  120. self.assertEqual( r.children, ["<c>"] )
  121. def test_alias(self):
  122. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  123. def _make_full_earley_test(LEXER):
  124. def _Lark(grammar, **kwargs):
  125. return Lark(grammar, lexer=LEXER, parser='earley', propagate_positions=True, **kwargs)
  126. class _TestFullEarley(unittest.TestCase):
  127. def test_anon(self):
  128. # Fails an Earley implementation without special handling for empty rules,
  129. # or re-processing of already completed rules.
  130. g = Lark(r"""start: B
  131. B: ("ab"|/[^b]/)+
  132. """, lexer=LEXER)
  133. self.assertEqual( g.parse('abc').children[0], 'abc')
  134. def test_earley(self):
  135. g = Lark("""start: A "b" c
  136. A: "a"+
  137. c: "abc"
  138. """, parser="earley", lexer=LEXER)
  139. x = g.parse('aaaababc')
  140. def test_earley2(self):
  141. grammar = """
  142. start: statement+
  143. statement: "r"
  144. | "c" /[a-z]/+
  145. %ignore " "
  146. """
  147. program = """c b r"""
  148. l = Lark(grammar, parser='earley', lexer=LEXER)
  149. l.parse(program)
  150. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  151. def test_earley3(self):
  152. """Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)
  153. By default, `+` should immitate regexp greedy-matching
  154. """
  155. grammar = """
  156. start: A A
  157. A: "a"+
  158. """
  159. l = Lark(grammar, parser='earley', lexer=LEXER)
  160. res = l.parse("aaa")
  161. self.assertEqual(set(res.children), {'aa', 'a'})
  162. # XXX TODO fix Earley to maintain correct order
  163. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  164. # self.assertEqual(res.children, ['aa', 'a'])
  165. def test_earley4(self):
  166. grammar = """
  167. start: A A?
  168. A: "a"+
  169. """
  170. l = Lark(grammar, parser='earley', lexer=LEXER)
  171. res = l.parse("aaa")
  172. assert set(res.children) == {'aa', 'a'} or res.children == ['aaa']
  173. # XXX TODO fix Earley to maintain correct order
  174. # i.e. terminals it imitate greedy search for terminals, but lazy search for rules
  175. # self.assertEqual(res.children, ['aaa'])
  176. def test_earley_repeating_empty(self):
  177. # This was a sneaky bug!
  178. grammar = """
  179. !start: "a" empty empty "b"
  180. empty: empty2
  181. empty2:
  182. """
  183. parser = Lark(grammar, parser='earley', lexer=LEXER)
  184. res = parser.parse('ab')
  185. empty_tree = Tree('empty', [Tree('empty2', [])])
  186. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  187. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  188. def test_earley_explicit_ambiguity(self):
  189. # This was a sneaky bug!
  190. grammar = """
  191. start: a b | ab
  192. a: "a"
  193. b: "b"
  194. ab: "ab"
  195. """
  196. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  197. ambig_tree = parser.parse('ab')
  198. self.assertEqual( ambig_tree.data, '_ambig')
  199. self.assertEqual( len(ambig_tree.children), 2)
  200. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  201. def test_ambiguity1(self):
  202. grammar = """
  203. start: cd+ "e"
  204. !cd: "c"
  205. | "d"
  206. | "cd"
  207. """
  208. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  209. ambig_tree = l.parse('cde')
  210. assert ambig_tree.data == '_ambig', ambig_tree
  211. assert len(ambig_tree.children) == 2
  212. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  213. def test_ambiguity2(self):
  214. grammar = """
  215. ANY: /[a-zA-Z0-9 ]+/
  216. a.2: "A" b+
  217. b.2: "B"
  218. c: ANY
  219. start: (a|c)*
  220. """
  221. l = Lark(grammar, parser='earley', lexer=LEXER)
  222. res = l.parse('ABX')
  223. expected = Tree('start', [
  224. Tree('a', [
  225. Tree('b', [])
  226. ]),
  227. Tree('c', [
  228. 'X'
  229. ])
  230. ])
  231. self.assertEqual(res, expected)
  232. def test_fruitflies_ambig(self):
  233. grammar = """
  234. start: noun verb noun -> simple
  235. | noun verb "like" noun -> comparative
  236. noun: adj? NOUN
  237. verb: VERB
  238. adj: ADJ
  239. NOUN: "flies" | "bananas" | "fruit"
  240. VERB: "like" | "flies"
  241. ADJ: "fruit"
  242. %import common.WS
  243. %ignore WS
  244. """
  245. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  246. tree = parser.parse('fruit flies like bananas')
  247. expected = Tree('_ambig', [
  248. Tree('comparative', [
  249. Tree('noun', ['fruit']),
  250. Tree('verb', ['flies']),
  251. Tree('noun', ['bananas'])
  252. ]),
  253. Tree('simple', [
  254. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  255. Tree('verb', ['like']),
  256. Tree('noun', ['bananas'])
  257. ])
  258. ])
  259. # self.assertEqual(tree, expected)
  260. self.assertEqual(tree.data, expected.data)
  261. self.assertEqual(set(tree.children), set(expected.children))
  262. @unittest.skipIf(LEXER!='dynamic_complete', "Only relevant for the dynamic_complete parser")
  263. def test_explicit_ambiguity2(self):
  264. grammar = r"""
  265. start: NAME+
  266. NAME: /\w+/
  267. %ignore " "
  268. """
  269. text = """cat"""
  270. parser = _Lark(grammar, start='start', ambiguity='explicit')
  271. tree = parser.parse(text)
  272. self.assertEqual(tree.data, '_ambig')
  273. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  274. self.assertEqual(combinations, {
  275. ('cat',),
  276. ('ca', 't'),
  277. ('c', 'at'),
  278. ('c', 'a' ,'t')
  279. })
  280. def test_term_ambig_resolve(self):
  281. grammar = r"""
  282. !start: NAME+
  283. NAME: /\w+/
  284. %ignore " "
  285. """
  286. text = """foo bar"""
  287. parser = Lark(grammar)
  288. tree = parser.parse(text)
  289. self.assertEqual(tree.children, ['foo', 'bar'])
  290. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  291. # def test_not_all_derivations(self):
  292. # grammar = """
  293. # start: cd+ "e"
  294. # !cd: "c"
  295. # | "d"
  296. # | "cd"
  297. # """
  298. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  299. # x = l.parse('cde')
  300. # assert x.data != '_ambig', x
  301. # assert len(x.children) == 1
  302. _NAME = "TestFullEarley" + LEXER.capitalize()
  303. _TestFullEarley.__name__ = _NAME
  304. globals()[_NAME] = _TestFullEarley
  305. def _make_parser_test(LEXER, PARSER):
  306. def _Lark(grammar, **kwargs):
  307. return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
  308. class _TestParser(unittest.TestCase):
  309. def test_basic1(self):
  310. g = _Lark("""start: a+ b a* "b" a*
  311. b: "b"
  312. a: "a"
  313. """)
  314. r = g.parse('aaabaab')
  315. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  316. r = g.parse('aaabaaba')
  317. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  318. self.assertRaises(ParseError, g.parse, 'aaabaa')
  319. def test_basic2(self):
  320. # Multiple parsers and colliding tokens
  321. g = _Lark("""start: B A
  322. B: "12"
  323. A: "1" """)
  324. g2 = _Lark("""start: B A
  325. B: "12"
  326. A: "2" """)
  327. x = g.parse('121')
  328. assert x.data == 'start' and x.children == ['12', '1'], x
  329. x = g2.parse('122')
  330. assert x.data == 'start' and x.children == ['12', '2'], x
  331. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  332. def test_stringio_bytes(self):
  333. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  334. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  335. def test_stringio_unicode(self):
  336. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  337. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  338. def test_unicode(self):
  339. g = _Lark(u"""start: UNIA UNIB UNIA
  340. UNIA: /\xa3/
  341. UNIB: /\u0101/
  342. """)
  343. g.parse(u'\xa3\u0101\u00a3')
  344. def test_unicode2(self):
  345. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  346. UNIA: /\xa3/
  347. UNIB: "a\u0101b\ "
  348. UNIC: /a?\u0101c\n/
  349. """)
  350. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  351. def test_unicode3(self):
  352. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  353. UNIA: /\xa3/
  354. UNIB: "\u0101"
  355. UNIC: /\u0203/ /\n/
  356. """)
  357. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  358. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  359. def test_stack_for_ebnf(self):
  360. """Verify that stack depth isn't an issue for EBNF grammars"""
  361. g = _Lark(r"""start: a+
  362. a : "a" """)
  363. g.parse("a" * (sys.getrecursionlimit()*2 ))
  364. def test_expand1_lists_with_one_item(self):
  365. g = _Lark(r"""start: list
  366. ?list: item+
  367. item : A
  368. A: "a"
  369. """)
  370. r = g.parse("a")
  371. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  372. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  373. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  374. self.assertEqual(len(r.children), 1)
  375. def test_expand1_lists_with_one_item_2(self):
  376. g = _Lark(r"""start: list
  377. ?list: item+ "!"
  378. item : A
  379. A: "a"
  380. """)
  381. r = g.parse("a!")
  382. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  383. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  384. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  385. self.assertEqual(len(r.children), 1)
  386. def test_dont_expand1_lists_with_multiple_items(self):
  387. g = _Lark(r"""start: list
  388. ?list: item+
  389. item : A
  390. A: "a"
  391. """)
  392. r = g.parse("aa")
  393. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  394. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  395. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  396. self.assertEqual(len(r.children), 1)
  397. # Sanity check: verify that 'list' contains the two 'item's we've given it
  398. [list] = r.children
  399. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  400. def test_dont_expand1_lists_with_multiple_items_2(self):
  401. g = _Lark(r"""start: list
  402. ?list: item+ "!"
  403. item : A
  404. A: "a"
  405. """)
  406. r = g.parse("aa!")
  407. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  408. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  409. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  410. self.assertEqual(len(r.children), 1)
  411. # Sanity check: verify that 'list' contains the two 'item's we've given it
  412. [list] = r.children
  413. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  414. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  415. def test_empty_expand1_list(self):
  416. g = _Lark(r"""start: list
  417. ?list: item*
  418. item : A
  419. A: "a"
  420. """)
  421. r = g.parse("")
  422. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  423. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  424. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  425. self.assertEqual(len(r.children), 1)
  426. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  427. [list] = r.children
  428. self.assertSequenceEqual([item.data for item in list.children], ())
  429. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  430. def test_empty_expand1_list_2(self):
  431. g = _Lark(r"""start: list
  432. ?list: item* "!"?
  433. item : A
  434. A: "a"
  435. """)
  436. r = g.parse("")
  437. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  438. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  439. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  440. self.assertEqual(len(r.children), 1)
  441. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  442. [list] = r.children
  443. self.assertSequenceEqual([item.data for item in list.children], ())
  444. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  445. def test_empty_flatten_list(self):
  446. g = _Lark(r"""start: list
  447. list: | item "," list
  448. item : A
  449. A: "a"
  450. """)
  451. r = g.parse("")
  452. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  453. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  454. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  455. [list] = r.children
  456. self.assertSequenceEqual([item.data for item in list.children], ())
  457. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  458. def test_single_item_flatten_list(self):
  459. g = _Lark(r"""start: list
  460. list: | item "," list
  461. item : A
  462. A: "a"
  463. """)
  464. r = g.parse("a,")
  465. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  466. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  467. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  468. [list] = r.children
  469. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  470. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  471. def test_multiple_item_flatten_list(self):
  472. g = _Lark(r"""start: list
  473. #list: | item "," list
  474. item : A
  475. A: "a"
  476. """)
  477. r = g.parse("a,a,")
  478. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  479. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  480. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  481. [list] = r.children
  482. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  483. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  484. def test_recurse_flatten(self):
  485. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  486. g = _Lark(r"""start: a | start a
  487. a : A
  488. A : "a" """)
  489. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  490. # STree data structures, which uses recursion).
  491. g.parse("a" * (sys.getrecursionlimit() // 4))
  492. def test_token_collision(self):
  493. g = _Lark(r"""start: "Hello" NAME
  494. NAME: /\w/+
  495. %ignore " "
  496. """)
  497. x = g.parse('Hello World')
  498. self.assertSequenceEqual(x.children, ['World'])
  499. x = g.parse('Hello HelloWorld')
  500. self.assertSequenceEqual(x.children, ['HelloWorld'])
  501. def test_token_collision_WS(self):
  502. g = _Lark(r"""start: "Hello" NAME
  503. NAME: /\w/+
  504. %import common.WS
  505. %ignore WS
  506. """)
  507. x = g.parse('Hello World')
  508. self.assertSequenceEqual(x.children, ['World'])
  509. x = g.parse('Hello HelloWorld')
  510. self.assertSequenceEqual(x.children, ['HelloWorld'])
  511. def test_token_collision2(self):
  512. g = _Lark("""
  513. !start: "starts"
  514. %import common.LCASE_LETTER
  515. """)
  516. x = g.parse("starts")
  517. self.assertSequenceEqual(x.children, ['starts'])
  518. # def test_string_priority(self):
  519. # g = _Lark("""start: (A | /a?bb/)+
  520. # A: "a" """)
  521. # x = g.parse('abb')
  522. # self.assertEqual(len(x.children), 2)
  523. # # This parse raises an exception because the lexer will always try to consume
  524. # # "a" first and will never match the regular expression
  525. # # This behavior is subject to change!!
  526. # # Thie won't happen with ambiguity handling.
  527. # g = _Lark("""start: (A | /a?ab/)+
  528. # A: "a" """)
  529. # self.assertRaises(LexError, g.parse, 'aab')
  530. def test_undefined_rule(self):
  531. self.assertRaises(GrammarError, _Lark, """start: a""")
  532. def test_undefined_token(self):
  533. self.assertRaises(GrammarError, _Lark, """start: A""")
  534. def test_rule_collision(self):
  535. g = _Lark("""start: "a"+ "b"
  536. | "a"+ """)
  537. x = g.parse('aaaa')
  538. x = g.parse('aaaab')
  539. def test_rule_collision2(self):
  540. g = _Lark("""start: "a"* "b"
  541. | "a"+ """)
  542. x = g.parse('aaaa')
  543. x = g.parse('aaaab')
  544. x = g.parse('b')
  545. def test_token_not_anon(self):
  546. """Tests that "a" is matched as an anonymous token, and not A.
  547. """
  548. g = _Lark("""start: "a"
  549. A: "a" """)
  550. x = g.parse('a')
  551. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  552. g = _Lark("""start: "a" A
  553. A: "a" """)
  554. x = g.parse('aa')
  555. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  556. self.assertEqual(x.children[0].type, "A")
  557. g = _Lark("""start: /a/
  558. A: /a/ """)
  559. x = g.parse('a')
  560. self.assertEqual(len(x.children), 1)
  561. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  562. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  563. def test_maybe(self):
  564. g = _Lark("""start: ["a"] """)
  565. x = g.parse('a')
  566. x = g.parse('')
  567. def test_start(self):
  568. g = _Lark("""a: "a" a? """, start='a')
  569. x = g.parse('a')
  570. x = g.parse('aa')
  571. x = g.parse('aaa')
  572. def test_alias(self):
  573. g = _Lark("""start: "a" -> b """)
  574. x = g.parse('a')
  575. self.assertEqual(x.data, "b")
  576. def test_token_ebnf(self):
  577. g = _Lark("""start: A
  578. A: "a"* ("b"? "c".."e")+
  579. """)
  580. x = g.parse('abcde')
  581. x = g.parse('dd')
  582. def test_backslash(self):
  583. g = _Lark(r"""start: "\\" "a"
  584. """)
  585. x = g.parse(r'\a')
  586. g = _Lark(r"""start: /\\/ /a/
  587. """)
  588. x = g.parse(r'\a')
  589. def test_special_chars(self):
  590. g = _Lark(r"""start: "\n"
  591. """)
  592. x = g.parse('\n')
  593. g = _Lark(r"""start: /\n/
  594. """)
  595. x = g.parse('\n')
  596. def test_backslash2(self):
  597. g = _Lark(r"""start: "\"" "-"
  598. """)
  599. x = g.parse('"-')
  600. g = _Lark(r"""start: /\// /-/
  601. """)
  602. x = g.parse('/-')
  603. # def test_token_recurse(self):
  604. # g = _Lark("""start: A
  605. # A: B
  606. # B: A
  607. # """)
  608. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  609. def test_empty(self):
  610. # Fails an Earley implementation without special handling for empty rules,
  611. # or re-processing of already completed rules.
  612. g = _Lark(r"""start: _empty a "B"
  613. a: _empty "A"
  614. _empty:
  615. """)
  616. x = g.parse('AB')
  617. def test_regex_quote(self):
  618. g = r"""
  619. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  620. SINGLE_QUOTED_STRING : /'[^']*'/
  621. DOUBLE_QUOTED_STRING : /"[^"]*"/
  622. """
  623. g = _Lark(g)
  624. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  625. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  626. def test_lexer_token_limit(self):
  627. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  628. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  629. g = _Lark("""start: %s
  630. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  631. def test_float_without_lexer(self):
  632. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  633. if PARSER == 'cyk':
  634. expected_error = ParseError
  635. g = _Lark("""start: ["+"|"-"] float
  636. float: digit* "." digit+ exp?
  637. | digit+ exp
  638. exp: ("e"|"E") ["+"|"-"] digit+
  639. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  640. """)
  641. g.parse("1.2")
  642. g.parse("-.2e9")
  643. g.parse("+2e-9")
  644. self.assertRaises( expected_error, g.parse, "+2e-9e")
  645. def test_keep_all_tokens(self):
  646. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  647. tree = l.parse('aaa')
  648. self.assertEqual(tree.children, ['a', 'a', 'a'])
  649. def test_token_flags(self):
  650. l = _Lark("""!start: "a"i+
  651. """
  652. )
  653. tree = l.parse('aA')
  654. self.assertEqual(tree.children, ['a', 'A'])
  655. l = _Lark("""!start: /a/i+
  656. """
  657. )
  658. tree = l.parse('aA')
  659. self.assertEqual(tree.children, ['a', 'A'])
  660. # g = """!start: "a"i "a"
  661. # """
  662. # self.assertRaises(GrammarError, _Lark, g)
  663. # g = """!start: /a/i /a/
  664. # """
  665. # self.assertRaises(GrammarError, _Lark, g)
  666. g = """start: NAME "," "a"
  667. NAME: /[a-z_]/i /[a-z0-9_]/i*
  668. """
  669. l = _Lark(g)
  670. tree = l.parse('ab,a')
  671. self.assertEqual(tree.children, ['ab'])
  672. tree = l.parse('AB,a')
  673. self.assertEqual(tree.children, ['AB'])
  674. def test_token_flags3(self):
  675. l = _Lark("""!start: ABC+
  676. ABC: "abc"i
  677. """
  678. )
  679. tree = l.parse('aBcAbC')
  680. self.assertEqual(tree.children, ['aBc', 'AbC'])
  681. def test_token_flags2(self):
  682. g = """!start: ("a"i | /a/ /b/?)+
  683. """
  684. l = _Lark(g)
  685. tree = l.parse('aA')
  686. self.assertEqual(tree.children, ['a', 'A'])
  687. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  688. def test_twice_empty(self):
  689. g = """!start: [["A"]]
  690. """
  691. l = _Lark(g)
  692. tree = l.parse('A')
  693. self.assertEqual(tree.children, ['A'])
  694. tree = l.parse('')
  695. self.assertEqual(tree.children, [])
  696. def test_undefined_ignore(self):
  697. g = """!start: "A"
  698. %ignore B
  699. """
  700. self.assertRaises( GrammarError, _Lark, g)
  701. def test_alias_in_terminal(self):
  702. g = """start: TERM
  703. TERM: "a" -> alias
  704. """
  705. self.assertRaises( GrammarError, _Lark, g)
  706. def test_line_and_column(self):
  707. g = r"""!start: "A" bc "D"
  708. !bc: "B\nC"
  709. """
  710. l = _Lark(g)
  711. a, bc, d = l.parse("AB\nCD").children
  712. self.assertEqual(a.line, 1)
  713. self.assertEqual(a.column, 1)
  714. bc ,= bc.children
  715. self.assertEqual(bc.line, 1)
  716. self.assertEqual(bc.column, 2)
  717. self.assertEqual(d.line, 2)
  718. self.assertEqual(d.column, 2)
  719. if LEXER != 'dynamic':
  720. self.assertEqual(a.end_line, 1)
  721. self.assertEqual(a.end_column, 2)
  722. self.assertEqual(bc.end_line, 2)
  723. self.assertEqual(bc.end_column, 2)
  724. self.assertEqual(d.end_line, 2)
  725. self.assertEqual(d.end_column, 3)
  726. def test_reduce_cycle(self):
  727. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  728. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  729. """
  730. l = _Lark("""
  731. term: A
  732. | term term
  733. A: "a"
  734. """, start='term')
  735. tree = l.parse("aa")
  736. self.assertEqual(len(tree.children), 2)
  737. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  738. def test_lexer_prioritization(self):
  739. "Tests effect of priority on result"
  740. grammar = """
  741. start: A B | AB
  742. A.2: "a"
  743. B: "b"
  744. AB: "ab"
  745. """
  746. l = _Lark(grammar)
  747. res = l.parse("ab")
  748. self.assertEqual(res.children, ['a', 'b'])
  749. self.assertNotEqual(res.children, ['ab'])
  750. grammar = """
  751. start: A B | AB
  752. A: "a"
  753. B: "b"
  754. AB.3: "ab"
  755. """
  756. l = _Lark(grammar)
  757. res = l.parse("ab")
  758. self.assertNotEqual(res.children, ['a', 'b'])
  759. self.assertEqual(res.children, ['ab'])
  760. def test_import(self):
  761. grammar = """
  762. start: NUMBER WORD
  763. %import common.NUMBER
  764. %import common.WORD
  765. %import common.WS
  766. %ignore WS
  767. """
  768. l = _Lark(grammar)
  769. x = l.parse('12 elephants')
  770. self.assertEqual(x.children, ['12', 'elephants'])
  771. def test_relative_import(self):
  772. grammar = """
  773. start: NUMBER WORD
  774. %import .grammars.test.NUMBER
  775. %import common.WORD
  776. %import common.WS
  777. %ignore WS
  778. """
  779. l = _Lark(grammar)
  780. x = l.parse('12 lions')
  781. self.assertEqual(x.children, ['12', 'lions'])
  782. def test_multi_import(self):
  783. grammar = """
  784. start: NUMBER WORD
  785. %import common (NUMBER, WORD, WS)
  786. %ignore WS
  787. """
  788. l = _Lark(grammar)
  789. x = l.parse('12 toucans')
  790. self.assertEqual(x.children, ['12', 'toucans'])
  791. def test_relative_multi_import(self):
  792. grammar = """
  793. start: NUMBER WORD
  794. %import .grammars.test (NUMBER, WORD, WS)
  795. %ignore WS
  796. """
  797. l = _Lark(grammar)
  798. x = l.parse('12 capybaras')
  799. self.assertEqual(x.children, ['12', 'capybaras'])
  800. def test_import_errors(self):
  801. grammar = """
  802. start: NUMBER WORD
  803. %import .grammars.bad_test.NUMBER
  804. """
  805. self.assertRaises(IOError, _Lark, grammar)
  806. grammar = """
  807. start: NUMBER WORD
  808. %import bad_test.NUMBER
  809. """
  810. self.assertRaises(IOError, _Lark, grammar)
  811. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  812. def test_earley_prioritization(self):
  813. "Tests effect of priority on result"
  814. grammar = """
  815. start: a | b
  816. a.1: "a"
  817. b.2: "a"
  818. """
  819. # l = Lark(grammar, parser='earley', lexer='standard')
  820. l = _Lark(grammar)
  821. res = l.parse("a")
  822. self.assertEqual(res.children[0].data, 'b')
  823. grammar = """
  824. start: a | b
  825. a.2: "a"
  826. b.1: "a"
  827. """
  828. l = _Lark(grammar)
  829. # l = Lark(grammar, parser='earley', lexer='standard')
  830. res = l.parse("a")
  831. self.assertEqual(res.children[0].data, 'a')
  832. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  833. def test_earley_prioritization_sum(self):
  834. "Tests effect of priority on result"
  835. grammar = """
  836. start: ab_ b_ a_ | indirection
  837. indirection: a_ bb_ a_
  838. a_: "a"
  839. b_: "b"
  840. ab_: "ab"
  841. bb_.1: "bb"
  842. """
  843. l = Lark(grammar, priority="invert")
  844. res = l.parse('abba')
  845. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  846. grammar = """
  847. start: ab_ b_ a_ | indirection
  848. indirection: a_ bb_ a_
  849. a_: "a"
  850. b_: "b"
  851. ab_.1: "ab"
  852. bb_: "bb"
  853. """
  854. l = Lark(grammar, priority="invert")
  855. res = l.parse('abba')
  856. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  857. grammar = """
  858. start: ab_ b_ a_ | indirection
  859. indirection: a_ bb_ a_
  860. a_.2: "a"
  861. b_.1: "b"
  862. ab_.3: "ab"
  863. bb_.3: "bb"
  864. """
  865. l = Lark(grammar, priority="invert")
  866. res = l.parse('abba')
  867. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  868. grammar = """
  869. start: ab_ b_ a_ | indirection
  870. indirection: a_ bb_ a_
  871. a_.1: "a"
  872. b_.1: "b"
  873. ab_.4: "ab"
  874. bb_.3: "bb"
  875. """
  876. l = Lark(grammar, priority="invert")
  877. res = l.parse('abba')
  878. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  879. def test_utf8(self):
  880. g = u"""start: a
  881. a: "±a"
  882. """
  883. l = _Lark(g)
  884. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  885. g = u"""start: A
  886. A: "±a"
  887. """
  888. l = _Lark(g)
  889. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  890. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  891. def test_ignore(self):
  892. grammar = r"""
  893. COMMENT: /(!|(\/\/))[^\n]*/
  894. %ignore COMMENT
  895. %import common.WS -> _WS
  896. %import common.INT
  897. start: "INT"i _WS+ INT _WS*
  898. """
  899. parser = _Lark(grammar)
  900. tree = parser.parse("int 1 ! This is a comment\n")
  901. self.assertEqual(tree.children, ['1'])
  902. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  903. self.assertEqual(tree.children, ['1'])
  904. parser = _Lark(r"""
  905. start : "a"*
  906. %ignore "b"
  907. """)
  908. tree = parser.parse("bb")
  909. self.assertEqual(tree.children, [])
  910. def test_regex_escaping(self):
  911. g = _Lark("start: /[ab]/")
  912. g.parse('a')
  913. g.parse('b')
  914. self.assertRaises( UnexpectedInput, g.parse, 'c')
  915. _Lark(r'start: /\w/').parse('a')
  916. g = _Lark(r'start: /\\w/')
  917. self.assertRaises( UnexpectedInput, g.parse, 'a')
  918. g.parse(r'\w')
  919. _Lark(r'start: /\[/').parse('[')
  920. _Lark(r'start: /\//').parse('/')
  921. _Lark(r'start: /\\/').parse('\\')
  922. _Lark(r'start: /\[ab]/').parse('[ab]')
  923. _Lark(r'start: /\\[ab]/').parse('\\a')
  924. _Lark(r'start: /\t/').parse('\t')
  925. _Lark(r'start: /\\t/').parse('\\t')
  926. _Lark(r'start: /\\\t/').parse('\\\t')
  927. _Lark(r'start: "\t"').parse('\t')
  928. _Lark(r'start: "\\t"').parse('\\t')
  929. _Lark(r'start: "\\\t"').parse('\\\t')
  930. def test_ranged_repeat_rules(self):
  931. g = u"""!start: "A"~3
  932. """
  933. l = _Lark(g)
  934. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  935. self.assertRaises(ParseError, l.parse, u'AA')
  936. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  937. g = u"""!start: "A"~0..2
  938. """
  939. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  940. l = _Lark(g)
  941. self.assertEqual(l.parse(u''), Tree('start', []))
  942. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  943. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  944. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  945. g = u"""!start: "A"~3..2
  946. """
  947. self.assertRaises(GrammarError, _Lark, g)
  948. g = u"""!start: "A"~2..3 "B"~2
  949. """
  950. l = _Lark(g)
  951. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  952. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  953. self.assertRaises(ParseError, l.parse, u'AAAB')
  954. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  955. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  956. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  957. def test_ranged_repeat_terms(self):
  958. g = u"""!start: AAA
  959. AAA: "A"~3
  960. """
  961. l = _Lark(g)
  962. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  963. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  964. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  965. g = u"""!start: AABB CC
  966. AABB: "A"~0..2 "B"~2
  967. CC: "C"~1..2
  968. """
  969. l = _Lark(g)
  970. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  971. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  972. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  973. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  974. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  975. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  976. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  977. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  978. def test_priority_vs_embedded(self):
  979. g = """
  980. A.2: "a"
  981. WORD: ("a".."z")+
  982. start: (A | WORD)+
  983. """
  984. l = _Lark(g)
  985. t = l.parse('abc')
  986. self.assertEqual(t.children, ['a', 'bc'])
  987. self.assertEqual(t.children[0].type, 'A')
  988. def test_line_counting(self):
  989. p = _Lark("start: /[^x]+/")
  990. text = 'hello\nworld'
  991. t = p.parse(text)
  992. tok = t.children[0]
  993. self.assertEqual(tok, text)
  994. self.assertEqual(tok.line, 1)
  995. self.assertEqual(tok.column, 1)
  996. if _LEXER != 'dynamic':
  997. self.assertEqual(tok.end_line, 2)
  998. self.assertEqual(tok.end_column, 6)
  999. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1000. def test_empty_end(self):
  1001. p = _Lark("""
  1002. start: b c d
  1003. b: "B"
  1004. c: | "C"
  1005. d: | "D"
  1006. """)
  1007. res = p.parse('B')
  1008. self.assertEqual(len(res.children), 3)
  1009. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1010. def test_maybe_placeholders(self):
  1011. # Anonymous tokens shouldn't count
  1012. p = _Lark("""start: "a"? "b"? "c"? """, maybe_placeholders=True)
  1013. self.assertEqual(p.parse("").children, [])
  1014. # Anonymous tokens shouldn't count, other constructs should
  1015. p = _Lark("""start: A? "b"? _c?
  1016. A: "a"
  1017. _c: "c" """, maybe_placeholders=True)
  1018. self.assertEqual(p.parse("").children, [None])
  1019. p = _Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True)
  1020. self.assertEqual(p.parse("").children, [None, None, None])
  1021. self.assertEqual(p.parse("a").children, ['a', None, None])
  1022. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1023. self.assertEqual(p.parse("c").children, [None, None, 'c'])
  1024. self.assertEqual(p.parse("ab").children, ['a', 'b', None])
  1025. self.assertEqual(p.parse("ac").children, ['a', None, 'c'])
  1026. self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
  1027. self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])
  1028. p = _Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True)
  1029. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1030. self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
  1031. self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
  1032. self.assertEqual(p.parse("babbcabcb").children,
  1033. [None, 'b', None,
  1034. 'a', 'b', None,
  1035. None, 'b', 'c',
  1036. 'a', 'b', 'c',
  1037. None, 'b', None])
  1038. p = _Lark("""!start: "a"? "c"? "b"+ "a"? "d"? """, maybe_placeholders=True)
  1039. self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None])
  1040. self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd'])
  1041. self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None])
  1042. self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None])
  1043. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  1044. _TestParser.__name__ = _NAME
  1045. globals()[_NAME] = _TestParser
  1046. # Note: You still have to import them in __main__ for the tests to run
  1047. _TO_TEST = [
  1048. ('standard', 'earley'),
  1049. ('standard', 'cyk'),
  1050. ('dynamic', 'earley'),
  1051. ('dynamic_complete', 'earley'),
  1052. ('standard', 'lalr'),
  1053. ('contextual', 'lalr'),
  1054. # (None, 'earley'),
  1055. ]
  1056. for _LEXER, _PARSER in _TO_TEST:
  1057. _make_parser_test(_LEXER, _PARSER)
  1058. for _LEXER in ('dynamic', 'dynamic_complete'):
  1059. _make_full_earley_test(_LEXER)
  1060. if __name__ == '__main__':
  1061. unittest.main()