This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1266 lines
41 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput
  19. from lark.tree import Tree
  20. from lark.visitors import Transformer
  21. __path__ = os.path.dirname(__file__)
  22. def _read(n, *args):
  23. with open(os.path.join(__path__, n), *args) as f:
  24. return f.read()
  25. class TestParsers(unittest.TestCase):
  26. def test_same_ast(self):
  27. "Tests that Earley and LALR parsers produce equal trees"
  28. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  29. name_list: NAME | name_list "," NAME
  30. NAME: /\w+/ """, parser='lalr')
  31. l = g.parse('(a,b,c,*x)')
  32. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  33. name_list: NAME | name_list "," NAME
  34. NAME: /\w/+ """)
  35. l2 = g.parse('(a,b,c,*x)')
  36. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  37. def test_infinite_recurse(self):
  38. g = """start: a
  39. a: a | "a"
  40. """
  41. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  42. l = Lark(g, parser='earley', lexer='dynamic')
  43. self.assertRaises(ParseError, l.parse, 'a')
  44. def test_propagate_positions(self):
  45. g = Lark("""start: a
  46. a: "a"
  47. """, propagate_positions=True)
  48. r = g.parse('a')
  49. self.assertEqual( r.children[0].meta.line, 1 )
  50. def test_expand1(self):
  51. g = Lark("""start: a
  52. ?a: b
  53. b: "x"
  54. """)
  55. r = g.parse('x')
  56. self.assertEqual( r.children[0].data, "b" )
  57. g = Lark("""start: a
  58. ?a: b -> c
  59. b: "x"
  60. """)
  61. r = g.parse('x')
  62. self.assertEqual( r.children[0].data, "c" )
  63. g = Lark("""start: a
  64. ?a: B -> c
  65. B: "x"
  66. """)
  67. self.assertEqual( r.children[0].data, "c" )
  68. g = Lark("""start: a
  69. ?a: b b -> c
  70. b: "x"
  71. """)
  72. r = g.parse('xx')
  73. self.assertEqual( r.children[0].data, "c" )
  74. def test_embedded_transformer(self):
  75. class T(Transformer):
  76. def a(self, children):
  77. return "<a>"
  78. def b(self, children):
  79. return "<b>"
  80. def c(self, children):
  81. return "<c>"
  82. # Test regular
  83. g = Lark("""start: a
  84. a : "x"
  85. """, parser='lalr')
  86. r = T().transform(g.parse("x"))
  87. self.assertEqual( r.children, ["<a>"] )
  88. g = Lark("""start: a
  89. a : "x"
  90. """, parser='lalr', transformer=T())
  91. r = g.parse("x")
  92. self.assertEqual( r.children, ["<a>"] )
  93. # Test Expand1
  94. g = Lark("""start: a
  95. ?a : b
  96. b : "x"
  97. """, parser='lalr')
  98. r = T().transform(g.parse("x"))
  99. self.assertEqual( r.children, ["<b>"] )
  100. g = Lark("""start: a
  101. ?a : b
  102. b : "x"
  103. """, parser='lalr', transformer=T())
  104. r = g.parse("x")
  105. self.assertEqual( r.children, ["<b>"] )
  106. # Test Expand1 -> Alias
  107. g = Lark("""start: a
  108. ?a : b b -> c
  109. b : "x"
  110. """, parser='lalr')
  111. r = T().transform(g.parse("xx"))
  112. self.assertEqual( r.children, ["<c>"] )
  113. g = Lark("""start: a
  114. ?a : b b -> c
  115. b : "x"
  116. """, parser='lalr', transformer=T())
  117. r = g.parse("xx")
  118. self.assertEqual( r.children, ["<c>"] )
  119. def test_alias(self):
  120. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  121. def _make_full_earley_test(LEXER):
  122. class _TestFullEarley(unittest.TestCase):
  123. def test_anon(self):
  124. # Fails an Earley implementation without special handling for empty rules,
  125. # or re-processing of already completed rules.
  126. g = Lark(r"""start: B
  127. B: ("ab"|/[^b]/)+
  128. """, lexer=LEXER)
  129. self.assertEqual( g.parse('abc').children[0], 'abc')
  130. def test_earley(self):
  131. g = Lark("""start: A "b" c
  132. A: "a"+
  133. c: "abc"
  134. """, parser="earley", lexer=LEXER)
  135. x = g.parse('aaaababc')
  136. def test_earley2(self):
  137. grammar = """
  138. start: statement+
  139. statement: "r"
  140. | "c" /[a-z]/+
  141. %ignore " "
  142. """
  143. program = """c b r"""
  144. l = Lark(grammar, parser='earley', lexer=LEXER)
  145. l.parse(program)
  146. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  147. def test_earley3(self):
  148. "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  149. grammar = """
  150. start: A A
  151. A: "a"+
  152. """
  153. l = Lark(grammar, parser='earley', lexer=LEXER)
  154. res = l.parse("aaa")
  155. self.assertEqual(res.children, ['aa', 'a'])
  156. def test_earley4(self):
  157. grammar = """
  158. start: A A?
  159. A: "a"+
  160. """
  161. l = Lark(grammar, parser='earley', lexer=LEXER)
  162. res = l.parse("aaa")
  163. self.assertEqual(res.children, ['aaa'])
  164. def test_earley_repeating_empty(self):
  165. # This was a sneaky bug!
  166. grammar = """
  167. !start: "a" empty empty "b"
  168. empty: empty2
  169. empty2:
  170. """
  171. parser = Lark(grammar, parser='earley', lexer=LEXER)
  172. res = parser.parse('ab')
  173. empty_tree = Tree('empty', [Tree('empty2', [])])
  174. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  175. def test_earley_explicit_ambiguity(self):
  176. # This was a sneaky bug!
  177. grammar = """
  178. start: a b | ab
  179. a: "a"
  180. b: "b"
  181. ab: "ab"
  182. """
  183. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  184. res = parser.parse('ab')
  185. self.assertEqual( res.data, '_ambig')
  186. self.assertEqual( len(res.children), 2)
  187. def test_ambiguity1(self):
  188. grammar = """
  189. start: cd+ "e"
  190. !cd: "c"
  191. | "d"
  192. | "cd"
  193. """
  194. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  195. x = l.parse('cde')
  196. assert x.data == '_ambig', x
  197. assert len(x.children) == 2
  198. def test_fruitflies_ambig(self):
  199. grammar = """
  200. start: noun verb noun -> simple
  201. | noun verb "like" noun -> comparative
  202. noun: adj? NOUN
  203. verb: VERB
  204. adj: ADJ
  205. NOUN: "flies" | "bananas" | "fruit"
  206. VERB: "like" | "flies"
  207. ADJ: "fruit"
  208. %import common.WS
  209. %ignore WS
  210. """
  211. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  212. res = parser.parse('fruit flies like bananas')
  213. expected = Tree('_ambig', [
  214. Tree('comparative', [
  215. Tree('noun', ['fruit']),
  216. Tree('verb', ['flies']),
  217. Tree('noun', ['bananas'])
  218. ]),
  219. Tree('simple', [
  220. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  221. Tree('verb', ['like']),
  222. Tree('noun', ['bananas'])
  223. ])
  224. ])
  225. # print res.pretty()
  226. # print expected.pretty()
  227. self.assertEqual(res, expected)
  228. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  229. def test_explicit_ambiguity2(self):
  230. grammar = r"""
  231. start: NAME+
  232. NAME: /\w+/
  233. %ignore " "
  234. """
  235. text = """cat"""
  236. parser = Lark(grammar, start='start', ambiguity='explicit')
  237. tree = parser.parse(text)
  238. self.assertEqual(tree.data, '_ambig')
  239. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  240. self.assertEqual(combinations, {
  241. ('cat',),
  242. ('ca', 't'),
  243. ('c', 'at'),
  244. ('c', 'a' ,'t')
  245. })
  246. def test_term_ambig_resolve(self):
  247. grammar = r"""
  248. !start: NAME+
  249. NAME: /\w+/
  250. %ignore " "
  251. """
  252. text = """foo bar"""
  253. parser = Lark(grammar)
  254. tree = parser.parse(text)
  255. self.assertEqual(tree.children, ['foo', 'bar'])
  256. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  257. # def test_not_all_derivations(self):
  258. # grammar = """
  259. # start: cd+ "e"
  260. # !cd: "c"
  261. # | "d"
  262. # | "cd"
  263. # """
  264. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  265. # x = l.parse('cde')
  266. # assert x.data != '_ambig', x
  267. # assert len(x.children) == 1
  268. _NAME = "TestFullEarley" + LEXER.capitalize()
  269. _TestFullEarley.__name__ = _NAME
  270. globals()[_NAME] = _TestFullEarley
  271. def _make_parser_test(LEXER, PARSER):
  272. def _Lark(grammar, **kwargs):
  273. return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
  274. class _TestParser(unittest.TestCase):
  275. def test_basic1(self):
  276. g = _Lark("""start: a+ b a* "b" a*
  277. b: "b"
  278. a: "a"
  279. """)
  280. r = g.parse('aaabaab')
  281. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  282. r = g.parse('aaabaaba')
  283. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  284. self.assertRaises(ParseError, g.parse, 'aaabaa')
  285. def test_basic2(self):
  286. # Multiple parsers and colliding tokens
  287. g = _Lark("""start: B A
  288. B: "12"
  289. A: "1" """)
  290. g2 = _Lark("""start: B A
  291. B: "12"
  292. A: "2" """)
  293. x = g.parse('121')
  294. assert x.data == 'start' and x.children == ['12', '1'], x
  295. x = g2.parse('122')
  296. assert x.data == 'start' and x.children == ['12', '2'], x
  297. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  298. def test_stringio_bytes(self):
  299. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  300. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  301. def test_stringio_unicode(self):
  302. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  303. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  304. def test_unicode(self):
  305. g = _Lark(u"""start: UNIA UNIB UNIA
  306. UNIA: /\xa3/
  307. UNIB: /\u0101/
  308. """)
  309. g.parse(u'\xa3\u0101\u00a3')
  310. def test_unicode2(self):
  311. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  312. UNIA: /\xa3/
  313. UNIB: "a\u0101b\ "
  314. UNIC: /a?\u0101c\n/
  315. """)
  316. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  317. def test_unicode3(self):
  318. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  319. UNIA: /\xa3/
  320. UNIB: "\u0101"
  321. UNIC: /\u0203/ /\n/
  322. """)
  323. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  324. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  325. def test_stack_for_ebnf(self):
  326. """Verify that stack depth isn't an issue for EBNF grammars"""
  327. g = _Lark(r"""start: a+
  328. a : "a" """)
  329. g.parse("a" * (sys.getrecursionlimit()*2 ))
  330. def test_expand1_lists_with_one_item(self):
  331. g = _Lark(r"""start: list
  332. ?list: item+
  333. item : A
  334. A: "a"
  335. """)
  336. r = g.parse("a")
  337. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  338. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  339. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  340. self.assertEqual(len(r.children), 1)
  341. def test_expand1_lists_with_one_item_2(self):
  342. g = _Lark(r"""start: list
  343. ?list: item+ "!"
  344. item : A
  345. A: "a"
  346. """)
  347. r = g.parse("a!")
  348. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  349. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  350. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  351. self.assertEqual(len(r.children), 1)
  352. def test_dont_expand1_lists_with_multiple_items(self):
  353. g = _Lark(r"""start: list
  354. ?list: item+
  355. item : A
  356. A: "a"
  357. """)
  358. r = g.parse("aa")
  359. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  360. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  361. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  362. self.assertEqual(len(r.children), 1)
  363. # Sanity check: verify that 'list' contains the two 'item's we've given it
  364. [list] = r.children
  365. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  366. def test_dont_expand1_lists_with_multiple_items_2(self):
  367. g = _Lark(r"""start: list
  368. ?list: item+ "!"
  369. item : A
  370. A: "a"
  371. """)
  372. r = g.parse("aa!")
  373. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  374. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  375. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  376. self.assertEqual(len(r.children), 1)
  377. # Sanity check: verify that 'list' contains the two 'item's we've given it
  378. [list] = r.children
  379. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  380. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  381. def test_empty_expand1_list(self):
  382. g = _Lark(r"""start: list
  383. ?list: item*
  384. item : A
  385. A: "a"
  386. """)
  387. r = g.parse("")
  388. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  389. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  390. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  391. self.assertEqual(len(r.children), 1)
  392. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  393. [list] = r.children
  394. self.assertSequenceEqual([item.data for item in list.children], ())
  395. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  396. def test_empty_expand1_list_2(self):
  397. g = _Lark(r"""start: list
  398. ?list: item* "!"?
  399. item : A
  400. A: "a"
  401. """)
  402. r = g.parse("")
  403. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  404. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  405. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  406. self.assertEqual(len(r.children), 1)
  407. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  408. [list] = r.children
  409. self.assertSequenceEqual([item.data for item in list.children], ())
  410. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  411. def test_empty_flatten_list(self):
  412. g = _Lark(r"""start: list
  413. list: | item "," list
  414. item : A
  415. A: "a"
  416. """)
  417. r = g.parse("")
  418. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  419. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  420. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  421. [list] = r.children
  422. self.assertSequenceEqual([item.data for item in list.children], ())
  423. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  424. def test_single_item_flatten_list(self):
  425. g = _Lark(r"""start: list
  426. list: | item "," list
  427. item : A
  428. A: "a"
  429. """)
  430. r = g.parse("a,")
  431. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  432. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  433. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  434. [list] = r.children
  435. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  436. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  437. def test_multiple_item_flatten_list(self):
  438. g = _Lark(r"""start: list
  439. #list: | item "," list
  440. item : A
  441. A: "a"
  442. """)
  443. r = g.parse("a,a,")
  444. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  445. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  446. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  447. [list] = r.children
  448. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  449. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  450. def test_recurse_flatten(self):
  451. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  452. g = _Lark(r"""start: a | start a
  453. a : A
  454. A : "a" """)
  455. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  456. # STree data structures, which uses recursion).
  457. g.parse("a" * (sys.getrecursionlimit() // 4))
  458. def test_token_collision(self):
  459. g = _Lark(r"""start: "Hello" NAME
  460. NAME: /\w/+
  461. %ignore " "
  462. """)
  463. x = g.parse('Hello World')
  464. self.assertSequenceEqual(x.children, ['World'])
  465. x = g.parse('Hello HelloWorld')
  466. self.assertSequenceEqual(x.children, ['HelloWorld'])
  467. def test_token_collision_WS(self):
  468. g = _Lark(r"""start: "Hello" NAME
  469. NAME: /\w/+
  470. %import common.WS
  471. %ignore WS
  472. """)
  473. x = g.parse('Hello World')
  474. self.assertSequenceEqual(x.children, ['World'])
  475. x = g.parse('Hello HelloWorld')
  476. self.assertSequenceEqual(x.children, ['HelloWorld'])
  477. def test_token_collision2(self):
  478. g = _Lark("""
  479. !start: "starts"
  480. %import common.LCASE_LETTER
  481. """)
  482. x = g.parse("starts")
  483. self.assertSequenceEqual(x.children, ['starts'])
  484. # def test_string_priority(self):
  485. # g = _Lark("""start: (A | /a?bb/)+
  486. # A: "a" """)
  487. # x = g.parse('abb')
  488. # self.assertEqual(len(x.children), 2)
  489. # # This parse raises an exception because the lexer will always try to consume
  490. # # "a" first and will never match the regular expression
  491. # # This behavior is subject to change!!
  492. # # Thie won't happen with ambiguity handling.
  493. # g = _Lark("""start: (A | /a?ab/)+
  494. # A: "a" """)
  495. # self.assertRaises(LexError, g.parse, 'aab')
  496. def test_undefined_rule(self):
  497. self.assertRaises(GrammarError, _Lark, """start: a""")
  498. def test_undefined_token(self):
  499. self.assertRaises(GrammarError, _Lark, """start: A""")
  500. def test_rule_collision(self):
  501. g = _Lark("""start: "a"+ "b"
  502. | "a"+ """)
  503. x = g.parse('aaaa')
  504. x = g.parse('aaaab')
  505. def test_rule_collision2(self):
  506. g = _Lark("""start: "a"* "b"
  507. | "a"+ """)
  508. x = g.parse('aaaa')
  509. x = g.parse('aaaab')
  510. x = g.parse('b')
  511. def test_token_not_anon(self):
  512. """Tests that "a" is matched as an anonymous token, and not A.
  513. """
  514. g = _Lark("""start: "a"
  515. A: "a" """)
  516. x = g.parse('a')
  517. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  518. g = _Lark("""start: "a" A
  519. A: "a" """)
  520. x = g.parse('aa')
  521. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  522. self.assertEqual(x.children[0].type, "A")
  523. g = _Lark("""start: /a/
  524. A: /a/ """)
  525. x = g.parse('a')
  526. self.assertEqual(len(x.children), 1)
  527. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  528. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  529. def test_maybe(self):
  530. g = _Lark("""start: ["a"] """)
  531. x = g.parse('a')
  532. x = g.parse('')
  533. def test_start(self):
  534. g = _Lark("""a: "a" a? """, start='a')
  535. x = g.parse('a')
  536. x = g.parse('aa')
  537. x = g.parse('aaa')
  538. def test_alias(self):
  539. g = _Lark("""start: "a" -> b """)
  540. x = g.parse('a')
  541. self.assertEqual(x.data, "b")
  542. def test_token_ebnf(self):
  543. g = _Lark("""start: A
  544. A: "a"* ("b"? "c".."e")+
  545. """)
  546. x = g.parse('abcde')
  547. x = g.parse('dd')
  548. def test_backslash(self):
  549. g = _Lark(r"""start: "\\" "a"
  550. """)
  551. x = g.parse(r'\a')
  552. g = _Lark(r"""start: /\\/ /a/
  553. """)
  554. x = g.parse(r'\a')
  555. def test_special_chars(self):
  556. g = _Lark(r"""start: "\n"
  557. """)
  558. x = g.parse('\n')
  559. g = _Lark(r"""start: /\n/
  560. """)
  561. x = g.parse('\n')
  562. def test_backslash2(self):
  563. g = _Lark(r"""start: "\"" "-"
  564. """)
  565. x = g.parse('"-')
  566. g = _Lark(r"""start: /\// /-/
  567. """)
  568. x = g.parse('/-')
  569. # def test_token_recurse(self):
  570. # g = _Lark("""start: A
  571. # A: B
  572. # B: A
  573. # """)
  574. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  575. def test_empty(self):
  576. # Fails an Earley implementation without special handling for empty rules,
  577. # or re-processing of already completed rules.
  578. g = _Lark(r"""start: _empty a "B"
  579. a: _empty "A"
  580. _empty:
  581. """)
  582. x = g.parse('AB')
  583. def test_regex_quote(self):
  584. g = r"""
  585. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  586. SINGLE_QUOTED_STRING : /'[^']*'/
  587. DOUBLE_QUOTED_STRING : /"[^"]*"/
  588. """
  589. g = _Lark(g)
  590. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  591. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  592. def test_lexer_token_limit(self):
  593. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  594. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  595. g = _Lark("""start: %s
  596. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  597. def test_float_without_lexer(self):
  598. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  599. if PARSER == 'cyk':
  600. expected_error = ParseError
  601. g = _Lark("""start: ["+"|"-"] float
  602. float: digit* "." digit+ exp?
  603. | digit+ exp
  604. exp: ("e"|"E") ["+"|"-"] digit+
  605. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  606. """)
  607. g.parse("1.2")
  608. g.parse("-.2e9")
  609. g.parse("+2e-9")
  610. self.assertRaises( expected_error, g.parse, "+2e-9e")
  611. def test_keep_all_tokens(self):
  612. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  613. tree = l.parse('aaa')
  614. self.assertEqual(tree.children, ['a', 'a', 'a'])
  615. def test_token_flags(self):
  616. l = _Lark("""!start: "a"i+
  617. """
  618. )
  619. tree = l.parse('aA')
  620. self.assertEqual(tree.children, ['a', 'A'])
  621. l = _Lark("""!start: /a/i+
  622. """
  623. )
  624. tree = l.parse('aA')
  625. self.assertEqual(tree.children, ['a', 'A'])
  626. # g = """!start: "a"i "a"
  627. # """
  628. # self.assertRaises(GrammarError, _Lark, g)
  629. # g = """!start: /a/i /a/
  630. # """
  631. # self.assertRaises(GrammarError, _Lark, g)
  632. g = """start: NAME "," "a"
  633. NAME: /[a-z_]/i /[a-z0-9_]/i*
  634. """
  635. l = _Lark(g)
  636. tree = l.parse('ab,a')
  637. self.assertEqual(tree.children, ['ab'])
  638. tree = l.parse('AB,a')
  639. self.assertEqual(tree.children, ['AB'])
  640. def test_token_flags3(self):
  641. l = _Lark("""!start: ABC+
  642. ABC: "abc"i
  643. """
  644. )
  645. tree = l.parse('aBcAbC')
  646. self.assertEqual(tree.children, ['aBc', 'AbC'])
  647. def test_token_flags2(self):
  648. g = """!start: ("a"i | /a/ /b/?)+
  649. """
  650. l = _Lark(g)
  651. tree = l.parse('aA')
  652. self.assertEqual(tree.children, ['a', 'A'])
  653. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  654. def test_twice_empty(self):
  655. g = """!start: [["A"]]
  656. """
  657. l = _Lark(g)
  658. tree = l.parse('A')
  659. self.assertEqual(tree.children, ['A'])
  660. tree = l.parse('')
  661. self.assertEqual(tree.children, [])
  662. def test_undefined_ignore(self):
  663. g = """!start: "A"
  664. %ignore B
  665. """
  666. self.assertRaises( GrammarError, _Lark, g)
  667. def test_alias_in_terminal(self):
  668. g = """start: TERM
  669. TERM: "a" -> alias
  670. """
  671. self.assertRaises( GrammarError, _Lark, g)
  672. def test_line_and_column(self):
  673. g = r"""!start: "A" bc "D"
  674. !bc: "B\nC"
  675. """
  676. l = _Lark(g)
  677. a, bc, d = l.parse("AB\nCD").children
  678. self.assertEqual(a.line, 1)
  679. self.assertEqual(a.column, 1)
  680. bc ,= bc.children
  681. self.assertEqual(bc.line, 1)
  682. self.assertEqual(bc.column, 2)
  683. self.assertEqual(d.line, 2)
  684. self.assertEqual(d.column, 2)
  685. if LEXER != 'dynamic':
  686. self.assertEqual(a.end_line, 1)
  687. self.assertEqual(a.end_column, 2)
  688. self.assertEqual(bc.end_line, 2)
  689. self.assertEqual(bc.end_column, 2)
  690. self.assertEqual(d.end_line, 2)
  691. self.assertEqual(d.end_column, 3)
  692. def test_reduce_cycle(self):
  693. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  694. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  695. """
  696. l = _Lark("""
  697. term: A
  698. | term term
  699. A: "a"
  700. """, start='term')
  701. tree = l.parse("aa")
  702. self.assertEqual(len(tree.children), 2)
  703. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  704. def test_lexer_prioritization(self):
  705. "Tests effect of priority on result"
  706. grammar = """
  707. start: A B | AB
  708. A.2: "a"
  709. B: "b"
  710. AB: "ab"
  711. """
  712. l = _Lark(grammar)
  713. res = l.parse("ab")
  714. self.assertEqual(res.children, ['a', 'b'])
  715. self.assertNotEqual(res.children, ['ab'])
  716. grammar = """
  717. start: A B | AB
  718. A: "a"
  719. B: "b"
  720. AB.3: "ab"
  721. """
  722. l = _Lark(grammar)
  723. res = l.parse("ab")
  724. self.assertNotEqual(res.children, ['a', 'b'])
  725. self.assertEqual(res.children, ['ab'])
  726. def test_import(self):
  727. grammar = """
  728. start: NUMBER WORD
  729. %import common.NUMBER
  730. %import common.WORD
  731. %import common.WS
  732. %ignore WS
  733. """
  734. l = _Lark(grammar)
  735. x = l.parse('12 elephants')
  736. self.assertEqual(x.children, ['12', 'elephants'])
  737. def test_relative_import(self):
  738. grammar = """
  739. start: NUMBER WORD
  740. %import .grammars.test.NUMBER
  741. %import common.WORD
  742. %import common.WS
  743. %ignore WS
  744. """
  745. l = _Lark(grammar)
  746. x = l.parse('12 lions')
  747. self.assertEqual(x.children, ['12', 'lions'])
  748. def test_multi_import(self):
  749. grammar = """
  750. start: NUMBER WORD
  751. %import common (NUMBER, WORD, WS)
  752. %ignore WS
  753. """
  754. l = _Lark(grammar)
  755. x = l.parse('12 toucans')
  756. self.assertEqual(x.children, ['12', 'toucans'])
  757. def test_relative_multi_import(self):
  758. grammar = """
  759. start: NUMBER WORD
  760. %import .grammars.test (NUMBER, WORD, WS)
  761. %ignore WS
  762. """
  763. l = _Lark(grammar)
  764. x = l.parse('12 capybaras')
  765. self.assertEqual(x.children, ['12', 'capybaras'])
  766. def test_import_errors(self):
  767. grammar = """
  768. start: NUMBER WORD
  769. %import .grammars.bad_test.NUMBER
  770. """
  771. self.assertRaises(IOError, _Lark, grammar)
  772. grammar = """
  773. start: NUMBER WORD
  774. %import bad_test.NUMBER
  775. """
  776. self.assertRaises(IOError, _Lark, grammar)
  777. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  778. def test_earley_prioritization(self):
  779. "Tests effect of priority on result"
  780. grammar = """
  781. start: a | b
  782. a.1: "a"
  783. b.2: "a"
  784. """
  785. # l = Lark(grammar, parser='earley', lexer='standard')
  786. l = _Lark(grammar)
  787. res = l.parse("a")
  788. self.assertEqual(res.children[0].data, 'b')
  789. grammar = """
  790. start: a | b
  791. a.2: "a"
  792. b.1: "a"
  793. """
  794. l = _Lark(grammar)
  795. # l = Lark(grammar, parser='earley', lexer='standard')
  796. res = l.parse("a")
  797. self.assertEqual(res.children[0].data, 'a')
  798. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  799. def test_earley_prioritization_sum(self):
  800. "Tests effect of priority on result"
  801. grammar = """
  802. start: ab_ b_ a_ | indirection
  803. indirection: a_ bb_ a_
  804. a_: "a"
  805. b_: "b"
  806. ab_: "ab"
  807. bb_.1: "bb"
  808. """
  809. l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
  810. res = l.parse('abba')
  811. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  812. grammar = """
  813. start: ab_ b_ a_ | indirection
  814. indirection: a_ bb_ a_
  815. a_: "a"
  816. b_: "b"
  817. ab_.1: "ab"
  818. bb_: "bb"
  819. """
  820. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  821. res = l.parse('abba')
  822. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  823. grammar = """
  824. start: ab_ b_ a_ | indirection
  825. indirection: a_ bb_ a_
  826. a_.2: "a"
  827. b_.1: "b"
  828. ab_.3: "ab"
  829. bb_.3: "bb"
  830. """
  831. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  832. res = l.parse('abba')
  833. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  834. grammar = """
  835. start: ab_ b_ a_ | indirection
  836. indirection: a_ bb_ a_
  837. a_.1: "a"
  838. b_.1: "b"
  839. ab_.4: "ab"
  840. bb_.3: "bb"
  841. """
  842. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  843. res = l.parse('abba')
  844. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  845. def test_utf8(self):
  846. g = u"""start: a
  847. a: "±a"
  848. """
  849. l = _Lark(g)
  850. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  851. g = u"""start: A
  852. A: "±a"
  853. """
  854. l = _Lark(g)
  855. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  856. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  857. def test_ignore(self):
  858. grammar = r"""
  859. COMMENT: /(!|(\/\/))[^\n]*/
  860. %ignore COMMENT
  861. %import common.WS -> _WS
  862. %import common.INT
  863. start: "INT"i _WS+ INT _WS*
  864. """
  865. parser = _Lark(grammar)
  866. tree = parser.parse("int 1 ! This is a comment\n")
  867. self.assertEqual(tree.children, ['1'])
  868. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  869. self.assertEqual(tree.children, ['1'])
  870. parser = _Lark(r"""
  871. start : "a"*
  872. %ignore "b"
  873. """)
  874. tree = parser.parse("bb")
  875. self.assertEqual(tree.children, [])
  876. def test_regex_escaping(self):
  877. g = _Lark("start: /[ab]/")
  878. g.parse('a')
  879. g.parse('b')
  880. self.assertRaises( UnexpectedInput, g.parse, 'c')
  881. _Lark(r'start: /\w/').parse('a')
  882. g = _Lark(r'start: /\\w/')
  883. self.assertRaises( UnexpectedInput, g.parse, 'a')
  884. g.parse(r'\w')
  885. _Lark(r'start: /\[/').parse('[')
  886. _Lark(r'start: /\//').parse('/')
  887. _Lark(r'start: /\\/').parse('\\')
  888. _Lark(r'start: /\[ab]/').parse('[ab]')
  889. _Lark(r'start: /\\[ab]/').parse('\\a')
  890. _Lark(r'start: /\t/').parse('\t')
  891. _Lark(r'start: /\\t/').parse('\\t')
  892. _Lark(r'start: /\\\t/').parse('\\\t')
  893. _Lark(r'start: "\t"').parse('\t')
  894. _Lark(r'start: "\\t"').parse('\\t')
  895. _Lark(r'start: "\\\t"').parse('\\\t')
  896. def test_ranged_repeat_rules(self):
  897. g = u"""!start: "A"~3
  898. """
  899. l = _Lark(g)
  900. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  901. self.assertRaises(ParseError, l.parse, u'AA')
  902. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  903. g = u"""!start: "A"~0..2
  904. """
  905. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  906. l = _Lark(g)
  907. self.assertEqual(l.parse(u''), Tree('start', []))
  908. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  909. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  910. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  911. g = u"""!start: "A"~3..2
  912. """
  913. self.assertRaises(GrammarError, _Lark, g)
  914. g = u"""!start: "A"~2..3 "B"~2
  915. """
  916. l = _Lark(g)
  917. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  918. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  919. self.assertRaises(ParseError, l.parse, u'AAAB')
  920. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  921. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  922. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  923. def test_ranged_repeat_terms(self):
  924. g = u"""!start: AAA
  925. AAA: "A"~3
  926. """
  927. l = _Lark(g)
  928. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  929. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  930. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  931. g = u"""!start: AABB CC
  932. AABB: "A"~0..2 "B"~2
  933. CC: "C"~1..2
  934. """
  935. l = _Lark(g)
  936. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  937. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  938. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  939. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  940. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  941. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  942. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  943. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  944. def test_priority_vs_embedded(self):
  945. g = """
  946. A.2: "a"
  947. WORD: ("a".."z")+
  948. start: (A | WORD)+
  949. """
  950. l = _Lark(g)
  951. t = l.parse('abc')
  952. self.assertEqual(t.children, ['a', 'bc'])
  953. self.assertEqual(t.children[0].type, 'A')
  954. def test_line_counting(self):
  955. p = _Lark("start: /[^x]+/")
  956. text = 'hello\nworld'
  957. t = p.parse(text)
  958. tok = t.children[0]
  959. self.assertEqual(tok, text)
  960. self.assertEqual(tok.line, 1)
  961. self.assertEqual(tok.column, 1)
  962. if _LEXER != 'dynamic':
  963. self.assertEqual(tok.end_line, 2)
  964. self.assertEqual(tok.end_column, 6)
  965. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  966. _TestParser.__name__ = _NAME
  967. globals()[_NAME] = _TestParser
  968. # Note: You still have to import them in __main__ for the tests to run
  969. _TO_TEST = [
  970. ('standard', 'earley'),
  971. ('standard', 'cyk'),
  972. ('dynamic', 'earley'),
  973. ('dynamic_complete', 'earley'),
  974. ('standard', 'lalr'),
  975. ('contextual', 'lalr'),
  976. # (None, 'earley'),
  977. ]
  978. for _LEXER, _PARSER in _TO_TEST:
  979. _make_parser_test(_LEXER, _PARSER)
  980. for _LEXER in ('dynamic',):
  981. _make_full_earley_test(_LEXER)
  982. if __name__ == '__main__':
  983. unittest.main()