This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1250 lines
41 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput
  19. from lark.tree import Tree
  20. from lark.visitors import Transformer
  21. __path__ = os.path.dirname(__file__)
  22. def _read(n, *args):
  23. with open(os.path.join(__path__, n), *args) as f:
  24. return f.read()
  25. class TestParsers(unittest.TestCase):
  26. def test_same_ast(self):
  27. "Tests that Earley and LALR parsers produce equal trees"
  28. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  29. name_list: NAME | name_list "," NAME
  30. NAME: /\w+/ """, parser='lalr')
  31. l = g.parse('(a,b,c,*x)')
  32. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  33. name_list: NAME | name_list "," NAME
  34. NAME: /\w/+ """)
  35. l2 = g.parse('(a,b,c,*x)')
  36. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  37. def test_infinite_recurse(self):
  38. g = """start: a
  39. a: a | "a"
  40. """
  41. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  42. l = Lark(g, parser='earley', lexer='dynamic')
  43. self.assertRaises(ParseError, l.parse, 'a')
  44. def test_propagate_positions(self):
  45. g = Lark("""start: a
  46. a: "a"
  47. """, propagate_positions=True)
  48. r = g.parse('a')
  49. self.assertEqual( r.children[0].meta.line, 1 )
  50. def test_expand1(self):
  51. g = Lark("""start: a
  52. ?a: b
  53. b: "x"
  54. """)
  55. r = g.parse('x')
  56. self.assertEqual( r.children[0].data, "b" )
  57. g = Lark("""start: a
  58. ?a: b -> c
  59. b: "x"
  60. """)
  61. r = g.parse('x')
  62. self.assertEqual( r.children[0].data, "c" )
  63. g = Lark("""start: a
  64. ?a: B -> c
  65. B: "x"
  66. """)
  67. self.assertEqual( r.children[0].data, "c" )
  68. g = Lark("""start: a
  69. ?a: b b -> c
  70. b: "x"
  71. """)
  72. r = g.parse('xx')
  73. self.assertEqual( r.children[0].data, "c" )
  74. def test_embedded_transformer(self):
  75. class T(Transformer):
  76. def a(self, children):
  77. return "<a>"
  78. def b(self, children):
  79. return "<b>"
  80. def c(self, children):
  81. return "<c>"
  82. # Test regular
  83. g = Lark("""start: a
  84. a : "x"
  85. """, parser='lalr')
  86. r = T().transform(g.parse("x"))
  87. self.assertEqual( r.children, ["<a>"] )
  88. g = Lark("""start: a
  89. a : "x"
  90. """, parser='lalr', transformer=T())
  91. r = g.parse("x")
  92. self.assertEqual( r.children, ["<a>"] )
  93. # Test Expand1
  94. g = Lark("""start: a
  95. ?a : b
  96. b : "x"
  97. """, parser='lalr')
  98. r = T().transform(g.parse("x"))
  99. self.assertEqual( r.children, ["<b>"] )
  100. g = Lark("""start: a
  101. ?a : b
  102. b : "x"
  103. """, parser='lalr', transformer=T())
  104. r = g.parse("x")
  105. self.assertEqual( r.children, ["<b>"] )
  106. # Test Expand1 -> Alias
  107. g = Lark("""start: a
  108. ?a : b b -> c
  109. b : "x"
  110. """, parser='lalr')
  111. r = T().transform(g.parse("xx"))
  112. self.assertEqual( r.children, ["<c>"] )
  113. g = Lark("""start: a
  114. ?a : b b -> c
  115. b : "x"
  116. """, parser='lalr', transformer=T())
  117. r = g.parse("xx")
  118. self.assertEqual( r.children, ["<c>"] )
  119. def _make_full_earley_test(LEXER):
  120. class _TestFullEarley(unittest.TestCase):
  121. def test_anon(self):
  122. # Fails an Earley implementation without special handling for empty rules,
  123. # or re-processing of already completed rules.
  124. g = Lark(r"""start: B
  125. B: ("ab"|/[^b]/)+
  126. """, lexer=LEXER)
  127. self.assertEqual( g.parse('abc').children[0], 'abc')
  128. def test_earley(self):
  129. g = Lark("""start: A "b" c
  130. A: "a"+
  131. c: "abc"
  132. """, parser="earley", lexer=LEXER)
  133. x = g.parse('aaaababc')
  134. def test_earley2(self):
  135. grammar = """
  136. start: statement+
  137. statement: "r"
  138. | "c" /[a-z]/+
  139. %ignore " "
  140. """
  141. program = """c b r"""
  142. l = Lark(grammar, parser='earley', lexer=LEXER)
  143. l.parse(program)
  144. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  145. def test_earley3(self):
  146. "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  147. grammar = """
  148. start: A A
  149. A: "a"+
  150. """
  151. l = Lark(grammar, parser='earley', lexer=LEXER)
  152. res = l.parse("aaa")
  153. self.assertEqual(res.children, ['aa', 'a'])
  154. def test_earley4(self):
  155. grammar = """
  156. start: A A?
  157. A: "a"+
  158. """
  159. l = Lark(grammar, parser='earley', lexer=LEXER)
  160. res = l.parse("aaa")
  161. self.assertEqual(res.children, ['aaa'])
  162. def test_earley_repeating_empty(self):
  163. # This was a sneaky bug!
  164. grammar = """
  165. !start: "a" empty empty "b"
  166. empty: empty2
  167. empty2:
  168. """
  169. parser = Lark(grammar, parser='earley', lexer=LEXER)
  170. res = parser.parse('ab')
  171. empty_tree = Tree('empty', [Tree('empty2', [])])
  172. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  173. def test_earley_explicit_ambiguity(self):
  174. # This was a sneaky bug!
  175. grammar = """
  176. start: a b | ab
  177. a: "a"
  178. b: "b"
  179. ab: "ab"
  180. """
  181. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  182. res = parser.parse('ab')
  183. self.assertEqual( res.data, '_ambig')
  184. self.assertEqual( len(res.children), 2)
  185. def test_ambiguity1(self):
  186. grammar = """
  187. start: cd+ "e"
  188. !cd: "c"
  189. | "d"
  190. | "cd"
  191. """
  192. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  193. x = l.parse('cde')
  194. assert x.data == '_ambig', x
  195. assert len(x.children) == 2
  196. def test_fruitflies_ambig(self):
  197. grammar = """
  198. start: noun verb noun -> simple
  199. | noun verb "like" noun -> comparative
  200. noun: adj? NOUN
  201. verb: VERB
  202. adj: ADJ
  203. NOUN: "flies" | "bananas" | "fruit"
  204. VERB: "like" | "flies"
  205. ADJ: "fruit"
  206. %import common.WS
  207. %ignore WS
  208. """
  209. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  210. res = parser.parse('fruit flies like bananas')
  211. expected = Tree('_ambig', [
  212. Tree('comparative', [
  213. Tree('noun', ['fruit']),
  214. Tree('verb', ['flies']),
  215. Tree('noun', ['bananas'])
  216. ]),
  217. Tree('simple', [
  218. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  219. Tree('verb', ['like']),
  220. Tree('noun', ['bananas'])
  221. ])
  222. ])
  223. # print res.pretty()
  224. # print expected.pretty()
  225. self.assertEqual(res, expected)
  226. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  227. def test_explicit_ambiguity2(self):
  228. grammar = r"""
  229. start: NAME+
  230. NAME: /\w+/
  231. %ignore " "
  232. """
  233. text = """cat"""
  234. parser = Lark(grammar, start='start', ambiguity='explicit')
  235. tree = parser.parse(text)
  236. self.assertEqual(tree.data, '_ambig')
  237. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  238. self.assertEqual(combinations, {
  239. ('cat',),
  240. ('ca', 't'),
  241. ('c', 'at'),
  242. ('c', 'a' ,'t')
  243. })
  244. def test_term_ambig_resolve(self):
  245. grammar = r"""
  246. !start: NAME+
  247. NAME: /\w+/
  248. %ignore " "
  249. """
  250. text = """foo bar"""
  251. parser = Lark(grammar)
  252. tree = parser.parse(text)
  253. self.assertEqual(tree.children, ['foo', 'bar'])
  254. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  255. # def test_not_all_derivations(self):
  256. # grammar = """
  257. # start: cd+ "e"
  258. # !cd: "c"
  259. # | "d"
  260. # | "cd"
  261. # """
  262. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  263. # x = l.parse('cde')
  264. # assert x.data != '_ambig', x
  265. # assert len(x.children) == 1
  266. _NAME = "TestFullEarley" + LEXER.capitalize()
  267. _TestFullEarley.__name__ = _NAME
  268. globals()[_NAME] = _TestFullEarley
  269. def _make_parser_test(LEXER, PARSER):
  270. def _Lark(grammar, **kwargs):
  271. return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
  272. class _TestParser(unittest.TestCase):
  273. def test_basic1(self):
  274. g = _Lark("""start: a+ b a* "b" a*
  275. b: "b"
  276. a: "a"
  277. """)
  278. r = g.parse('aaabaab')
  279. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  280. r = g.parse('aaabaaba')
  281. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  282. self.assertRaises(ParseError, g.parse, 'aaabaa')
  283. def test_basic2(self):
  284. # Multiple parsers and colliding tokens
  285. g = _Lark("""start: B A
  286. B: "12"
  287. A: "1" """)
  288. g2 = _Lark("""start: B A
  289. B: "12"
  290. A: "2" """)
  291. x = g.parse('121')
  292. assert x.data == 'start' and x.children == ['12', '1'], x
  293. x = g2.parse('122')
  294. assert x.data == 'start' and x.children == ['12', '2'], x
  295. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  296. def test_stringio_bytes(self):
  297. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  298. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  299. def test_stringio_unicode(self):
  300. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  301. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  302. def test_unicode(self):
  303. g = _Lark(u"""start: UNIA UNIB UNIA
  304. UNIA: /\xa3/
  305. UNIB: /\u0101/
  306. """)
  307. g.parse(u'\xa3\u0101\u00a3')
  308. def test_unicode2(self):
  309. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  310. UNIA: /\xa3/
  311. UNIB: "a\u0101b\ "
  312. UNIC: /a?\u0101c\n/
  313. """)
  314. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  315. def test_unicode3(self):
  316. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  317. UNIA: /\xa3/
  318. UNIB: "\u0101"
  319. UNIC: /\u0203/ /\n/
  320. """)
  321. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  322. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  323. def test_stack_for_ebnf(self):
  324. """Verify that stack depth isn't an issue for EBNF grammars"""
  325. g = _Lark(r"""start: a+
  326. a : "a" """)
  327. g.parse("a" * (sys.getrecursionlimit()*2 ))
  328. def test_expand1_lists_with_one_item(self):
  329. g = _Lark(r"""start: list
  330. ?list: item+
  331. item : A
  332. A: "a"
  333. """)
  334. r = g.parse("a")
  335. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  336. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  337. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  338. self.assertEqual(len(r.children), 1)
  339. def test_expand1_lists_with_one_item_2(self):
  340. g = _Lark(r"""start: list
  341. ?list: item+ "!"
  342. item : A
  343. A: "a"
  344. """)
  345. r = g.parse("a!")
  346. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  347. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  348. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  349. self.assertEqual(len(r.children), 1)
  350. def test_dont_expand1_lists_with_multiple_items(self):
  351. g = _Lark(r"""start: list
  352. ?list: item+
  353. item : A
  354. A: "a"
  355. """)
  356. r = g.parse("aa")
  357. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  358. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  359. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  360. self.assertEqual(len(r.children), 1)
  361. # Sanity check: verify that 'list' contains the two 'item's we've given it
  362. [list] = r.children
  363. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  364. def test_dont_expand1_lists_with_multiple_items_2(self):
  365. g = _Lark(r"""start: list
  366. ?list: item+ "!"
  367. item : A
  368. A: "a"
  369. """)
  370. r = g.parse("aa!")
  371. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  372. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  373. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  374. self.assertEqual(len(r.children), 1)
  375. # Sanity check: verify that 'list' contains the two 'item's we've given it
  376. [list] = r.children
  377. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  378. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  379. def test_empty_expand1_list(self):
  380. g = _Lark(r"""start: list
  381. ?list: item*
  382. item : A
  383. A: "a"
  384. """)
  385. r = g.parse("")
  386. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  387. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  388. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  389. self.assertEqual(len(r.children), 1)
  390. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  391. [list] = r.children
  392. self.assertSequenceEqual([item.data for item in list.children], ())
  393. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  394. def test_empty_expand1_list_2(self):
  395. g = _Lark(r"""start: list
  396. ?list: item* "!"?
  397. item : A
  398. A: "a"
  399. """)
  400. r = g.parse("")
  401. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  402. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  403. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  404. self.assertEqual(len(r.children), 1)
  405. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  406. [list] = r.children
  407. self.assertSequenceEqual([item.data for item in list.children], ())
  408. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  409. def test_empty_flatten_list(self):
  410. g = _Lark(r"""start: list
  411. list: | item "," list
  412. item : A
  413. A: "a"
  414. """)
  415. r = g.parse("")
  416. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  417. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  418. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  419. [list] = r.children
  420. self.assertSequenceEqual([item.data for item in list.children], ())
  421. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  422. def test_single_item_flatten_list(self):
  423. g = _Lark(r"""start: list
  424. list: | item "," list
  425. item : A
  426. A: "a"
  427. """)
  428. r = g.parse("a,")
  429. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  430. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  431. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  432. [list] = r.children
  433. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  434. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  435. def test_multiple_item_flatten_list(self):
  436. g = _Lark(r"""start: list
  437. #list: | item "," list
  438. item : A
  439. A: "a"
  440. """)
  441. r = g.parse("a,a,")
  442. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  443. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  444. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  445. [list] = r.children
  446. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  447. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  448. def test_recurse_flatten(self):
  449. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  450. g = _Lark(r"""start: a | start a
  451. a : A
  452. A : "a" """)
  453. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  454. # STree data structures, which uses recursion).
  455. g.parse("a" * (sys.getrecursionlimit() // 4))
  456. def test_token_collision(self):
  457. g = _Lark(r"""start: "Hello" NAME
  458. NAME: /\w/+
  459. %ignore " "
  460. """)
  461. x = g.parse('Hello World')
  462. self.assertSequenceEqual(x.children, ['World'])
  463. x = g.parse('Hello HelloWorld')
  464. self.assertSequenceEqual(x.children, ['HelloWorld'])
  465. def test_token_collision_WS(self):
  466. g = _Lark(r"""start: "Hello" NAME
  467. NAME: /\w/+
  468. %import common.WS
  469. %ignore WS
  470. """)
  471. x = g.parse('Hello World')
  472. self.assertSequenceEqual(x.children, ['World'])
  473. x = g.parse('Hello HelloWorld')
  474. self.assertSequenceEqual(x.children, ['HelloWorld'])
  475. def test_token_collision2(self):
  476. g = _Lark("""
  477. !start: "starts"
  478. %import common.LCASE_LETTER
  479. """)
  480. x = g.parse("starts")
  481. self.assertSequenceEqual(x.children, ['starts'])
  482. # def test_string_priority(self):
  483. # g = _Lark("""start: (A | /a?bb/)+
  484. # A: "a" """)
  485. # x = g.parse('abb')
  486. # self.assertEqual(len(x.children), 2)
  487. # # This parse raises an exception because the lexer will always try to consume
  488. # # "a" first and will never match the regular expression
  489. # # This behavior is subject to change!!
  490. # # Thie won't happen with ambiguity handling.
  491. # g = _Lark("""start: (A | /a?ab/)+
  492. # A: "a" """)
  493. # self.assertRaises(LexError, g.parse, 'aab')
  494. def test_undefined_rule(self):
  495. self.assertRaises(GrammarError, _Lark, """start: a""")
  496. def test_undefined_token(self):
  497. self.assertRaises(GrammarError, _Lark, """start: A""")
  498. def test_rule_collision(self):
  499. g = _Lark("""start: "a"+ "b"
  500. | "a"+ """)
  501. x = g.parse('aaaa')
  502. x = g.parse('aaaab')
  503. def test_rule_collision2(self):
  504. g = _Lark("""start: "a"* "b"
  505. | "a"+ """)
  506. x = g.parse('aaaa')
  507. x = g.parse('aaaab')
  508. x = g.parse('b')
  509. def test_token_not_anon(self):
  510. """Tests that "a" is matched as an anonymous token, and not A.
  511. """
  512. g = _Lark("""start: "a"
  513. A: "a" """)
  514. x = g.parse('a')
  515. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  516. g = _Lark("""start: "a" A
  517. A: "a" """)
  518. x = g.parse('aa')
  519. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  520. self.assertEqual(x.children[0].type, "A")
  521. g = _Lark("""start: /a/
  522. A: /a/ """)
  523. x = g.parse('a')
  524. self.assertEqual(len(x.children), 1)
  525. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  526. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  527. def test_maybe(self):
  528. g = _Lark("""start: ["a"] """)
  529. x = g.parse('a')
  530. x = g.parse('')
  531. def test_start(self):
  532. g = _Lark("""a: "a" a? """, start='a')
  533. x = g.parse('a')
  534. x = g.parse('aa')
  535. x = g.parse('aaa')
  536. def test_alias(self):
  537. g = _Lark("""start: "a" -> b """)
  538. x = g.parse('a')
  539. self.assertEqual(x.data, "b")
  540. def test_token_ebnf(self):
  541. g = _Lark("""start: A
  542. A: "a"* ("b"? "c".."e")+
  543. """)
  544. x = g.parse('abcde')
  545. x = g.parse('dd')
  546. def test_backslash(self):
  547. g = _Lark(r"""start: "\\" "a"
  548. """)
  549. x = g.parse(r'\a')
  550. g = _Lark(r"""start: /\\/ /a/
  551. """)
  552. x = g.parse(r'\a')
  553. def test_special_chars(self):
  554. g = _Lark(r"""start: "\n"
  555. """)
  556. x = g.parse('\n')
  557. g = _Lark(r"""start: /\n/
  558. """)
  559. x = g.parse('\n')
  560. def test_backslash2(self):
  561. g = _Lark(r"""start: "\"" "-"
  562. """)
  563. x = g.parse('"-')
  564. g = _Lark(r"""start: /\// /-/
  565. """)
  566. x = g.parse('/-')
  567. # def test_token_recurse(self):
  568. # g = _Lark("""start: A
  569. # A: B
  570. # B: A
  571. # """)
  572. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  573. def test_empty(self):
  574. # Fails an Earley implementation without special handling for empty rules,
  575. # or re-processing of already completed rules.
  576. g = _Lark(r"""start: _empty a "B"
  577. a: _empty "A"
  578. _empty:
  579. """)
  580. x = g.parse('AB')
  581. def test_regex_quote(self):
  582. g = r"""
  583. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  584. SINGLE_QUOTED_STRING : /'[^']*'/
  585. DOUBLE_QUOTED_STRING : /"[^"]*"/
  586. """
  587. g = _Lark(g)
  588. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  589. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  590. def test_lexer_token_limit(self):
  591. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  592. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  593. g = _Lark("""start: %s
  594. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  595. def test_float_without_lexer(self):
  596. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  597. if PARSER == 'cyk':
  598. expected_error = ParseError
  599. g = _Lark("""start: ["+"|"-"] float
  600. float: digit* "." digit+ exp?
  601. | digit+ exp
  602. exp: ("e"|"E") ["+"|"-"] digit+
  603. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  604. """)
  605. g.parse("1.2")
  606. g.parse("-.2e9")
  607. g.parse("+2e-9")
  608. self.assertRaises( expected_error, g.parse, "+2e-9e")
  609. def test_keep_all_tokens(self):
  610. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  611. tree = l.parse('aaa')
  612. self.assertEqual(tree.children, ['a', 'a', 'a'])
  613. def test_token_flags(self):
  614. l = _Lark("""!start: "a"i+
  615. """
  616. )
  617. tree = l.parse('aA')
  618. self.assertEqual(tree.children, ['a', 'A'])
  619. l = _Lark("""!start: /a/i+
  620. """
  621. )
  622. tree = l.parse('aA')
  623. self.assertEqual(tree.children, ['a', 'A'])
  624. # g = """!start: "a"i "a"
  625. # """
  626. # self.assertRaises(GrammarError, _Lark, g)
  627. # g = """!start: /a/i /a/
  628. # """
  629. # self.assertRaises(GrammarError, _Lark, g)
  630. g = """start: NAME "," "a"
  631. NAME: /[a-z_]/i /[a-z0-9_]/i*
  632. """
  633. l = _Lark(g)
  634. tree = l.parse('ab,a')
  635. self.assertEqual(tree.children, ['ab'])
  636. tree = l.parse('AB,a')
  637. self.assertEqual(tree.children, ['AB'])
  638. def test_token_flags3(self):
  639. l = _Lark("""!start: ABC+
  640. ABC: "abc"i
  641. """
  642. )
  643. tree = l.parse('aBcAbC')
  644. self.assertEqual(tree.children, ['aBc', 'AbC'])
  645. def test_token_flags2(self):
  646. g = """!start: ("a"i | /a/ /b/?)+
  647. """
  648. l = _Lark(g)
  649. tree = l.parse('aA')
  650. self.assertEqual(tree.children, ['a', 'A'])
  651. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  652. def test_twice_empty(self):
  653. g = """!start: [["A"]]
  654. """
  655. l = _Lark(g)
  656. tree = l.parse('A')
  657. self.assertEqual(tree.children, ['A'])
  658. tree = l.parse('')
  659. self.assertEqual(tree.children, [])
  660. def test_undefined_ignore(self):
  661. g = """!start: "A"
  662. %ignore B
  663. """
  664. self.assertRaises( GrammarError, _Lark, g)
  665. def test_alias_in_terminal(self):
  666. g = """start: TERM
  667. TERM: "a" -> alias
  668. """
  669. self.assertRaises( GrammarError, _Lark, g)
  670. def test_line_and_column(self):
  671. g = r"""!start: "A" bc "D"
  672. !bc: "B\nC"
  673. """
  674. l = _Lark(g)
  675. a, bc, d = l.parse("AB\nCD").children
  676. self.assertEqual(a.line, 1)
  677. self.assertEqual(a.column, 1)
  678. bc ,= bc.children
  679. self.assertEqual(bc.line, 1)
  680. self.assertEqual(bc.column, 2)
  681. self.assertEqual(d.line, 2)
  682. self.assertEqual(d.column, 2)
  683. if LEXER != 'dynamic':
  684. self.assertEqual(a.end_line, 1)
  685. self.assertEqual(a.end_column, 2)
  686. self.assertEqual(bc.end_line, 2)
  687. self.assertEqual(bc.end_column, 2)
  688. self.assertEqual(d.end_line, 2)
  689. self.assertEqual(d.end_column, 3)
  690. def test_reduce_cycle(self):
  691. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  692. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  693. """
  694. l = _Lark("""
  695. term: A
  696. | term term
  697. A: "a"
  698. """, start='term')
  699. tree = l.parse("aa")
  700. self.assertEqual(len(tree.children), 2)
  701. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  702. def test_lexer_prioritization(self):
  703. "Tests effect of priority on result"
  704. grammar = """
  705. start: A B | AB
  706. A.2: "a"
  707. B: "b"
  708. AB: "ab"
  709. """
  710. l = _Lark(grammar)
  711. res = l.parse("ab")
  712. self.assertEqual(res.children, ['a', 'b'])
  713. self.assertNotEqual(res.children, ['ab'])
  714. grammar = """
  715. start: A B | AB
  716. A: "a"
  717. B: "b"
  718. AB.3: "ab"
  719. """
  720. l = _Lark(grammar)
  721. res = l.parse("ab")
  722. self.assertNotEqual(res.children, ['a', 'b'])
  723. self.assertEqual(res.children, ['ab'])
  724. def test_import(self):
  725. grammar = """
  726. start: NUMBER WORD
  727. %import common.NUMBER
  728. %import common.WORD
  729. %import common.WS
  730. %ignore WS
  731. """
  732. l = _Lark(grammar)
  733. x = l.parse('12 elephants')
  734. self.assertEqual(x.children, ['12', 'elephants'])
  735. def test_relative_import(self):
  736. grammar = """
  737. start: NUMBER WORD
  738. %import .grammars.test.NUMBER
  739. %import common.WORD
  740. %import common.WS
  741. %ignore WS
  742. """
  743. l = _Lark(grammar)
  744. x = l.parse('12 lions')
  745. self.assertEqual(x.children, ['12', 'lions'])
  746. def test_multi_import(self):
  747. grammar = """
  748. start: NUMBER WORD
  749. %import common (NUMBER, WORD, WS)
  750. %ignore WS
  751. """
  752. l = _Lark(grammar)
  753. x = l.parse('12 toucans')
  754. self.assertEqual(x.children, ['12', 'toucans'])
  755. def test_relative_multi_import(self):
  756. grammar = """
  757. start: NUMBER WORD
  758. %import .grammars.test (NUMBER, WORD, WS)
  759. %ignore WS
  760. """
  761. l = _Lark(grammar)
  762. x = l.parse('12 capybaras')
  763. self.assertEqual(x.children, ['12', 'capybaras'])
  764. def test_import_errors(self):
  765. grammar = """
  766. start: NUMBER WORD
  767. %import .grammars.bad_test.NUMBER
  768. """
  769. self.assertRaises(IOError, _Lark, grammar)
  770. grammar = """
  771. start: NUMBER WORD
  772. %import bad_test.NUMBER
  773. """
  774. self.assertRaises(IOError, _Lark, grammar)
  775. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  776. def test_earley_prioritization(self):
  777. "Tests effect of priority on result"
  778. grammar = """
  779. start: a | b
  780. a.1: "a"
  781. b.2: "a"
  782. """
  783. # l = Lark(grammar, parser='earley', lexer='standard')
  784. l = _Lark(grammar)
  785. res = l.parse("a")
  786. self.assertEqual(res.children[0].data, 'b')
  787. grammar = """
  788. start: a | b
  789. a.2: "a"
  790. b.1: "a"
  791. """
  792. l = _Lark(grammar)
  793. # l = Lark(grammar, parser='earley', lexer='standard')
  794. res = l.parse("a")
  795. self.assertEqual(res.children[0].data, 'a')
  796. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  797. def test_earley_prioritization_sum(self):
  798. "Tests effect of priority on result"
  799. grammar = """
  800. start: ab_ b_ a_ | indirection
  801. indirection: a_ bb_ a_
  802. a_: "a"
  803. b_: "b"
  804. ab_: "ab"
  805. bb_.1: "bb"
  806. """
  807. l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
  808. res = l.parse('abba')
  809. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  810. grammar = """
  811. start: ab_ b_ a_ | indirection
  812. indirection: a_ bb_ a_
  813. a_: "a"
  814. b_: "b"
  815. ab_.1: "ab"
  816. bb_: "bb"
  817. """
  818. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  819. res = l.parse('abba')
  820. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  821. grammar = """
  822. start: ab_ b_ a_ | indirection
  823. indirection: a_ bb_ a_
  824. a_.2: "a"
  825. b_.1: "b"
  826. ab_.3: "ab"
  827. bb_.3: "bb"
  828. """
  829. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  830. res = l.parse('abba')
  831. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  832. grammar = """
  833. start: ab_ b_ a_ | indirection
  834. indirection: a_ bb_ a_
  835. a_.1: "a"
  836. b_.1: "b"
  837. ab_.4: "ab"
  838. bb_.3: "bb"
  839. """
  840. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  841. res = l.parse('abba')
  842. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  843. def test_utf8(self):
  844. g = u"""start: a
  845. a: "±a"
  846. """
  847. l = _Lark(g)
  848. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  849. g = u"""start: A
  850. A: "±a"
  851. """
  852. l = _Lark(g)
  853. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  854. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  855. def test_ignore(self):
  856. grammar = r"""
  857. COMMENT: /(!|(\/\/))[^\n]*/
  858. %ignore COMMENT
  859. %import common.WS -> _WS
  860. %import common.INT
  861. start: "INT"i _WS+ INT _WS*
  862. """
  863. parser = _Lark(grammar)
  864. tree = parser.parse("int 1 ! This is a comment\n")
  865. self.assertEqual(tree.children, ['1'])
  866. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  867. self.assertEqual(tree.children, ['1'])
  868. parser = _Lark(r"""
  869. start : "a"*
  870. %ignore "b"
  871. """)
  872. tree = parser.parse("bb")
  873. self.assertEqual(tree.children, [])
  874. def test_regex_escaping(self):
  875. g = _Lark("start: /[ab]/")
  876. g.parse('a')
  877. g.parse('b')
  878. self.assertRaises( UnexpectedInput, g.parse, 'c')
  879. _Lark(r'start: /\w/').parse('a')
  880. g = _Lark(r'start: /\\w/')
  881. self.assertRaises( UnexpectedInput, g.parse, 'a')
  882. g.parse(r'\w')
  883. _Lark(r'start: /\[/').parse('[')
  884. _Lark(r'start: /\//').parse('/')
  885. _Lark(r'start: /\\/').parse('\\')
  886. _Lark(r'start: /\[ab]/').parse('[ab]')
  887. _Lark(r'start: /\\[ab]/').parse('\\a')
  888. _Lark(r'start: /\t/').parse('\t')
  889. _Lark(r'start: /\\t/').parse('\\t')
  890. _Lark(r'start: /\\\t/').parse('\\\t')
  891. _Lark(r'start: "\t"').parse('\t')
  892. _Lark(r'start: "\\t"').parse('\\t')
  893. _Lark(r'start: "\\\t"').parse('\\\t')
  894. def test_ranged_repeat_rules(self):
  895. g = u"""!start: "A"~3
  896. """
  897. l = _Lark(g)
  898. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  899. self.assertRaises(ParseError, l.parse, u'AA')
  900. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  901. g = u"""!start: "A"~0..2
  902. """
  903. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  904. l = _Lark(g)
  905. self.assertEqual(l.parse(u''), Tree('start', []))
  906. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  907. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  908. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  909. g = u"""!start: "A"~3..2
  910. """
  911. self.assertRaises(GrammarError, _Lark, g)
  912. g = u"""!start: "A"~2..3 "B"~2
  913. """
  914. l = _Lark(g)
  915. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  916. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  917. self.assertRaises(ParseError, l.parse, u'AAAB')
  918. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  919. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  920. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  921. def test_ranged_repeat_terms(self):
  922. g = u"""!start: AAA
  923. AAA: "A"~3
  924. """
  925. l = _Lark(g)
  926. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  927. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  928. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  929. g = u"""!start: AABB CC
  930. AABB: "A"~0..2 "B"~2
  931. CC: "C"~1..2
  932. """
  933. l = _Lark(g)
  934. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  935. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  936. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  937. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  938. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  939. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  940. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  941. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  942. def test_priority_vs_embedded(self):
  943. g = """
  944. A.2: "a"
  945. WORD: ("a".."z")+
  946. start: (A | WORD)+
  947. """
  948. l = _Lark(g)
  949. t = l.parse('abc')
  950. self.assertEqual(t.children, ['a', 'bc'])
  951. self.assertEqual(t.children[0].type, 'A')
  952. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  953. _TestParser.__name__ = _NAME
  954. globals()[_NAME] = _TestParser
  955. # Note: You still have to import them in __main__ for the tests to run
  956. _TO_TEST = [
  957. ('standard', 'earley'),
  958. ('standard', 'cyk'),
  959. ('dynamic', 'earley'),
  960. ('dynamic_complete', 'earley'),
  961. ('standard', 'lalr'),
  962. ('contextual', 'lalr'),
  963. # (None, 'earley'),
  964. ]
  965. for _LEXER, _PARSER in _TO_TEST:
  966. _make_parser_test(_LEXER, _PARSER)
  967. for _LEXER in ('dynamic',):
  968. _make_full_earley_test(_LEXER)
  969. if __name__ == '__main__':
  970. unittest.main()