This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1299 lines
43 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput
  19. from lark.tree import Tree
  20. from lark.visitors import Transformer
  21. from lark.parsers.earley_forest import ForestToAmbiguousTreeVisitor
  22. from lark.parsers.earley import ApplyCallbacks
  23. __path__ = os.path.dirname(__file__)
  24. def _read(n, *args):
  25. with open(os.path.join(__path__, n), *args) as f:
  26. return f.read()
  27. class TestParsers(unittest.TestCase):
  28. def test_same_ast(self):
  29. "Tests that Earley and LALR parsers produce equal trees"
  30. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  31. name_list: NAME | name_list "," NAME
  32. NAME: /\w+/ """, parser='lalr')
  33. l = g.parse('(a,b,c,*x)')
  34. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  35. name_list: NAME | name_list "," NAME
  36. NAME: /\w/+ """)
  37. l2 = g.parse('(a,b,c,*x)')
  38. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  39. def test_infinite_recurse(self):
  40. g = """start: a
  41. a: a | "a"
  42. """
  43. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  44. l = Lark(g, parser='earley', lexer='dynamic')
  45. self.assertRaises(ParseError, l.parse, 'a')
  46. def test_propagate_positions(self):
  47. g = Lark("""start: a
  48. a: "a"
  49. """, propagate_positions=True)
  50. r = g.parse('a')
  51. self.assertEqual( r.children[0].meta.line, 1 )
  52. def test_expand1(self):
  53. g = Lark("""start: a
  54. ?a: b
  55. b: "x"
  56. """)
  57. r = g.parse('x')
  58. self.assertEqual( r.children[0].data, "b" )
  59. g = Lark("""start: a
  60. ?a: b -> c
  61. b: "x"
  62. """)
  63. r = g.parse('x')
  64. self.assertEqual( r.children[0].data, "c" )
  65. g = Lark("""start: a
  66. ?a: B -> c
  67. B: "x"
  68. """)
  69. self.assertEqual( r.children[0].data, "c" )
  70. g = Lark("""start: a
  71. ?a: b b -> c
  72. b: "x"
  73. """)
  74. r = g.parse('xx')
  75. self.assertEqual( r.children[0].data, "c" )
  76. def test_embedded_transformer(self):
  77. class T(Transformer):
  78. def a(self, children):
  79. return "<a>"
  80. def b(self, children):
  81. return "<b>"
  82. def c(self, children):
  83. return "<c>"
  84. # Test regular
  85. g = Lark("""start: a
  86. a : "x"
  87. """, parser='lalr')
  88. r = T().transform(g.parse("x"))
  89. self.assertEqual( r.children, ["<a>"] )
  90. g = Lark("""start: a
  91. a : "x"
  92. """, parser='lalr', transformer=T())
  93. r = g.parse("x")
  94. self.assertEqual( r.children, ["<a>"] )
  95. # Test Expand1
  96. g = Lark("""start: a
  97. ?a : b
  98. b : "x"
  99. """, parser='lalr')
  100. r = T().transform(g.parse("x"))
  101. self.assertEqual( r.children, ["<b>"] )
  102. g = Lark("""start: a
  103. ?a : b
  104. b : "x"
  105. """, parser='lalr', transformer=T())
  106. r = g.parse("x")
  107. self.assertEqual( r.children, ["<b>"] )
  108. # Test Expand1 -> Alias
  109. g = Lark("""start: a
  110. ?a : b b -> c
  111. b : "x"
  112. """, parser='lalr')
  113. r = T().transform(g.parse("xx"))
  114. self.assertEqual( r.children, ["<c>"] )
  115. g = Lark("""start: a
  116. ?a : b b -> c
  117. b : "x"
  118. """, parser='lalr', transformer=T())
  119. r = g.parse("xx")
  120. self.assertEqual( r.children, ["<c>"] )
  121. def test_alias(self):
  122. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  123. def _make_full_earley_test(LEXER):
  124. class _TestFullEarley(unittest.TestCase):
  125. def test_anon(self):
  126. # Fails an Earley implementation without special handling for empty rules,
  127. # or re-processing of already completed rules.
  128. g = Lark(r"""start: B
  129. B: ("ab"|/[^b]/)+
  130. """, lexer=LEXER)
  131. self.assertEqual( g.parse('abc').children[0], 'abc')
  132. def test_earley(self):
  133. g = Lark("""start: A "b" c
  134. A: "a"+
  135. c: "abc"
  136. """, parser="earley", lexer=LEXER)
  137. x = g.parse('aaaababc')
  138. def test_earley2(self):
  139. grammar = """
  140. start: statement+
  141. statement: "r"
  142. | "c" /[a-z]/+
  143. %ignore " "
  144. """
  145. program = """c b r"""
  146. l = Lark(grammar, parser='earley', lexer=LEXER)
  147. l.parse(program)
  148. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  149. def test_earley3(self):
  150. "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  151. grammar = """
  152. start: A A
  153. A: "a"+
  154. """
  155. l = Lark(grammar, parser='earley', lexer=LEXER)
  156. res = l.parse("aaa")
  157. self.assertEqual(res.children, ['aa', 'a'])
  158. def test_earley4(self):
  159. grammar = """
  160. start: A A?
  161. A: "a"+
  162. """
  163. l = Lark(grammar, parser='earley', lexer=LEXER)
  164. res = l.parse("aaa")
  165. self.assertEqual(res.children, ['aaa'])
  166. def test_earley_repeating_empty(self):
  167. # This was a sneaky bug!
  168. grammar = """
  169. !start: "a" empty empty "b"
  170. empty: empty2
  171. empty2:
  172. """
  173. parser = Lark(grammar, parser='earley', lexer=LEXER)
  174. res = parser.parse('ab')
  175. empty_tree = Tree('empty', [Tree('empty2', [])])
  176. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  177. def test_earley_explicit_ambiguity(self):
  178. # This was a sneaky bug!
  179. grammar = """
  180. start: a b | ab
  181. a: "a"
  182. b: "b"
  183. ab: "ab"
  184. """
  185. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  186. root_symbol = parser.parse('ab')
  187. ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go()
  188. print(ambig_tree.pretty())
  189. self.assertEqual( ambig_tree.data, '_ambig')
  190. self.assertEqual( len(ambig_tree.children), 2)
  191. def test_ambiguity1(self):
  192. grammar = """
  193. start: cd+ "e"
  194. !cd: "c"
  195. | "d"
  196. | "cd"
  197. """
  198. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  199. root_symbol = l.parse('cde')
  200. ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, l.parser.parser.callbacks).go()
  201. print(ambig_tree.pretty())
  202. # tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree)
  203. assert ambig_tree.data == '_ambig', ambig_tree
  204. assert len(ambig_tree.children) == 2
  205. @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
  206. def test_ambiguity2(self):
  207. grammar = """
  208. ANY: /[a-zA-Z0-9 ]+/
  209. a.2: "A" b+
  210. b.2: "B"
  211. c: ANY
  212. start: (a|c)*
  213. """
  214. l = Lark(grammar, parser='earley', lexer=LEXER)
  215. res = l.parse('ABX')
  216. expected = Tree('start', [
  217. Tree('a', [
  218. Tree('b', [])
  219. ]),
  220. Tree('c', [
  221. 'X'
  222. ])
  223. ])
  224. self.assertEqual(res, expected)
  225. def test_fruitflies_ambig(self):
  226. grammar = """
  227. start: noun verb noun -> simple
  228. | noun verb "like" noun -> comparative
  229. noun: adj? NOUN
  230. verb: VERB
  231. adj: ADJ
  232. NOUN: "flies" | "bananas" | "fruit"
  233. VERB: "like" | "flies"
  234. ADJ: "fruit"
  235. %import common.WS
  236. %ignore WS
  237. """
  238. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  239. root_symbol = parser.parse('fruit flies like bananas')
  240. tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go()
  241. # tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree)
  242. expected = Tree('_ambig', [
  243. Tree('comparative', [
  244. Tree('noun', ['fruit']),
  245. Tree('verb', ['flies']),
  246. Tree('noun', ['bananas'])
  247. ]),
  248. Tree('simple', [
  249. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  250. Tree('verb', ['like']),
  251. Tree('noun', ['bananas'])
  252. ])
  253. ])
  254. # print res.pretty()
  255. # print expected.pretty()
  256. self.assertEqual(tree, expected)
  257. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  258. def test_explicit_ambiguity2(self):
  259. grammar = r"""
  260. start: NAME+
  261. NAME: /\w+/
  262. %ignore " "
  263. """
  264. text = """cat"""
  265. parser = Lark(grammar, start='start', ambiguity='explicit')
  266. root_symbol = parser.parse(text)
  267. ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol).go()
  268. tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree)
  269. self.assertEqual(tree.data, '_ambig')
  270. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  271. self.assertEqual(combinations, {
  272. ('cat',),
  273. ('ca', 't'),
  274. ('c', 'at'),
  275. ('c', 'a' ,'t')
  276. })
  277. def test_term_ambig_resolve(self):
  278. grammar = r"""
  279. !start: NAME+
  280. NAME: /\w+/
  281. %ignore " "
  282. """
  283. text = """foo bar"""
  284. parser = Lark(grammar)
  285. tree = parser.parse(text)
  286. self.assertEqual(tree.children, ['foo', 'bar'])
  287. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  288. # def test_not_all_derivations(self):
  289. # grammar = """
  290. # start: cd+ "e"
  291. # !cd: "c"
  292. # | "d"
  293. # | "cd"
  294. # """
  295. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  296. # x = l.parse('cde')
  297. # assert x.data != '_ambig', x
  298. # assert len(x.children) == 1
  299. _NAME = "TestFullEarley" + LEXER.capitalize()
  300. _TestFullEarley.__name__ = _NAME
  301. globals()[_NAME] = _TestFullEarley
  302. def _make_parser_test(LEXER, PARSER):
  303. def _Lark(grammar, **kwargs):
  304. return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
  305. class _TestParser(unittest.TestCase):
  306. def test_basic1(self):
  307. g = _Lark("""start: a+ b a* "b" a*
  308. b: "b"
  309. a: "a"
  310. """)
  311. r = g.parse('aaabaab')
  312. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  313. r = g.parse('aaabaaba')
  314. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  315. self.assertRaises(ParseError, g.parse, 'aaabaa')
  316. def test_basic2(self):
  317. # Multiple parsers and colliding tokens
  318. g = _Lark("""start: B A
  319. B: "12"
  320. A: "1" """)
  321. g2 = _Lark("""start: B A
  322. B: "12"
  323. A: "2" """)
  324. x = g.parse('121')
  325. assert x.data == 'start' and x.children == ['12', '1'], x
  326. x = g2.parse('122')
  327. assert x.data == 'start' and x.children == ['12', '2'], x
  328. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  329. def test_stringio_bytes(self):
  330. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  331. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  332. def test_stringio_unicode(self):
  333. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  334. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  335. def test_unicode(self):
  336. g = _Lark(u"""start: UNIA UNIB UNIA
  337. UNIA: /\xa3/
  338. UNIB: /\u0101/
  339. """)
  340. g.parse(u'\xa3\u0101\u00a3')
  341. def test_unicode2(self):
  342. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  343. UNIA: /\xa3/
  344. UNIB: "a\u0101b\ "
  345. UNIC: /a?\u0101c\n/
  346. """)
  347. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  348. def test_unicode3(self):
  349. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  350. UNIA: /\xa3/
  351. UNIB: "\u0101"
  352. UNIC: /\u0203/ /\n/
  353. """)
  354. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  355. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  356. def test_stack_for_ebnf(self):
  357. """Verify that stack depth isn't an issue for EBNF grammars"""
  358. g = _Lark(r"""start: a+
  359. a : "a" """)
  360. g.parse("a" * (sys.getrecursionlimit()*2 ))
  361. def test_expand1_lists_with_one_item(self):
  362. g = _Lark(r"""start: list
  363. ?list: item+
  364. item : A
  365. A: "a"
  366. """)
  367. r = g.parse("a")
  368. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  369. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  370. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  371. self.assertEqual(len(r.children), 1)
  372. def test_expand1_lists_with_one_item_2(self):
  373. g = _Lark(r"""start: list
  374. ?list: item+ "!"
  375. item : A
  376. A: "a"
  377. """)
  378. r = g.parse("a!")
  379. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  380. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  381. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  382. self.assertEqual(len(r.children), 1)
  383. def test_dont_expand1_lists_with_multiple_items(self):
  384. g = _Lark(r"""start: list
  385. ?list: item+
  386. item : A
  387. A: "a"
  388. """)
  389. r = g.parse("aa")
  390. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  391. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  392. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  393. self.assertEqual(len(r.children), 1)
  394. # Sanity check: verify that 'list' contains the two 'item's we've given it
  395. [list] = r.children
  396. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  397. def test_dont_expand1_lists_with_multiple_items_2(self):
  398. g = _Lark(r"""start: list
  399. ?list: item+ "!"
  400. item : A
  401. A: "a"
  402. """)
  403. r = g.parse("aa!")
  404. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  405. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  406. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  407. self.assertEqual(len(r.children), 1)
  408. # Sanity check: verify that 'list' contains the two 'item's we've given it
  409. [list] = r.children
  410. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  411. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  412. def test_empty_expand1_list(self):
  413. g = _Lark(r"""start: list
  414. ?list: item*
  415. item : A
  416. A: "a"
  417. """)
  418. r = g.parse("")
  419. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  420. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  421. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  422. self.assertEqual(len(r.children), 1)
  423. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  424. [list] = r.children
  425. self.assertSequenceEqual([item.data for item in list.children], ())
  426. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  427. def test_empty_expand1_list_2(self):
  428. g = _Lark(r"""start: list
  429. ?list: item* "!"?
  430. item : A
  431. A: "a"
  432. """)
  433. r = g.parse("")
  434. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  435. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  436. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  437. self.assertEqual(len(r.children), 1)
  438. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  439. [list] = r.children
  440. self.assertSequenceEqual([item.data for item in list.children], ())
  441. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  442. def test_empty_flatten_list(self):
  443. g = _Lark(r"""start: list
  444. list: | item "," list
  445. item : A
  446. A: "a"
  447. """)
  448. r = g.parse("")
  449. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  450. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  451. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  452. [list] = r.children
  453. self.assertSequenceEqual([item.data for item in list.children], ())
  454. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  455. def test_single_item_flatten_list(self):
  456. g = _Lark(r"""start: list
  457. list: | item "," list
  458. item : A
  459. A: "a"
  460. """)
  461. r = g.parse("a,")
  462. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  463. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  464. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  465. [list] = r.children
  466. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  467. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  468. def test_multiple_item_flatten_list(self):
  469. g = _Lark(r"""start: list
  470. #list: | item "," list
  471. item : A
  472. A: "a"
  473. """)
  474. r = g.parse("a,a,")
  475. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  476. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  477. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  478. [list] = r.children
  479. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  480. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  481. def test_recurse_flatten(self):
  482. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  483. g = _Lark(r"""start: a | start a
  484. a : A
  485. A : "a" """)
  486. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  487. # STree data structures, which uses recursion).
  488. g.parse("a" * (sys.getrecursionlimit() // 4))
  489. def test_token_collision(self):
  490. g = _Lark(r"""start: "Hello" NAME
  491. NAME: /\w/+
  492. %ignore " "
  493. """)
  494. x = g.parse('Hello World')
  495. self.assertSequenceEqual(x.children, ['World'])
  496. x = g.parse('Hello HelloWorld')
  497. self.assertSequenceEqual(x.children, ['HelloWorld'])
  498. def test_token_collision_WS(self):
  499. g = _Lark(r"""start: "Hello" NAME
  500. NAME: /\w/+
  501. %import common.WS
  502. %ignore WS
  503. """)
  504. x = g.parse('Hello World')
  505. self.assertSequenceEqual(x.children, ['World'])
  506. x = g.parse('Hello HelloWorld')
  507. self.assertSequenceEqual(x.children, ['HelloWorld'])
  508. def test_token_collision2(self):
  509. g = _Lark("""
  510. !start: "starts"
  511. %import common.LCASE_LETTER
  512. """)
  513. x = g.parse("starts")
  514. self.assertSequenceEqual(x.children, ['starts'])
  515. # def test_string_priority(self):
  516. # g = _Lark("""start: (A | /a?bb/)+
  517. # A: "a" """)
  518. # x = g.parse('abb')
  519. # self.assertEqual(len(x.children), 2)
  520. # # This parse raises an exception because the lexer will always try to consume
  521. # # "a" first and will never match the regular expression
  522. # # This behavior is subject to change!!
  523. # # Thie won't happen with ambiguity handling.
  524. # g = _Lark("""start: (A | /a?ab/)+
  525. # A: "a" """)
  526. # self.assertRaises(LexError, g.parse, 'aab')
  527. def test_undefined_rule(self):
  528. self.assertRaises(GrammarError, _Lark, """start: a""")
  529. def test_undefined_token(self):
  530. self.assertRaises(GrammarError, _Lark, """start: A""")
  531. def test_rule_collision(self):
  532. g = _Lark("""start: "a"+ "b"
  533. | "a"+ """)
  534. x = g.parse('aaaa')
  535. x = g.parse('aaaab')
  536. def test_rule_collision2(self):
  537. g = _Lark("""start: "a"* "b"
  538. | "a"+ """)
  539. x = g.parse('aaaa')
  540. x = g.parse('aaaab')
  541. x = g.parse('b')
  542. def test_token_not_anon(self):
  543. """Tests that "a" is matched as an anonymous token, and not A.
  544. """
  545. g = _Lark("""start: "a"
  546. A: "a" """)
  547. x = g.parse('a')
  548. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  549. g = _Lark("""start: "a" A
  550. A: "a" """)
  551. x = g.parse('aa')
  552. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  553. self.assertEqual(x.children[0].type, "A")
  554. g = _Lark("""start: /a/
  555. A: /a/ """)
  556. x = g.parse('a')
  557. self.assertEqual(len(x.children), 1)
  558. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  559. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  560. def test_maybe(self):
  561. g = _Lark("""start: ["a"] """)
  562. x = g.parse('a')
  563. x = g.parse('')
  564. def test_start(self):
  565. g = _Lark("""a: "a" a? """, start='a')
  566. x = g.parse('a')
  567. x = g.parse('aa')
  568. x = g.parse('aaa')
  569. def test_alias(self):
  570. g = _Lark("""start: "a" -> b """)
  571. x = g.parse('a')
  572. self.assertEqual(x.data, "b")
  573. def test_token_ebnf(self):
  574. g = _Lark("""start: A
  575. A: "a"* ("b"? "c".."e")+
  576. """)
  577. x = g.parse('abcde')
  578. x = g.parse('dd')
  579. def test_backslash(self):
  580. g = _Lark(r"""start: "\\" "a"
  581. """)
  582. x = g.parse(r'\a')
  583. g = _Lark(r"""start: /\\/ /a/
  584. """)
  585. x = g.parse(r'\a')
  586. def test_special_chars(self):
  587. g = _Lark(r"""start: "\n"
  588. """)
  589. x = g.parse('\n')
  590. g = _Lark(r"""start: /\n/
  591. """)
  592. x = g.parse('\n')
  593. def test_backslash2(self):
  594. g = _Lark(r"""start: "\"" "-"
  595. """)
  596. x = g.parse('"-')
  597. g = _Lark(r"""start: /\// /-/
  598. """)
  599. x = g.parse('/-')
  600. # def test_token_recurse(self):
  601. # g = _Lark("""start: A
  602. # A: B
  603. # B: A
  604. # """)
  605. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  606. def test_empty(self):
  607. # Fails an Earley implementation without special handling for empty rules,
  608. # or re-processing of already completed rules.
  609. g = _Lark(r"""start: _empty a "B"
  610. a: _empty "A"
  611. _empty:
  612. """)
  613. x = g.parse('AB')
  614. def test_regex_quote(self):
  615. g = r"""
  616. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  617. SINGLE_QUOTED_STRING : /'[^']*'/
  618. DOUBLE_QUOTED_STRING : /"[^"]*"/
  619. """
  620. g = _Lark(g)
  621. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  622. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  623. def test_lexer_token_limit(self):
  624. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  625. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  626. g = _Lark("""start: %s
  627. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  628. def test_float_without_lexer(self):
  629. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  630. if PARSER == 'cyk':
  631. expected_error = ParseError
  632. g = _Lark("""start: ["+"|"-"] float
  633. float: digit* "." digit+ exp?
  634. | digit+ exp
  635. exp: ("e"|"E") ["+"|"-"] digit+
  636. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  637. """)
  638. g.parse("1.2")
  639. g.parse("-.2e9")
  640. g.parse("+2e-9")
  641. self.assertRaises( expected_error, g.parse, "+2e-9e")
  642. def test_keep_all_tokens(self):
  643. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  644. tree = l.parse('aaa')
  645. self.assertEqual(tree.children, ['a', 'a', 'a'])
  646. def test_token_flags(self):
  647. l = _Lark("""!start: "a"i+
  648. """
  649. )
  650. tree = l.parse('aA')
  651. self.assertEqual(tree.children, ['a', 'A'])
  652. l = _Lark("""!start: /a/i+
  653. """
  654. )
  655. tree = l.parse('aA')
  656. self.assertEqual(tree.children, ['a', 'A'])
  657. # g = """!start: "a"i "a"
  658. # """
  659. # self.assertRaises(GrammarError, _Lark, g)
  660. # g = """!start: /a/i /a/
  661. # """
  662. # self.assertRaises(GrammarError, _Lark, g)
  663. g = """start: NAME "," "a"
  664. NAME: /[a-z_]/i /[a-z0-9_]/i*
  665. """
  666. l = _Lark(g)
  667. tree = l.parse('ab,a')
  668. self.assertEqual(tree.children, ['ab'])
  669. tree = l.parse('AB,a')
  670. self.assertEqual(tree.children, ['AB'])
  671. def test_token_flags3(self):
  672. l = _Lark("""!start: ABC+
  673. ABC: "abc"i
  674. """
  675. )
  676. tree = l.parse('aBcAbC')
  677. self.assertEqual(tree.children, ['aBc', 'AbC'])
  678. def test_token_flags2(self):
  679. g = """!start: ("a"i | /a/ /b/?)+
  680. """
  681. l = _Lark(g)
  682. tree = l.parse('aA')
  683. self.assertEqual(tree.children, ['a', 'A'])
  684. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  685. def test_twice_empty(self):
  686. g = """!start: [["A"]]
  687. """
  688. l = _Lark(g)
  689. tree = l.parse('A')
  690. self.assertEqual(tree.children, ['A'])
  691. tree = l.parse('')
  692. self.assertEqual(tree.children, [])
  693. def test_undefined_ignore(self):
  694. g = """!start: "A"
  695. %ignore B
  696. """
  697. self.assertRaises( GrammarError, _Lark, g)
  698. def test_alias_in_terminal(self):
  699. g = """start: TERM
  700. TERM: "a" -> alias
  701. """
  702. self.assertRaises( GrammarError, _Lark, g)
  703. def test_line_and_column(self):
  704. g = r"""!start: "A" bc "D"
  705. !bc: "B\nC"
  706. """
  707. l = _Lark(g)
  708. a, bc, d = l.parse("AB\nCD").children
  709. self.assertEqual(a.line, 1)
  710. self.assertEqual(a.column, 1)
  711. bc ,= bc.children
  712. self.assertEqual(bc.line, 1)
  713. self.assertEqual(bc.column, 2)
  714. self.assertEqual(d.line, 2)
  715. self.assertEqual(d.column, 2)
  716. if LEXER != 'dynamic':
  717. self.assertEqual(a.end_line, 1)
  718. self.assertEqual(a.end_column, 2)
  719. self.assertEqual(bc.end_line, 2)
  720. self.assertEqual(bc.end_column, 2)
  721. self.assertEqual(d.end_line, 2)
  722. self.assertEqual(d.end_column, 3)
  723. def test_reduce_cycle(self):
  724. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  725. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  726. """
  727. l = _Lark("""
  728. term: A
  729. | term term
  730. A: "a"
  731. """, start='term')
  732. tree = l.parse("aa")
  733. self.assertEqual(len(tree.children), 2)
  734. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  735. def test_lexer_prioritization(self):
  736. "Tests effect of priority on result"
  737. grammar = """
  738. start: A B | AB
  739. A.2: "a"
  740. B: "b"
  741. AB: "ab"
  742. """
  743. l = _Lark(grammar)
  744. res = l.parse("ab")
  745. self.assertEqual(res.children, ['a', 'b'])
  746. self.assertNotEqual(res.children, ['ab'])
  747. grammar = """
  748. start: A B | AB
  749. A: "a"
  750. B: "b"
  751. AB.3: "ab"
  752. """
  753. l = _Lark(grammar)
  754. res = l.parse("ab")
  755. self.assertNotEqual(res.children, ['a', 'b'])
  756. self.assertEqual(res.children, ['ab'])
  757. def test_import(self):
  758. grammar = """
  759. start: NUMBER WORD
  760. %import common.NUMBER
  761. %import common.WORD
  762. %import common.WS
  763. %ignore WS
  764. """
  765. l = _Lark(grammar)
  766. x = l.parse('12 elephants')
  767. self.assertEqual(x.children, ['12', 'elephants'])
  768. def test_relative_import(self):
  769. grammar = """
  770. start: NUMBER WORD
  771. %import .grammars.test.NUMBER
  772. %import common.WORD
  773. %import common.WS
  774. %ignore WS
  775. """
  776. l = _Lark(grammar)
  777. x = l.parse('12 lions')
  778. self.assertEqual(x.children, ['12', 'lions'])
  779. def test_multi_import(self):
  780. grammar = """
  781. start: NUMBER WORD
  782. %import common (NUMBER, WORD, WS)
  783. %ignore WS
  784. """
  785. l = _Lark(grammar)
  786. x = l.parse('12 toucans')
  787. self.assertEqual(x.children, ['12', 'toucans'])
  788. def test_relative_multi_import(self):
  789. grammar = """
  790. start: NUMBER WORD
  791. %import .grammars.test (NUMBER, WORD, WS)
  792. %ignore WS
  793. """
  794. l = _Lark(grammar)
  795. x = l.parse('12 capybaras')
  796. self.assertEqual(x.children, ['12', 'capybaras'])
  797. def test_import_errors(self):
  798. grammar = """
  799. start: NUMBER WORD
  800. %import .grammars.bad_test.NUMBER
  801. """
  802. self.assertRaises(IOError, _Lark, grammar)
  803. grammar = """
  804. start: NUMBER WORD
  805. %import bad_test.NUMBER
  806. """
  807. self.assertRaises(IOError, _Lark, grammar)
  808. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  809. def test_earley_prioritization(self):
  810. "Tests effect of priority on result"
  811. grammar = """
  812. start: a | b
  813. a.1: "a"
  814. b.2: "a"
  815. """
  816. # l = Lark(grammar, parser='earley', lexer='standard')
  817. l = _Lark(grammar)
  818. res = l.parse("a")
  819. self.assertEqual(res.children[0].data, 'b')
  820. grammar = """
  821. start: a | b
  822. a.2: "a"
  823. b.1: "a"
  824. """
  825. l = _Lark(grammar)
  826. # l = Lark(grammar, parser='earley', lexer='standard')
  827. res = l.parse("a")
  828. self.assertEqual(res.children[0].data, 'a')
  829. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  830. def test_earley_prioritization_sum(self):
  831. "Tests effect of priority on result"
  832. grammar = """
  833. start: ab_ b_ a_ | indirection
  834. indirection: a_ bb_ a_
  835. a_: "a"
  836. b_: "b"
  837. ab_: "ab"
  838. bb_.1: "bb"
  839. """
  840. l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
  841. res = l.parse('abba')
  842. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  843. grammar = """
  844. start: ab_ b_ a_ | indirection
  845. indirection: a_ bb_ a_
  846. a_: "a"
  847. b_: "b"
  848. ab_.1: "ab"
  849. bb_: "bb"
  850. """
  851. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  852. res = l.parse('abba')
  853. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  854. grammar = """
  855. start: ab_ b_ a_ | indirection
  856. indirection: a_ bb_ a_
  857. a_.2: "a"
  858. b_.1: "b"
  859. ab_.3: "ab"
  860. bb_.3: "bb"
  861. """
  862. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  863. res = l.parse('abba')
  864. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  865. grammar = """
  866. start: ab_ b_ a_ | indirection
  867. indirection: a_ bb_ a_
  868. a_.1: "a"
  869. b_.1: "b"
  870. ab_.4: "ab"
  871. bb_.3: "bb"
  872. """
  873. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  874. res = l.parse('abba')
  875. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  876. def test_utf8(self):
  877. g = u"""start: a
  878. a: "±a"
  879. """
  880. l = _Lark(g)
  881. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  882. g = u"""start: A
  883. A: "±a"
  884. """
  885. l = _Lark(g)
  886. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  887. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  888. def test_ignore(self):
  889. grammar = r"""
  890. COMMENT: /(!|(\/\/))[^\n]*/
  891. %ignore COMMENT
  892. %import common.WS -> _WS
  893. %import common.INT
  894. start: "INT"i _WS+ INT _WS*
  895. """
  896. parser = _Lark(grammar)
  897. tree = parser.parse("int 1 ! This is a comment\n")
  898. self.assertEqual(tree.children, ['1'])
  899. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  900. self.assertEqual(tree.children, ['1'])
  901. parser = _Lark(r"""
  902. start : "a"*
  903. %ignore "b"
  904. """)
  905. tree = parser.parse("bb")
  906. self.assertEqual(tree.children, [])
  907. def test_regex_escaping(self):
  908. g = _Lark("start: /[ab]/")
  909. g.parse('a')
  910. g.parse('b')
  911. self.assertRaises( UnexpectedInput, g.parse, 'c')
  912. _Lark(r'start: /\w/').parse('a')
  913. g = _Lark(r'start: /\\w/')
  914. self.assertRaises( UnexpectedInput, g.parse, 'a')
  915. g.parse(r'\w')
  916. _Lark(r'start: /\[/').parse('[')
  917. _Lark(r'start: /\//').parse('/')
  918. _Lark(r'start: /\\/').parse('\\')
  919. _Lark(r'start: /\[ab]/').parse('[ab]')
  920. _Lark(r'start: /\\[ab]/').parse('\\a')
  921. _Lark(r'start: /\t/').parse('\t')
  922. _Lark(r'start: /\\t/').parse('\\t')
  923. _Lark(r'start: /\\\t/').parse('\\\t')
  924. _Lark(r'start: "\t"').parse('\t')
  925. _Lark(r'start: "\\t"').parse('\\t')
  926. _Lark(r'start: "\\\t"').parse('\\\t')
  927. def test_ranged_repeat_rules(self):
  928. g = u"""!start: "A"~3
  929. """
  930. l = _Lark(g)
  931. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  932. self.assertRaises(ParseError, l.parse, u'AA')
  933. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  934. g = u"""!start: "A"~0..2
  935. """
  936. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  937. l = _Lark(g)
  938. self.assertEqual(l.parse(u''), Tree('start', []))
  939. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  940. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  941. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  942. g = u"""!start: "A"~3..2
  943. """
  944. self.assertRaises(GrammarError, _Lark, g)
  945. g = u"""!start: "A"~2..3 "B"~2
  946. """
  947. l = _Lark(g)
  948. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  949. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  950. self.assertRaises(ParseError, l.parse, u'AAAB')
  951. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  952. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  953. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  954. def test_ranged_repeat_terms(self):
  955. g = u"""!start: AAA
  956. AAA: "A"~3
  957. """
  958. l = _Lark(g)
  959. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  960. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  961. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  962. g = u"""!start: AABB CC
  963. AABB: "A"~0..2 "B"~2
  964. CC: "C"~1..2
  965. """
  966. l = _Lark(g)
  967. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  968. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  969. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  970. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  971. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  972. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  973. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  974. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  975. def test_priority_vs_embedded(self):
  976. g = """
  977. A.2: "a"
  978. WORD: ("a".."z")+
  979. start: (A | WORD)+
  980. """
  981. l = _Lark(g)
  982. t = l.parse('abc')
  983. self.assertEqual(t.children, ['a', 'bc'])
  984. self.assertEqual(t.children[0].type, 'A')
  985. def test_line_counting(self):
  986. p = _Lark("start: /[^x]+/")
  987. text = 'hello\nworld'
  988. t = p.parse(text)
  989. tok = t.children[0]
  990. self.assertEqual(tok, text)
  991. self.assertEqual(tok.line, 1)
  992. self.assertEqual(tok.column, 1)
  993. if _LEXER != 'dynamic':
  994. self.assertEqual(tok.end_line, 2)
  995. self.assertEqual(tok.end_column, 6)
  996. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  997. _TestParser.__name__ = _NAME
  998. globals()[_NAME] = _TestParser
  999. # Note: You still have to import them in __main__ for the tests to run
  1000. _TO_TEST = [
  1001. ('standard', 'earley'),
  1002. ('standard', 'cyk'),
  1003. ('dynamic', 'earley'),
  1004. ('dynamic_complete', 'earley'),
  1005. ('standard', 'lalr'),
  1006. ('contextual', 'lalr'),
  1007. # (None, 'earley'),
  1008. ]
  1009. for _LEXER, _PARSER in _TO_TEST:
  1010. _make_parser_test(_LEXER, _PARSER)
  1011. for _LEXER in ('dynamic',):
  1012. _make_full_earley_test(_LEXER)
  1013. if __name__ == '__main__':
  1014. unittest.main()