This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1214 lines
41 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.common import GrammarError, ParseError, UnexpectedToken
  19. from lark.lexer import LexError, UnexpectedInput
  20. from lark.tree import Tree, Transformer
  21. __path__ = os.path.dirname(__file__)
  22. def _read(n, *args):
  23. with open(os.path.join(__path__, n), *args) as f:
  24. return f.read()
  25. class TestParsers(unittest.TestCase):
  26. def test_same_ast(self):
  27. "Tests that Earley and LALR parsers produce equal trees"
  28. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  29. name_list: NAME | name_list "," NAME
  30. NAME: /\w+/ """, parser='lalr')
  31. l = g.parse('(a,b,c,*x)')
  32. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  33. name_list: NAME | name_list "," NAME
  34. NAME: /\w/+ """)
  35. l2 = g.parse('(a,b,c,*x)')
  36. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  37. def test_infinite_recurse(self):
  38. g = """start: a
  39. a: a | "a"
  40. """
  41. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  42. l = Lark(g, parser='earley', lexer=None)
  43. self.assertRaises(ParseError, l.parse, 'a')
  44. l = Lark(g, parser='earley', lexer='dynamic')
  45. self.assertRaises(ParseError, l.parse, 'a')
  46. def test_propagate_positions(self):
  47. g = Lark("""start: a
  48. a: "a"
  49. """, propagate_positions=True)
  50. r = g.parse('a')
  51. self.assertEqual( r.children[0].line, 1 )
  52. def test_expand1(self):
  53. g = Lark("""start: a
  54. ?a: b
  55. b: "x"
  56. """)
  57. r = g.parse('x')
  58. self.assertEqual( r.children[0].data, "b" )
  59. g = Lark("""start: a
  60. ?a: b -> c
  61. b: "x"
  62. """)
  63. r = g.parse('x')
  64. self.assertEqual( r.children[0].data, "c" )
  65. g = Lark("""start: a
  66. ?a: B -> c
  67. B: "x"
  68. """)
  69. self.assertEqual( r.children[0].data, "c" )
  70. g = Lark("""start: a
  71. ?a: b b -> c
  72. b: "x"
  73. """)
  74. r = g.parse('xx')
  75. self.assertEqual( r.children[0].data, "c" )
  76. def test_embedded_transformer(self):
  77. class T(Transformer):
  78. def a(self, children):
  79. return "<a>"
  80. def b(self, children):
  81. return "<b>"
  82. def c(self, children):
  83. return "<c>"
  84. # Test regular
  85. g = Lark("""start: a
  86. a : "x"
  87. """, parser='lalr')
  88. r = T().transform(g.parse("x"))
  89. self.assertEqual( r.children, ["<a>"] )
  90. g = Lark("""start: a
  91. a : "x"
  92. """, parser='lalr', transformer=T())
  93. r = g.parse("x")
  94. self.assertEqual( r.children, ["<a>"] )
  95. # Test Expand1
  96. g = Lark("""start: a
  97. ?a : b
  98. b : "x"
  99. """, parser='lalr')
  100. r = T().transform(g.parse("x"))
  101. self.assertEqual( r.children, ["<b>"] )
  102. g = Lark("""start: a
  103. ?a : b
  104. b : "x"
  105. """, parser='lalr', transformer=T())
  106. r = g.parse("x")
  107. self.assertEqual( r.children, ["<b>"] )
  108. # Test Expand1 -> Alias
  109. g = Lark("""start: a
  110. ?a : b b -> c
  111. b : "x"
  112. """, parser='lalr')
  113. r = T().transform(g.parse("xx"))
  114. self.assertEqual( r.children, ["<c>"] )
  115. g = Lark("""start: a
  116. ?a : b b -> c
  117. b : "x"
  118. """, parser='lalr', transformer=T())
  119. r = g.parse("xx")
  120. self.assertEqual( r.children, ["<c>"] )
  121. def _make_full_earley_test(LEXER):
  122. class _TestFullEarley(unittest.TestCase):
  123. def test_anon_in_scanless(self):
  124. # Fails an Earley implementation without special handling for empty rules,
  125. # or re-processing of already completed rules.
  126. g = Lark(r"""start: B
  127. B: ("ab"|/[^b]/)+
  128. """, lexer=LEXER)
  129. self.assertEqual( g.parse('abc').children[0], 'abc')
  130. def test_earley_scanless(self):
  131. g = Lark("""start: A "b" c
  132. A: "a"+
  133. c: "abc"
  134. """, parser="earley", lexer=LEXER)
  135. x = g.parse('aaaababc')
  136. def test_earley_scanless2(self):
  137. grammar = """
  138. start: statement+
  139. statement: "r"
  140. | "c" /[a-z]/+
  141. %ignore " "
  142. """
  143. program = """c b r"""
  144. l = Lark(grammar, parser='earley', lexer=LEXER)
  145. l.parse(program)
  146. # XXX Fails for scanless mode
  147. # XXX Decided not to fix, because
  148. # a) It's a subtle bug
  149. # b) Scanless is intended for deprecation
  150. #
  151. # def test_earley_scanless3(self):
  152. # "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  153. # grammar = """
  154. # start: A A
  155. # A: "a"+
  156. # """
  157. # l = Lark(grammar, parser='earley', lexer=LEXER)
  158. # res = l.parse("aaa")
  159. # self.assertEqual(res.children, ['aa', 'a'])
  160. def test_earley_scanless4(self):
  161. grammar = """
  162. start: A A?
  163. A: "a"+
  164. """
  165. l = Lark(grammar, parser='earley', lexer=LEXER)
  166. res = l.parse("aaa")
  167. self.assertEqual(res.children, ['aaa'])
  168. def test_earley_repeating_empty(self):
  169. # This was a sneaky bug!
  170. grammar = """
  171. !start: "a" empty empty "b"
  172. empty: empty2
  173. empty2:
  174. """
  175. parser = Lark(grammar, parser='earley', lexer=LEXER)
  176. res = parser.parse('ab')
  177. empty_tree = Tree('empty', [Tree('empty2', [])])
  178. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  179. def test_earley_explicit_ambiguity(self):
  180. # This was a sneaky bug!
  181. grammar = """
  182. start: a b | ab
  183. a: "a"
  184. b: "b"
  185. ab: "ab"
  186. """
  187. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  188. res = parser.parse('ab')
  189. self.assertEqual( res.data, '_ambig')
  190. self.assertEqual( len(res.children), 2)
  191. def test_ambiguity1(self):
  192. grammar = """
  193. start: cd+ "e"
  194. !cd: "c"
  195. | "d"
  196. | "cd"
  197. """
  198. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  199. x = l.parse('cde')
  200. assert x.data == '_ambig', x
  201. assert len(x.children) == 2
  202. @unittest.skipIf(LEXER==None, "BUG in scanless parsing!") # TODO fix bug!
  203. def test_fruitflies_ambig(self):
  204. grammar = """
  205. start: noun verb noun -> simple
  206. | noun verb "like" noun -> comparative
  207. noun: adj? NOUN
  208. verb: VERB
  209. adj: ADJ
  210. NOUN: "flies" | "bananas" | "fruit"
  211. VERB: "like" | "flies"
  212. ADJ: "fruit"
  213. %import common.WS
  214. %ignore WS
  215. """
  216. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  217. res = parser.parse('fruit flies like bananas')
  218. expected = Tree('_ambig', [
  219. Tree('comparative', [
  220. Tree('noun', ['fruit']),
  221. Tree('verb', ['flies']),
  222. Tree('noun', ['bananas'])
  223. ]),
  224. Tree('simple', [
  225. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  226. Tree('verb', ['like']),
  227. Tree('noun', ['bananas'])
  228. ])
  229. ])
  230. # print res.pretty()
  231. # print expected.pretty()
  232. self.assertEqual(res, expected)
  233. def test_explicit_ambiguity2(self):
  234. grammar = r"""
  235. start: NAME+
  236. NAME: /\w+/
  237. %ignore " "
  238. """
  239. text = """cat"""
  240. parser = Lark(grammar, start='start', ambiguity='explicit')
  241. tree = parser.parse(text)
  242. self.assertEqual(tree.data, '_ambig')
  243. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  244. self.assertEqual(combinations, {
  245. ('cat',),
  246. ('ca', 't'),
  247. ('c', 'at'),
  248. ('c', 'a' ,'t')
  249. })
  250. def test_term_ambig_resolve(self):
  251. grammar = r"""
  252. !start: NAME+
  253. NAME: /\w+/
  254. %ignore " "
  255. """
  256. text = """foo bar"""
  257. parser = Lark(grammar)
  258. tree = parser.parse(text)
  259. self.assertEqual(tree.children, ['foo', 'bar'])
  260. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  261. # def test_not_all_derivations(self):
  262. # grammar = """
  263. # start: cd+ "e"
  264. # !cd: "c"
  265. # | "d"
  266. # | "cd"
  267. # """
  268. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  269. # x = l.parse('cde')
  270. # assert x.data != '_ambig', x
  271. # assert len(x.children) == 1
  272. _NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize()
  273. _TestFullEarley.__name__ = _NAME
  274. globals()[_NAME] = _TestFullEarley
  275. def _make_parser_test(LEXER, PARSER):
  276. def _Lark(grammar, **kwargs):
  277. return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
  278. class _TestParser(unittest.TestCase):
  279. def test_basic1(self):
  280. g = _Lark("""start: a+ b a* "b" a*
  281. b: "b"
  282. a: "a"
  283. """)
  284. r = g.parse('aaabaab')
  285. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  286. r = g.parse('aaabaaba')
  287. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  288. self.assertRaises(ParseError, g.parse, 'aaabaa')
  289. def test_basic2(self):
  290. # Multiple parsers and colliding tokens
  291. g = _Lark("""start: B A
  292. B: "12"
  293. A: "1" """)
  294. g2 = _Lark("""start: B A
  295. B: "12"
  296. A: "2" """)
  297. x = g.parse('121')
  298. assert x.data == 'start' and x.children == ['12', '1'], x
  299. x = g2.parse('122')
  300. assert x.data == 'start' and x.children == ['12', '2'], x
  301. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  302. def test_stringio_bytes(self):
  303. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  304. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  305. def test_stringio_unicode(self):
  306. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  307. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  308. def test_unicode(self):
  309. g = _Lark(u"""start: UNIA UNIB UNIA
  310. UNIA: /\xa3/
  311. UNIB: /\u0101/
  312. """)
  313. g.parse(u'\xa3\u0101\u00a3')
  314. @unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing")
  315. def test_unicode2(self):
  316. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  317. UNIA: /\xa3/
  318. UNIB: "a\u0101b\ "
  319. UNIC: /a?\u0101c\n/
  320. """)
  321. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  322. def test_unicode3(self):
  323. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  324. UNIA: /\xa3/
  325. UNIB: "\u0101"
  326. UNIC: /\u0203/ /\n/
  327. """)
  328. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  329. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  330. def test_stack_for_ebnf(self):
  331. """Verify that stack depth isn't an issue for EBNF grammars"""
  332. g = _Lark(r"""start: a+
  333. a : "a" """)
  334. g.parse("a" * (sys.getrecursionlimit()*2 ))
  335. def test_expand1_lists_with_one_item(self):
  336. g = _Lark(r"""start: list
  337. ?list: item+
  338. item : A
  339. A: "a"
  340. """)
  341. r = g.parse("a")
  342. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  343. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  344. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  345. self.assertEqual(len(r.children), 1)
  346. def test_expand1_lists_with_one_item_2(self):
  347. g = _Lark(r"""start: list
  348. ?list: item+ "!"
  349. item : A
  350. A: "a"
  351. """)
  352. r = g.parse("a!")
  353. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  354. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  355. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  356. self.assertEqual(len(r.children), 1)
  357. def test_dont_expand1_lists_with_multiple_items(self):
  358. g = _Lark(r"""start: list
  359. ?list: item+
  360. item : A
  361. A: "a"
  362. """)
  363. r = g.parse("aa")
  364. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  365. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  366. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  367. self.assertEqual(len(r.children), 1)
  368. # Sanity check: verify that 'list' contains the two 'item's we've given it
  369. [list] = r.children
  370. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  371. def test_dont_expand1_lists_with_multiple_items_2(self):
  372. g = _Lark(r"""start: list
  373. ?list: item+ "!"
  374. item : A
  375. A: "a"
  376. """)
  377. r = g.parse("aa!")
  378. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  379. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  380. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  381. self.assertEqual(len(r.children), 1)
  382. # Sanity check: verify that 'list' contains the two 'item's we've given it
  383. [list] = r.children
  384. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  385. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  386. def test_empty_expand1_list(self):
  387. g = _Lark(r"""start: list
  388. ?list: item*
  389. item : A
  390. A: "a"
  391. """)
  392. r = g.parse("")
  393. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  394. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  395. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  396. self.assertEqual(len(r.children), 1)
  397. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  398. [list] = r.children
  399. self.assertSequenceEqual([item.data for item in list.children], ())
  400. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  401. def test_empty_expand1_list_2(self):
  402. g = _Lark(r"""start: list
  403. ?list: item* "!"?
  404. item : A
  405. A: "a"
  406. """)
  407. r = g.parse("")
  408. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  409. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  410. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  411. self.assertEqual(len(r.children), 1)
  412. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  413. [list] = r.children
  414. self.assertSequenceEqual([item.data for item in list.children], ())
  415. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  416. def test_empty_flatten_list(self):
  417. g = _Lark(r"""start: list
  418. list: | item "," list
  419. item : A
  420. A: "a"
  421. """)
  422. r = g.parse("")
  423. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  424. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  425. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  426. [list] = r.children
  427. self.assertSequenceEqual([item.data for item in list.children], ())
  428. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  429. def test_single_item_flatten_list(self):
  430. g = _Lark(r"""start: list
  431. list: | item "," list
  432. item : A
  433. A: "a"
  434. """)
  435. r = g.parse("a,")
  436. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  437. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  438. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  439. [list] = r.children
  440. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  441. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  442. def test_multiple_item_flatten_list(self):
  443. g = _Lark(r"""start: list
  444. #list: | item "," list
  445. item : A
  446. A: "a"
  447. """)
  448. r = g.parse("a,a,")
  449. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  450. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  451. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  452. [list] = r.children
  453. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  454. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  455. def test_recurse_flatten(self):
  456. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  457. g = _Lark(r"""start: a | start a
  458. a : A
  459. A : "a" """)
  460. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  461. # STree data structures, which uses recursion).
  462. g.parse("a" * (sys.getrecursionlimit() // 4))
  463. def test_token_collision(self):
  464. g = _Lark(r"""start: "Hello" NAME
  465. NAME: /\w/+
  466. %ignore " "
  467. """)
  468. x = g.parse('Hello World')
  469. self.assertSequenceEqual(x.children, ['World'])
  470. x = g.parse('Hello HelloWorld')
  471. self.assertSequenceEqual(x.children, ['HelloWorld'])
  472. def test_token_collision_WS(self):
  473. g = _Lark(r"""start: "Hello" NAME
  474. NAME: /\w/+
  475. %import common.WS
  476. %ignore WS
  477. """)
  478. x = g.parse('Hello World')
  479. self.assertSequenceEqual(x.children, ['World'])
  480. x = g.parse('Hello HelloWorld')
  481. self.assertSequenceEqual(x.children, ['HelloWorld'])
  482. @unittest.skipIf(LEXER is None, "Known bug with scanless parsing") # TODO
  483. def test_token_collision2(self):
  484. # NOTE: This test reveals a bug in token reconstruction in Scanless Earley
  485. # I probably need to re-write grammar transformation
  486. g = _Lark("""
  487. !start: "starts"
  488. %import common.LCASE_LETTER
  489. """)
  490. x = g.parse("starts")
  491. self.assertSequenceEqual(x.children, ['starts'])
  492. # def test_string_priority(self):
  493. # g = _Lark("""start: (A | /a?bb/)+
  494. # A: "a" """)
  495. # x = g.parse('abb')
  496. # self.assertEqual(len(x.children), 2)
  497. # # This parse raises an exception because the lexer will always try to consume
  498. # # "a" first and will never match the regular expression
  499. # # This behavior is subject to change!!
  500. # # Thie won't happen with ambiguity handling.
  501. # g = _Lark("""start: (A | /a?ab/)+
  502. # A: "a" """)
  503. # self.assertRaises(LexError, g.parse, 'aab')
  504. def test_undefined_rule(self):
  505. self.assertRaises(GrammarError, _Lark, """start: a""")
  506. def test_undefined_token(self):
  507. self.assertRaises(GrammarError, _Lark, """start: A""")
  508. def test_rule_collision(self):
  509. g = _Lark("""start: "a"+ "b"
  510. | "a"+ """)
  511. x = g.parse('aaaa')
  512. x = g.parse('aaaab')
  513. def test_rule_collision2(self):
  514. g = _Lark("""start: "a"* "b"
  515. | "a"+ """)
  516. x = g.parse('aaaa')
  517. x = g.parse('aaaab')
  518. x = g.parse('b')
  519. @unittest.skipIf(LEXER in (None, 'dynamic'), "Known bug with scanless parsing") # TODO
  520. def test_token_not_anon(self):
  521. """Tests that "a" is matched as A, rather than an anonymous token.
  522. That means that "a" is not filtered out, despite being an 'immediate string'.
  523. Whether or not this is the intuitive behavior, I'm not sure yet.
  524. Perhaps the right thing to do is report a collision (if such is relevant)
  525. -Erez
  526. """
  527. g = _Lark("""start: "a"
  528. A: "a" """)
  529. x = g.parse('a')
  530. self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous')
  531. self.assertEqual(x.children[0].type, "A")
  532. g = _Lark("""start: /a/
  533. A: /a/ """)
  534. x = g.parse('a')
  535. self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
  536. self.assertEqual(x.children[0].type, "A")
  537. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  538. def test_maybe(self):
  539. g = _Lark("""start: ["a"] """)
  540. x = g.parse('a')
  541. x = g.parse('')
  542. def test_start(self):
  543. g = _Lark("""a: "a" a? """, start='a')
  544. x = g.parse('a')
  545. x = g.parse('aa')
  546. x = g.parse('aaa')
  547. def test_alias(self):
  548. g = _Lark("""start: "a" -> b """)
  549. x = g.parse('a')
  550. self.assertEqual(x.data, "b")
  551. def test_token_ebnf(self):
  552. g = _Lark("""start: A
  553. A: "a"* ("b"? "c".."e")+
  554. """)
  555. x = g.parse('abcde')
  556. x = g.parse('dd')
  557. def test_backslash(self):
  558. g = _Lark(r"""start: "\\" "a"
  559. """)
  560. x = g.parse(r'\a')
  561. g = _Lark(r"""start: /\\/ /a/
  562. """)
  563. x = g.parse(r'\a')
  564. def test_special_chars(self):
  565. g = _Lark(r"""start: "\n"
  566. """)
  567. x = g.parse('\n')
  568. g = _Lark(r"""start: /\n/
  569. """)
  570. x = g.parse('\n')
  571. def test_backslash2(self):
  572. g = _Lark(r"""start: "\"" "-"
  573. """)
  574. x = g.parse('"-')
  575. g = _Lark(r"""start: /\// /-/
  576. """)
  577. x = g.parse('/-')
  578. # def test_token_recurse(self):
  579. # g = _Lark("""start: A
  580. # A: B
  581. # B: A
  582. # """)
  583. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  584. def test_empty(self):
  585. # Fails an Earley implementation without special handling for empty rules,
  586. # or re-processing of already completed rules.
  587. g = _Lark(r"""start: _empty a "B"
  588. a: _empty "A"
  589. _empty:
  590. """)
  591. x = g.parse('AB')
  592. @unittest.skipIf(LEXER == None, "Scanless can't handle regexps")
  593. def test_regex_quote(self):
  594. g = r"""
  595. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  596. SINGLE_QUOTED_STRING : /'[^']*'/
  597. DOUBLE_QUOTED_STRING : /"[^"]*"/
  598. """
  599. g = _Lark(g)
  600. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  601. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  602. def test_lexer_token_limit(self):
  603. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  604. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  605. g = _Lark("""start: %s
  606. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  607. def test_float_without_lexer(self):
  608. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  609. if PARSER == 'cyk':
  610. expected_error = ParseError
  611. g = _Lark("""start: ["+"|"-"] float
  612. float: digit* "." digit+ exp?
  613. | digit+ exp
  614. exp: ("e"|"E") ["+"|"-"] digit+
  615. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  616. """)
  617. g.parse("1.2")
  618. g.parse("-.2e9")
  619. g.parse("+2e-9")
  620. self.assertRaises( expected_error, g.parse, "+2e-9e")
  621. def test_keep_all_tokens(self):
  622. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  623. tree = l.parse('aaa')
  624. self.assertEqual(tree.children, ['a', 'a', 'a'])
  625. def test_token_flags(self):
  626. l = _Lark("""!start: "a"i+
  627. """
  628. )
  629. tree = l.parse('aA')
  630. self.assertEqual(tree.children, ['a', 'A'])
  631. l = _Lark("""!start: /a/i+
  632. """
  633. )
  634. tree = l.parse('aA')
  635. self.assertEqual(tree.children, ['a', 'A'])
  636. # g = """!start: "a"i "a"
  637. # """
  638. # self.assertRaises(GrammarError, _Lark, g)
  639. # g = """!start: /a/i /a/
  640. # """
  641. # self.assertRaises(GrammarError, _Lark, g)
  642. g = """start: NAME "," "a"
  643. NAME: /[a-z_]/i /[a-z0-9_]/i*
  644. """
  645. l = _Lark(g)
  646. tree = l.parse('ab,a')
  647. self.assertEqual(tree.children, ['ab'])
  648. tree = l.parse('AB,a')
  649. self.assertEqual(tree.children, ['AB'])
  650. def test_token_flags3(self):
  651. l = _Lark("""!start: ABC+
  652. ABC: "abc"i
  653. """
  654. )
  655. tree = l.parse('aBcAbC')
  656. self.assertEqual(tree.children, ['aBc', 'AbC'])
  657. def test_token_flags2(self):
  658. g = """!start: ("a"i | /a/ /b/?)+
  659. """
  660. l = _Lark(g)
  661. tree = l.parse('aA')
  662. self.assertEqual(tree.children, ['a', 'A'])
  663. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  664. def test_twice_empty(self):
  665. g = """!start: [["A"]]
  666. """
  667. l = _Lark(g)
  668. tree = l.parse('A')
  669. self.assertEqual(tree.children, ['A'])
  670. tree = l.parse('')
  671. self.assertEqual(tree.children, [])
  672. def test_undefined_ignore(self):
  673. g = """!start: "A"
  674. %ignore B
  675. """
  676. self.assertRaises( GrammarError, _Lark, g)
  677. def test_alias_in_terminal(self):
  678. g = """start: TERM
  679. TERM: "a" -> alias
  680. """
  681. self.assertRaises( GrammarError, _Lark, g)
  682. @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
  683. def test_line_and_column(self):
  684. g = r"""!start: "A" bc "D"
  685. !bc: "B\nC"
  686. """
  687. l = _Lark(g)
  688. a, bc, d = l.parse("AB\nCD").children
  689. self.assertEqual(a.line, 1)
  690. self.assertEqual(a.column, 0)
  691. bc ,= bc.children
  692. self.assertEqual(bc.line, 1)
  693. self.assertEqual(bc.column, 1)
  694. self.assertEqual(d.line, 2)
  695. self.assertEqual(d.column, 1)
  696. if LEXER != 'dynamic':
  697. self.assertEqual(a.end_line, 1)
  698. self.assertEqual(a.end_column, 1)
  699. self.assertEqual(bc.end_line, 2)
  700. self.assertEqual(bc.end_column, 1)
  701. self.assertEqual(d.end_line, 2)
  702. self.assertEqual(d.end_column, 2)
  703. def test_reduce_cycle(self):
  704. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  705. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  706. """
  707. l = _Lark("""
  708. term: A
  709. | term term
  710. A: "a"
  711. """, start='term')
  712. tree = l.parse("aa")
  713. self.assertEqual(len(tree.children), 2)
  714. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  715. def test_lexer_prioritization(self):
  716. "Tests effect of priority on result"
  717. grammar = """
  718. start: A B | AB
  719. A.2: "a"
  720. B: "b"
  721. AB: "ab"
  722. """
  723. l = _Lark(grammar)
  724. res = l.parse("ab")
  725. self.assertEqual(res.children, ['a', 'b'])
  726. self.assertNotEqual(res.children, ['ab'])
  727. grammar = """
  728. start: A B | AB
  729. A: "a"
  730. B: "b"
  731. AB.3: "ab"
  732. """
  733. l = _Lark(grammar)
  734. res = l.parse("ab")
  735. self.assertNotEqual(res.children, ['a', 'b'])
  736. self.assertEqual(res.children, ['ab'])
  737. def test_import(self):
  738. grammar = """
  739. start: NUMBER WORD
  740. %import common.NUMBER
  741. %import common.WORD
  742. %import common.WS
  743. %ignore WS
  744. """
  745. l = _Lark(grammar)
  746. x = l.parse('12 elephants')
  747. self.assertEqual(x.children, ['12', 'elephants'])
  748. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  749. def test_earley_prioritization(self):
  750. "Tests effect of priority on result"
  751. grammar = """
  752. start: a | b
  753. a.1: "a"
  754. b.2: "a"
  755. """
  756. # l = Lark(grammar, parser='earley', lexer='standard')
  757. l = _Lark(grammar)
  758. res = l.parse("a")
  759. self.assertEqual(res.children[0].data, 'b')
  760. grammar = """
  761. start: a | b
  762. a.2: "a"
  763. b.1: "a"
  764. """
  765. l = _Lark(grammar)
  766. # l = Lark(grammar, parser='earley', lexer='standard')
  767. res = l.parse("a")
  768. self.assertEqual(res.children[0].data, 'a')
  769. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  770. def test_earley_prioritization_sum(self):
  771. "Tests effect of priority on result"
  772. grammar = """
  773. start: ab_ b_ a_ | indirection
  774. indirection: a_ bb_ a_
  775. a_: "a"
  776. b_: "b"
  777. ab_: "ab"
  778. bb_.1: "bb"
  779. """
  780. l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
  781. res = l.parse('abba')
  782. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  783. grammar = """
  784. start: ab_ b_ a_ | indirection
  785. indirection: a_ bb_ a_
  786. a_: "a"
  787. b_: "b"
  788. ab_.1: "ab"
  789. bb_: "bb"
  790. """
  791. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  792. res = l.parse('abba')
  793. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  794. grammar = """
  795. start: ab_ b_ a_ | indirection
  796. indirection: a_ bb_ a_
  797. a_.2: "a"
  798. b_.1: "b"
  799. ab_.3: "ab"
  800. bb_.3: "bb"
  801. """
  802. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  803. res = l.parse('abba')
  804. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  805. grammar = """
  806. start: ab_ b_ a_ | indirection
  807. indirection: a_ bb_ a_
  808. a_.1: "a"
  809. b_.1: "b"
  810. ab_.4: "ab"
  811. bb_.3: "bb"
  812. """
  813. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  814. res = l.parse('abba')
  815. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  816. def test_utf8(self):
  817. g = u"""start: a
  818. a: "±a"
  819. """
  820. l = _Lark(g)
  821. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  822. g = u"""start: A
  823. A: "±a"
  824. """
  825. l = _Lark(g)
  826. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  827. @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
  828. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  829. def test_ignore(self):
  830. grammar = r"""
  831. COMMENT: /(!|(\/\/))[^\n]*/
  832. %ignore COMMENT
  833. %import common.WS -> _WS
  834. %import common.INT
  835. start: "INT"i _WS+ INT _WS*
  836. """
  837. parser = _Lark(grammar)
  838. tree = parser.parse("int 1 ! This is a comment\n")
  839. self.assertEqual(tree.children, ['1'])
  840. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  841. self.assertEqual(tree.children, ['1'])
  842. parser = _Lark(r"""
  843. start : "a"*
  844. %ignore "b"
  845. """)
  846. tree = parser.parse("bb")
  847. self.assertEqual(tree.children, [])
  848. @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
  849. def test_regex_escaping(self):
  850. g = _Lark("start: /[ab]/")
  851. g.parse('a')
  852. g.parse('b')
  853. self.assertRaises( UnexpectedInput, g.parse, 'c')
  854. _Lark(r'start: /\w/').parse('a')
  855. g = _Lark(r'start: /\\w/')
  856. self.assertRaises( UnexpectedInput, g.parse, 'a')
  857. g.parse(r'\w')
  858. _Lark(r'start: /\[/').parse('[')
  859. _Lark(r'start: /\//').parse('/')
  860. _Lark(r'start: /\\/').parse('\\')
  861. _Lark(r'start: /\[ab]/').parse('[ab]')
  862. _Lark(r'start: /\\[ab]/').parse('\\a')
  863. _Lark(r'start: /\t/').parse('\t')
  864. _Lark(r'start: /\\t/').parse('\\t')
  865. _Lark(r'start: /\\\t/').parse('\\\t')
  866. _Lark(r'start: "\t"').parse('\t')
  867. _Lark(r'start: "\\t"').parse('\\t')
  868. _Lark(r'start: "\\\t"').parse('\\\t')
  869. def test_ranged_repeat_rules(self):
  870. g = u"""!start: "A"~3
  871. """
  872. l = _Lark(g)
  873. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  874. self.assertRaises(ParseError, l.parse, u'AA')
  875. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  876. g = u"""!start: "A"~0..2
  877. """
  878. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  879. l = _Lark(g)
  880. self.assertEqual(l.parse(u''), Tree('start', []))
  881. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  882. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  883. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  884. g = u"""!start: "A"~3..2
  885. """
  886. self.assertRaises(GrammarError, _Lark, g)
  887. g = u"""!start: "A"~2..3 "B"~2
  888. """
  889. l = _Lark(g)
  890. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  891. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  892. self.assertRaises(ParseError, l.parse, u'AAAB')
  893. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  894. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  895. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  896. def test_ranged_repeat_terms(self):
  897. g = u"""!start: AAA
  898. AAA: "A"~3
  899. """
  900. l = _Lark(g)
  901. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  902. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  903. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  904. g = u"""!start: AABB CC
  905. AABB: "A"~0..2 "B"~2
  906. CC: "C"~1..2
  907. """
  908. l = _Lark(g)
  909. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  910. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  911. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  912. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  913. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  914. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  915. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  916. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  917. def test_priority_vs_embedded(self):
  918. g = """
  919. A.2: "a"
  920. WORD: ("a".."z")+
  921. start: (A | WORD)+
  922. """
  923. l = _Lark(g)
  924. t = l.parse('abc')
  925. self.assertEqual(t.children, ['a', 'bc'])
  926. self.assertEqual(t.children[0].type, 'A')
  927. _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize()
  928. _TestParser.__name__ = _NAME
  929. globals()[_NAME] = _TestParser
  930. # Note: You still have to import them in __main__ for the tests to run
  931. _TO_TEST = [
  932. ('standard', 'earley'),
  933. ('standard', 'cyk'),
  934. ('dynamic', 'earley'),
  935. ('standard', 'lalr'),
  936. ('contextual', 'lalr'),
  937. (None, 'earley'),
  938. ]
  939. for _LEXER, _PARSER in _TO_TEST:
  940. _make_parser_test(_LEXER, _PARSER)
  941. for _LEXER in (None, 'dynamic'):
  942. _make_full_earley_test(_LEXER)
  943. if __name__ == '__main__':
  944. unittest.main()