Browse Source

Merge branch 'regex' of https://github.com/julienmalard/lark into julienmalard-regex

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.9.0
Erez Sh 5 years ago
parent
commit
1ef0e1832e
6 changed files with 27 additions and 50 deletions
  1. +1
    -0
      lark-stubs/lexer.pyi
  2. +7
    -7
      lark/lexer.py
  3. +1
    -1
      tests/__main__.py
  4. +17
    -0
      tests/test_parser.py
  5. +0
    -37
      tests/test_regex.py
  6. +1
    -5
      tox.ini

+ 1
- 0
lark-stubs/lexer.pyi View File

@@ -107,6 +107,7 @@ class TraditionalLexer(Lexer):
user_callbacks: Dict[str, _Callback]
callback: Dict[str, _Callback]
mres: List[Tuple[REPattern, Dict[int, str]]]
re: ModuleType

def __init__(
self,


+ 7
- 7
lark/lexer.py View File

@@ -247,13 +247,13 @@ def _create_unless(terminals, g_regex_flags, re_):
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole):
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
@@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole):
mres = []
while terminals:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)

# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:]
return mres

def build_mres(terminals, g_regex_flags, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)
def build_mres(terminals, g_regex_flags, re_, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)

def _regexp_has_newline(r):
r"""Expressions that may indicate newlines in a regexp:
@@ -332,7 +332,7 @@ class TraditionalLexer(Lexer):
else:
self.callback[type_] = f

self.mres = build_mres(terminals, g_regex_flags)
self.mres = build_mres(terminals, g_regex_flags, self.re)

def match(self, stream, pos):
for mre, type_from_index in self.mres:


+ 1
- 1
tests/__main__.py View File

@@ -7,7 +7,7 @@ from .test_trees import TestTrees
from .test_tools import TestStandalone
from .test_cache import TestCache
from .test_reconstructor import TestReconstructor
from .test_regex import TestRegex
try:
from .test_nearley.test_nearley import TestNearley
except ImportError:


+ 17
- 0
tests/test_parser.py View File

@@ -1787,6 +1787,23 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2)

@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_class(self):
"Tests that character classes from the `regex` module work correctly."
g = _Lark(r"""?start: NAME
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)

self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_word(self):
"Tests that a persistent bug in the `re` module works when `regex` is enabled."
g = _Lark(r"""?start: NAME
NAME: /[\w]+/
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME


+ 0
- 37
tests/test_regex.py View File

@@ -1,37 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

import logging
import sys
import unittest

logging.basicConfig(level=logging.INFO)

from lark.lark import Lark


class TestRegex(unittest.TestCase):
@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_class(self):
"Tests that character classes from the `regex` module work correctly."
g = Lark(r"""
?start: NAME
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
""", regex=True)

self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_word(self):
"Tests that a persistent bug in the `re` module works when `regex` is enabled."
g = Lark(r"""
?start: NAME
NAME: /[\w]+/
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')


if __name__ == '__main__':
unittest.main()

+ 1
- 5
tox.ini View File

@@ -14,11 +14,7 @@ pypy3 = pypy3
[testenv]
whitelist_externals = git
deps =
-rnearley-requirements.txt
-rregex-requirements.txt

# For regex testing
extras = regex
-rtest-requirements.txt

# to always force recreation and avoid unexpected side effects
recreate=True


Loading…
Cancel
Save