Browse Source

Merge branch 'regex'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.9.0
Erez Sh 4 years ago
parent
commit
e5db096b9f
13 changed files with 153 additions and 51 deletions
  1. +1
    -1
      .github/workflows/tests.yml
  2. +25
    -1
      docs/classes.md
  3. +2
    -0
      lark-stubs/lark.pyi
  4. +4
    -1
      lark-stubs/lexer.pyi
  5. +22
    -3
      lark/lark.py
  6. +18
    -16
      lark/lexer.py
  7. +8
    -6
      lark/load_grammar.py
  8. +20
    -15
      lark/parser_frontends.py
  9. +19
    -4
      lark/utils.py
  10. +8
    -1
      setup.py
  11. +1
    -0
      test-requirements.txt
  12. +24
    -2
      tests/test_parser.py
  13. +1
    -1
      tox.ini

+ 1
- 1
.github/workflows/tests.yml View File

@@ -22,7 +22,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -r nearley-requirements.txt
pip install -r test-requirements.txt
- name: Run tests - name: Run tests
run: | run: |
python -m tests python -m tests

+ 25
- 1
docs/classes.md View File

@@ -70,6 +70,8 @@ Useful for caching and multiprocessing.


**g_regex_flags** - Flags that are applied to all terminals (both regex and strings) **g_regex_flags** - Flags that are applied to all terminals (both regex and strings)


**regex** - Use the `regex` library instead of the built-in `re` module (See below)

**keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) **keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False)


**cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. **cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.
@@ -94,13 +96,35 @@ Useful for caching and multiprocessing.
- "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) - "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules)
- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).


#### Domain Specific
#### Misc.


- **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
- **edit_terminals** - A callback - **edit_terminals** - A callback



#### Using Unicode character classes with `regex`
Python's builtin `re` module has a few persistent known bugs and also won't parse
advanced regex features such as character classes.
With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark`
and can act as a drop-in replacement to `re`.

Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module
instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers.
```python
from lark import Lark
>>> g = Lark(r"""
?start: NAME
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
""", regex=True)

>>> g.parse('வணக்கம்')
'வணக்கம்'

```
---- ----


## Tree ## Tree


+ 2
- 0
lark-stubs/lark.pyi View File

@@ -23,6 +23,7 @@ class LarkOptions:
transformer: Optional[Transformer] transformer: Optional[Transformer]
postlex: Optional[PostLex] postlex: Optional[PostLex]
ambiguity: str ambiguity: str
regex: bool
debug: bool debug: bool
keep_all_tokens: bool keep_all_tokens: bool
propagate_positions: bool propagate_positions: bool
@@ -48,6 +49,7 @@ class Lark:
transformer: Optional[Transformer] = None, transformer: Optional[Transformer] = None,
postlex: Optional[PostLex] = None, postlex: Optional[PostLex] = None,
ambiguity: Literal["explicit", "resolve"] = "resolve", ambiguity: Literal["explicit", "resolve"] = "resolve",
regex: bool = False,
debug: bool = False, debug: bool = False,
keep_all_tokens: bool = False, keep_all_tokens: bool = False,
propagate_positions: bool = False, propagate_positions: bool = False,


+ 4
- 1
lark-stubs/lexer.pyi View File

@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from types import ModuleType
from typing import ( from typing import (
TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional,
Pattern as REPattern, Pattern as REPattern,
@@ -107,10 +107,12 @@ class TraditionalLexer(Lexer):
user_callbacks: Dict[str, _Callback] user_callbacks: Dict[str, _Callback]
callback: Dict[str, _Callback] callback: Dict[str, _Callback]
mres: List[Tuple[REPattern, Dict[int, str]]] mres: List[Tuple[REPattern, Dict[int, str]]]
re: ModuleType


def __init__( def __init__(
self, self,
terminals: Collection[TerminalDef], terminals: Collection[TerminalDef],
re_: ModuleType,
ignore: Collection[str] = ..., ignore: Collection[str] = ...,
user_callbacks: Dict[str, _Callback] = ..., user_callbacks: Dict[str, _Callback] = ...,
g_regex_flags: int = ... g_regex_flags: int = ...
@@ -135,6 +137,7 @@ class ContextualLexer(Lexer):
self, self,
terminals: Collection[TerminalDef], terminals: Collection[TerminalDef],
states: Dict[str, Collection[str]], states: Dict[str, Collection[str]],
re_: ModuleType,
ignore: Collection[str] = ..., ignore: Collection[str] = ...,
always_accept: Collection[str] = ..., always_accept: Collection[str] = ...,
user_callbacks: Dict[str, _Callback] = ..., user_callbacks: Dict[str, _Callback] = ...,


+ 22
- 3
lark/lark.py View File

@@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend from .parser_frontends import get_frontend
from .grammar import Rule from .grammar import Rule


import re
try:
import regex
except ImportError:
regex = None

###{standalone ###{standalone


class LarkOptions(Serialize): class LarkOptions(Serialize):
@@ -34,6 +40,7 @@ class LarkOptions(Serialize):
When `False`, `[]` behaves like the `?` operator, When `False`, `[]` behaves like the `?` operator,
and returns no value at all. and returns no value at all.
(default=`False`. Recommended to set to `True`) (default=`False`. Recommended to set to `True`)
regex - When True, uses the `regex` module instead of the stdlib `re`.
cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
LALR only for now. LALR only for now.
When `False`, does nothing (default) When `False`, does nothing (default)
@@ -92,6 +99,7 @@ class LarkOptions(Serialize):
'start': 'start', 'start': 'start',
'priority': 'auto', 'priority': 'auto',
'ambiguity': 'auto', 'ambiguity': 'auto',
'regex': False,
'propagate_positions': False, 'propagate_positions': False,
'lexer_callbacks': {}, 'lexer_callbacks': {},
'maybe_placeholders': False, 'maybe_placeholders': False,
@@ -154,6 +162,16 @@ class Lark(Serialize):


self.options = LarkOptions(options) self.options = LarkOptions(options)


# Set regex or re module
use_regex = self.options.regex
if use_regex:
if regex:
self.re = regex
else:
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
else:
self.re = re

# Some, but not all file-like objects have a 'name' attribute # Some, but not all file-like objects have a 'name' attribute
try: try:
self.source = grammar.name self.source = grammar.name
@@ -225,7 +243,7 @@ class Lark(Serialize):
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )


# Parse the grammar file and compose the grammars (TODO) # Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source)
self.grammar = load_grammar(grammar, self.source, self.re)


# Compile the EBNF grammar into BNF # Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -286,7 +304,7 @@ class Lark(Serialize):
def _build_parser(self): def _build_parser(self):
self._prepare_callbacks() self._prepare_callbacks()
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options)


def save(self, f): def save(self, f):
data, m = self.memo_serialize([TerminalDef, Rule]) data, m = self.memo_serialize([TerminalDef, Rule])
@@ -313,10 +331,11 @@ class Lark(Serialize):
if postlex is not None: if postlex is not None:
options['postlex'] = postlex options['postlex'] = postlex
self.options = LarkOptions.deserialize(options, memo) self.options = LarkOptions.deserialize(options, memo)
self.re = regex if self.options.regex else re
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>' self.source = '<deserialized>'
self._prepare_callbacks() self._prepare_callbacks()
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re)
return self return self


@classmethod @classmethod


+ 18
- 16
lark/lexer.py View File

@@ -230,7 +230,7 @@ class CallChain:






def _create_unless(terminals, g_regex_flags):
def _create_unless(terminals, g_regex_flags, re_):
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys() assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set() embedded_strs = set()
@@ -241,19 +241,19 @@ def _create_unless(terminals, g_regex_flags):
if strtok.priority > retok.priority: if strtok.priority > retok.priority:
continue continue
s = strtok.pattern.value s = strtok.pattern.value
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s: if m and m.group(0) == s:
unless.append(strtok) unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags: if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok) embedded_strs.add(strtok)
if unless: if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))


terminals = [t for t in terminals if t not in embedded_strs] terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback return terminals, callback




def _build_mres(terminals, max_size, g_regex_flags, match_whole):
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
# Python sets an unreasonable group limit (currently 100) in its re module # Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError! # Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful. # This function recursively tries less and less groups until it's successful.
@@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole):
mres = [] mres = []
while terminals: while terminals:
try: try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/ except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)


# terms_from_name = {t.name: t for t in terminals[:max_size]} # terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:] terminals = terminals[max_size:]
return mres return mres


def build_mres(terminals, g_regex_flags, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)
def build_mres(terminals, g_regex_flags, re_, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)


def _regexp_has_newline(r): def _regexp_has_newline(r):
r"""Expressions that may indicate newlines in a regexp: r"""Expressions that may indicate newlines in a regexp:
@@ -294,16 +294,17 @@ class Lexer(object):


class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):


def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals assert all(isinstance(t, TerminalDef) for t in terminals), terminals


terminals = list(terminals) terminals = list(terminals)


self.re = re_
# Sanitization # Sanitization
for t in terminals: for t in terminals:
try: try:
re.compile(t.pattern.to_regexp(), g_regex_flags)
except re.error:
self.re.compile(t.pattern.to_regexp(), g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))


if t.pattern.min_width == 0: if t.pattern.min_width == 0:
@@ -321,7 +322,7 @@ class TraditionalLexer(Lexer):
self.build(g_regex_flags) self.build(g_regex_flags)


def build(self, g_regex_flags=0): def build(self, g_regex_flags=0):
terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re)
assert all(self.callback.values()) assert all(self.callback.values())


for type_, f in self.user_callbacks.items(): for type_, f in self.user_callbacks.items():
@@ -331,7 +332,7 @@ class TraditionalLexer(Lexer):
else: else:
self.callback[type_] = f self.callback[type_] = f


self.mres = build_mres(terminals, g_regex_flags)
self.mres = build_mres(terminals, g_regex_flags, self.re)


def match(self, stream, pos): def match(self, stream, pos):
for mre, type_from_index in self.mres: for mre, type_from_index in self.mres:
@@ -347,7 +348,8 @@ class TraditionalLexer(Lexer):


class ContextualLexer(Lexer): class ContextualLexer(Lexer):


def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
self.re = re_
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:
assert t.name not in tokens_by_name, t assert t.name not in tokens_by_name, t
@@ -362,12 +364,12 @@ class ContextualLexer(Lexer):
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept) accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer


self.lexers[state] = lexer self.lexers[state] = lexer


self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)


def lex(self, stream, get_parser_state): def lex(self, stream, get_parser_state):
parser_state = get_parser_state() parser_state = get_parser_state()


+ 8
- 6
lark/load_grammar.py View File

@@ -616,7 +616,7 @@ class Grammar:




_imported_grammars = {} _imported_grammars = {}
def import_grammar(grammar_path, base_paths=[]):
def import_grammar(grammar_path, re_, base_paths=[]):
if grammar_path not in _imported_grammars: if grammar_path not in _imported_grammars:
import_paths = base_paths + IMPORT_PATHS import_paths = base_paths + IMPORT_PATHS
for import_path in import_paths: for import_path in import_paths:
@@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]):
joined_path = os.path.join(import_path, grammar_path) joined_path = os.path.join(import_path, grammar_path)
with open(joined_path, encoding='utf8') as f: with open(joined_path, encoding='utf8') as f:
text = f.read() text = f.read()
grammar = load_grammar(text, joined_path)
grammar = load_grammar(text, joined_path, re_)
_imported_grammars[grammar_path] = grammar _imported_grammars[grammar_path] = grammar
break break
else: else:
@@ -755,7 +755,8 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}


class GrammarLoader: class GrammarLoader:
def __init__(self):
def __init__(self, re_):
self.re = re_
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]


rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
@@ -764,7 +765,7 @@ class GrammarLoader:
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, ['start']) parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)


self.canonize_tree = CanonizeTree() self.canonize_tree = CanonizeTree()


@@ -862,7 +863,7 @@ class GrammarLoader:
# import grammars # import grammars
for dotted_path, (base_paths, aliases) in imports.items(): for dotted_path, (base_paths, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT grammar_path = os.path.join(*dotted_path) + EXT
g = import_grammar(grammar_path, base_paths=base_paths)
g = import_grammar(grammar_path, self.re, base_paths=base_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)


term_defs += new_td term_defs += new_td
@@ -942,4 +943,5 @@ class GrammarLoader:






load_grammar = GrammarLoader().load_grammar
def load_grammar(grammar, source, re_):
return GrammarLoader(re_).load_grammar(grammar, source)

+ 20
- 15
lark/parser_frontends.py View File

@@ -1,4 +1,3 @@
import re
from functools import partial from functools import partial


from .utils import get_regexp_width, Serialize from .utils import get_regexp_width, Serialize
@@ -63,14 +62,16 @@ class WithLexer(_ParserFrontend):
__serialize_fields__ = 'parser', 'lexer_conf', 'start' __serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf, __serialize_namespace__ = LexerConf,


def __init__(self, lexer_conf, parser_conf, options=None):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.start = parser_conf.start self.start = parser_conf.start
self.postlex = lexer_conf.postlex self.postlex = lexer_conf.postlex
self.re = re_


@classmethod @classmethod
def deserialize(cls, data, memo, callbacks, postlex):
def deserialize(cls, data, memo, callbacks, postlex, re_):
inst = super(WithLexer, cls).deserialize(data, memo) inst = super(WithLexer, cls).deserialize(data, memo)
inst.re = re_
inst.postlex = postlex inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.init_lexer() inst.init_lexer()
@@ -88,13 +89,14 @@ class WithLexer(_ParserFrontend):
return self._parse(token_stream, start) return self._parse(token_stream, start)


def init_traditional_lexer(self): def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)


class LALR_WithLexer(WithLexer): class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
debug = options.debug if options else False debug = options.debug if options else False
self.re = re_
self.parser = LALR_Parser(parser_conf, debug=debug) self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)


self.init_lexer() self.init_lexer()


@@ -110,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else () always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states, self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
re_=self.re,
ignore=self.lexer_conf.ignore, ignore=self.lexer_conf.ignore,
always_accept=always_accept, always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks, user_callbacks=self.lexer_conf.callbacks,
@@ -126,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer):
###} ###}


class LALR_CustomLexer(LALR_WithLexer): class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.lexer = lexer_cls(lexer_conf)
def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None):
self.lexer = lexer_cls(lexer_conf, re_=re_)
debug = options.debug if options else False debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug) self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)




def tokenize_text(text): def tokenize_text(text):
@@ -143,8 +146,8 @@ def tokenize_text(text):
yield Token('CHAR', ch, line=line, column=i - col_start_pos) yield Token('CHAR', ch, line=line, column=i - col_start_pos)


class Earley(WithLexer): class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
self.init_traditional_lexer() self.init_traditional_lexer()


resolve_ambiguity = options.ambiguity == 'resolve' resolve_ambiguity = options.ambiguity == 'resolve'
@@ -156,7 +159,9 @@ class Earley(WithLexer):




class XEarley(_ParserFrontend): class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw):
self.re = re_

self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start self.start = parser_conf.start


@@ -188,7 +193,7 @@ class XEarley(_ParserFrontend):
if width == 0: if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)


self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)
self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags)


def parse(self, text, start): def parse(self, text, start):
return self._parse(text, start) return self._parse(text, start)
@@ -201,8 +206,8 @@ class XEarley_CompleteLex(XEarley):


class CYK(WithLexer): class CYK(WithLexer):


def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
self.init_traditional_lexer() self.init_traditional_lexer()


self._analysis = GrammarAnalyzer(parser_conf) self._analysis = GrammarAnalyzer(parser_conf)


+ 19
- 4
lark/utils.py View File

@@ -165,16 +165,31 @@ def smart_decorator(f, create_decorator):
else: else:
return create_decorator(f.__func__.__call__, True) return create_decorator(f.__func__.__call__, True)


try:
import regex
except ImportError:
regex = None

import sys, re import sys, re
Py36 = (sys.version_info[:2] >= (3, 6)) Py36 = (sys.version_info[:2] >= (3, 6))


import sre_parse import sre_parse
import sre_constants import sre_constants
def get_regexp_width(regexp):
categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
def get_regexp_width(expr):
if regex:
# Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
# a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
# match here below.
regexp_final = re.sub(categ_pattern, 'A', expr)
else:
if re.search(categ_pattern, expr):
raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
regexp_final = expr
try: try:
return [int(x) for x in sre_parse.parse(regexp).getwidth()]
return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
except sre_constants.error: except sre_constants.error:
raise ValueError(regexp)
raise ValueError(expr)


###} ###}


@@ -182,7 +197,7 @@ def get_regexp_width(regexp):
def dedup_list(l): def dedup_list(l):
"""Given a list (l) will removing duplicates from the list, """Given a list (l) will removing duplicates from the list,
preserving the original order of the list. Assumes that preserving the original order of the list. Assumes that
the list entrie are hashable."""
the list entries are hashable."""
dedup = set() dedup = set()
return [ x for x in l if not (x in dedup or dedup.add(x))] return [ x for x in l if not (x in dedup or dedup.add(x))]




+ 8
- 1
setup.py View File

@@ -1,4 +1,7 @@
import re
try:
import regex as re
except ImportError:
import re
from setuptools import find_packages, setup from setuptools import find_packages, setup


__version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read()) __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read())
@@ -11,6 +14,10 @@ setup(
requires = [], requires = [],
install_requires = [], install_requires = [],


extras_require = {
"regex": ["regex"]
},

package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']},


test_suite = 'tests.__main__', test_suite = 'tests.__main__',


nearley-requirements.txt → test-requirements.txt View File

@@ -1 +1,2 @@
Js2Py==0.68 Js2Py==0.68
regex

+ 24
- 2
tests/test_parser.py View File

@@ -20,6 +20,11 @@ from io import (


logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)


try:
import regex
except ImportError:
regex = None

from lark.lark import Lark from lark.lark import Lark
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
from lark.tree import Tree from lark.tree import Tree
@@ -548,8 +553,8 @@ class CustomLexer(Lexer):
Purpose of this custom lexer is to test the integration, Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour. so it uses the traditionalparser as implementation without custom lexing behaviour.
""" """
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def __init__(self, lexer_conf, re_):
self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def lex(self, *args, **kwargs): def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs) return self.lexer.lex(*args, **kwargs)


@@ -1784,6 +1789,23 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(a.line, 1) self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2) self.assertEqual(b.line, 2)


@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_class(self):
"Tests that character classes from the `regex` module work correctly."
g = _Lark(r"""?start: NAME
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)

self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_word(self):
"Tests that a persistent bug in the `re` module works when `regex` is enabled."
g = _Lark(r"""?start: NAME
NAME: /[\w]+/
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME _TestParser.__name__ = _NAME


+ 1
- 1
tox.ini View File

@@ -14,7 +14,7 @@ pypy3 = pypy3
[testenv] [testenv]
whitelist_externals = git whitelist_externals = git
deps = deps =
-rnearley-requirements.txt
-rtest-requirements.txt


# to always force recreation and avoid unexpected side effects # to always force recreation and avoid unexpected side effects
recreate=True recreate=True


Loading…
Cancel
Save