@@ -22,7 +22,7 @@ jobs: | |||||
- name: Install dependencies | - name: Install dependencies | ||||
run: | | run: | | ||||
python -m pip install --upgrade pip | python -m pip install --upgrade pip | ||||
pip install -r nearley-requirements.txt | |||||
pip install -r test-requirements.txt | |||||
- name: Run tests | - name: Run tests | ||||
run: | | run: | | ||||
python -m tests | python -m tests |
@@ -70,6 +70,8 @@ Useful for caching and multiprocessing. | |||||
**g_regex_flags** - Flags that are applied to all terminals (both regex and strings) | **g_regex_flags** - Flags that are applied to all terminals (both regex and strings) | ||||
**regex** - Use the `regex` library instead of the built-in `re` module (See below) | |||||
**keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) | **keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) | ||||
**cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. | **cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. | ||||
@@ -94,13 +96,35 @@ Useful for caching and multiprocessing. | |||||
- "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) | - "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) | ||||
- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). | - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). | ||||
#### Domain Specific | |||||
#### Misc. | |||||
- **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. | - **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. | ||||
- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) | - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) | ||||
- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | ||||
- **edit_terminals** - A callback | - **edit_terminals** - A callback | ||||
#### Using Unicode character classes with `regex` | |||||
Python's builtin `re` module has a few persistent known bugs and also won't parse | |||||
advanced regex features such as character classes. | |||||
With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` | |||||
and can act as a drop-in replacement to `re`. | |||||
Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module | |||||
instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. | |||||
```python | |||||
from lark import Lark | |||||
>>> g = Lark(r""" | |||||
?start: NAME | |||||
NAME: ID_START ID_CONTINUE* | |||||
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||||
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ | |||||
""", regex=True) | |||||
>>> g.parse('வணக்கம்') | |||||
'வணக்கம்' | |||||
``` | |||||
---- | ---- | ||||
## Tree | ## Tree | ||||
@@ -23,6 +23,7 @@ class LarkOptions: | |||||
transformer: Optional[Transformer] | transformer: Optional[Transformer] | ||||
postlex: Optional[PostLex] | postlex: Optional[PostLex] | ||||
ambiguity: str | ambiguity: str | ||||
regex: bool | |||||
debug: bool | debug: bool | ||||
keep_all_tokens: bool | keep_all_tokens: bool | ||||
propagate_positions: bool | propagate_positions: bool | ||||
@@ -48,6 +49,7 @@ class Lark: | |||||
transformer: Optional[Transformer] = None, | transformer: Optional[Transformer] = None, | ||||
postlex: Optional[PostLex] = None, | postlex: Optional[PostLex] = None, | ||||
ambiguity: Literal["explicit", "resolve"] = "resolve", | ambiguity: Literal["explicit", "resolve"] = "resolve", | ||||
regex: bool = False, | |||||
debug: bool = False, | debug: bool = False, | ||||
keep_all_tokens: bool = False, | keep_all_tokens: bool = False, | ||||
propagate_positions: bool = False, | propagate_positions: bool = False, | ||||
@@ -1,5 +1,5 @@ | |||||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
from types import ModuleType | |||||
from typing import ( | from typing import ( | ||||
TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, | TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, | ||||
Pattern as REPattern, | Pattern as REPattern, | ||||
@@ -107,10 +107,12 @@ class TraditionalLexer(Lexer): | |||||
user_callbacks: Dict[str, _Callback] | user_callbacks: Dict[str, _Callback] | ||||
callback: Dict[str, _Callback] | callback: Dict[str, _Callback] | ||||
mres: List[Tuple[REPattern, Dict[int, str]]] | mres: List[Tuple[REPattern, Dict[int, str]]] | ||||
re: ModuleType | |||||
def __init__( | def __init__( | ||||
self, | self, | ||||
terminals: Collection[TerminalDef], | terminals: Collection[TerminalDef], | ||||
re_: ModuleType, | |||||
ignore: Collection[str] = ..., | ignore: Collection[str] = ..., | ||||
user_callbacks: Dict[str, _Callback] = ..., | user_callbacks: Dict[str, _Callback] = ..., | ||||
g_regex_flags: int = ... | g_regex_flags: int = ... | ||||
@@ -135,6 +137,7 @@ class ContextualLexer(Lexer): | |||||
self, | self, | ||||
terminals: Collection[TerminalDef], | terminals: Collection[TerminalDef], | ||||
states: Dict[str, Collection[str]], | states: Dict[str, Collection[str]], | ||||
re_: ModuleType, | |||||
ignore: Collection[str] = ..., | ignore: Collection[str] = ..., | ||||
always_accept: Collection[str] = ..., | always_accept: Collection[str] = ..., | ||||
user_callbacks: Dict[str, _Callback] = ..., | user_callbacks: Dict[str, _Callback] = ..., | ||||
@@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder | |||||
from .parser_frontends import get_frontend | from .parser_frontends import get_frontend | ||||
from .grammar import Rule | from .grammar import Rule | ||||
import re | |||||
try: | |||||
import regex | |||||
except ImportError: | |||||
regex = None | |||||
###{standalone | ###{standalone | ||||
class LarkOptions(Serialize): | class LarkOptions(Serialize): | ||||
@@ -34,6 +40,7 @@ class LarkOptions(Serialize): | |||||
When `False`, `[]` behaves like the `?` operator, | When `False`, `[]` behaves like the `?` operator, | ||||
and returns no value at all. | and returns no value at all. | ||||
(default=`False`. Recommended to set to `True`) | (default=`False`. Recommended to set to `True`) | ||||
regex - When True, uses the `regex` module instead of the stdlib `re`. | |||||
cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. | cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. | ||||
LALR only for now. | LALR only for now. | ||||
When `False`, does nothing (default) | When `False`, does nothing (default) | ||||
@@ -92,6 +99,7 @@ class LarkOptions(Serialize): | |||||
'start': 'start', | 'start': 'start', | ||||
'priority': 'auto', | 'priority': 'auto', | ||||
'ambiguity': 'auto', | 'ambiguity': 'auto', | ||||
'regex': False, | |||||
'propagate_positions': False, | 'propagate_positions': False, | ||||
'lexer_callbacks': {}, | 'lexer_callbacks': {}, | ||||
'maybe_placeholders': False, | 'maybe_placeholders': False, | ||||
@@ -154,6 +162,16 @@ class Lark(Serialize): | |||||
self.options = LarkOptions(options) | self.options = LarkOptions(options) | ||||
# Set regex or re module | |||||
use_regex = self.options.regex | |||||
if use_regex: | |||||
if regex: | |||||
self.re = regex | |||||
else: | |||||
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') | |||||
else: | |||||
self.re = re | |||||
# Some, but not all file-like objects have a 'name' attribute | # Some, but not all file-like objects have a 'name' attribute | ||||
try: | try: | ||||
self.source = grammar.name | self.source = grammar.name | ||||
@@ -225,7 +243,7 @@ class Lark(Serialize): | |||||
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) | assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) | ||||
# Parse the grammar file and compose the grammars (TODO) | # Parse the grammar file and compose the grammars (TODO) | ||||
self.grammar = load_grammar(grammar, self.source) | |||||
self.grammar = load_grammar(grammar, self.source, self.re) | |||||
# Compile the EBNF grammar into BNF | # Compile the EBNF grammar into BNF | ||||
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) | self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) | ||||
@@ -286,7 +304,7 @@ class Lark(Serialize): | |||||
def _build_parser(self): | def _build_parser(self): | ||||
self._prepare_callbacks() | self._prepare_callbacks() | ||||
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) | parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) | ||||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||||
return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options) | |||||
def save(self, f): | def save(self, f): | ||||
data, m = self.memo_serialize([TerminalDef, Rule]) | data, m = self.memo_serialize([TerminalDef, Rule]) | ||||
@@ -313,10 +331,11 @@ class Lark(Serialize): | |||||
if postlex is not None: | if postlex is not None: | ||||
options['postlex'] = postlex | options['postlex'] = postlex | ||||
self.options = LarkOptions.deserialize(options, memo) | self.options = LarkOptions.deserialize(options, memo) | ||||
self.re = regex if self.options.regex else re | |||||
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | ||||
self.source = '<deserialized>' | self.source = '<deserialized>' | ||||
self._prepare_callbacks() | self._prepare_callbacks() | ||||
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex) | |||||
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re) | |||||
return self | return self | ||||
@classmethod | @classmethod | ||||
@@ -230,7 +230,7 @@ class CallChain: | |||||
def _create_unless(terminals, g_regex_flags): | |||||
def _create_unless(terminals, g_regex_flags, re_): | |||||
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | ||||
assert len(tokens_by_type) <= 2, tokens_by_type.keys() | assert len(tokens_by_type) <= 2, tokens_by_type.keys() | ||||
embedded_strs = set() | embedded_strs = set() | ||||
@@ -241,19 +241,19 @@ def _create_unless(terminals, g_regex_flags): | |||||
if strtok.priority > retok.priority: | if strtok.priority > retok.priority: | ||||
continue | continue | ||||
s = strtok.pattern.value | s = strtok.pattern.value | ||||
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||||
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||||
if m and m.group(0) == s: | if m and m.group(0) == s: | ||||
unless.append(strtok) | unless.append(strtok) | ||||
if strtok.pattern.flags <= retok.pattern.flags: | if strtok.pattern.flags <= retok.pattern.flags: | ||||
embedded_strs.add(strtok) | embedded_strs.add(strtok) | ||||
if unless: | if unless: | ||||
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True)) | |||||
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) | |||||
terminals = [t for t in terminals if t not in embedded_strs] | terminals = [t for t in terminals if t not in embedded_strs] | ||||
return terminals, callback | return terminals, callback | ||||
def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||||
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): | |||||
# Python sets an unreasonable group limit (currently 100) in its re module | # Python sets an unreasonable group limit (currently 100) in its re module | ||||
# Worse, the only way to know we reached it is by catching an AssertionError! | # Worse, the only way to know we reached it is by catching an AssertionError! | ||||
# This function recursively tries less and less groups until it's successful. | # This function recursively tries less and less groups until it's successful. | ||||
@@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||||
mres = [] | mres = [] | ||||
while terminals: | while terminals: | ||||
try: | try: | ||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||||
mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||||
except AssertionError: # Yes, this is what Python provides us.. :/ | except AssertionError: # Yes, this is what Python provides us.. :/ | ||||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole) | |||||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) | |||||
# terms_from_name = {t.name: t for t in terminals[:max_size]} | # terms_from_name = {t.name: t for t in terminals[:max_size]} | ||||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | ||||
terminals = terminals[max_size:] | terminals = terminals[max_size:] | ||||
return mres | return mres | ||||
def build_mres(terminals, g_regex_flags, match_whole=False): | |||||
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole) | |||||
def build_mres(terminals, g_regex_flags, re_, match_whole=False): | |||||
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) | |||||
def _regexp_has_newline(r): | def _regexp_has_newline(r): | ||||
r"""Expressions that may indicate newlines in a regexp: | r"""Expressions that may indicate newlines in a regexp: | ||||
@@ -294,16 +294,17 @@ class Lexer(object): | |||||
class TraditionalLexer(Lexer): | class TraditionalLexer(Lexer): | ||||
def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0): | |||||
def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0): | |||||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals | assert all(isinstance(t, TerminalDef) for t in terminals), terminals | ||||
terminals = list(terminals) | terminals = list(terminals) | ||||
self.re = re_ | |||||
# Sanitization | # Sanitization | ||||
for t in terminals: | for t in terminals: | ||||
try: | try: | ||||
re.compile(t.pattern.to_regexp(), g_regex_flags) | |||||
except re.error: | |||||
self.re.compile(t.pattern.to_regexp(), g_regex_flags) | |||||
except self.re.error: | |||||
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) | raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) | ||||
if t.pattern.min_width == 0: | if t.pattern.min_width == 0: | ||||
@@ -321,7 +322,7 @@ class TraditionalLexer(Lexer): | |||||
self.build(g_regex_flags) | self.build(g_regex_flags) | ||||
def build(self, g_regex_flags=0): | def build(self, g_regex_flags=0): | ||||
terminals, self.callback = _create_unless(self.terminals, g_regex_flags) | |||||
terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re) | |||||
assert all(self.callback.values()) | assert all(self.callback.values()) | ||||
for type_, f in self.user_callbacks.items(): | for type_, f in self.user_callbacks.items(): | ||||
@@ -331,7 +332,7 @@ class TraditionalLexer(Lexer): | |||||
else: | else: | ||||
self.callback[type_] = f | self.callback[type_] = f | ||||
self.mres = build_mres(terminals, g_regex_flags) | |||||
self.mres = build_mres(terminals, g_regex_flags, self.re) | |||||
def match(self, stream, pos): | def match(self, stream, pos): | ||||
for mre, type_from_index in self.mres: | for mre, type_from_index in self.mres: | ||||
@@ -347,7 +348,8 @@ class TraditionalLexer(Lexer): | |||||
class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): | |||||
def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): | |||||
self.re = re_ | |||||
tokens_by_name = {} | tokens_by_name = {} | ||||
for t in terminals: | for t in terminals: | ||||
assert t.name not in tokens_by_name, t | assert t.name not in tokens_by_name, t | ||||
@@ -362,12 +364,12 @@ class ContextualLexer(Lexer): | |||||
except KeyError: | except KeyError: | ||||
accepts = set(accepts) | set(ignore) | set(always_accept) | accepts = set(accepts) | set(ignore) | set(always_accept) | ||||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | ||||
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||||
lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||||
lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
self.lexers[state] = lexer | self.lexers[state] = lexer | ||||
self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||||
self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||||
def lex(self, stream, get_parser_state): | def lex(self, stream, get_parser_state): | ||||
parser_state = get_parser_state() | parser_state = get_parser_state() | ||||
@@ -616,7 +616,7 @@ class Grammar: | |||||
_imported_grammars = {} | _imported_grammars = {} | ||||
def import_grammar(grammar_path, base_paths=[]): | |||||
def import_grammar(grammar_path, re_, base_paths=[]): | |||||
if grammar_path not in _imported_grammars: | if grammar_path not in _imported_grammars: | ||||
import_paths = base_paths + IMPORT_PATHS | import_paths = base_paths + IMPORT_PATHS | ||||
for import_path in import_paths: | for import_path in import_paths: | ||||
@@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]): | |||||
joined_path = os.path.join(import_path, grammar_path) | joined_path = os.path.join(import_path, grammar_path) | ||||
with open(joined_path, encoding='utf8') as f: | with open(joined_path, encoding='utf8') as f: | ||||
text = f.read() | text = f.read() | ||||
grammar = load_grammar(text, joined_path) | |||||
grammar = load_grammar(text, joined_path, re_) | |||||
_imported_grammars[grammar_path] = grammar | _imported_grammars[grammar_path] = grammar | ||||
break | break | ||||
else: | else: | ||||
@@ -755,7 +755,8 @@ def _find_used_symbols(tree): | |||||
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | ||||
class GrammarLoader: | class GrammarLoader: | ||||
def __init__(self): | |||||
def __init__(self, re_): | |||||
self.re = re_ | |||||
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] | terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] | ||||
rules = [options_from_rule(name, None, x) for name, x in RULES.items()] | rules = [options_from_rule(name, None, x) for name, x in RULES.items()] | ||||
@@ -764,7 +765,7 @@ class GrammarLoader: | |||||
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) | lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) | ||||
parser_conf = ParserConf(rules, callback, ['start']) | parser_conf = ParserConf(rules, callback, ['start']) | ||||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | |||||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) | |||||
self.canonize_tree = CanonizeTree() | self.canonize_tree = CanonizeTree() | ||||
@@ -862,7 +863,7 @@ class GrammarLoader: | |||||
# import grammars | # import grammars | ||||
for dotted_path, (base_paths, aliases) in imports.items(): | for dotted_path, (base_paths, aliases) in imports.items(): | ||||
grammar_path = os.path.join(*dotted_path) + EXT | grammar_path = os.path.join(*dotted_path) + EXT | ||||
g = import_grammar(grammar_path, base_paths=base_paths) | |||||
g = import_grammar(grammar_path, self.re, base_paths=base_paths) | |||||
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) | new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) | ||||
term_defs += new_td | term_defs += new_td | ||||
@@ -942,4 +943,5 @@ class GrammarLoader: | |||||
load_grammar = GrammarLoader().load_grammar | |||||
def load_grammar(grammar, source, re_): | |||||
return GrammarLoader(re_).load_grammar(grammar, source) |
@@ -1,4 +1,3 @@ | |||||
import re | |||||
from functools import partial | from functools import partial | ||||
from .utils import get_regexp_width, Serialize | from .utils import get_regexp_width, Serialize | ||||
@@ -63,14 +62,16 @@ class WithLexer(_ParserFrontend): | |||||
__serialize_fields__ = 'parser', 'lexer_conf', 'start' | __serialize_fields__ = 'parser', 'lexer_conf', 'start' | ||||
__serialize_namespace__ = LexerConf, | __serialize_namespace__ = LexerConf, | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | |||||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
self.start = parser_conf.start | self.start = parser_conf.start | ||||
self.postlex = lexer_conf.postlex | self.postlex = lexer_conf.postlex | ||||
self.re = re_ | |||||
@classmethod | @classmethod | ||||
def deserialize(cls, data, memo, callbacks, postlex): | |||||
def deserialize(cls, data, memo, callbacks, postlex, re_): | |||||
inst = super(WithLexer, cls).deserialize(data, memo) | inst = super(WithLexer, cls).deserialize(data, memo) | ||||
inst.re = re_ | |||||
inst.postlex = postlex | inst.postlex = postlex | ||||
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | ||||
inst.init_lexer() | inst.init_lexer() | ||||
@@ -88,13 +89,14 @@ class WithLexer(_ParserFrontend): | |||||
return self._parse(token_stream, start) | return self._parse(token_stream, start) | ||||
def init_traditional_lexer(self): | def init_traditional_lexer(self): | ||||
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||||
self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||||
class LALR_WithLexer(WithLexer): | class LALR_WithLexer(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | |||||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||||
debug = options.debug if options else False | debug = options.debug if options else False | ||||
self.re = re_ | |||||
self.parser = LALR_Parser(parser_conf, debug=debug) | self.parser = LALR_Parser(parser_conf, debug=debug) | ||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||||
self.init_lexer() | self.init_lexer() | ||||
@@ -110,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||||
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | ||||
always_accept = self.postlex.always_accept if self.postlex else () | always_accept = self.postlex.always_accept if self.postlex else () | ||||
self.lexer = ContextualLexer(self.lexer_conf.tokens, states, | self.lexer = ContextualLexer(self.lexer_conf.tokens, states, | ||||
re_=self.re, | |||||
ignore=self.lexer_conf.ignore, | ignore=self.lexer_conf.ignore, | ||||
always_accept=always_accept, | always_accept=always_accept, | ||||
user_callbacks=self.lexer_conf.callbacks, | user_callbacks=self.lexer_conf.callbacks, | ||||
@@ -126,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||||
###} | ###} | ||||
class LALR_CustomLexer(LALR_WithLexer): | class LALR_CustomLexer(LALR_WithLexer): | ||||
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||||
self.lexer = lexer_cls(lexer_conf) | |||||
def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None): | |||||
self.lexer = lexer_cls(lexer_conf, re_=re_) | |||||
debug = options.debug if options else False | debug = options.debug if options else False | ||||
self.parser = LALR_Parser(parser_conf, debug=debug) | self.parser = LALR_Parser(parser_conf, debug=debug) | ||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||||
def tokenize_text(text): | def tokenize_text(text): | ||||
@@ -143,8 +146,8 @@ def tokenize_text(text): | |||||
yield Token('CHAR', ch, line=line, column=i - col_start_pos) | yield Token('CHAR', ch, line=line, column=i - col_start_pos) | ||||
class Earley(WithLexer): | class Earley(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||||
self.init_traditional_lexer() | self.init_traditional_lexer() | ||||
resolve_ambiguity = options.ambiguity == 'resolve' | resolve_ambiguity = options.ambiguity == 'resolve' | ||||
@@ -156,7 +159,9 @@ class Earley(WithLexer): | |||||
class XEarley(_ParserFrontend): | class XEarley(_ParserFrontend): | ||||
def __init__(self, lexer_conf, parser_conf, options=None, **kw): | |||||
def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw): | |||||
self.re = re_ | |||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | self.token_by_name = {t.name:t for t in lexer_conf.tokens} | ||||
self.start = parser_conf.start | self.start = parser_conf.start | ||||
@@ -188,7 +193,7 @@ class XEarley(_ParserFrontend): | |||||
if width == 0: | if width == 0: | ||||
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) | raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) | ||||
self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags) | |||||
self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags) | |||||
def parse(self, text, start): | def parse(self, text, start): | ||||
return self._parse(text, start) | return self._parse(text, start) | ||||
@@ -201,8 +206,8 @@ class XEarley_CompleteLex(XEarley): | |||||
class CYK(WithLexer): | class CYK(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||||
self.init_traditional_lexer() | self.init_traditional_lexer() | ||||
self._analysis = GrammarAnalyzer(parser_conf) | self._analysis = GrammarAnalyzer(parser_conf) | ||||
@@ -165,16 +165,31 @@ def smart_decorator(f, create_decorator): | |||||
else: | else: | ||||
return create_decorator(f.__func__.__call__, True) | return create_decorator(f.__func__.__call__, True) | ||||
try: | |||||
import regex | |||||
except ImportError: | |||||
regex = None | |||||
import sys, re | import sys, re | ||||
Py36 = (sys.version_info[:2] >= (3, 6)) | Py36 = (sys.version_info[:2] >= (3, 6)) | ||||
import sre_parse | import sre_parse | ||||
import sre_constants | import sre_constants | ||||
def get_regexp_width(regexp): | |||||
categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') | |||||
def get_regexp_width(expr): | |||||
if regex: | |||||
# Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with | |||||
# a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex | |||||
# match here below. | |||||
regexp_final = re.sub(categ_pattern, 'A', expr) | |||||
else: | |||||
if re.search(categ_pattern, expr): | |||||
raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr) | |||||
regexp_final = expr | |||||
try: | try: | ||||
return [int(x) for x in sre_parse.parse(regexp).getwidth()] | |||||
return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] | |||||
except sre_constants.error: | except sre_constants.error: | ||||
raise ValueError(regexp) | |||||
raise ValueError(expr) | |||||
###} | ###} | ||||
@@ -182,7 +197,7 @@ def get_regexp_width(regexp): | |||||
def dedup_list(l): | def dedup_list(l): | ||||
"""Given a list (l) will removing duplicates from the list, | """Given a list (l) will removing duplicates from the list, | ||||
preserving the original order of the list. Assumes that | preserving the original order of the list. Assumes that | ||||
the list entrie are hashable.""" | |||||
the list entries are hashable.""" | |||||
dedup = set() | dedup = set() | ||||
return [ x for x in l if not (x in dedup or dedup.add(x))] | return [ x for x in l if not (x in dedup or dedup.add(x))] | ||||
@@ -1,4 +1,7 @@ | |||||
import re | |||||
try: | |||||
import regex as re | |||||
except ImportError: | |||||
import re | |||||
from setuptools import find_packages, setup | from setuptools import find_packages, setup | ||||
__version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read()) | __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read()) | ||||
@@ -11,6 +14,10 @@ setup( | |||||
requires = [], | requires = [], | ||||
install_requires = [], | install_requires = [], | ||||
extras_require = { | |||||
"regex": ["regex"] | |||||
}, | |||||
package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, | package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, | ||||
test_suite = 'tests.__main__', | test_suite = 'tests.__main__', | ||||
@@ -1 +1,2 @@ | |||||
Js2Py==0.68 | Js2Py==0.68 | ||||
regex |
@@ -20,6 +20,11 @@ from io import ( | |||||
logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
try: | |||||
import regex | |||||
except ImportError: | |||||
regex = None | |||||
from lark.lark import Lark | from lark.lark import Lark | ||||
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters | from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters | ||||
from lark.tree import Tree | from lark.tree import Tree | ||||
@@ -548,8 +553,8 @@ class CustomLexer(Lexer): | |||||
Purpose of this custom lexer is to test the integration, | Purpose of this custom lexer is to test the integration, | ||||
so it uses the traditionalparser as implementation without custom lexing behaviour. | so it uses the traditionalparser as implementation without custom lexing behaviour. | ||||
""" | """ | ||||
def __init__(self, lexer_conf): | |||||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) | |||||
def __init__(self, lexer_conf, re_): | |||||
self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) | |||||
def lex(self, *args, **kwargs): | def lex(self, *args, **kwargs): | ||||
return self.lexer.lex(*args, **kwargs) | return self.lexer.lex(*args, **kwargs) | ||||
@@ -1784,6 +1789,23 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertEqual(a.line, 1) | self.assertEqual(a.line, 1) | ||||
self.assertEqual(b.line, 2) | self.assertEqual(b.line, 2) | ||||
@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
def test_unicode_class(self): | |||||
"Tests that character classes from the `regex` module work correctly." | |||||
g = _Lark(r"""?start: NAME | |||||
NAME: ID_START ID_CONTINUE* | |||||
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||||
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True) | |||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
def test_unicode_word(self): | |||||
"Tests that a persistent bug in the `re` module works when `regex` is enabled." | |||||
g = _Lark(r"""?start: NAME | |||||
NAME: /[\w]+/ | |||||
""", regex=True) | |||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||
_TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
@@ -14,7 +14,7 @@ pypy3 = pypy3 | |||||
[testenv] | [testenv] | ||||
whitelist_externals = git | whitelist_externals = git | ||||
deps = | deps = | ||||
-rnearley-requirements.txt | |||||
-rtest-requirements.txt | |||||
# to always force recreation and avoid unexpected side effects | # to always force recreation and avoid unexpected side effects | ||||
recreate=True | recreate=True | ||||