@@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist | |||||
| Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | | | Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | | ||||
| Lark - LALR(1) | 8s | 1.53s | 453M | 266M | | | Lark - LALR(1) | 8s | 1.53s | 453M | 266M | | ||||
| Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | | | Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | | ||||
| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M | | |||||
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | | |||||
| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M | | |||||
| PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M | | |||||
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | | |||||
| Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M | | |||||
I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). | I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). | ||||
@@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective. | |||||
This is the end of the tutorial. I hoped you liked it and learned a little about Lark. | This is the end of the tutorial. I hoped you liked it and learned a little about Lark. | ||||
To see what else you can do with Lark, check out the [examples](examples). | |||||
To see what else you can do with Lark, check out the [examples](/examples). | |||||
For questions or any other subject, feel free to email me at erezshin at gmail dot com. | For questions or any other subject, feel free to email me at erezshin at gmail dot com. | ||||
@@ -33,7 +33,7 @@ class LarkOptions: | |||||
regex: bool | regex: bool | ||||
debug: bool | debug: bool | ||||
keep_all_tokens: bool | keep_all_tokens: bool | ||||
propagate_positions: Union[bool, str] | |||||
propagate_positions: Union[bool, Callable] | |||||
maybe_placeholders: bool | maybe_placeholders: bool | ||||
lexer_callbacks: Dict[str, Callable[[Token], Token]] | lexer_callbacks: Dict[str, Callable[[Token], Token]] | ||||
cache: Union[bool, str] | cache: Union[bool, str] | ||||
@@ -77,7 +77,7 @@ class Lark: | |||||
regex: bool = False, | regex: bool = False, | ||||
debug: bool = False, | debug: bool = False, | ||||
keep_all_tokens: bool = False, | keep_all_tokens: bool = False, | ||||
propagate_positions: Union[bool, str] = False, | |||||
propagate_positions: Union[bool, Callable] = False, | |||||
maybe_placeholders: bool = False, | maybe_placeholders: bool = False, | ||||
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, | lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, | ||||
cache: Union[bool, str] = False, | cache: Union[bool, str] = False, | ||||
@@ -1,4 +1,5 @@ | |||||
from warnings import warn | from warnings import warn | ||||
from copy import deepcopy | |||||
from .utils import Serialize | from .utils import Serialize | ||||
from .lexer import TerminalDef | from .lexer import TerminalDef | ||||
@@ -31,6 +32,17 @@ class LexerConf(Serialize): | |||||
def _deserialize(self): | def _deserialize(self): | ||||
self.terminals_by_name = {t.name: t for t in self.terminals} | self.terminals_by_name = {t.name: t for t in self.terminals} | ||||
def __deepcopy__(self, memo=None): | |||||
return type(self)( | |||||
deepcopy(self.terminals, memo), | |||||
self.re_module, | |||||
deepcopy(self.ignore, memo), | |||||
deepcopy(self.postlex, memo), | |||||
deepcopy(self.callbacks, memo), | |||||
deepcopy(self.g_regex_flags, memo), | |||||
deepcopy(self.skip_validation, memo), | |||||
deepcopy(self.use_bytes, memo), | |||||
) | |||||
class ParserConf(Serialize): | class ParserConf(Serialize): | ||||
@@ -129,6 +129,8 @@ class UnexpectedInput(LarkError): | |||||
class UnexpectedEOF(ParseError, UnexpectedInput): | class UnexpectedEOF(ParseError, UnexpectedInput): | ||||
def __init__(self, expected, state=None, terminals_by_name=None): | def __init__(self, expected, state=None, terminals_by_name=None): | ||||
super(UnexpectedEOF, self).__init__() | |||||
self.expected = expected | self.expected = expected | ||||
self.state = state | self.state = state | ||||
from .lexer import Token | from .lexer import Token | ||||
@@ -138,7 +140,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput): | |||||
self.column = -1 | self.column = -1 | ||||
self._terminals_by_name = terminals_by_name | self._terminals_by_name = terminals_by_name | ||||
super(UnexpectedEOF, self).__init__() | |||||
def __str__(self): | def __str__(self): | ||||
message = "Unexpected end-of-input. " | message = "Unexpected end-of-input. " | ||||
@@ -149,6 +150,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput): | |||||
class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, | def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, | ||||
terminals_by_name=None, considered_rules=None): | terminals_by_name=None, considered_rules=None): | ||||
super(UnexpectedCharacters, self).__init__() | |||||
# TODO considered_tokens and allowed can be figured out using state | # TODO considered_tokens and allowed can be figured out using state | ||||
self.line = line | self.line = line | ||||
self.column = column | self.column = column | ||||
@@ -167,7 +170,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||||
self.char = seq[lex_pos] | self.char = seq[lex_pos] | ||||
self._context = self.get_context(seq) | self._context = self.get_context(seq) | ||||
super(UnexpectedCharacters, self).__init__() | |||||
def __str__(self): | def __str__(self): | ||||
message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) | message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) | ||||
@@ -190,6 +192,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
""" | """ | ||||
def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): | def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): | ||||
super(UnexpectedToken, self).__init__() | |||||
# TODO considered_rules and expected can be figured out using state | # TODO considered_rules and expected can be figured out using state | ||||
self.line = getattr(token, 'line', '?') | self.line = getattr(token, 'line', '?') | ||||
self.column = getattr(token, 'column', '?') | self.column = getattr(token, 'column', '?') | ||||
@@ -204,7 +208,6 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
self._terminals_by_name = terminals_by_name | self._terminals_by_name = terminals_by_name | ||||
self.token_history = token_history | self.token_history = token_history | ||||
super(UnexpectedToken, self).__init__() | |||||
@property | @property | ||||
def accepts(self): | def accepts(self): | ||||
@@ -236,10 +239,10 @@ class VisitError(LarkError): | |||||
""" | """ | ||||
def __init__(self, rule, obj, orig_exc): | def __init__(self, rule, obj, orig_exc): | ||||
self.obj = obj | |||||
self.orig_exc = orig_exc | |||||
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | ||||
super(VisitError, self).__init__(message) | super(VisitError, self).__init__(message) | ||||
self.obj = obj | |||||
self.orig_exc = orig_exc | |||||
###} | ###} |
@@ -44,7 +44,7 @@ class LarkOptions(Serialize): | |||||
Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) | Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) | ||||
propagate_positions | propagate_positions | ||||
Propagates (line, column, end_line, end_column) attributes into all tree branches. | Propagates (line, column, end_line, end_column) attributes into all tree branches. | ||||
Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. | |||||
Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. | |||||
maybe_placeholders | maybe_placeholders | ||||
When ``True``, the ``[]`` operator returns ``None`` when not matched. | When ``True``, the ``[]`` operator returns ``None`` when not matched. | ||||
@@ -162,7 +162,7 @@ class LarkOptions(Serialize): | |||||
assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) | assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) | ||||
if self.parser == 'earley' and self.transformer: | if self.parser == 'earley' and self.transformer: | ||||
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.' | |||||
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' | |||||
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') | 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') | ||||
if o: | if o: | ||||
@@ -451,11 +451,11 @@ class Lark(Serialize): | |||||
d = f | d = f | ||||
else: | else: | ||||
d = pickle.load(f) | d = pickle.load(f) | ||||
memo = d['memo'] | |||||
memo_json = d['memo'] | |||||
data = d['data'] | data = d['data'] | ||||
assert memo | |||||
memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) | |||||
assert memo_json | |||||
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) | |||||
options = dict(data['options']) | options = dict(data['options']) | ||||
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): | if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): | ||||
raise ConfigurationError("Some options are not allowed when loading a Parser: {}" | raise ConfigurationError("Some options are not allowed when loading a Parser: {}" | ||||
@@ -512,11 +512,11 @@ class Lark(Serialize): | |||||
Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) | Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) | ||||
""" | """ | ||||
package = FromPackageLoader(package, search_paths) | |||||
full_path, text = package(None, grammar_path) | |||||
package_loader = FromPackageLoader(package, search_paths) | |||||
full_path, text = package_loader(None, grammar_path) | |||||
options.setdefault('source_path', full_path) | options.setdefault('source_path', full_path) | ||||
options.setdefault('import_paths', []) | options.setdefault('import_paths', []) | ||||
options['import_paths'].append(package) | |||||
options['import_paths'].append(package_loader) | |||||
return cls(text, **options) | return cls(text, **options) | ||||
def __repr__(self): | def __repr__(self): | ||||
@@ -120,33 +120,33 @@ class Token(Str): | |||||
Attributes: | Attributes: | ||||
type: Name of the token (as specified in grammar) | type: Name of the token (as specified in grammar) | ||||
value: Value of the token (redundant, as ``token.value == token`` will always be true) | value: Value of the token (redundant, as ``token.value == token`` will always be true) | ||||
pos_in_stream: The index of the token in the text | |||||
start_pos: The index of the token in the text | |||||
line: The line of the token in the text (starting with 1) | line: The line of the token in the text (starting with 1) | ||||
column: The column of the token in the text (starting with 1) | column: The column of the token in the text (starting with 1) | ||||
end_line: The line where the token ends | end_line: The line where the token ends | ||||
end_column: The next column after the end of the token. For example, | end_column: The next column after the end of the token. For example, | ||||
if the token is a single character with a column value of 4, | if the token is a single character with a column value of 4, | ||||
end_column will be 5. | end_column will be 5. | ||||
end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``) | |||||
end_pos: the index where the token ends (basically ``start_pos + len(token)``) | |||||
""" | """ | ||||
__slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') | __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') | ||||
def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): | def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): | ||||
try: | try: | ||||
self = super(Token, cls).__new__(cls, value) | |||||
inst = super(Token, cls).__new__(cls, value) | |||||
except UnicodeDecodeError: | except UnicodeDecodeError: | ||||
value = value.decode('latin1') | value = value.decode('latin1') | ||||
self = super(Token, cls).__new__(cls, value) | |||||
self.type = type_ | |||||
self.start_pos = start_pos if start_pos is not None else pos_in_stream | |||||
self.value = value | |||||
self.line = line | |||||
self.column = column | |||||
self.end_line = end_line | |||||
self.end_column = end_column | |||||
self.end_pos = end_pos | |||||
return self | |||||
inst = super(Token, cls).__new__(cls, value) | |||||
inst.type = type_ | |||||
inst.start_pos = start_pos if start_pos is not None else pos_in_stream | |||||
inst.value = value | |||||
inst.line = line | |||||
inst.column = column | |||||
inst.end_line = end_line | |||||
inst.end_column = end_column | |||||
inst.end_pos = end_pos | |||||
return inst | |||||
@property | @property | ||||
def pos_in_stream(self): | def pos_in_stream(self): | ||||
@@ -214,15 +214,13 @@ class LineCounter: | |||||
class UnlessCallback: | class UnlessCallback: | ||||
def __init__(self, mres): | |||||
self.mres = mres | |||||
def __init__(self, scanner): | |||||
self.scanner = scanner | |||||
def __call__(self, t): | def __call__(self, t): | ||||
for mre, type_from_index in self.mres: | |||||
m = mre.match(t.value) | |||||
if m: | |||||
t.type = type_from_index[m.lastindex] | |||||
break | |||||
res = self.scanner.match(t.value, 0) | |||||
if res: | |||||
_value, t.type = res | |||||
return t | return t | ||||
@@ -237,6 +235,11 @@ class CallChain: | |||||
return self.callback2(t) if self.cond(t2) else t2 | return self.callback2(t) if self.cond(t2) else t2 | ||||
def _get_match(re_, regexp, s, flags): | |||||
m = re_.match(regexp, s, flags) | |||||
if m: | |||||
return m.group(0) | |||||
def _create_unless(terminals, g_regex_flags, re_, use_bytes): | def _create_unless(terminals, g_regex_flags, re_, use_bytes): | ||||
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | ||||
assert len(tokens_by_type) <= 2, tokens_by_type.keys() | assert len(tokens_by_type) <= 2, tokens_by_type.keys() | ||||
@@ -248,40 +251,54 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): | |||||
if strtok.priority > retok.priority: | if strtok.priority > retok.priority: | ||||
continue | continue | ||||
s = strtok.pattern.value | s = strtok.pattern.value | ||||
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||||
if m and m.group(0) == s: | |||||
if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags): | |||||
unless.append(strtok) | unless.append(strtok) | ||||
if strtok.pattern.flags <= retok.pattern.flags: | if strtok.pattern.flags <= retok.pattern.flags: | ||||
embedded_strs.add(strtok) | embedded_strs.add(strtok) | ||||
if unless: | if unless: | ||||
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) | |||||
terminals = [t for t in terminals if t not in embedded_strs] | |||||
return terminals, callback | |||||
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): | |||||
# Python sets an unreasonable group limit (currently 100) in its re module | |||||
# Worse, the only way to know we reached it is by catching an AssertionError! | |||||
# This function recursively tries less and less groups until it's successful. | |||||
postfix = '$' if match_whole else '' | |||||
mres = [] | |||||
while terminals: | |||||
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) | |||||
if use_bytes: | |||||
pattern = pattern.encode('latin-1') | |||||
try: | |||||
mre = re_.compile(pattern, g_regex_flags) | |||||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | |||||
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) | |||||
mres.append((mre, {i: n for n, i in mre.groupindex.items()})) | |||||
terminals = terminals[max_size:] | |||||
return mres | |||||
new_terminals = [t for t in terminals if t not in embedded_strs] | |||||
return new_terminals, callback | |||||
def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): | |||||
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) | |||||
class Scanner: | |||||
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): | |||||
self.terminals = terminals | |||||
self.g_regex_flags = g_regex_flags | |||||
self.re_ = re_ | |||||
self.use_bytes = use_bytes | |||||
self.match_whole = match_whole | |||||
self.allowed_types = {t.name for t in self.terminals} | |||||
self._mres = self._build_mres(terminals, len(terminals)) | |||||
def _build_mres(self, terminals, max_size): | |||||
# Python sets an unreasonable group limit (currently 100) in its re module | |||||
# Worse, the only way to know we reached it is by catching an AssertionError! | |||||
# This function recursively tries less and less groups until it's successful. | |||||
postfix = '$' if self.match_whole else '' | |||||
mres = [] | |||||
while terminals: | |||||
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) | |||||
if self.use_bytes: | |||||
pattern = pattern.encode('latin-1') | |||||
try: | |||||
mre = self.re_.compile(pattern, self.g_regex_flags) | |||||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||||
return self._build_mres(terminals, max_size//2) | |||||
mres.append((mre, {i: n for n, i in mre.groupindex.items()})) | |||||
terminals = terminals[max_size:] | |||||
return mres | |||||
def match(self, text, pos): | |||||
for mre, type_from_index in self._mres: | |||||
m = mre.match(text, pos) | |||||
if m: | |||||
return m.group(0), type_from_index[m.lastindex] | |||||
def _regexp_has_newline(r): | def _regexp_has_newline(r): | ||||
@@ -341,9 +358,9 @@ class TraditionalLexer(Lexer): | |||||
self.use_bytes = conf.use_bytes | self.use_bytes = conf.use_bytes | ||||
self.terminals_by_name = conf.terminals_by_name | self.terminals_by_name = conf.terminals_by_name | ||||
self._mres = None | |||||
self._scanner = None | |||||
def _build(self): | |||||
def _build_scanner(self): | |||||
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) | terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) | ||||
assert all(self.callback.values()) | assert all(self.callback.values()) | ||||
@@ -354,19 +371,16 @@ class TraditionalLexer(Lexer): | |||||
else: | else: | ||||
self.callback[type_] = f | self.callback[type_] = f | ||||
self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) | |||||
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) | |||||
@property | @property | ||||
def mres(self): | |||||
if self._mres is None: | |||||
self._build() | |||||
return self._mres | |||||
def scanner(self): | |||||
if self._scanner is None: | |||||
self._build_scanner() | |||||
return self._scanner | |||||
def match(self, text, pos): | def match(self, text, pos): | ||||
for mre, type_from_index in self.mres: | |||||
m = mre.match(text, pos) | |||||
if m: | |||||
return m.group(0), type_from_index[m.lastindex] | |||||
return self.scanner.match(text, pos) | |||||
def lex(self, state, parser_state): | def lex(self, state, parser_state): | ||||
with suppress(EOFError): | with suppress(EOFError): | ||||
@@ -378,7 +392,7 @@ class TraditionalLexer(Lexer): | |||||
while line_ctr.char_pos < len(lex_state.text): | while line_ctr.char_pos < len(lex_state.text): | ||||
res = self.match(lex_state.text, line_ctr.char_pos) | res = self.match(lex_state.text, line_ctr.char_pos) | ||||
if not res: | if not res: | ||||
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | |||||
allowed = self.scanner.allowed_types - self.ignore_types | |||||
if not allowed: | if not allowed: | ||||
allowed = {"<END-OF-FILE>"} | allowed = {"<END-OF-FILE>"} | ||||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | ||||
@@ -23,54 +23,59 @@ class ExpandSingleChild: | |||||
class PropagatePositions: | class PropagatePositions: | ||||
def __init__(self, node_builder): | |||||
def __init__(self, node_builder, node_filter=None): | |||||
self.node_builder = node_builder | self.node_builder = node_builder | ||||
self.node_filter = node_filter | |||||
def __call__(self, children): | def __call__(self, children): | ||||
res = self.node_builder(children) | res = self.node_builder(children) | ||||
# local reference to Tree.meta reduces number of presence checks | |||||
if isinstance(res, Tree): | if isinstance(res, Tree): | ||||
res_meta = res.meta | |||||
# Calculate positions while the tree is streaming, according to the rule: | |||||
# - nodes start at the start of their first child's container, | |||||
# and end at the end of their last child's container. | |||||
# Containers are nodes that take up space in text, but have been inlined in the tree. | |||||
src_meta = self._pp_get_meta(children) | |||||
if src_meta is not None: | |||||
res_meta.line = src_meta.line | |||||
res_meta.column = src_meta.column | |||||
res_meta.start_pos = src_meta.start_pos | |||||
res_meta.empty = False | |||||
res_meta = res.meta | |||||
src_meta = self._pp_get_meta(reversed(children)) | |||||
if src_meta is not None: | |||||
res_meta.end_line = src_meta.end_line | |||||
res_meta.end_column = src_meta.end_column | |||||
res_meta.end_pos = src_meta.end_pos | |||||
res_meta.empty = False | |||||
first_meta = self._pp_get_meta(children) | |||||
if first_meta is not None: | |||||
if not hasattr(res_meta, 'line'): | |||||
# meta was already set, probably because the rule has been inlined (e.g. `?rule`) | |||||
res_meta.line = getattr(first_meta, 'container_line', first_meta.line) | |||||
res_meta.column = getattr(first_meta, 'container_column', first_meta.column) | |||||
res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) | |||||
res_meta.empty = False | |||||
res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line) | |||||
res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column) | |||||
last_meta = self._pp_get_meta(reversed(children)) | |||||
if last_meta is not None: | |||||
if not hasattr(res_meta, 'end_line'): | |||||
res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) | |||||
res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) | |||||
res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos) | |||||
res_meta.empty = False | |||||
res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) | |||||
res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) | |||||
return res | return res | ||||
def _pp_get_meta(self, children): | def _pp_get_meta(self, children): | ||||
for c in children: | for c in children: | ||||
if self.node_filter is not None and not self.node_filter(c): | |||||
continue | |||||
if isinstance(c, Tree): | if isinstance(c, Tree): | ||||
if not c.meta.empty: | if not c.meta.empty: | ||||
return c.meta | return c.meta | ||||
elif isinstance(c, Token): | elif isinstance(c, Token): | ||||
return c | return c | ||||
class PropagatePositions_IgnoreWs(PropagatePositions): | |||||
def _pp_get_meta(self, children): | |||||
for c in children: | |||||
if isinstance(c, Tree): | |||||
if not c.meta.empty: | |||||
return c.meta | |||||
elif isinstance(c, Token): | |||||
if c and not c.isspace(): # Disregard whitespace-only tokens | |||||
return c | |||||
def make_propagate_positions(option): | def make_propagate_positions(option): | ||||
if option == "ignore_ws": | |||||
return PropagatePositions_IgnoreWs | |||||
if callable(option): | |||||
return partial(PropagatePositions, node_filter=option) | |||||
elif option is True: | elif option is True: | ||||
return PropagatePositions | return PropagatePositions | ||||
elif option is False: | elif option is False: | ||||
@@ -39,8 +39,7 @@ class MakeParsingFrontend: | |||||
lexer_conf.lexer_type = self.lexer_type | lexer_conf.lexer_type = self.lexer_type | ||||
return ParsingFrontend(lexer_conf, parser_conf, options) | return ParsingFrontend(lexer_conf, parser_conf, options) | ||||
@classmethod | |||||
def deserialize(cls, data, memo, lexer_conf, callbacks, options): | |||||
def deserialize(self, data, memo, lexer_conf, callbacks, options): | |||||
parser_conf = ParserConf.deserialize(data['parser_conf'], memo) | parser_conf = ParserConf.deserialize(data['parser_conf'], memo) | ||||
parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) | parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) | ||||
parser_conf.callbacks = callbacks | parser_conf.callbacks = callbacks | ||||
@@ -92,26 +91,26 @@ class ParsingFrontend(Serialize): | |||||
def _verify_start(self, start=None): | def _verify_start(self, start=None): | ||||
if start is None: | if start is None: | ||||
start = self.parser_conf.start | |||||
if len(start) > 1: | |||||
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) | |||||
start ,= start | |||||
start_decls = self.parser_conf.start | |||||
if len(start_decls) > 1: | |||||
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls) | |||||
start ,= start_decls | |||||
elif start not in self.parser_conf.start: | elif start not in self.parser_conf.start: | ||||
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) | raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) | ||||
return start | return start | ||||
def parse(self, text, start=None, on_error=None): | def parse(self, text, start=None, on_error=None): | ||||
start = self._verify_start(start) | |||||
chosen_start = self._verify_start(start) | |||||
stream = text if self.skip_lexer else LexerThread(self.lexer, text) | stream = text if self.skip_lexer else LexerThread(self.lexer, text) | ||||
kw = {} if on_error is None else {'on_error': on_error} | kw = {} if on_error is None else {'on_error': on_error} | ||||
return self.parser.parse(stream, start, **kw) | |||||
return self.parser.parse(stream, chosen_start, **kw) | |||||
def parse_interactive(self, text=None, start=None): | def parse_interactive(self, text=None, start=None): | ||||
start = self._verify_start(start) | |||||
chosen_start = self._verify_start(start) | |||||
if self.parser_conf.parser_type != 'lalr': | if self.parser_conf.parser_type != 'lalr': | ||||
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") | raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") | ||||
stream = text if self.skip_lexer else LexerThread(self.lexer, text) | stream = text if self.skip_lexer else LexerThread(self.lexer, text) | ||||
return self.parser.parse_interactive(stream, start) | |||||
return self.parser.parse_interactive(stream, chosen_start) | |||||
def get_frontend(parser, lexer): | def get_frontend(parser, lexer): | ||||
@@ -178,8 +178,8 @@ class _Parser(object): | |||||
for token in state.lexer.lex(state): | for token in state.lexer.lex(state): | ||||
state.feed_token(token) | state.feed_token(token) | ||||
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
return state.feed_token(token, True) | |||||
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
return state.feed_token(end_token, True) | |||||
except UnexpectedInput as e: | except UnexpectedInput as e: | ||||
try: | try: | ||||
e.interactive_parser = InteractiveParser(self, state, state.lexer) | e.interactive_parser = InteractiveParser(self, state, state.lexer) | ||||
@@ -73,14 +73,13 @@ class Serialize(object): | |||||
fields = getattr(self, '__serialize_fields__') | fields = getattr(self, '__serialize_fields__') | ||||
res = {f: _serialize(getattr(self, f), memo) for f in fields} | res = {f: _serialize(getattr(self, f), memo) for f in fields} | ||||
res['__type__'] = type(self).__name__ | res['__type__'] = type(self).__name__ | ||||
postprocess = getattr(self, '_serialize', None) | |||||
if postprocess: | |||||
postprocess(res, memo) | |||||
if hasattr(self, '_serialize'): | |||||
self._serialize(res, memo) | |||||
return res | return res | ||||
@classmethod | @classmethod | ||||
def deserialize(cls, data, memo): | def deserialize(cls, data, memo): | ||||
namespace = getattr(cls, '__serialize_namespace__', {}) | |||||
namespace = getattr(cls, '__serialize_namespace__', []) | |||||
namespace = {c.__name__:c for c in namespace} | namespace = {c.__name__:c for c in namespace} | ||||
fields = getattr(cls, '__serialize_fields__') | fields = getattr(cls, '__serialize_fields__') | ||||
@@ -94,9 +93,10 @@ class Serialize(object): | |||||
setattr(inst, f, _deserialize(data[f], namespace, memo)) | setattr(inst, f, _deserialize(data[f], namespace, memo)) | ||||
except KeyError as e: | except KeyError as e: | ||||
raise KeyError("Cannot find key for class", cls, e) | raise KeyError("Cannot find key for class", cls, e) | ||||
postprocess = getattr(inst, '_deserialize', None) | |||||
if postprocess: | |||||
postprocess() | |||||
if hasattr(inst, '_deserialize'): | |||||
inst._deserialize() | |||||
return inst | return inst | ||||
@@ -241,17 +241,6 @@ except ImportError: | |||||
pass | pass | ||||
try: | |||||
compare = cmp | |||||
except NameError: | |||||
def compare(a, b): | |||||
if a == b: | |||||
return 0 | |||||
elif a > b: | |||||
return 1 | |||||
return -1 | |||||
class Enumerator(Serialize): | class Enumerator(Serialize): | ||||
def __init__(self): | def __init__(self): | ||||
self.enums = {} | self.enums = {} | ||||
@@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase): | |||||
r = g.parse('a') | r = g.parse('a') | ||||
self.assertEqual( r.children[0].meta.line, 1 ) | self.assertEqual( r.children[0].meta.line, 1 ) | ||||
def test_propagate_positions2(self): | |||||
g = Lark("""start: a | |||||
a: b | |||||
?b: "(" t ")" | |||||
!t: "t" | |||||
""", propagate_positions=True) | |||||
start = g.parse("(t)") | |||||
a ,= start.children | |||||
t ,= a.children | |||||
assert t.children[0] == "t" | |||||
assert t.meta.column == 2 | |||||
assert t.meta.end_column == 3 | |||||
assert start.meta.column == a.meta.column == 1 | |||||
assert start.meta.end_column == a.meta.end_column == 4 | |||||
def test_expand1(self): | def test_expand1(self): | ||||
g = Lark("""start: a | g = Lark("""start: a | ||||