@@ -411,6 +411,8 @@ I measured memory consumption using a little script called [memusg](https://gist | |||||
I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1). | I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1). | ||||
These benchmarks are for Lark's alpha version. I already have several optimizations planned that will significantly improve run-time speed. | |||||
Once again, shout-out to PyPy for being so effective. | Once again, shout-out to PyPy for being so effective. | ||||
## Afterword | ## Afterword | ||||
@@ -101,7 +101,7 @@ Tokens that *will* appear in the tree are: | |||||
## Shaping the tree | ## Shaping the tree | ||||
1. Rules whose name begins with an underscore will be inlined into their containing rule. | |||||
a. Rules whose name begins with an underscore will be inlined into their containing rule. | |||||
Example: | Example: | ||||
@@ -115,7 +115,7 @@ Lark will parse "(hello world)" as: | |||||
"world" | "world" | ||||
2. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child. | |||||
b. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child. | |||||
Example: | Example: | ||||
@@ -131,7 +131,7 @@ Lark will parse "hello world (planet)" as: | |||||
"world" | "world" | ||||
"planet" | "planet" | ||||
3. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. | |||||
c. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. | |||||
Example: | Example: | ||||
@@ -20,7 +20,7 @@ json_grammar = r""" | |||||
number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | ||||
string : /".*?(?<!\\)"/ | string : /".*?(?<!\\)"/ | ||||
WS.ignore.newline: /[ \t\n]+/ | |||||
WS.ignore: /[ \t\n]+/ | |||||
""" | """ | ||||
class TreeToJson(Transformer): | class TreeToJson(Transformer): | ||||
@@ -11,6 +11,8 @@ class Indenter: | |||||
if (self.paren_level > 0): | if (self.paren_level > 0): | ||||
return | return | ||||
yield token | |||||
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | ||||
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | ||||
@@ -20,18 +22,18 @@ class Indenter: | |||||
else: | else: | ||||
while indent < self.indent_level[-1]: | while indent < self.indent_level[-1]: | ||||
self.indent_level.pop() | self.indent_level.pop() | ||||
yield Token(self.DEDENT_type, indent_str) | |||||
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) | |||||
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | ||||
def process(self, stream): | def process(self, stream): | ||||
for token in stream: | for token in stream: | ||||
yield token | |||||
if token.type == self.NL_type: | if token.type == self.NL_type: | ||||
for t in self.handle_NL(token): | for t in self.handle_NL(token): | ||||
yield t | yield t | ||||
else: | |||||
yield token | |||||
if token.type in self.OPEN_PAREN_types: | if token.type in self.OPEN_PAREN_types: | ||||
self.paren_level += 1 | self.paren_level += 1 | ||||
@@ -15,6 +15,13 @@ class Token(Str): | |||||
inst.value = value | inst.value = value | ||||
return inst | return inst | ||||
@classmethod | |||||
def new_borrow_pos(cls, type, value, borrow_t): | |||||
inst = cls(type, value, borrow_t.pos_in_stream) | |||||
inst.line = borrow_t.line | |||||
inst.column = borrow_t.column | |||||
return inst | |||||
def __repr__(self): | def __repr__(self): | ||||
return 'Token(%s, %s)' % (self.type, self.value) | return 'Token(%s, %s)' % (self.type, self.value) | ||||
@@ -46,7 +53,7 @@ class Lexer(object): | |||||
self.token_types = list(token_names) | self.token_types = list(token_names) | ||||
self.type_index = {name:i for i,name in enumerate(self.token_types)} | self.type_index = {name:i for i,name in enumerate(self.token_types)} | ||||
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] | |||||
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]] | |||||
self.ignore_types = [self.type_index[t] for t in ignore] | self.ignore_types = [self.type_index[t] for t in ignore] | ||||
self.mres = self._build_mres(tokens, len(tokens)) | self.mres = self._build_mres(tokens, len(tokens)) | ||||
@@ -26,9 +26,12 @@ class Parser(object): | |||||
except IndexError: | except IndexError: | ||||
assert key == '$end' | assert key == '$end' | ||||
token = seq[-1] | token = seq[-1] | ||||
raise ParseError("Unexpected input %r at line %d, column %d.\n" | |||||
raise ParseError("Unexpected input %r at line %s, column %s.\n" | |||||
"Expected: %s\n" | "Expected: %s\n" | ||||
"Context: %s" % (token.value, token.line, token.column, expected, context)) | |||||
"Context: %s" % (token.value, | |||||
getattr(token, 'line', '?'), | |||||
getattr(token, 'column', '?'), | |||||
expected, context)) | |||||
def reduce(rule): | def reduce(rule): | ||||
if rule.expansion: | if rule.expansion: | ||||