| @@ -411,6 +411,8 @@ I measured memory consumption using a little script called [memusg](https://gist | |||||
| I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1). | I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1). | ||||
| These benchmarks are for Lark's alpha version. I already have several optimizations planned that will significantly improve run-time speed. | |||||
| Once again, shout-out to PyPy for being so effective. | Once again, shout-out to PyPy for being so effective. | ||||
| ## Afterword | ## Afterword | ||||
| @@ -101,7 +101,7 @@ Tokens that *will* appear in the tree are: | |||||
| ## Shaping the tree | ## Shaping the tree | ||||
| 1. Rules whose name begins with an underscore will be inlined into their containing rule. | |||||
| a. Rules whose name begins with an underscore will be inlined into their containing rule. | |||||
| Example: | Example: | ||||
| @@ -115,7 +115,7 @@ Lark will parse "(hello world)" as: | |||||
| "world" | "world" | ||||
| 2. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child. | |||||
| b. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child. | |||||
| Example: | Example: | ||||
| @@ -131,7 +131,7 @@ Lark will parse "hello world (planet)" as: | |||||
| "world" | "world" | ||||
| "planet" | "planet" | ||||
| 3. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. | |||||
| c. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. | |||||
| Example: | Example: | ||||
| @@ -20,7 +20,7 @@ json_grammar = r""" | |||||
| number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | ||||
| string : /".*?(?<!\\)"/ | string : /".*?(?<!\\)"/ | ||||
| WS.ignore.newline: /[ \t\n]+/ | |||||
| WS.ignore: /[ \t\n]+/ | |||||
| """ | """ | ||||
| class TreeToJson(Transformer): | class TreeToJson(Transformer): | ||||
| @@ -11,6 +11,8 @@ class Indenter: | |||||
| if (self.paren_level > 0): | if (self.paren_level > 0): | ||||
| return | return | ||||
| yield token | |||||
| indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | ||||
| indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | ||||
| @@ -20,18 +22,18 @@ class Indenter: | |||||
| else: | else: | ||||
| while indent < self.indent_level[-1]: | while indent < self.indent_level[-1]: | ||||
| self.indent_level.pop() | self.indent_level.pop() | ||||
| yield Token(self.DEDENT_type, indent_str) | |||||
| yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) | |||||
| assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | ||||
| def process(self, stream): | def process(self, stream): | ||||
| for token in stream: | for token in stream: | ||||
| yield token | |||||
| if token.type == self.NL_type: | if token.type == self.NL_type: | ||||
| for t in self.handle_NL(token): | for t in self.handle_NL(token): | ||||
| yield t | yield t | ||||
| else: | |||||
| yield token | |||||
| if token.type in self.OPEN_PAREN_types: | if token.type in self.OPEN_PAREN_types: | ||||
| self.paren_level += 1 | self.paren_level += 1 | ||||
| @@ -15,6 +15,13 @@ class Token(Str): | |||||
| inst.value = value | inst.value = value | ||||
| return inst | return inst | ||||
| @classmethod | |||||
| def new_borrow_pos(cls, type, value, borrow_t): | |||||
| inst = cls(type, value, borrow_t.pos_in_stream) | |||||
| inst.line = borrow_t.line | |||||
| inst.column = borrow_t.column | |||||
| return inst | |||||
| def __repr__(self): | def __repr__(self): | ||||
| return 'Token(%s, %s)' % (self.type, self.value) | return 'Token(%s, %s)' % (self.type, self.value) | ||||
| @@ -46,7 +53,7 @@ class Lexer(object): | |||||
| self.token_types = list(token_names) | self.token_types = list(token_names) | ||||
| self.type_index = {name:i for i,name in enumerate(self.token_types)} | self.type_index = {name:i for i,name in enumerate(self.token_types)} | ||||
| self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] | |||||
| self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]] | |||||
| self.ignore_types = [self.type_index[t] for t in ignore] | self.ignore_types = [self.type_index[t] for t in ignore] | ||||
| self.mres = self._build_mres(tokens, len(tokens)) | self.mres = self._build_mres(tokens, len(tokens)) | ||||
| @@ -26,9 +26,12 @@ class Parser(object): | |||||
| except IndexError: | except IndexError: | ||||
| assert key == '$end' | assert key == '$end' | ||||
| token = seq[-1] | token = seq[-1] | ||||
| raise ParseError("Unexpected input %r at line %d, column %d.\n" | |||||
| raise ParseError("Unexpected input %r at line %s, column %s.\n" | |||||
| "Expected: %s\n" | "Expected: %s\n" | ||||
| "Context: %s" % (token.value, token.line, token.column, expected, context)) | |||||
| "Context: %s" % (token.value, | |||||
| getattr(token, 'line', '?'), | |||||
| getattr(token, 'column', '?'), | |||||
| expected, context)) | |||||
| def reduce(rule): | def reduce(rule): | ||||
| if rule.expansion: | if rule.expansion: | ||||