From e201beab5fbdd0730c80cfda20c6afbda1df0a3a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 11 Feb 2017 00:50:48 +0200 Subject: [PATCH] Fixed lexer newline counter and other stuff --- docs/json_tutorial.md | 2 ++ docs/reference.md | 6 +++--- examples/json_parser.py | 2 +- lark/indenter.py | 8 +++++--- lark/lexer.py | 9 ++++++++- lark/parsers/lalr_parser.py | 7 +++++-- 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index bf740cc..734a70b 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -411,6 +411,8 @@ I measured memory consumption using a little script called [memusg](https://gist I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1). +These benchmarks are for Lark's alpha version. I already have several optimizations planned that will significantly improve run-time speed. + Once again, shout-out to PyPy for being so effective. ## Afterword diff --git a/docs/reference.md b/docs/reference.md index fc2b7af..1c57766 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -101,7 +101,7 @@ Tokens that *will* appear in the tree are: ## Shaping the tree -1. Rules whose name begins with an underscore will be inlined into their containing rule. +a. Rules whose name begins with an underscore will be inlined into their containing rule. Example: @@ -115,7 +115,7 @@ Lark will parse "(hello world)" as: "world" -2. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child. +b. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child. Example: @@ -131,7 +131,7 @@ Lark will parse "hello world (planet)" as: "world" "planet" -3. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. +c. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. Example: diff --git a/examples/json_parser.py b/examples/json_parser.py index 485ea52..9e4b952 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -20,7 +20,7 @@ json_grammar = r""" number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ string : /".*?(? 0): return + yield token + indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len @@ -20,18 +22,18 @@ class Indenter: else: while indent < self.indent_level[-1]: self.indent_level.pop() - yield Token(self.DEDENT_type, indent_str) + yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) def process(self, stream): for token in stream: - yield token - if token.type == self.NL_type: for t in self.handle_NL(token): yield t + else: + yield token if token.type in self.OPEN_PAREN_types: self.paren_level += 1 diff --git a/lark/lexer.py b/lark/lexer.py index 1fc6a1c..cd32117 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -15,6 +15,13 @@ class Token(Str): inst.value = value return inst + @classmethod + def new_borrow_pos(cls, type, value, borrow_t): + inst = cls(type, value, borrow_t.pos_in_stream) + inst.line = borrow_t.line + inst.column = borrow_t.column + return inst + def __repr__(self): return 'Token(%s, %s)' % (self.type, self.value) @@ -46,7 +53,7 @@ class Lexer(object): self.token_types = list(token_names) self.type_index = {name:i for i,name in enumerate(self.token_types)} - self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] + self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]] self.ignore_types = [self.type_index[t] for t in ignore] self.mres = self._build_mres(tokens, len(tokens)) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 57bc5ae..2827925 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -26,9 +26,12 @@ class Parser(object): except IndexError: assert key == '$end' token = seq[-1] - raise ParseError("Unexpected input %r at line %d, column %d.\n" + raise ParseError("Unexpected input %r at line %s, column %s.\n" "Expected: %s\n" - "Context: %s" % (token.value, token.line, token.column, expected, context)) + "Context: %s" % (token.value, + getattr(token, 'line', '?'), + getattr(token, 'column', '?'), + expected, context)) def reduce(rule): if rule.expansion: