Fixed lexer newline counter and other stuff

8 years ago · e201beab5f
--- a/docs/json_tutorial.md
+++ b/docs/json_tutorial.md
@@ -411,6 +411,8 @@ I measured memory consumption using a little script called [memusg](https://gist

 I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1).

 These benchmarks are for Lark's alpha version. I already have several optimizations planned that will significantly improve run-time speed.

 Once again, shout-out to PyPy for being so effective.

 ## Afterword
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -101,7 +101,7 @@ Tokens that *will* appear in the tree are:

 ## Shaping the tree

 1. Rules whose name begins with an underscore will be inlined into their containing rule.
 a. Rules whose name begins with an underscore will be inlined into their containing rule.

 Example:

@@ -115,7 +115,7 @@ Lark will parse "(hello world)" as:
        "world"


 2. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child.
 b. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child.

 Example:

@@ -131,7 +131,7 @@ Lark will parse "hello world (planet)" as:
            "world"
        "planet"

 3. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option.
 c. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option.

 Example:

--- a/examples/json_parser.py
+++ b/examples/json_parser.py
@@ -20,7 +20,7 @@ json_grammar = r"""
    number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/
    string : /".*?(?<!\\)"/

    WS.ignore.newline: /[ \t\n]+/
    WS.ignore: /[ \t\n]+/
 """

 class TreeToJson(Transformer):
--- a/lark/indenter.py
+++ b/lark/indenter.py
@@ -11,6 +11,8 @@ class Indenter:
        if (self.paren_level > 0):
            return

        yield token

        indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
        indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len

@@ -20,18 +22,18 @@ class Indenter:
        else:
            while indent < self.indent_level[-1]:
                self.indent_level.pop()
                yield Token(self.DEDENT_type, indent_str)
                yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)

            assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])


    def process(self, stream):
        for token in stream:
            yield token

            if token.type == self.NL_type:
                for t in self.handle_NL(token):
                    yield t
            else:
                yield token

            if token.type in self.OPEN_PAREN_types:
                self.paren_level += 1
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -15,6 +15,13 @@ class Token(Str):
        inst.value = value
        return inst

    @classmethod
    def new_borrow_pos(cls, type, value, borrow_t):
        inst = cls(type, value, borrow_t.pos_in_stream)
        inst.line = borrow_t.line
        inst.column = borrow_t.column
        return inst

    def __repr__(self):
        return 'Token(%s, %s)' % (self.type, self.value)

@@ -46,7 +53,7 @@ class Lexer(object):
        self.token_types = list(token_names)
        self.type_index = {name:i for i,name in enumerate(self.token_types)}

        self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]]
        self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]]
        self.ignore_types = [self.type_index[t] for t in ignore]

        self.mres = self._build_mres(tokens, len(tokens))
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -26,9 +26,12 @@ class Parser(object):
                except IndexError:
                    assert key == '$end'
                    token = seq[-1]
                raise ParseError("Unexpected input %r at line %d, column %d.\n"
                raise ParseError("Unexpected input %r at line %s, column %s.\n"
                                 "Expected: %s\n"
                                 "Context: %s" % (token.value, token.line, token.column, expected, context))
                                 "Context: %s" % (token.value, 
                                        getattr(token, 'line', '?'),
                                        getattr(token, 'column', '?'), 
                                        expected, context))

        def reduce(rule):
            if rule.expansion: