Browse Source

Fixed lexer newline counter and other stuff

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 years ago
parent
commit
e201beab5f
6 changed files with 24 additions and 10 deletions
  1. +2
    -0
      docs/json_tutorial.md
  2. +3
    -3
      docs/reference.md
  3. +1
    -1
      examples/json_parser.py
  4. +5
    -3
      lark/indenter.py
  5. +8
    -1
      lark/lexer.py
  6. +5
    -2
      lark/parsers/lalr_parser.py

+ 2
- 0
docs/json_tutorial.md View File

@@ -411,6 +411,8 @@ I measured memory consumption using a little script called [memusg](https://gist


I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1). I added PyParsing for comparison. It fairs pretty well in its memory usage, but it can't compete with the run-time speed of LALR(1).


These benchmarks are for Lark's alpha version. I already have several optimizations planned that will significantly improve run-time speed.

Once again, shout-out to PyPy for being so effective. Once again, shout-out to PyPy for being so effective.


## Afterword ## Afterword


+ 3
- 3
docs/reference.md View File

@@ -101,7 +101,7 @@ Tokens that *will* appear in the tree are:


## Shaping the tree ## Shaping the tree


1. Rules whose name begins with an underscore will be inlined into their containing rule.
a. Rules whose name begins with an underscore will be inlined into their containing rule.


Example: Example:


@@ -115,7 +115,7 @@ Lark will parse "(hello world)" as:
"world" "world"




2. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child.
b. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child.


Example: Example:


@@ -131,7 +131,7 @@ Lark will parse "hello world (planet)" as:
"world" "world"
"planet" "planet"


3. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option.
c. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option.


Example: Example:




+ 1
- 1
examples/json_parser.py View File

@@ -20,7 +20,7 @@ json_grammar = r"""
number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/
string : /".*?(?<!\\)"/ string : /".*?(?<!\\)"/


WS.ignore.newline: /[ \t\n]+/
WS.ignore: /[ \t\n]+/
""" """


class TreeToJson(Transformer): class TreeToJson(Transformer):


+ 5
- 3
lark/indenter.py View File

@@ -11,6 +11,8 @@ class Indenter:
if (self.paren_level > 0): if (self.paren_level > 0):
return return


yield token

indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len


@@ -20,18 +22,18 @@ class Indenter:
else: else:
while indent < self.indent_level[-1]: while indent < self.indent_level[-1]:
self.indent_level.pop() self.indent_level.pop()
yield Token(self.DEDENT_type, indent_str)
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)


assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])




def process(self, stream): def process(self, stream):
for token in stream: for token in stream:
yield token

if token.type == self.NL_type: if token.type == self.NL_type:
for t in self.handle_NL(token): for t in self.handle_NL(token):
yield t yield t
else:
yield token


if token.type in self.OPEN_PAREN_types: if token.type in self.OPEN_PAREN_types:
self.paren_level += 1 self.paren_level += 1


+ 8
- 1
lark/lexer.py View File

@@ -15,6 +15,13 @@ class Token(Str):
inst.value = value inst.value = value
return inst return inst


@classmethod
def new_borrow_pos(cls, type, value, borrow_t):
inst = cls(type, value, borrow_t.pos_in_stream)
inst.line = borrow_t.line
inst.column = borrow_t.column
return inst

def __repr__(self): def __repr__(self):
return 'Token(%s, %s)' % (self.type, self.value) return 'Token(%s, %s)' % (self.type, self.value)


@@ -46,7 +53,7 @@ class Lexer(object):
self.token_types = list(token_names) self.token_types = list(token_names)
self.type_index = {name:i for i,name in enumerate(self.token_types)} self.type_index = {name:i for i,name in enumerate(self.token_types)}


self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]]
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]]
self.ignore_types = [self.type_index[t] for t in ignore] self.ignore_types = [self.type_index[t] for t in ignore]


self.mres = self._build_mres(tokens, len(tokens)) self.mres = self._build_mres(tokens, len(tokens))


+ 5
- 2
lark/parsers/lalr_parser.py View File

@@ -26,9 +26,12 @@ class Parser(object):
except IndexError: except IndexError:
assert key == '$end' assert key == '$end'
token = seq[-1] token = seq[-1]
raise ParseError("Unexpected input %r at line %d, column %d.\n"
raise ParseError("Unexpected input %r at line %s, column %s.\n"
"Expected: %s\n" "Expected: %s\n"
"Context: %s" % (token.value, token.line, token.column, expected, context))
"Context: %s" % (token.value,
getattr(token, 'line', '?'),
getattr(token, 'column', '?'),
expected, context))


def reduce(rule): def reduce(rule):
if rule.expansion: if rule.expansion:


Loading…
Cancel
Save