Ver a proveniência

Token import now working

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan há 7 anos
ascendente
cometimento
a0bb661c61
5 ficheiros alterados com 155 adições e 59 eliminações
  1. +5
    -4
      examples/calc.py
  2. +7
    -3
      examples/json_parser.py
  3. +36
    -0
      lark/grammars/common.g
  4. +102
    -47
      lark/load_grammar.py
  5. +5
    -5
      lark/tree.py

+ 5
- 4
examples/calc.py Ver ficheiro

@@ -22,15 +22,16 @@ calc_grammar = """
| product "*" atom -> mul
| product "/" atom -> div

?atom: NUMBER -> number
?atom: DECIMAL -> number
| "-" atom -> neg
| NAME -> var
| "(" sum ")"

NAME: /[a-zA-Z]\w+/ // Regexp form
NUMBER: ("0".."9"|".")+ // EBNF form (compiles to regexp)
%import common.CNAME -> NAME
%import common.DECIMAL
%import common.WS_INLINE

%ignore " "|"\t"
%ignore WS_INLINE
"""

class CalculateTree(InlineTransformer):


+ 7
- 3
examples/json_parser.py Ver ficheiro

@@ -24,10 +24,14 @@ json_grammar = r"""
object : "{" [pair ("," pair)*] "}"
pair : string ":" value

number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/
string : /".*?(?<!\\)"/
number: FLOAT
string : ESCAPED_STRING

%ignore /[ \t\n]+/
%import common.ESCAPED_STRING
%import common.FLOAT
%import common.WS

%ignore WS
"""

class TreeToJson(Transformer):


+ 36
- 0
lark/grammars/common.g Ver ficheiro

@@ -0,0 +1,36 @@
//
// Numbers
//

DIGIT: "0".."9"

INT: DIGIT+
DECIMAL: INT ("." INT)?

// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
FLOAT: "-"? DECIMAL (("e"|"E")("+"|"-")? INT)?


//
// Strings
//
ESCAPED_STRING: /".*?(?<!\\)"/


//
// Names (Variables)
//
LCASE_LETTER: "a".."z"
UCASE_LETTER: "A".."Z"

LETTER: UCASE_LETTER | LCASE_LETTER

CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*


//
// Whitespace
//
WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+


+ 102
- 47
lark/load_grammar.py Ver ficheiro

@@ -1,3 +1,4 @@
import os.path
from itertools import chain
import re
import codecs
@@ -13,6 +14,10 @@ from .tree import Tree as T, Transformer, InlineTransformer, Visitor

unicode_escape = codecs.getdecoder('unicode_escape')

__path__ = os.path.dirname(__file__)
IMPORT_PATHS = [os.path.join(__path__, 'grammars')]


_TOKEN_NAMES = {
'.' : 'DOT',
',' : 'COMMA',
@@ -62,7 +67,6 @@ TOKENS = {
'_COLON': ':',
'_OR': r'\|',
'_DOT': r'\.',
'_PERCENT': r'%',
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TOKEN': '_?[A-Z][_A-Z0-9]*',
'STRING': r'".*?[^\\]"',
@@ -70,7 +74,9 @@ TOKENS = {
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'//[^\n]*',
'_TO': '->'
'_TO': '->',
'_IGNORE': r'%ignore',
'_IMPORT': r'%import',
}

RULES = {
@@ -93,16 +99,22 @@ RULES = {

'?atom': ['_LPAR expansions _RPAR',
'maybe',
'RULE',
'TOKEN',
'name',
'tokenvalue',
'range'],

'?name': ['RULE', 'TOKEN'],

'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],

'token': ['TOKEN _COLON expansions _NL'],
'statement': ['_PERCENT RULE expansions _NL'],
'statement': ['ignore', 'import'],
'ignore': ['_IGNORE expansions _NL'],
'import': ['_IMPORT import_args _NL',
'_IMPORT import_args _TO TOKEN'],
'import_args': ['_import_args'],
'_import_args': ['name', '_import_args _DOT name'],

'tokenvalue': ['REGEXP', 'STRING'],
}
@@ -301,7 +313,7 @@ class TokenTreeToRegexp(Transformer):
def expansions(self, exps):
if len(exps) == 1:
return exps[0]
return TokenValue__Regexp('%s' % ('|'.join(i.to_regexp() for i in exps)))
return TokenValue__Regexp('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
def range(self, items):
assert all(i.type=='STRING' for i in items)
items = [i[1:-1] for i in items]
@@ -314,43 +326,20 @@ class TokenTreeToRegexp(Transformer):
return TokenValue__Regexp('(?:%s)%s' % (inner.to_regexp(), op))

class Grammar:
def __init__(self, ruledefs, tokendefs, extra):
self.tokendefs = tokendefs
self.ruledefs = ruledefs
def __init__(self, rule_defs, token_defs, extra):
self.token_defs = token_defs
self.rule_defs = rule_defs
self.extra = extra

def compile(self, lexer=False):
assert lexer

tokendefs = [(name.value, t) for name, t in self.tokendefs]

ignore = []
for i, t in enumerate(self.extra['ignore']):
name = '__IGNORE_%d'%i
tokendefs.append((name, t))
ignore.append(name)
self.extra['ignore'] = ignore
tokendefs = list(self.token_defs)

# =================
# Compile Tokens
# =================
token_to_regexp = TokenTreeToRegexp()
token_dict = dict(tokendefs)
assert len(token_dict) == len(tokendefs), "Same name defined twice?"

# Resolve token assignments
while True:
changed = False
for name, token_tree in tokendefs:
for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
for i, item in enumerate(exp.children):
if isinstance(item, Token):
assert item.type != 'RULE', "Rules aren't allowed inside tokens"
if item.type == 'TOKEN':
exp.children[i] = token_dict[item]
changed = True
if not changed:
break

# Convert tokens to strings/regexps
tokens = []
@@ -362,9 +351,9 @@ class Grammar:
tokendef = TokenDef__Regexp(name, regexp.to_regexp())
tokens.append(tokendef)

# Resolve regexp assignments of the form /..${X}../
# Not sure this is even important, since you can express most regexps with EBNF
# TODO a nicer implementation of this
# Resolve regexp assignments of the form /..${X}../
# XXX This is deprecated, since you can express most regexps with EBNF
# XXX Also, since this happens after import, it can be a source of bugs
token_dict = {td.name: td.to_regexp() for td in tokens}
while True:
changed = False
@@ -389,7 +378,7 @@ class Grammar:
rule_tree_to_text = RuleTreeToText()
rules = {}

for name, rule_tree in self.ruledefs:
for name, rule_tree in self.rule_defs:
assert name not in rules
tree = extract_anon.transform(rule_tree) # Adds to tokens
rules[name] = ebnf_to_bnf.transform(tree)
@@ -417,6 +406,36 @@ class GrammarRule:



_imported_grammars = {}
def import_grammar(grammar_path):
if grammar_path not in _imported_grammars:
for import_path in IMPORT_PATHS:
with open(os.path.join(import_path, grammar_path)) as f:
text = f.read()
grammar = load_grammar(text)
_imported_grammars[grammar_path] = grammar

return _imported_grammars[grammar_path]


def resolve_token_references(token_defs):
token_dict = dict(token_defs)
assert len(token_dict) == len(token_defs), "Same name defined twice?"

while True:
changed = False
for name, token_tree in token_defs:
for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
for i, item in enumerate(exp.children):
if isinstance(item, Token):
if item.type == 'RULE':
raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name))
if item.type == 'TOKEN':
exp.children[i] = token_dict[item]
changed = True
if not changed:
break


class GrammarLoader:
def __init__(self):
@@ -451,11 +470,51 @@ class GrammarLoader:
statements = [c.children for c in tree.children if c.data=='statement']
assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)

# Verify correctness
token_names = set()
token_defs = [(name.value, t) for name, t in token_defs]

# Execute statements
ignore = []
for (stmt,) in statements:
if stmt.data == 'ignore':
expansions ,= stmt.children
ignore.append(expansions)
elif stmt.data == 'import':
dotted_path = stmt.children[0].children
name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
grammar_path = os.path.join(*dotted_path[:-1]) + '.g'
g = import_grammar(grammar_path)
token_tree = dict(g.token_defs)[dotted_path[-1]]
token_defs.append([name.value, token_tree])
else:
assert False, command


# Verify correctness 1
for name, _ in token_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
# Handle ignore tokens
ignore_names = []
for i, t in enumerate(ignore):
if t.data == 'expansions' and len(t.children) == 1:
x ,= t.children
if x.data == 'expansion' and len(x.children) == 1:
item ,= x.children
if isinstance(item, Token) and item.type == 'TOKEN':
# XXX is this really a wise solution? -- Erez
ignore_names.append(item.value)
continue

name = '__IGNORE_%d'%i
token_defs.append((name, t))
ignore_names.append(name)

# Resolve token references
resolve_token_references(token_defs)

# Verify correctness 2
token_names = set()
for name, _ in token_defs:
if name in token_names:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)
@@ -467,7 +526,7 @@ class GrammarLoader:
if r.name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if r.name in rule_names:
raise GrammarError("Token '%s' defined more than once" % name)
raise GrammarError("Rule '%s' defined more than once" % r.name)
rule_names.add(r.name)

for r in rules:
@@ -481,14 +540,10 @@ class GrammarLoader:
if sym not in rule_names:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, r.name))

ignore = []
for command, expansions in statements:
if command == 'ignore':
ignore.append(expansions)
else:
assert False, command
# TODO don't include unused tokens, they can only cause trouble!

return Grammar(rule_defs, token_defs, {'ignore': ignore_names})

return Grammar(rule_defs, token_defs, {'ignore': ignore})


load_grammar = GrammarLoader().load_grammar

+ 5
- 5
lark/tree.py Ver ficheiro

@@ -39,11 +39,11 @@ class Tree(object):
def find_pred(self, pred):
if pred(self):
yield self
else:
for c in self.children:
if isinstance(c, Tree):
for t in c.find_pred(pred):
yield t
for c in self.children:
if isinstance(c, Tree):
for t in c.find_pred(pred):
yield t

def find_data(self, data):
return self.find_pred(lambda t: t.data == data)


Carregando…
Cancelar
Guardar