diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index 6db2dd9..4adac3a 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -2,6 +2,7 @@ import os.path import sys +import codecs from lark import Lark, InlineTransformer, Transformer @@ -113,7 +114,7 @@ def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes): path = os.path.join(folder, arg[1:-1]) if path not in includes: includes.add(path) - with open(path) as f: + with codecs.open(path, encoding='utf8') as f: text = f.read() rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes) else: @@ -168,17 +169,18 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path): return ''.join(emit_code) -def main(): - if len(sys.argv) < 3: - print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.") - print("Usage: %s " % sys.argv[0]) - return - - fn, start, nearley_lib = sys.argv[1:] - with open(fn) as f: +def main(fn, start, nearley_lib): + with codecs.open(fn, encoding='utf8') as f: grammar = f.read() print(create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)))) if __name__ == '__main__': - main() + if len(sys.argv) < 4: + print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.") + print("Usage: %s " % sys.argv[0]) + sys.exit(1) + + fn, start, nearley_lib = sys.argv[1:] + + main(fn, start, nearley_lib) diff --git a/tests/test_nearley/grammars/include_unicode.ne b/tests/test_nearley/grammars/include_unicode.ne new file mode 100644 index 0000000..b04c2a9 --- /dev/null +++ b/tests/test_nearley/grammars/include_unicode.ne @@ -0,0 +1,3 @@ +@include "unicode.ne" + +main -> x diff --git a/tests/test_nearley/grammars/unicode.ne b/tests/test_nearley/grammars/unicode.ne new file mode 100644 index 0000000..c930830 --- /dev/null +++ b/tests/test_nearley/grammars/unicode.ne @@ -0,0 +1 @@ +x -> "±a" diff --git a/tests/test_nearley/test_nearley.py b/tests/test_nearley/test_nearley.py index 0fbe239..e980f9f 100644 --- a/tests/test_nearley/test_nearley.py +++ b/tests/test_nearley/test_nearley.py @@ -1,14 +1,17 @@ +# -*- coding: utf-8 -*- from __future__ import absolute_import import unittest import logging import os +import codecs logging.basicConfig(level=logging.INFO) -from lark.tools.nearley import create_code_for_nearley_grammar +from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main -NEARLEY_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'nearley')) +TEST_PATH = os.path.abspath(os.path.dirname(__file__)) +NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') class TestNearley(unittest.TestCase): @@ -59,6 +62,23 @@ class TestNearley(unittest.TestCase): parse('b') parse('c') + def test_utf8(self): + grammar = u'main -> "±a"' + code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, './') + d = {} + exec (code, d) + parse = d['parse'] + + parse(u'±a') + + def test_utf8_2(self): + fn = os.path.join(TEST_PATH, 'grammars/unicode.ne') + nearley_tool_main(fn, 'x', NEARLEY_PATH) + + def test_include_utf8(self): + fn = os.path.join(TEST_PATH, 'grammars/include_unicode.ne') + nearley_tool_main(fn, 'main', NEARLEY_PATH) + if __name__ == '__main__': unittest.main()