diff --git a/.gitignore b/.gitignore index af066ef..bbdf54d 100755 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,6 @@ src/codetext.egg-info/* *.pyc *.so *.whl - +.idea +.vscode +*.iml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..2840782 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,6 @@ +# Default ignored files +/shelf/ +/workspace.xml +.idea +.vscode +*.iml \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d8bb24d..9db3f28 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "codetext" -version = "0.0.8" +version = "0.0.9" authors = [ { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" }, ] @@ -17,11 +17,12 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "tree-sitter>=0.20", + "tree-sitter==0.20.4", "Levenshtein>=0.20", "langdetect>=1.0.0", "bs4>=0.0.1", - "tabulate>=0.9.0" + "tabulate>=0.9.0", + "tree_sitter_languages>=1.10.0" ] [project.urls] diff --git a/requirements.txt b/requirements.txt index d438040..4bc4c06 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # for preprocessing -tree-sitter +tree-sitter==0.20.4 tabulate Levenshtein langdetect bs4 +tree_sitter_languages==1.10.2 diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index d330ecb..5975897 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -92,13 +92,19 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename) load_path = str(calling_script_path.parent) - ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so') - if not os.path.exists(ts_lang_path): - logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") - build_language(language, load_path) - + # Get parser from languages parser = Parser() - language = Language(load_path + f"/tree-sitter/{language}.so", language) + try: + from tree_sitter_languages import get_language, get_parser + language = get_language(language) + except ImportError: + # Work-around when pre-built binaries wheels for tree-sitter-languages are not available + logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace") + ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so') + if not os.path.exists(ts_lang_path): + logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") + build_language(language, load_path) + language = Language(load_path + f"/tree-sitter/{language}.so", language) parser.set_language(language) if isinstance(raw_code, str): diff --git a/tests/setup.py b/tests/setup.py index 4a7f6aa..d9516a6 100755 --- a/tests/setup.py +++ b/tests/setup.py @@ -1,8 +1,12 @@ from ..src.codetext.utils import build_language - +from tree_sitter_languages import get_language, get_parser if __name__ == '__main__': lang_list = ['python', 'cpp', 'java', 'c-sharp', 'ruby', 'rust', 'javascript', 'php', 'go'] for lang in lang_list: - build_language(lang) + # build_language(lang) + try: + get_parser(get_language(lang)) + except: + build_language(lang) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index d4a4ba4..af7288c 100755 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -14,8 +14,6 @@ def test_parse_code(self): def sum_2_num(a, b): return a + b """ - - build_language(language='python') parse_code(sample, 'python')