From 68f3b190029bd6aa9dc6a214fdc69061d7590b81 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 20 May 2024 09:41:52 +0700 Subject: [PATCH 1/6] Update .gitignore --- .gitignore | 3 ++- .idea/.gitignore | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 .idea/.gitignore diff --git a/.gitignore b/.gitignore index af066ef..6ea0d8c 100755 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ src/codetext.egg-info/* *.pyc *.so *.whl - +.idea +.vscode*.iml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..2840782 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,6 @@ +# Default ignored files +/shelf/ +/workspace.xml +.idea +.vscode +*.iml \ No newline at end of file From 31b5863e2b66720534ed9621464490d6386c1c04 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 20 May 2024 09:42:05 +0700 Subject: [PATCH 2/6] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6ea0d8c..bbdf54d 100755 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ src/codetext.egg-info/* *.so *.whl .idea -.vscode*.iml +.vscode +*.iml From 3876ce2e42d34d2661261f219379da7562c4a52d Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 20 May 2024 09:42:50 +0700 Subject: [PATCH 3/6] skip parser build (Attempt #1) --- pyproject.toml | 5 +++-- requirements.txt | 1 + src/codetext/utils/utils.py | 22 ++++++++++++++-------- tests/setup.py | 8 ++++++-- tests/test_utils/test_utils.py | 2 -- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d8bb24d..b6a68f6 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "codetext" -version = "0.0.8" +version = "0.0.9" authors = [ { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" }, ] @@ -21,7 +21,8 @@ dependencies = [ "Levenshtein>=0.20", "langdetect>=1.0.0", "bs4>=0.0.1", - "tabulate>=0.9.0" + "tabulate>=0.9.0", + "tree_sitter_languages>=1.10.0" ] [project.urls] diff --git a/requirements.txt b/requirements.txt index d438040..9eb91b2 100755 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ tabulate Levenshtein langdetect bs4 +tree-sitter-languages \ No newline at end of file diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index d330ecb..16c94f8 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -92,15 +92,21 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename) load_path = str(calling_script_path.parent) - ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so') - if not os.path.exists(ts_lang_path): - logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") - build_language(language, load_path) - + # Get parser from languages parser = Parser() - language = Language(load_path + f"/tree-sitter/{language}.so", language) - parser.set_language(language) - + try: + from tree_sitter_languages import get_language, get_parser + parser = get_parser(get_language(language)) + except ImportError: + # Work-around when pre-built binaries wheels for tree-sitter-languages are not available + logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace") + ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so') + if not os.path.exists(ts_lang_path): + logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") + build_language(language, load_path) + language = Language(load_path + f"/tree-sitter/{language}.so", language) + parser.set_language(language) + if isinstance(raw_code, str): raw_code = bytes(raw_code, 'utf8') elif isinstance(raw_code, bytes): diff --git a/tests/setup.py b/tests/setup.py index 4a7f6aa..d9516a6 100755 --- a/tests/setup.py +++ b/tests/setup.py @@ -1,8 +1,12 @@ from ..src.codetext.utils import build_language - +from tree_sitter_languages import get_language, get_parser if __name__ == '__main__': lang_list = ['python', 'cpp', 'java', 'c-sharp', 'ruby', 'rust', 'javascript', 'php', 'go'] for lang in lang_list: - build_language(lang) + # build_language(lang) + try: + get_parser(get_language(lang)) + except: + build_language(lang) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index d4a4ba4..af7288c 100755 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -14,8 +14,6 @@ def test_parse_code(self): def sum_2_num(a, b): return a + b """ - - build_language(language='python') parse_code(sample, 'python') From ced3bf6e83d0975a2cc31a5d0f0fc13cc0c62ead Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 1 Jul 2024 10:46:45 +0700 Subject: [PATCH 4/6] # Please enter the commit message for your changes. Lines starting # with '#' will be ignored, and an empty message aborts the commit. # # On branch dev/v0.0.9 # Changes to be committed: # modified: requirements.txt # Update `requirements` - Current version of `codetext` can only work with `tree-sitter==0.20.4` - Replace current language builders with `tree_sitter_languages==1.10.2` --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d438040..4bc4c06 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # for preprocessing -tree-sitter +tree-sitter==0.20.4 tabulate Levenshtein langdetect bs4 +tree_sitter_languages==1.10.2 From 44fafa1694a11a5bc36ade7db7bc242f6b78a4c7 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 1 Jul 2024 11:09:56 +0700 Subject: [PATCH 5/6] Skip language build, use pre-built `get_language` --- src/codetext/utils/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index 16c94f8..98d57e5 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -96,7 +96,7 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) parser = Parser() try: from tree_sitter_languages import get_language, get_parser - parser = get_parser(get_language(language)) + language = get_language(language) except ImportError: # Work-around when pre-built binaries wheels for tree-sitter-languages are not available logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace") @@ -104,9 +104,9 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) if not os.path.exists(ts_lang_path): logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") build_language(language, load_path) - language = Language(load_path + f"/tree-sitter/{language}.so", language) - parser.set_language(language) - + language = Language(load_path + f"/tree-sitter/{language}.so", language) + parser.set_language(language) + if isinstance(raw_code, str): raw_code = bytes(raw_code, 'utf8') elif isinstance(raw_code, bytes): From 3daaa8ac83d468888d7467bd04b9a6ade7fb1135 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 1 Jul 2024 11:11:23 +0700 Subject: [PATCH 6/6] Fix indentation --- src/codetext/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index 98d57e5..5975897 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -104,7 +104,7 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) if not os.path.exists(ts_lang_path): logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") build_language(language, load_path) - language = Language(load_path + f"/tree-sitter/{language}.so", language) + language = Language(load_path + f"/tree-sitter/{language}.so", language) parser.set_language(language) if isinstance(raw_code, str):