From 337278824077e8c6e803d7daacf4802404cad052 Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Wed, 15 Nov 2023 01:49:44 -0500 Subject: [PATCH 1/8] created a new function to use pytesseract --- evadb/functions/pytesseract_function.py | 71 +++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 evadb/functions/pytesseract_function.py diff --git a/evadb/functions/pytesseract_function.py b/evadb/functions/pytesseract_function.py new file mode 100644 index 0000000000..3222f8a0d8 --- /dev/null +++ b/evadb/functions/pytesseract_function.py @@ -0,0 +1,71 @@ +import pandas as pd +import numpy as np +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.catalog.catalog_type import NdArrayType +import ast + +from evadb.utils.generic_utils import try_to_import_pytesseract + + +class PyTesseractOCRFunction(AbstractFunction): + @property + def name(self) -> str: + return "PyTesseractOCRFunction" + + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) + def setup(self,\ + convert_to_grayscale: bool = True, \ + remove_noise: bool = True, \ + tesseract_path:str = None) -> None: + + try_to_import_pytesseract() + + #set the tesseract engine + pytesseract.pytesseract.tesseract_cmd = tesseract_path + + self.grayscale_flag = convert_to_grayscale + self.remove_noise = remove_noise + + + @forward( + input_signatures=[ + PandasDataframe( + columns=["data"], + column_types=[NdArrayType.FLOAT64], + column_shapes=[(None, 3)], + ), + + ], + output_signatures=[ + PandasDataframe( + columns=["text"], + column_types=[NdArrayType.STR], + column_shapes=[(None,)], + ) + ], + ) + def forward(self, frames: pd.DataFrame) -> pd.DataFrame: + + img_data = np.asarray(frames['data'][0]) + + if ast.literal_eval(self.grayscale_flag): + img_data = cv2.cvtColor(img_data, cv2.COLOR_RGB2GRAY) + + if ast.literal_eval(self.remove_noise): + img_data = cv2.medianBlur(img_data,5) + + + #apply the OCR + text = pytesseract.image_to_string(img_data) + + new_df = {"text": [text]} + + return pd.DataFrame(new_df) + + + + + + From 56dc106ecf17cd37c132cd08e2a35f0600bda8c7 Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Wed, 15 Nov 2023 01:49:58 -0500 Subject: [PATCH 2/8] import function for pytesseract --- evadb/utils/generic_utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index d9af319103..e7c4811824 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -500,6 +500,24 @@ def try_to_import_norfair(): """Could not import norfair python package. Please install it with `pip install norfair`.""" ) + +def try_to_import_pytesseract(): + try: + import cv2 + except ImportError: + raise ValueError( + """Could not import opencv python package. + Please install it with pip install opencv-python""" + ) + + try: + import pytesseract + except ImportError: + raise ValueError( + """Could not import pytesseract python package. + Please install it with pip install pytesseract""" + ) + ############################## From 0fb2dd4e342bd47a34dd2192647c7b99f95acbfe Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Fri, 17 Nov 2023 12:24:52 -0500 Subject: [PATCH 3/8] added tests. changed arg type. --- evadb/functions/pytesseract_function.py | 6 ++-- .../long/functions/test_pytesseract.py | 35 +++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 test/integration_tests/long/functions/test_pytesseract.py diff --git a/evadb/functions/pytesseract_function.py b/evadb/functions/pytesseract_function.py index 3222f8a0d8..4e081df0aa 100644 --- a/evadb/functions/pytesseract_function.py +++ b/evadb/functions/pytesseract_function.py @@ -16,9 +16,9 @@ def name(self) -> str: @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) def setup(self,\ - convert_to_grayscale: bool = True, \ - remove_noise: bool = True, \ - tesseract_path:str = None) -> None: + convert_to_grayscale: str, \ + remove_noise: str, \ + tesseract_path:str = None) -> None: # type: ignore try_to_import_pytesseract() diff --git a/test/integration_tests/long/functions/test_pytesseract.py b/test/integration_tests/long/functions/test_pytesseract.py new file mode 100644 index 0000000000..0de84305ac --- /dev/null +++ b/test/integration_tests/long/functions/test_pytesseract.py @@ -0,0 +1,35 @@ +import unittest + +from test.util import get_evadb_for_testing + +from evadb.server.command_handler import execute_query_fetch_all + +class PytesseractTest(unittest.TestCase): + + def setUp(self) -> None: + self.evadb = get_evadb_for_testing() + self.evadb.catalog().reset() + + load_image_query = """LOAD IMAGE 'data/ocr/Example.jpg' INTO MyImage;""" + + execute_query_fetch_all(self.evadb, load_image_query) + + def tearDown(self) -> None: + execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyImage;") + + @unittest.skip("Needs Pytesseract") + def test_pytesseract_function(self): + function_name = "PyTesseractOCRFunction" + execute_query_fetch_all(self.evadb, f"DROP FUNCTION IF EXISTS {function_name};") + + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS{function_name} + IMPL 'evadb/functions/pytesseract_function.py'; + """ + execute_query_fetch_all(self.evadb, create_function_query) + + ocr_query = f"SELECT {function_name}(data) FROM MyImage;" + output_batch = execute_query_fetch_all(self.evadb, ocr_query) + self.assertEqual(1, len(output_batch)) + + + From 5389125834fe0a565eca15e7b67d6b808912f98f Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Fri, 17 Nov 2023 12:26:43 -0500 Subject: [PATCH 4/8] removed the repetitive import cv2 --- evadb/utils/generic_utils.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index e7c4811824..7b04c0a770 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -502,13 +502,7 @@ def try_to_import_norfair(): ) def try_to_import_pytesseract(): - try: - import cv2 - except ImportError: - raise ValueError( - """Could not import opencv python package. - Please install it with pip install opencv-python""" - ) + try_to_import_cv2() try: import pytesseract From 923399c9847c3de99e458c8760fd99a769429de6 Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Fri, 17 Nov 2023 12:35:17 -0500 Subject: [PATCH 5/8] linting changes --- evadb/functions/pytesseract_function.py | 55 ++++++++++--------- evadb/utils/generic_utils.py | 8 +-- .../long/functions/test_pytesseract.py | 24 +++++--- 3 files changed, 50 insertions(+), 37 deletions(-) diff --git a/evadb/functions/pytesseract_function.py b/evadb/functions/pytesseract_function.py index 4e081df0aa..cc7d08db8c 100644 --- a/evadb/functions/pytesseract_function.py +++ b/evadb/functions/pytesseract_function.py @@ -1,11 +1,26 @@ -import pandas as pd +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast + import numpy as np +import pandas as pd + +from evadb.catalog.catalog_type import NdArrayType from evadb.functions.abstract.abstract_function import AbstractFunction from evadb.functions.decorators.decorators import forward, setup from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe -from evadb.catalog.catalog_type import NdArrayType -import ast - from evadb.utils.generic_utils import try_to_import_pytesseract @@ -13,21 +28,18 @@ class PyTesseractOCRFunction(AbstractFunction): @property def name(self) -> str: return "PyTesseractOCRFunction" - + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) - def setup(self,\ - convert_to_grayscale: str, \ - remove_noise: str, \ - tesseract_path:str = None) -> None: # type: ignore - + def setup( + self, convert_to_grayscale: str, remove_noise: str, tesseract_path: str = None + ) -> None: # type: ignore try_to_import_pytesseract() - - #set the tesseract engine + + # set the tesseract engine pytesseract.pytesseract.tesseract_cmd = tesseract_path self.grayscale_flag = convert_to_grayscale self.remove_noise = remove_noise - @forward( input_signatures=[ @@ -36,7 +48,6 @@ def setup(self,\ column_types=[NdArrayType.FLOAT64], column_shapes=[(None, 3)], ), - ], output_signatures=[ PandasDataframe( @@ -47,25 +58,17 @@ def setup(self,\ ], ) def forward(self, frames: pd.DataFrame) -> pd.DataFrame: - - img_data = np.asarray(frames['data'][0]) + img_data = np.asarray(frames["data"][0]) if ast.literal_eval(self.grayscale_flag): img_data = cv2.cvtColor(img_data, cv2.COLOR_RGB2GRAY) - + if ast.literal_eval(self.remove_noise): - img_data = cv2.medianBlur(img_data,5) + img_data = cv2.medianBlur(img_data, 5) - - #apply the OCR + # apply the OCR text = pytesseract.image_to_string(img_data) new_df = {"text": [text]} return pd.DataFrame(new_df) - - - - - - diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index 7b04c0a770..e73d691734 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -500,18 +500,18 @@ def try_to_import_norfair(): """Could not import norfair python package. Please install it with `pip install norfair`.""" ) - + + def try_to_import_pytesseract(): try_to_import_cv2() - + try: import pytesseract except ImportError: raise ValueError( - """Could not import pytesseract python package. + """Could not import pytesseract python package. Please install it with pip install pytesseract""" ) - ############################## diff --git a/test/integration_tests/long/functions/test_pytesseract.py b/test/integration_tests/long/functions/test_pytesseract.py index 0de84305ac..666dd87b61 100644 --- a/test/integration_tests/long/functions/test_pytesseract.py +++ b/test/integration_tests/long/functions/test_pytesseract.py @@ -1,17 +1,30 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import unittest - from test.util import get_evadb_for_testing from evadb.server.command_handler import execute_query_fetch_all -class PytesseractTest(unittest.TestCase): +class PytesseractTest(unittest.TestCase): def setUp(self) -> None: self.evadb = get_evadb_for_testing() self.evadb.catalog().reset() - + load_image_query = """LOAD IMAGE 'data/ocr/Example.jpg' INTO MyImage;""" - + execute_query_fetch_all(self.evadb, load_image_query) def tearDown(self) -> None: @@ -30,6 +43,3 @@ def test_pytesseract_function(self): ocr_query = f"SELECT {function_name}(data) FROM MyImage;" output_batch = execute_query_fetch_all(self.evadb, ocr_query) self.assertEqual(1, len(output_batch)) - - - From a7ab9b5c5708ba4a0ba9951247269cffbdb3bbde Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Wed, 29 Nov 2023 15:06:09 -0500 Subject: [PATCH 6/8] updated the pytest marker --- evadb/utils/generic_utils.py | 8 ++++++++ test/markers.py | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index e73d691734..f056bc426e 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -512,6 +512,14 @@ def try_to_import_pytesseract(): """Could not import pytesseract python package. Please install it with pip install pytesseract""" ) + +def is_pytessseract_available(): + try: + try_to_import_pytesseract() + return True + except ValueError: + return False + ############################## diff --git a/test/markers.py b/test/markers.py index 6fdd2ad3c7..abd1927964 100644 --- a/test/markers.py +++ b/test/markers.py @@ -25,6 +25,7 @@ is_ludwig_available, is_milvus_available, is_pinecone_available, + is_pytessseract_available, is_qdrant_available, is_replicate_available, is_sklearn_available, @@ -112,3 +113,7 @@ stable_diffusion_skip_marker = pytest.mark.skipif( is_replicate_available() is False, reason="requires replicate" ) + +pytesseract_skip_marker = pytest.mark.skipif( + is_pytessseract_available() is False, reason="requires pytesseract" +) \ No newline at end of file From a1370de4382e9c038d2c9125aa60743530e1b8b4 Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Wed, 29 Nov 2023 15:07:34 -0500 Subject: [PATCH 7/8] added pytesseract --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 3334fa8361..35bf3147d7 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ def read(path, encoding="utf-8"): "boto3", # AWS "norfair>=2.2.0", # OBJECT TRACKING "kornia", # SIFT FEATURES + "pytesseract", ] ray_libs = [ From 60fdffbea6c188d74748ba941bed39b850c5e083 Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Wed, 29 Nov 2023 15:54:48 -0500 Subject: [PATCH 8/8] added the custom test skip marker --- test/integration_tests/long/functions/test_pytesseract.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/integration_tests/long/functions/test_pytesseract.py b/test/integration_tests/long/functions/test_pytesseract.py index 666dd87b61..6b8ed6dc28 100644 --- a/test/integration_tests/long/functions/test_pytesseract.py +++ b/test/integration_tests/long/functions/test_pytesseract.py @@ -14,6 +14,7 @@ # limitations under the License. import unittest from test.util import get_evadb_for_testing +from test.markers import pytesseract_skip_marker from evadb.server.command_handler import execute_query_fetch_all @@ -30,7 +31,7 @@ def setUp(self) -> None: def tearDown(self) -> None: execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyImage;") - @unittest.skip("Needs Pytesseract") + @pytesseract_skip_marker def test_pytesseract_function(self): function_name = "PyTesseractOCRFunction" execute_query_fetch_all(self.evadb, f"DROP FUNCTION IF EXISTS {function_name};")