From b1e774910ab7b9cb967536162750841adb6044db Mon Sep 17 00:00:00 2001
From: sharonchu <schu0902@gmail.com>
Date: Mon, 20 Nov 2023 13:51:33 -0500
Subject: [PATCH 1/4] Add new built-in function for webpage text extraction.

---
 evadb/functions/function_bootstrap_queries.py |   7 +
 evadb/functions/webpage_text_extractor.py     | 152 ++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 evadb/functions/webpage_text_extractor.py

diff --git a/evadb/functions/function_bootstrap_queries.py b/evadb/functions/function_bootstrap_queries.py
index f8186d4dd3..7d6ffa4547 100644
--- a/evadb/functions/function_bootstrap_queries.py
+++ b/evadb/functions/function_bootstrap_queries.py
@@ -176,6 +176,12 @@
     EvaDB_INSTALLATION_DIR
 )
 
+Web_text_query = """CREATE FUNCTION IF NOT EXISTS WebpageTextExtractor
+        IMPL  '{}/functions/webpage_text_extractor.py';
+        """.format(
+    EvaDB_INSTALLATION_DIR
+)
+
 Text_feat_function_query = """CREATE FUNCTION IF NOT EXISTS SentenceFeatureExtractor
         IMPL  '{}/functions/sentence_feature_extractor.py';
         """.format(
@@ -282,6 +288,7 @@ def init_builtin_functions(db: EvaDBDatabase, mode: str = "debug") -> None:
         face_detection_function_query,
         # Mvit_function_query,
         Sift_function_query,
+        Web_text_query,
         Yolo_function_query,
         stablediffusion_function_query,
         dalle_function_query,
diff --git a/evadb/functions/webpage_text_extractor.py b/evadb/functions/webpage_text_extractor.py
new file mode 100644
index 0000000000..54b3b90981
--- /dev/null
+++ b/evadb/functions/webpage_text_extractor.py
@@ -0,0 +1,152 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+from selenium.webdriver.common.by import By
+import concurrent.futures
+import pandas as pd
+import time
+from evadb.catalog.catalog_type import NdArrayType
+from evadb.functions.abstract.abstract_function import AbstractFunction
+from evadb.functions.decorators.decorators import forward, setup
+from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
+
+import easyocr
+from tqdm import tqdm
+
+
+reader = easyocr.Reader(["en"], gpu=True)
+
+
+def scrape_user_page(url):
+    try:
+        options = FirefoxOptions()
+        options.add_argument("--headless")
+
+        driver = webdriver.Firefox(options=options)
+
+        driver.set_window_size(1920, 1080)
+        # Open the GitHub user page
+        driver.get(f"https://github.com/{url}")
+        # driver.execute_script("document.body.style.zoom='120%'")
+
+        # Capture the user profile section
+        user_info_blocks = []
+        try:
+            user_info_blocks.append(driver.find_element(By.CLASS_NAME, "h-card"))
+        except:
+            pass
+        info_ids = ["user-profile-frame", "user-private-profile-frame"]
+        for info_id in info_ids:
+            try:
+                user_info_blocks.append(driver.find_element(By.ID, info_id))
+            except:
+                pass
+
+        extracted_text = ""
+        for info_block in user_info_blocks:
+            screenshot = info_block.screenshot_as_png
+            # with torch.cuda.device(gpu_id):
+            result = reader.readtext(screenshot, detail=0)
+            for i in result:
+                extracted_text += i + " "
+
+        return extracted_text
+
+    except Exception as e:
+        print(f"Error for {url}: {str(e)}")
+        return str(e)
+    finally:
+        driver.quit()
+
+
+# Define a function to extract text from a set of URLs
+def extract_text_from_url(url):
+    try:
+        # Scrape user page using Selenium and EasyOCR
+        extracted_text = scrape_user_page(url)
+    except Exception as e:
+        error_msg = f"Error extracting text from {url}: {str(e)}"
+        print(error_msg)
+        return error_msg
+
+    return extracted_text
+
+
+class WebPageTextExtractor(AbstractFunction):
+    """
+    Arguments:
+        None
+
+    Input Signatures:
+        urls (list) : A list of URLs from which to extract text.
+
+    Output Signatures:
+        extracted_text (list) : A list of text extracted from the provided URLs.
+
+    Example Usage:
+        You can use this function to extract text from a list of URLs like this:
+
+        urls = ["https://example.com/page1", "https://example.com/page2"]
+    """
+
+    @property
+    def name(self) -> str:
+        return "WebPageTextExtractor"
+
+    @setup(cacheable=False, function_type="web-scraping")
+    def setup(self) -> None:
+        # Any setup or initialization can be done here if needed
+        pass
+
+    @forward(
+        input_signatures=[
+            PandasDataframe(
+                columns=["urls"],
+                column_types=[NdArrayType.STR],
+                column_shapes=[(None,)],
+            )
+        ],
+        output_signatures=[
+            PandasDataframe(
+                columns=["extracted_text"],
+                column_types=[NdArrayType.STR],
+                column_shapes=[(None,)],
+            )
+        ],
+    )
+    def forward(self, input_df):
+        # Ensure URLs are provided
+        if input_df.empty or input_df.iloc[0] is None:
+            raise ValueError("URLs must be provided.")
+
+        print(input_df)
+
+        # Extract URLs from the DataFrame
+        urls = input_df["github_username"]
+
+        # Use ThreadPoolExecutor for concurrent processing
+        num_workers = 1
+        # Note: CUDA errors in EasyOCR with more than 1 worker
+        ## profiling
+        # 1 worker: 218.00s
+        # 4 workers: 147.44s
+        # 8 workers: 134.55s
+        # 12 workers: 149.89s
+
+        num_urls = len(urls)
+
+        print(f"Extracting text from {num_urls} URLs using {num_workers} workers")
+
+        start = time.time()
+        extracted_text_lists = []
+        # Use ThreadPoolExecutor for concurrent processing
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+            # Submit tasks to extract text from each URL
+            extracted_text_lists = list(
+                tqdm(executor.map(extract_text_from_url, urls), total=num_urls)
+            )
+
+        # Create a DataFrame from the extracted text
+        extracted_text_df = pd.DataFrame({"extracted_text": extracted_text_lists})
+        end = time.time()
+        print("time taken: {:.2f}s".format(end - start))
+        return extracted_text_df

From 759cc605b24766dd2aad9c77fa9a59c703cfa040 Mon Sep 17 00:00:00 2001
From: sharonchu <schu0902@gmail.com>
Date: Wed, 22 Nov 2023 07:50:16 -0500
Subject: [PATCH 2/4] test

---
 evadb/functions/webpage_text_extractor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/evadb/functions/webpage_text_extractor.py b/evadb/functions/webpage_text_extractor.py
index 54b3b90981..43f97608cf 100644
--- a/evadb/functions/webpage_text_extractor.py
+++ b/evadb/functions/webpage_text_extractor.py
@@ -15,6 +15,7 @@
 
 reader = easyocr.Reader(["en"], gpu=True)
 
+#testtest
 
 def scrape_user_page(url):
     try:

From 9c413e435ac599e1cf9da96ca85544557e6d6c14 Mon Sep 17 00:00:00 2001
From: sharonchu <schu0902@gmail.com>
Date: Wed, 22 Nov 2023 09:13:37 -0500
Subject: [PATCH 3/4] Add text recognition as a new built-in function.

---
 evadb/functions/function_bootstrap_queries.py |   6 +-
 evadb/functions/text_recognition.py           | 101 ++++++++++++++++++
 2 files changed, 104 insertions(+), 3 deletions(-)
 create mode 100644 evadb/functions/text_recognition.py

diff --git a/evadb/functions/function_bootstrap_queries.py b/evadb/functions/function_bootstrap_queries.py
index 7d6ffa4547..70e2e8c89e 100644
--- a/evadb/functions/function_bootstrap_queries.py
+++ b/evadb/functions/function_bootstrap_queries.py
@@ -176,8 +176,8 @@
     EvaDB_INSTALLATION_DIR
 )
 
-Web_text_query = """CREATE FUNCTION IF NOT EXISTS WebpageTextExtractor
-        IMPL  '{}/functions/webpage_text_extractor.py';
+Text_recognition_query = """CREATE FUNCTION IF NOT EXISTS TextRecognizer
+        IMPL  '{}/functions/text_recognition.py';
         """.format(
     EvaDB_INSTALLATION_DIR
 )
@@ -288,7 +288,7 @@ def init_builtin_functions(db: EvaDBDatabase, mode: str = "debug") -> None:
         face_detection_function_query,
         # Mvit_function_query,
         Sift_function_query,
-        Web_text_query,
+        Text_recognition_query,
         Yolo_function_query,
         stablediffusion_function_query,
         dalle_function_query,
diff --git a/evadb/functions/text_recognition.py b/evadb/functions/text_recognition.py
new file mode 100644
index 0000000000..3158d535a6
--- /dev/null
+++ b/evadb/functions/text_recognition.py
@@ -0,0 +1,101 @@
+import concurrent.futures
+import pandas as pd
+import time
+from evadb.catalog.catalog_type import NdArrayType
+from evadb.functions.abstract.abstract_function import AbstractFunction
+from evadb.functions.decorators.decorators import forward, setup
+from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
+
+import easyocr
+from tqdm import tqdm
+
+
+reader = easyocr.Reader(["en"], gpu=True)
+
+
+def extract_text(image_path):
+    try:
+        extracted_text = ""
+        result = reader.readtext(image_path, detail=0)
+            for element in result:
+                extracted_text += element + " "
+        return extracted_text
+
+    except Exception as e:
+        print(f"Error for {image_path}: {str(e)}")
+        return str(e)
+
+def get_text_from_image(path):
+    try:
+        # Extract text using EasyOCR
+        extracted_text = extract_text(path)
+    except Exception as e:
+        error_msg = f"Error extracting text from {path}: {str(e)}"
+        print(error_msg)
+        return error_msg
+    return extracted_text
+
+
+class WebPageTextExtractor(AbstractFunction):
+    """
+    Arguments:
+        None
+
+    Input Signatures:
+        image_path (str) : The path to image from which to extract text.
+
+    Output Signatures:
+        extracted_text (list) : A list of text extracted from the provided image.
+
+    Example Usage:
+        You can use this function to extract text from an image
+    """
+
+    @property
+    def name(self) -> str:
+        return "ImageTextRecognizer"
+
+    @setup(cacheable=False, function_type="text-recognition")
+    def setup(self) -> None:
+        pass
+
+    @forward(
+        input_signatures=[
+            PandasDataframe(
+                columns=["image_path"],
+                column_types=[NdArrayType.STR],
+                column_shapes=[(None,)],
+            )
+        ],
+        output_signatures=[
+            PandasDataframe(
+                columns=["extracted_text"],
+                column_types=[NdArrayType.STR],
+                column_shapes=[(None,)],
+            )
+        ],
+    )
+    def forward(self, input_df):
+        if input_df.empty or input_df.iloc[0] is None:
+            raise ValueError("A path to image must be provided.")
+
+        print(input_df)
+
+        paths = input_df["path"]
+
+        num_workers = 1
+
+        start = time.time()
+        extracted_text_lists = []
+        # Use ThreadPoolExecutor for concurrent processing
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+            # Submit tasks to extract text from each URL
+            extracted_text_lists = list(
+                tqdm(executor.map(get_text_from_image, paths), total=len(paths))
+            )
+
+        # Create a DataFrame from the extracted text
+        extracted_text_df = pd.DataFrame({"extracted_text": extracted_text_lists})
+        end = time.time()
+        print("time taken: {:.2f}s".format(end - start))
+        return extracted_text_df

From 87cf590f70acef613567681316d65254d1e39bfd Mon Sep 17 00:00:00 2001
From: sharonchu <schu0902@gmail.com>
Date: Wed, 22 Nov 2023 09:18:31 -0500
Subject: [PATCH 4/4] Remove webpage_text_extractor.py

---
 evadb/functions/webpage_text_extractor.py | 153 ----------------------
 1 file changed, 153 deletions(-)
 delete mode 100644 evadb/functions/webpage_text_extractor.py

diff --git a/evadb/functions/webpage_text_extractor.py b/evadb/functions/webpage_text_extractor.py
deleted file mode 100644
index 43f97608cf..0000000000
--- a/evadb/functions/webpage_text_extractor.py
+++ /dev/null
@@ -1,153 +0,0 @@
-from selenium import webdriver
-from selenium.webdriver.firefox.options import Options as FirefoxOptions
-from selenium.webdriver.common.by import By
-import concurrent.futures
-import pandas as pd
-import time
-from evadb.catalog.catalog_type import NdArrayType
-from evadb.functions.abstract.abstract_function import AbstractFunction
-from evadb.functions.decorators.decorators import forward, setup
-from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
-
-import easyocr
-from tqdm import tqdm
-
-
-reader = easyocr.Reader(["en"], gpu=True)
-
-#testtest
-
-def scrape_user_page(url):
-    try:
-        options = FirefoxOptions()
-        options.add_argument("--headless")
-
-        driver = webdriver.Firefox(options=options)
-
-        driver.set_window_size(1920, 1080)
-        # Open the GitHub user page
-        driver.get(f"https://github.com/{url}")
-        # driver.execute_script("document.body.style.zoom='120%'")
-
-        # Capture the user profile section
-        user_info_blocks = []
-        try:
-            user_info_blocks.append(driver.find_element(By.CLASS_NAME, "h-card"))
-        except:
-            pass
-        info_ids = ["user-profile-frame", "user-private-profile-frame"]
-        for info_id in info_ids:
-            try:
-                user_info_blocks.append(driver.find_element(By.ID, info_id))
-            except:
-                pass
-
-        extracted_text = ""
-        for info_block in user_info_blocks:
-            screenshot = info_block.screenshot_as_png
-            # with torch.cuda.device(gpu_id):
-            result = reader.readtext(screenshot, detail=0)
-            for i in result:
-                extracted_text += i + " "
-
-        return extracted_text
-
-    except Exception as e:
-        print(f"Error for {url}: {str(e)}")
-        return str(e)
-    finally:
-        driver.quit()
-
-
-# Define a function to extract text from a set of URLs
-def extract_text_from_url(url):
-    try:
-        # Scrape user page using Selenium and EasyOCR
-        extracted_text = scrape_user_page(url)
-    except Exception as e:
-        error_msg = f"Error extracting text from {url}: {str(e)}"
-        print(error_msg)
-        return error_msg
-
-    return extracted_text
-
-
-class WebPageTextExtractor(AbstractFunction):
-    """
-    Arguments:
-        None
-
-    Input Signatures:
-        urls (list) : A list of URLs from which to extract text.
-
-    Output Signatures:
-        extracted_text (list) : A list of text extracted from the provided URLs.
-
-    Example Usage:
-        You can use this function to extract text from a list of URLs like this:
-
-        urls = ["https://example.com/page1", "https://example.com/page2"]
-    """
-
-    @property
-    def name(self) -> str:
-        return "WebPageTextExtractor"
-
-    @setup(cacheable=False, function_type="web-scraping")
-    def setup(self) -> None:
-        # Any setup or initialization can be done here if needed
-        pass
-
-    @forward(
-        input_signatures=[
-            PandasDataframe(
-                columns=["urls"],
-                column_types=[NdArrayType.STR],
-                column_shapes=[(None,)],
-            )
-        ],
-        output_signatures=[
-            PandasDataframe(
-                columns=["extracted_text"],
-                column_types=[NdArrayType.STR],
-                column_shapes=[(None,)],
-            )
-        ],
-    )
-    def forward(self, input_df):
-        # Ensure URLs are provided
-        if input_df.empty or input_df.iloc[0] is None:
-            raise ValueError("URLs must be provided.")
-
-        print(input_df)
-
-        # Extract URLs from the DataFrame
-        urls = input_df["github_username"]
-
-        # Use ThreadPoolExecutor for concurrent processing
-        num_workers = 1
-        # Note: CUDA errors in EasyOCR with more than 1 worker
-        ## profiling
-        # 1 worker: 218.00s
-        # 4 workers: 147.44s
-        # 8 workers: 134.55s
-        # 12 workers: 149.89s
-
-        num_urls = len(urls)
-
-        print(f"Extracting text from {num_urls} URLs using {num_workers} workers")
-
-        start = time.time()
-        extracted_text_lists = []
-        # Use ThreadPoolExecutor for concurrent processing
-        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
-            # Submit tasks to extract text from each URL
-            extracted_text_lists = list(
-                tqdm(executor.map(extract_text_from_url, urls), total=num_urls)
-            )
-
-        # Create a DataFrame from the extracted text
-        extracted_text_df = pd.DataFrame({"extracted_text": extracted_text_lists})
-        end = time.time()
-        print("time taken: {:.2f}s".format(end - start))
-        return extracted_text_df