From b1e774910ab7b9cb967536162750841adb6044db Mon Sep 17 00:00:00 2001 From: sharonchu Date: Mon, 20 Nov 2023 13:51:33 -0500 Subject: [PATCH 1/4] Add new built-in function for webpage text extraction. --- evadb/functions/function_bootstrap_queries.py | 7 + evadb/functions/webpage_text_extractor.py | 152 ++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 evadb/functions/webpage_text_extractor.py diff --git a/evadb/functions/function_bootstrap_queries.py b/evadb/functions/function_bootstrap_queries.py index f8186d4dd3..7d6ffa4547 100644 --- a/evadb/functions/function_bootstrap_queries.py +++ b/evadb/functions/function_bootstrap_queries.py @@ -176,6 +176,12 @@ EvaDB_INSTALLATION_DIR ) +Web_text_query = """CREATE FUNCTION IF NOT EXISTS WebpageTextExtractor + IMPL '{}/functions/webpage_text_extractor.py'; + """.format( + EvaDB_INSTALLATION_DIR +) + Text_feat_function_query = """CREATE FUNCTION IF NOT EXISTS SentenceFeatureExtractor IMPL '{}/functions/sentence_feature_extractor.py'; """.format( @@ -282,6 +288,7 @@ def init_builtin_functions(db: EvaDBDatabase, mode: str = "debug") -> None: face_detection_function_query, # Mvit_function_query, Sift_function_query, + Web_text_query, Yolo_function_query, stablediffusion_function_query, dalle_function_query, diff --git a/evadb/functions/webpage_text_extractor.py b/evadb/functions/webpage_text_extractor.py new file mode 100644 index 0000000000..54b3b90981 --- /dev/null +++ b/evadb/functions/webpage_text_extractor.py @@ -0,0 +1,152 @@ +from selenium import webdriver +from selenium.webdriver.firefox.options import Options as FirefoxOptions +from selenium.webdriver.common.by import By +import concurrent.futures +import pandas as pd +import time +from evadb.catalog.catalog_type import NdArrayType +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe + +import easyocr +from tqdm import tqdm + + +reader = easyocr.Reader(["en"], gpu=True) + + +def scrape_user_page(url): + try: + options = FirefoxOptions() + options.add_argument("--headless") + + driver = webdriver.Firefox(options=options) + + driver.set_window_size(1920, 1080) + # Open the GitHub user page + driver.get(f"https://github.com/{url}") + # driver.execute_script("document.body.style.zoom='120%'") + + # Capture the user profile section + user_info_blocks = [] + try: + user_info_blocks.append(driver.find_element(By.CLASS_NAME, "h-card")) + except: + pass + info_ids = ["user-profile-frame", "user-private-profile-frame"] + for info_id in info_ids: + try: + user_info_blocks.append(driver.find_element(By.ID, info_id)) + except: + pass + + extracted_text = "" + for info_block in user_info_blocks: + screenshot = info_block.screenshot_as_png + # with torch.cuda.device(gpu_id): + result = reader.readtext(screenshot, detail=0) + for i in result: + extracted_text += i + " " + + return extracted_text + + except Exception as e: + print(f"Error for {url}: {str(e)}") + return str(e) + finally: + driver.quit() + + +# Define a function to extract text from a set of URLs +def extract_text_from_url(url): + try: + # Scrape user page using Selenium and EasyOCR + extracted_text = scrape_user_page(url) + except Exception as e: + error_msg = f"Error extracting text from {url}: {str(e)}" + print(error_msg) + return error_msg + + return extracted_text + + +class WebPageTextExtractor(AbstractFunction): + """ + Arguments: + None + + Input Signatures: + urls (list) : A list of URLs from which to extract text. + + Output Signatures: + extracted_text (list) : A list of text extracted from the provided URLs. + + Example Usage: + You can use this function to extract text from a list of URLs like this: + + urls = ["https://example.com/page1", "https://example.com/page2"] + """ + + @property + def name(self) -> str: + return "WebPageTextExtractor" + + @setup(cacheable=False, function_type="web-scraping") + def setup(self) -> None: + # Any setup or initialization can be done here if needed + pass + + @forward( + input_signatures=[ + PandasDataframe( + columns=["urls"], + column_types=[NdArrayType.STR], + column_shapes=[(None,)], + ) + ], + output_signatures=[ + PandasDataframe( + columns=["extracted_text"], + column_types=[NdArrayType.STR], + column_shapes=[(None,)], + ) + ], + ) + def forward(self, input_df): + # Ensure URLs are provided + if input_df.empty or input_df.iloc[0] is None: + raise ValueError("URLs must be provided.") + + print(input_df) + + # Extract URLs from the DataFrame + urls = input_df["github_username"] + + # Use ThreadPoolExecutor for concurrent processing + num_workers = 1 + # Note: CUDA errors in EasyOCR with more than 1 worker + ## profiling + # 1 worker: 218.00s + # 4 workers: 147.44s + # 8 workers: 134.55s + # 12 workers: 149.89s + + num_urls = len(urls) + + print(f"Extracting text from {num_urls} URLs using {num_workers} workers") + + start = time.time() + extracted_text_lists = [] + # Use ThreadPoolExecutor for concurrent processing + with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + # Submit tasks to extract text from each URL + extracted_text_lists = list( + tqdm(executor.map(extract_text_from_url, urls), total=num_urls) + ) + + # Create a DataFrame from the extracted text + extracted_text_df = pd.DataFrame({"extracted_text": extracted_text_lists}) + end = time.time() + print("time taken: {:.2f}s".format(end - start)) + return extracted_text_df From 759cc605b24766dd2aad9c77fa9a59c703cfa040 Mon Sep 17 00:00:00 2001 From: sharonchu Date: Wed, 22 Nov 2023 07:50:16 -0500 Subject: [PATCH 2/4] test --- evadb/functions/webpage_text_extractor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evadb/functions/webpage_text_extractor.py b/evadb/functions/webpage_text_extractor.py index 54b3b90981..43f97608cf 100644 --- a/evadb/functions/webpage_text_extractor.py +++ b/evadb/functions/webpage_text_extractor.py @@ -15,6 +15,7 @@ reader = easyocr.Reader(["en"], gpu=True) +#testtest def scrape_user_page(url): try: From 9c413e435ac599e1cf9da96ca85544557e6d6c14 Mon Sep 17 00:00:00 2001 From: sharonchu Date: Wed, 22 Nov 2023 09:13:37 -0500 Subject: [PATCH 3/4] Add text recognition as a new built-in function. --- evadb/functions/function_bootstrap_queries.py | 6 +- evadb/functions/text_recognition.py | 101 ++++++++++++++++++ 2 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 evadb/functions/text_recognition.py diff --git a/evadb/functions/function_bootstrap_queries.py b/evadb/functions/function_bootstrap_queries.py index 7d6ffa4547..70e2e8c89e 100644 --- a/evadb/functions/function_bootstrap_queries.py +++ b/evadb/functions/function_bootstrap_queries.py @@ -176,8 +176,8 @@ EvaDB_INSTALLATION_DIR ) -Web_text_query = """CREATE FUNCTION IF NOT EXISTS WebpageTextExtractor - IMPL '{}/functions/webpage_text_extractor.py'; +Text_recognition_query = """CREATE FUNCTION IF NOT EXISTS TextRecognizer + IMPL '{}/functions/text_recognition.py'; """.format( EvaDB_INSTALLATION_DIR ) @@ -288,7 +288,7 @@ def init_builtin_functions(db: EvaDBDatabase, mode: str = "debug") -> None: face_detection_function_query, # Mvit_function_query, Sift_function_query, - Web_text_query, + Text_recognition_query, Yolo_function_query, stablediffusion_function_query, dalle_function_query, diff --git a/evadb/functions/text_recognition.py b/evadb/functions/text_recognition.py new file mode 100644 index 0000000000..3158d535a6 --- /dev/null +++ b/evadb/functions/text_recognition.py @@ -0,0 +1,101 @@ +import concurrent.futures +import pandas as pd +import time +from evadb.catalog.catalog_type import NdArrayType +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe + +import easyocr +from tqdm import tqdm + + +reader = easyocr.Reader(["en"], gpu=True) + + +def extract_text(image_path): + try: + extracted_text = "" + result = reader.readtext(image_path, detail=0) + for element in result: + extracted_text += element + " " + return extracted_text + + except Exception as e: + print(f"Error for {image_path}: {str(e)}") + return str(e) + +def get_text_from_image(path): + try: + # Extract text using EasyOCR + extracted_text = extract_text(path) + except Exception as e: + error_msg = f"Error extracting text from {path}: {str(e)}" + print(error_msg) + return error_msg + return extracted_text + + +class WebPageTextExtractor(AbstractFunction): + """ + Arguments: + None + + Input Signatures: + image_path (str) : The path to image from which to extract text. + + Output Signatures: + extracted_text (list) : A list of text extracted from the provided image. + + Example Usage: + You can use this function to extract text from an image + """ + + @property + def name(self) -> str: + return "ImageTextRecognizer" + + @setup(cacheable=False, function_type="text-recognition") + def setup(self) -> None: + pass + + @forward( + input_signatures=[ + PandasDataframe( + columns=["image_path"], + column_types=[NdArrayType.STR], + column_shapes=[(None,)], + ) + ], + output_signatures=[ + PandasDataframe( + columns=["extracted_text"], + column_types=[NdArrayType.STR], + column_shapes=[(None,)], + ) + ], + ) + def forward(self, input_df): + if input_df.empty or input_df.iloc[0] is None: + raise ValueError("A path to image must be provided.") + + print(input_df) + + paths = input_df["path"] + + num_workers = 1 + + start = time.time() + extracted_text_lists = [] + # Use ThreadPoolExecutor for concurrent processing + with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + # Submit tasks to extract text from each URL + extracted_text_lists = list( + tqdm(executor.map(get_text_from_image, paths), total=len(paths)) + ) + + # Create a DataFrame from the extracted text + extracted_text_df = pd.DataFrame({"extracted_text": extracted_text_lists}) + end = time.time() + print("time taken: {:.2f}s".format(end - start)) + return extracted_text_df From 87cf590f70acef613567681316d65254d1e39bfd Mon Sep 17 00:00:00 2001 From: sharonchu Date: Wed, 22 Nov 2023 09:18:31 -0500 Subject: [PATCH 4/4] Remove webpage_text_extractor.py --- evadb/functions/webpage_text_extractor.py | 153 ---------------------- 1 file changed, 153 deletions(-) delete mode 100644 evadb/functions/webpage_text_extractor.py diff --git a/evadb/functions/webpage_text_extractor.py b/evadb/functions/webpage_text_extractor.py deleted file mode 100644 index 43f97608cf..0000000000 --- a/evadb/functions/webpage_text_extractor.py +++ /dev/null @@ -1,153 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.firefox.options import Options as FirefoxOptions -from selenium.webdriver.common.by import By -import concurrent.futures -import pandas as pd -import time -from evadb.catalog.catalog_type import NdArrayType -from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.functions.decorators.decorators import forward, setup -from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe - -import easyocr -from tqdm import tqdm - - -reader = easyocr.Reader(["en"], gpu=True) - -#testtest - -def scrape_user_page(url): - try: - options = FirefoxOptions() - options.add_argument("--headless") - - driver = webdriver.Firefox(options=options) - - driver.set_window_size(1920, 1080) - # Open the GitHub user page - driver.get(f"https://github.com/{url}") - # driver.execute_script("document.body.style.zoom='120%'") - - # Capture the user profile section - user_info_blocks = [] - try: - user_info_blocks.append(driver.find_element(By.CLASS_NAME, "h-card")) - except: - pass - info_ids = ["user-profile-frame", "user-private-profile-frame"] - for info_id in info_ids: - try: - user_info_blocks.append(driver.find_element(By.ID, info_id)) - except: - pass - - extracted_text = "" - for info_block in user_info_blocks: - screenshot = info_block.screenshot_as_png - # with torch.cuda.device(gpu_id): - result = reader.readtext(screenshot, detail=0) - for i in result: - extracted_text += i + " " - - return extracted_text - - except Exception as e: - print(f"Error for {url}: {str(e)}") - return str(e) - finally: - driver.quit() - - -# Define a function to extract text from a set of URLs -def extract_text_from_url(url): - try: - # Scrape user page using Selenium and EasyOCR - extracted_text = scrape_user_page(url) - except Exception as e: - error_msg = f"Error extracting text from {url}: {str(e)}" - print(error_msg) - return error_msg - - return extracted_text - - -class WebPageTextExtractor(AbstractFunction): - """ - Arguments: - None - - Input Signatures: - urls (list) : A list of URLs from which to extract text. - - Output Signatures: - extracted_text (list) : A list of text extracted from the provided URLs. - - Example Usage: - You can use this function to extract text from a list of URLs like this: - - urls = ["https://example.com/page1", "https://example.com/page2"] - """ - - @property - def name(self) -> str: - return "WebPageTextExtractor" - - @setup(cacheable=False, function_type="web-scraping") - def setup(self) -> None: - # Any setup or initialization can be done here if needed - pass - - @forward( - input_signatures=[ - PandasDataframe( - columns=["urls"], - column_types=[NdArrayType.STR], - column_shapes=[(None,)], - ) - ], - output_signatures=[ - PandasDataframe( - columns=["extracted_text"], - column_types=[NdArrayType.STR], - column_shapes=[(None,)], - ) - ], - ) - def forward(self, input_df): - # Ensure URLs are provided - if input_df.empty or input_df.iloc[0] is None: - raise ValueError("URLs must be provided.") - - print(input_df) - - # Extract URLs from the DataFrame - urls = input_df["github_username"] - - # Use ThreadPoolExecutor for concurrent processing - num_workers = 1 - # Note: CUDA errors in EasyOCR with more than 1 worker - ## profiling - # 1 worker: 218.00s - # 4 workers: 147.44s - # 8 workers: 134.55s - # 12 workers: 149.89s - - num_urls = len(urls) - - print(f"Extracting text from {num_urls} URLs using {num_workers} workers") - - start = time.time() - extracted_text_lists = [] - # Use ThreadPoolExecutor for concurrent processing - with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: - # Submit tasks to extract text from each URL - extracted_text_lists = list( - tqdm(executor.map(extract_text_from_url, urls), total=num_urls) - ) - - # Create a DataFrame from the extracted text - extracted_text_df = pd.DataFrame({"extracted_text": extracted_text_lists}) - end = time.time() - print("time taken: {:.2f}s".format(end - start)) - return extracted_text_df