-
Notifications
You must be signed in to change notification settings - Fork 264
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added fitz and unstructuredio pdf loader py evadb script
- Loading branch information
Showing
1 changed file
with
96 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import pandas as pd | ||
import os | ||
import fitz | ||
import requests | ||
import csv | ||
|
||
# coding=utf-8 | ||
from typing import Iterator, List | ||
import requests | ||
import pandas as pd | ||
|
||
from evadb.catalog.sql_config import ROW_NUM_COLUMN | ||
from evadb.readers.abstract_reader import AbstractReader | ||
from evadb.utils.generic_utils import try_to_import_fitz | ||
|
||
class PDFReader(AbstractReader): | ||
def __init__(self, *args, **kwargs): | ||
""" | ||
Reads a PDF file and yields frame data. | ||
Args: | ||
column_list: list of columns (TupleValueExpression) | ||
to read from the PDF file | ||
""" | ||
super().__init__(*args, **kwargs) | ||
try_to_import_fitz() | ||
|
||
def _read(self) -> Iterator[str]: | ||
import fitz | ||
|
||
doc = fitz.open(self.file_url) | ||
|
||
for page in doc: | ||
text = page.get_text().encode("utf8").decode("utf-8") | ||
yield text | ||
|
||
def process_text(raw_text: str) -> List[str]: | ||
lines = raw_text.split('\n') | ||
sentences = [] | ||
current_sentence = '' | ||
|
||
for line in lines: | ||
stripped_line = line.strip() | ||
if not stripped_line: | ||
continue | ||
if (len(stripped_line) < 60 and not stripped_line.endswith(('.', '!', '?'))): | ||
if current_sentence: | ||
sentences.append(current_sentence) | ||
current_sentence = '' | ||
sentences.append(stripped_line) | ||
continue | ||
if stripped_line.endswith(('.', '!', '?')): | ||
current_sentence += ' ' + stripped_line | ||
sentences.append(current_sentence) | ||
current_sentence = '' | ||
else: | ||
current_sentence += ' ' + stripped_line | ||
|
||
if current_sentence: | ||
sentences.append(current_sentence) | ||
|
||
return sentences | ||
|
||
def send_to_unstructured_io(file_path: str) -> pd.DataFrame: | ||
url = 'https://api.unstructured.io/general/v0/general' | ||
headers = { | ||
'accept': 'application/json', | ||
'unstructured-api-key': 'DTmC4RIyD6E3XMBJcucb3GOUMoKHWK', | ||
} | ||
data = { | ||
"strategy": "ocr_only", | ||
"ocr_languages": ["eng", "kor"] | ||
} | ||
file_data = {'files': open(file_path, 'rb')} | ||
response = requests.post(url, headers=headers, data=data, files=file_data) | ||
file_data['files'].close() | ||
|
||
structured_data = response.json() | ||
df = pd.json_normalize(structured_data) | ||
return df | ||
|
||
# Use the above components: | ||
|
||
reader = PDFReader(file_url="YOUR_PDF_PATH") | ||
raw_texts = list(reader._read()) | ||
|
||
all_sentences = [] | ||
for raw_text in raw_texts: | ||
all_sentences.extend(process_text(raw_text)) | ||
|
||
with open('fitz_processed.txt', 'w', encoding='utf-8') as f: | ||
for sentence in all_sentences: | ||
f.write(sentence + '\n') | ||
|
||
df = send_to_unstructured_io('fitz_processed.txt') | ||
print(df) | ||
df.to_csv("fitz+unstructured.csv", index=False) |