Skip to content

Commit

Permalink
added fitz and unstructuredio pdf loader py evadb script
Browse files Browse the repository at this point in the history
  • Loading branch information
sru3 committed Nov 4, 2023
1 parent c296318 commit 69b7426
Showing 1 changed file with 96 additions and 0 deletions.
96 changes: 96 additions & 0 deletions evadb/readers/fitz_unstructured_pdf_loader_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import pandas as pd
import os
import fitz
import requests
import csv

# coding=utf-8
from typing import Iterator, List
import requests
import pandas as pd

from evadb.catalog.sql_config import ROW_NUM_COLUMN
from evadb.readers.abstract_reader import AbstractReader
from evadb.utils.generic_utils import try_to_import_fitz

class PDFReader(AbstractReader):
def __init__(self, *args, **kwargs):
"""
Reads a PDF file and yields frame data.
Args:
column_list: list of columns (TupleValueExpression)
to read from the PDF file
"""
super().__init__(*args, **kwargs)
try_to_import_fitz()

def _read(self) -> Iterator[str]:
import fitz

doc = fitz.open(self.file_url)

for page in doc:
text = page.get_text().encode("utf8").decode("utf-8")
yield text

def process_text(raw_text: str) -> List[str]:
lines = raw_text.split('\n')
sentences = []
current_sentence = ''

for line in lines:
stripped_line = line.strip()
if not stripped_line:
continue
if (len(stripped_line) < 60 and not stripped_line.endswith(('.', '!', '?'))):
if current_sentence:
sentences.append(current_sentence)
current_sentence = ''
sentences.append(stripped_line)
continue
if stripped_line.endswith(('.', '!', '?')):
current_sentence += ' ' + stripped_line
sentences.append(current_sentence)
current_sentence = ''
else:
current_sentence += ' ' + stripped_line

if current_sentence:
sentences.append(current_sentence)

return sentences

def send_to_unstructured_io(file_path: str) -> pd.DataFrame:
url = 'https://api.unstructured.io/general/v0/general'
headers = {
'accept': 'application/json',
'unstructured-api-key': 'DTmC4RIyD6E3XMBJcucb3GOUMoKHWK',
}
data = {
"strategy": "ocr_only",
"ocr_languages": ["eng", "kor"]
}
file_data = {'files': open(file_path, 'rb')}
response = requests.post(url, headers=headers, data=data, files=file_data)
file_data['files'].close()

structured_data = response.json()
df = pd.json_normalize(structured_data)
return df

# Use the above components:

reader = PDFReader(file_url="YOUR_PDF_PATH")
raw_texts = list(reader._read())

all_sentences = []
for raw_text in raw_texts:
all_sentences.extend(process_text(raw_text))

with open('fitz_processed.txt', 'w', encoding='utf-8') as f:
for sentence in all_sentences:
f.write(sentence + '\n')

df = send_to_unstructured_io('fitz_processed.txt')
print(df)
df.to_csv("fitz+unstructured.csv", index=False)

0 comments on commit 69b7426

Please sign in to comment.