Skip to content

Commit

Permalink
cleaned general prompt
Browse files Browse the repository at this point in the history
  • Loading branch information
torrmal committed Oct 11, 2024
1 parent ae9041d commit 0fbf28c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 12 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ from aipdf import ocr
api_key = 'your_openai_api_key'

file = open('somepdf.pdf', 'rb')
markdown_pages = ocr(file, api_key, prompt="extract markdown, extract tables and turn charts into tables")
markdown_pages = ocr(file, api_key)

```

Expand All @@ -47,7 +47,7 @@ We chose that you pass a file object, because that way it is flexible for you to
pdf_file = io.BytesIO(requests.get('https://arxiv.org/pdf/2410.02467').content)

# extract
pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json")
pages = ocr(pdf_file, api_key, prompt="extract tables, return each table in json")

```
### From S3
Expand All @@ -62,7 +62,7 @@ s3 = boto3.client('s3', config=Config(signature_version='s3v4'),

pdf_file = io.BytesIO(s3.get_object(Bucket=bucket_name, Key=object_key)['Body'].read())
# extract
pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json")
pages = ocr(pdf_file, api_key, prompt="extract charts data, turn it into tables that represent the variables in the chart")
```


Expand Down
31 changes: 22 additions & 9 deletions src/aipdf/ocr.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import io
from pdf2image import convert_from_bytes
from PIL import Image
import base64
import requests
import os
import logging
import concurrent.futures

from pdf2image import convert_from_bytes
from openai import OpenAI

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

DEFAULT_PROMPT = "Please analyze this image and provide a markdown representation of its content. Include headings, lists, and any other relevant markdown formatting."
DEFAULT_PROMPT = """
Extract the full markdown text from the given image, following these guidelines:
- Respond only with markdown, no additional commentary.
- Capture all the text, respecting titles, headers, subheaders, equations, etc.
- If there are tables in this page, convert each one into markdown table format and include it in the response.
- If there are images, provide a brief description of what is shown in each image, and include it in the response.
- if there are charts, for each chart include a markdown table with the data represents the chart, a column for each of the variables of the cart and the relevant estimated values
"""

def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEFAULT_PROMPT):
"""
Expand All @@ -26,9 +32,11 @@ def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEF
Returns:
str: The markdown representation of the image content, or None if an error occurs.
"""

base64_image = base64.b64encode(file_object.read()).decode('utf-8')
# Log that we're about to process a page
logging.info("About to process a page")

base64_image = base64.b64encode(file_object.read()).decode('utf-8')

try:
response = client.chat.completions.create(
model=model,
Expand All @@ -53,6 +61,7 @@ def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEF

# Extract the markdown content from the response
markdown_content = response.choices[0].message.content
logging.info("Page processed successfully")
return markdown_content

except Exception as e:
Expand Down Expand Up @@ -99,7 +108,7 @@ def pdf_to_image_files(pdf_file):
return image_files


def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT):
def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT, pages_list = None):
"""
Convert a PDF file to a list of markdown-formatted pages using OpenAI's API.
Expand All @@ -109,13 +118,17 @@ def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1'
model (str, optional): by default is gpt-4o
base_url (str): You can use this one to point the client whereever you need it like Ollama
prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT.
pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages.
Returns:
list: A list of strings, each containing the markdown representation of a PDF page.
"""
client = OpenAI(api_key=api_key, base_url = base_url) # Create OpenAI client
# Convert PDF to image files
image_files = pdf_to_image_files(pdf_file)

if pages_list:
# Filter image_files to only include pages in page_list
image_files = [img for i, img in enumerate(image_files) if i + 1 in pages_list]

# List to store markdown content for each page
markdown_pages = [None] * len(image_files)
Expand Down

0 comments on commit 0fbf28c

Please sign in to comment.