Skip to content

Commit

Permalink
Merge branch 'main' into feat/dynamic-llm
Browse files Browse the repository at this point in the history
  • Loading branch information
trducng committed Apr 3, 2024
2 parents 8e919ce + ecf09b2 commit 304afc0
Show file tree
Hide file tree
Showing 32 changed files with 1,721 additions and 292 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ $RECYCLE.BIN/
.theflow/

# End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
*.py[coid]

logs/
.gitsecret/keys/random_seed
Expand Down
7 changes: 6 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,12 @@ repos:
hooks:
- id: mypy
additional_dependencies:
[types-PyYAML==6.0.12.11, "types-requests", "sqlmodel"]
[
types-PyYAML==6.0.12.11,
"types-requests",
"sqlmodel",
"types-Markdown",
]
args: ["--check-untyped-defs", "--ignore-missing-imports"]
exclude: "^templates/"
- repo: https://github.com/codespell-project/codespell
Expand Down
5 changes: 4 additions & 1 deletion libs/kotaemon/kotaemon/indices/ingests/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
from kotaemon.loaders import (
AdobeReader,
DirectoryReader,
MathpixPDFReader,
OCRReader,
Expand Down Expand Up @@ -41,7 +42,7 @@ class DocumentIngestor(BaseComponent):
The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`
"""

pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
pdf_mode: str = "normal" # "normal", "mathpix", "ocr", "multimodal"
doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])
text_splitter: BaseSplitter = TokenSplitter.withx(
chunk_size=1024,
Expand All @@ -61,6 +62,8 @@ def _get_reader(self, input_files: list[str | Path]):
pass # use default loader of llama-index which is pypdf
elif self.pdf_mode == "ocr":
file_extractors[".pdf"] = OCRReader()
elif self.pdf_mode == "multimodal":
file_extractors[".pdf"] = AdobeReader()
else:
file_extractors[".pdf"] = MathpixPDFReader()

Expand Down
14 changes: 6 additions & 8 deletions libs/kotaemon/kotaemon/indices/qa/citation.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,18 +104,16 @@ def invoke(self, context: str, question: str):
print("CitationPipeline: invoking LLM")
llm_output = self.get_from_path("llm").invoke(messages, **llm_kwargs)
print("CitationPipeline: finish invoking LLM")
if not llm_output.messages:
return None
function_output = llm_output.messages[0].additional_kwargs["function_call"][
"arguments"
]
output = QuestionAnswer.parse_raw(function_output)
except Exception as e:
print(e)
return None

if not llm_output.messages:
return None

function_output = llm_output.messages[0].additional_kwargs["function_call"][
"arguments"
]
output = QuestionAnswer.parse_raw(function_output)

return output

async def ainvoke(self, context: str, question: str):
Expand Down
5 changes: 4 additions & 1 deletion libs/kotaemon/kotaemon/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
from .adobe_loader import AdobeReader
from .base import AutoReader, BaseReader
from .composite_loader import DirectoryReader
from .docx_loader import DocxReader
from .excel_loader import PandasExcelReader
from .html_loader import HtmlReader
from .mathpix_loader import MathpixPDFReader
from .ocr_loader import OCRReader
from .ocr_loader import ImageReader, OCRReader
from .unstructured_loader import UnstructuredReader

__all__ = [
"AutoReader",
"BaseReader",
"PandasExcelReader",
"MathpixPDFReader",
"ImageReader",
"OCRReader",
"DirectoryReader",
"UnstructuredReader",
"DocxReader",
"HtmlReader",
"AdobeReader",
]
186 changes: 186 additions & 0 deletions libs/kotaemon/kotaemon/loaders/adobe_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import logging
import os
import re
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional

from decouple import config
from llama_index.readers.base import BaseReader

from kotaemon.base import Document

logger = logging.getLogger(__name__)

DEFAULT_VLM_ENDPOINT = (
"{0}openai/deployments/{1}/chat/completions?api-version={2}".format(
config("AZURE_OPENAI_ENDPOINT", default=""),
"gpt-4-vision",
config("OPENAI_API_VERSION", default=""),
)
)


class AdobeReader(BaseReader):
"""Read PDF using the Adobe's PDF Services.
Be able to extract text, table, and figure with high accuracy
Example:
```python
>> from kotaemon.loaders import AdobeReader
>> reader = AdobeReader()
>> documents = reader.load_data("path/to/pdf")
```
Args:
endpoint: URL to the Vision Language Model endpoint. If not provided,
will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`
max_figures_to_caption: an int decides how many figured will be captioned.
The rest will be ignored (are indexed without captions).
"""

def __init__(
self,
vlm_endpoint: Optional[str] = None,
max_figures_to_caption: int = 100,
*args: Any,
**kwargs: Any,
) -> None:
"""Init params"""
super().__init__(*args)
self.table_regex = r"/Table(\[\d+\])?$"
self.figure_regex = r"/Figure(\[\d+\])?$"
self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT
self.max_figures_to_caption = max_figures_to_caption

def load_data(
self, file: Path, extra_info: Optional[Dict] = None, **kwargs
) -> List[Document]:
"""Load data by calling to the Adobe's API
Args:
file (Path): Path to the PDF file
Returns:
List[Document]: list of documents extracted from the PDF file,
includes 3 types: text, table, and image
"""
from .utils.adobe import (
generate_figure_captions,
load_json,
parse_figure_paths,
parse_table_paths,
request_adobe_service,
)

filename = file.name
filepath = str(Path(file).resolve())
output_path = request_adobe_service(file_path=str(file), output_path="")
results_path = os.path.join(output_path, "structuredData.json")

if not os.path.exists(results_path):
logger.exception("Fail to parse the document.")
return []

data = load_json(results_path)

texts = defaultdict(list)
tables = []
figures = []

elements = data["elements"]
for item_id, item in enumerate(elements):
page_number = item.get("Page", -1) + 1
item_path = item["Path"]
item_text = item.get("Text", "")

file_paths = [
Path(output_path) / path for path in item.get("filePaths", [])
]
prev_item = elements[item_id - 1]
title = prev_item.get("Text", "")

if re.search(self.table_regex, item_path):
table_content = parse_table_paths(file_paths)
if not table_content:
continue
table_caption = (
table_content.replace("|", "").replace("---", "")
+ f"\n(Table in Page {page_number}. {title})"
)
tables.append((page_number, table_content, table_caption))

elif re.search(self.figure_regex, item_path):
figure_caption = (
item_text + f"\n(Figure in Page {page_number}. {title})"
)
figure_content = parse_figure_paths(file_paths)
if not figure_content:
continue
figures.append([page_number, figure_content, figure_caption])

else:
if item_text and "Table" not in item_path and "Figure" not in item_path:
texts[page_number].append(item_text)

# get figure caption using GPT-4V
figure_captions = generate_figure_captions(
self.vlm_endpoint,
[item[1] for item in figures],
self.max_figures_to_caption,
)
for item, caption in zip(figures, figure_captions):
# update figure caption
item[2] += " " + caption

# Wrap elements with Document
documents = []

# join plain text elements
for page_number, txts in texts.items():
documents.append(
Document(
text="\n".join(txts),
metadata={
"page_label": page_number,
"file_name": filename,
"file_path": filepath,
},
)
)

# table elements
for page_number, table_content, table_caption in tables:
documents.append(
Document(
text=table_caption,
metadata={
"table_origin": table_content,
"type": "table",
"page_label": page_number,
"file_name": filename,
"file_path": filepath,
},
metadata_template="",
metadata_seperator="",
)
)

# figure elements
for page_number, figure_content, figure_caption in figures:
documents.append(
Document(
text=figure_caption,
metadata={
"image_origin": figure_content,
"type": "image",
"page_label": page_number,
"file_name": filename,
"file_path": filepath,
},
metadata_template="",
metadata_seperator="",
)
)
return documents
67 changes: 67 additions & 0 deletions libs/kotaemon/kotaemon/loaders/ocr_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,70 @@ def load_data(
)

return documents


class ImageReader(BaseReader):
"""Read PDF using OCR, with high focus on table extraction
Example:
```python
>> from knowledgehub.loaders import OCRReader
>> reader = OCRReader()
>> documents = reader.load_data("path/to/pdf")
```
Args:
endpoint: URL to FullOCR endpoint. If not provided, will look for
environment variable `OCR_READER_ENDPOINT` or use the default
`knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`
(http://127.0.0.1:8000/v2/ai/infer/)
use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF
If False, only the table and text within table cells will be extracted.
"""

def __init__(self, endpoint: Optional[str] = None):
"""Init the OCR reader with OCR endpoint (FullOCR pipeline)"""
super().__init__()
self.ocr_endpoint = endpoint or os.getenv(
"OCR_READER_ENDPOINT", DEFAULT_OCR_ENDPOINT
)

def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Load data using OCR reader
Args:
file_path (Path): Path to PDF file
debug_path (Path): Path to store debug image output
artifact_path (Path): Path to OCR endpoints artifacts directory
Returns:
List[Document]: list of documents extracted from the PDF file
"""
file_path = Path(file_path).resolve()

with file_path.open("rb") as content:
files = {"input": content}
data = {"job_id": uuid4(), "table_only": False}

# call the API from FullOCR endpoint
if "response_content" in kwargs:
# overriding response content if specified
ocr_results = kwargs["response_content"]
else:
# call original API
resp = tenacious_api_post(url=self.ocr_endpoint, files=files, data=data)
ocr_results = resp.json()["result"]

extra_info = extra_info or {}
result = []
for ocr_result in ocr_results:
result.append(
Document(
content=ocr_result["csv_string"],
metadata=extra_info,
)
)

return result
Loading

0 comments on commit 304afc0

Please sign in to comment.