Merge branch 'main' into feat/dynamic-llm

Cinnamon · Apr 3, 2024 · 304afc0 · 304afc0
2 parents 8e919ce + ecf09b2
commit 304afc0
Show file tree

Hide file tree

Showing 32 changed files with 1,721 additions and 292 deletions.
diff --git a/.gitignore b/.gitignore
@@ -452,6 +452,7 @@ $RECYCLE.BIN/
 .theflow/
 
 # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
+*.py[coid]
 
 logs/
 .gitsecret/keys/random_seed

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -52,7 +52,12 @@ repos:
     hooks:
       - id: mypy
         additional_dependencies:
-          [types-PyYAML==6.0.12.11, "types-requests", "sqlmodel"]
+          [
+            types-PyYAML==6.0.12.11,
+            "types-requests",
+            "sqlmodel",
+            "types-Markdown",
+          ]
         args: ["--check-untyped-defs", "--ignore-missing-imports"]
         exclude: "^templates/"
   - repo: https://github.com/codespell-project/codespell

diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -7,6 +7,7 @@
 from kotaemon.indices.extractors import BaseDocParser
 from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
 from kotaemon.loaders import (
+    AdobeReader,
     DirectoryReader,
     MathpixPDFReader,
     OCRReader,
@@ -41,7 +42,7 @@ class DocumentIngestor(BaseComponent):
             The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`
     """
 
-    pdf_mode: str = "normal"  # "normal", "mathpix", "ocr"
+    pdf_mode: str = "normal"  # "normal", "mathpix", "ocr", "multimodal"
     doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])
     text_splitter: BaseSplitter = TokenSplitter.withx(
         chunk_size=1024,
@@ -61,6 +62,8 @@ def _get_reader(self, input_files: list[str | Path]):
             pass  # use default loader of llama-index which is pypdf
         elif self.pdf_mode == "ocr":
             file_extractors[".pdf"] = OCRReader()
+        elif self.pdf_mode == "multimodal":
+            file_extractors[".pdf"] = AdobeReader()
         else:
             file_extractors[".pdf"] = MathpixPDFReader()
 

diff --git a/libs/kotaemon/kotaemon/indices/qa/citation.py b/libs/kotaemon/kotaemon/indices/qa/citation.py
@@ -104,18 +104,16 @@ def invoke(self, context: str, question: str):
             print("CitationPipeline: invoking LLM")
             llm_output = self.get_from_path("llm").invoke(messages, **llm_kwargs)
             print("CitationPipeline: finish invoking LLM")
+            if not llm_output.messages:
+                return None
+            function_output = llm_output.messages[0].additional_kwargs["function_call"][
+                "arguments"
+            ]
+            output = QuestionAnswer.parse_raw(function_output)
         except Exception as e:
             print(e)
             return None
 
-        if not llm_output.messages:
-            return None
-
-        function_output = llm_output.messages[0].additional_kwargs["function_call"][
-            "arguments"
-        ]
-        output = QuestionAnswer.parse_raw(function_output)
-
         return output
 
     async def ainvoke(self, context: str, question: str):

diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py
@@ -1,20 +1,23 @@
+from .adobe_loader import AdobeReader
 from .base import AutoReader, BaseReader
 from .composite_loader import DirectoryReader
 from .docx_loader import DocxReader
 from .excel_loader import PandasExcelReader
 from .html_loader import HtmlReader
 from .mathpix_loader import MathpixPDFReader
-from .ocr_loader import OCRReader
+from .ocr_loader import ImageReader, OCRReader
 from .unstructured_loader import UnstructuredReader
 
 __all__ = [
     "AutoReader",
     "BaseReader",
     "PandasExcelReader",
     "MathpixPDFReader",
+    "ImageReader",
     "OCRReader",
     "DirectoryReader",
     "UnstructuredReader",
     "DocxReader",
     "HtmlReader",
+    "AdobeReader",
 ]
diff --git a/libs/kotaemon/kotaemon/loaders/adobe_loader.py b/libs/kotaemon/kotaemon/loaders/adobe_loader.py
@@ -0,0 +1,186 @@
+import logging
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from decouple import config
+from llama_index.readers.base import BaseReader
+
+from kotaemon.base import Document
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_VLM_ENDPOINT = (
+    "{0}openai/deployments/{1}/chat/completions?api-version={2}".format(
+        config("AZURE_OPENAI_ENDPOINT", default=""),
+        "gpt-4-vision",
+        config("OPENAI_API_VERSION", default=""),
+    )
+)
+
+
+class AdobeReader(BaseReader):
+    """Read PDF using the Adobe's PDF Services.
+    Be able to extract text, table, and figure with high accuracy
+
+    Example:
+        ```python
+        >> from kotaemon.loaders import AdobeReader
+        >> reader = AdobeReader()
+        >> documents = reader.load_data("path/to/pdf")
+        ```
+    Args:
+        endpoint: URL to the Vision Language Model endpoint. If not provided,
+        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`
+
+        max_figures_to_caption: an int decides how many figured will be captioned.
+        The rest will be ignored (are indexed without captions).
+    """
+
+    def __init__(
+        self,
+        vlm_endpoint: Optional[str] = None,
+        max_figures_to_caption: int = 100,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Init params"""
+        super().__init__(*args)
+        self.table_regex = r"/Table(\[\d+\])?$"
+        self.figure_regex = r"/Figure(\[\d+\])?$"
+        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT
+        self.max_figures_to_caption = max_figures_to_caption
+
+    def load_data(
+        self, file: Path, extra_info: Optional[Dict] = None, **kwargs
+    ) -> List[Document]:
+        """Load data by calling to the Adobe's API
+
+        Args:
+            file (Path): Path to the PDF file
+
+        Returns:
+            List[Document]: list of documents extracted from the PDF file,
+                includes 3 types: text, table, and image
+
+        """
+        from .utils.adobe import (
+            generate_figure_captions,
+            load_json,
+            parse_figure_paths,
+            parse_table_paths,
+            request_adobe_service,
+        )
+
+        filename = file.name
+        filepath = str(Path(file).resolve())
+        output_path = request_adobe_service(file_path=str(file), output_path="")
+        results_path = os.path.join(output_path, "structuredData.json")
+
+        if not os.path.exists(results_path):
+            logger.exception("Fail to parse the document.")
+            return []
+
+        data = load_json(results_path)
+
+        texts = defaultdict(list)
+        tables = []
+        figures = []
+
+        elements = data["elements"]
+        for item_id, item in enumerate(elements):
+            page_number = item.get("Page", -1) + 1
+            item_path = item["Path"]
+            item_text = item.get("Text", "")
+
+            file_paths = [
+                Path(output_path) / path for path in item.get("filePaths", [])
+            ]
+            prev_item = elements[item_id - 1]
+            title = prev_item.get("Text", "")
+
+            if re.search(self.table_regex, item_path):
+                table_content = parse_table_paths(file_paths)
+                if not table_content:
+                    continue
+                table_caption = (
+                    table_content.replace("|", "").replace("---", "")
+                    + f"\n(Table in Page {page_number}. {title})"
+                )
+                tables.append((page_number, table_content, table_caption))
+
+            elif re.search(self.figure_regex, item_path):
+                figure_caption = (
+                    item_text + f"\n(Figure in Page {page_number}. {title})"
+                )
+                figure_content = parse_figure_paths(file_paths)
+                if not figure_content:
+                    continue
+                figures.append([page_number, figure_content, figure_caption])
+
+            else:
+                if item_text and "Table" not in item_path and "Figure" not in item_path:
+                    texts[page_number].append(item_text)
+
+        # get figure caption using GPT-4V
+        figure_captions = generate_figure_captions(
+            self.vlm_endpoint,
+            [item[1] for item in figures],
+            self.max_figures_to_caption,
+        )
+        for item, caption in zip(figures, figure_captions):
+            # update figure caption
+            item[2] += " " + caption
+
+        # Wrap elements with Document
+        documents = []
+
+        # join plain text elements
+        for page_number, txts in texts.items():
+            documents.append(
+                Document(
+                    text="\n".join(txts),
+                    metadata={
+                        "page_label": page_number,
+                        "file_name": filename,
+                        "file_path": filepath,
+                    },
+                )
+            )
+
+        # table elements
+        for page_number, table_content, table_caption in tables:
+            documents.append(
+                Document(
+                    text=table_caption,
+                    metadata={
+                        "table_origin": table_content,
+                        "type": "table",
+                        "page_label": page_number,
+                        "file_name": filename,
+                        "file_path": filepath,
+                    },
+                    metadata_template="",
+                    metadata_seperator="",
+                )
+            )
+
+        # figure elements
+        for page_number, figure_content, figure_caption in figures:
+            documents.append(
+                Document(
+                    text=figure_caption,
+                    metadata={
+                        "image_origin": figure_content,
+                        "type": "image",
+                        "page_label": page_number,
+                        "file_name": filename,
+                        "file_path": filepath,
+                    },
+                    metadata_template="",
+                    metadata_seperator="",
+                )
+            )
+        return documents
diff --git a/libs/kotaemon/kotaemon/loaders/ocr_loader.py b/libs/kotaemon/kotaemon/loaders/ocr_loader.py
@@ -125,3 +125,70 @@ def load_data(
         )
 
         return documents
+
+
+class ImageReader(BaseReader):
+    """Read PDF using OCR, with high focus on table extraction
+
+    Example:
+        ```python
+        >> from knowledgehub.loaders import OCRReader
+        >> reader = OCRReader()
+        >> documents = reader.load_data("path/to/pdf")
+        ```
+
+    Args:
+        endpoint: URL to FullOCR endpoint. If not provided, will look for
+            environment variable `OCR_READER_ENDPOINT` or use the default
+            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`
+            (http://127.0.0.1:8000/v2/ai/infer/)
+        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF
+            If False, only the table and text within table cells will be extracted.
+    """
+
+    def __init__(self, endpoint: Optional[str] = None):
+        """Init the OCR reader with OCR endpoint (FullOCR pipeline)"""
+        super().__init__()
+        self.ocr_endpoint = endpoint or os.getenv(
+            "OCR_READER_ENDPOINT", DEFAULT_OCR_ENDPOINT
+        )
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> List[Document]:
+        """Load data using OCR reader
+
+        Args:
+            file_path (Path): Path to PDF file
+            debug_path (Path): Path to store debug image output
+            artifact_path (Path): Path to OCR endpoints artifacts directory
+
+        Returns:
+            List[Document]: list of documents extracted from the PDF file
+        """
+        file_path = Path(file_path).resolve()
+
+        with file_path.open("rb") as content:
+            files = {"input": content}
+            data = {"job_id": uuid4(), "table_only": False}
+
+            # call the API from FullOCR endpoint
+            if "response_content" in kwargs:
+                # overriding response content if specified
+                ocr_results = kwargs["response_content"]
+            else:
+                # call original API
+                resp = tenacious_api_post(url=self.ocr_endpoint, files=files, data=data)
+                ocr_results = resp.json()["result"]
+
+        extra_info = extra_info or {}
+        result = []
+        for ocr_result in ocr_results:
+            result.append(
+                Document(
+                    content=ocr_result["csv_string"],
+                    metadata=extra_info,
+                )
+            )
+
+        return result