diff --git a/.github/workflows/unit-test.yaml b/.github/workflows/unit-test.yaml index 82b9154da..c94ad1945 100644 --- a/.github/workflows/unit-test.yaml +++ b/.github/workflows/unit-test.yaml @@ -89,7 +89,7 @@ jobs: run: | python -m pip install --upgrade pip cd libs/kotaemon - pip install -U --upgrade-strategy eager -e .[dev] + pip install -U --upgrade-strategy eager -e .[all] - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }} if: | diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index 80044afae..ed00e5cb7 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -7,7 +7,6 @@ from kotaemon.indices.extractors import BaseDocParser from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from kotaemon.loaders import ( - AutoReader, DirectoryReader, MathpixPDFReader, OCRReader, @@ -59,7 +58,7 @@ def _get_reader(self, input_files: list[str | Path]): file_extractors[ext] = cls() if self.pdf_mode == "normal": - file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore + pass # use default loader of llama-index which is pypdf elif self.pdf_mode == "ocr": file_extractors[".pdf"] = OCRReader() else: diff --git a/libs/kotaemon/kotaemon/loaders/base.py b/libs/kotaemon/kotaemon/loaders/base.py index ca27e4915..2e52f7292 100644 --- a/libs/kotaemon/kotaemon/loaders/base.py +++ b/libs/kotaemon/kotaemon/loaders/base.py @@ -55,7 +55,7 @@ def _get_wrapped_class(self) -> Type["BaseReader"]: def _get_wrapped_class(self) -> Type["LIBaseReader"]: raise NotImplementedError( - "Please return the relevant Langchain class in in _get_lc_class" + "Please return the relevant llama-index class in in _get_wrapped_class" ) def __init__(self, *args, **kwargs): diff --git a/libs/kotaemon/kotaemon/loaders/docx_loader.py b/libs/kotaemon/kotaemon/loaders/docx_loader.py index b8f77b8f6..dcec53984 100644 --- a/libs/kotaemon/kotaemon/loaders/docx_loader.py +++ b/libs/kotaemon/kotaemon/loaders/docx_loader.py @@ -33,7 +33,7 @@ def load_data( """Load data using Docx reader Args: - file_path (Path): Path to PDF file + file_path (Path): Path to .docx file Returns: List[Document]: list of documents extracted from the HTML file diff --git a/libs/kotaemon/kotaemon/loaders/html_loader.py b/libs/kotaemon/kotaemon/loaders/html_loader.py index fd0eddd61..1295cfca3 100644 --- a/libs/kotaemon/kotaemon/loaders/html_loader.py +++ b/libs/kotaemon/kotaemon/loaders/html_loader.py @@ -37,7 +37,7 @@ def load_data( """Load data using Html reader Args: - file_path: path to pdf file + file_path: path to HTML file extra_info: extra information passed to this reader during extracting data Returns: diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 5abfb98f9..1ba69636f 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development." dependencies = [ "langchain", "langchain-community", + "langchain-openai", + "openai", "theflow", "llama-index>=0.9.0,<0.10.0", "llama-hub", @@ -27,6 +29,11 @@ dependencies = [ "pandas", "trogon", "tenacity", + "python-dotenv", # currently used to read configs from file, should be remove in the future + "chromadb", + "unstructured", + "pypdf", + "html2text", ] readme = "README.md" license = { text = "MIT License" } @@ -42,31 +49,28 @@ classifiers = [ ] [project.optional-dependencies] -dev = [ - "ipython", - "pytest", - "pre-commit", - "black", - "flake8", - "sphinx", - "coverage", - "openai", - "langchain-openai", - "chromadb", +adv = [ "wikipedia", "duckduckgo-search", "googlesearch-python", "python-docx", - "python-dotenv", "pytest-mock", "unstructured[pdf]", "sentence_transformers", "cohere", "elasticsearch", - "pypdf", - "html2text", "llama-cpp-python", ] +dev = [ + "ipython", + "pytest", + "pre-commit", + "black", + "flake8", + "sphinx", + "coverage", +] +all = ["kotaemon[adv,dev]"] [project.scripts] kh = "kotaemon.cli:main" diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index ff7c89558..d15d2fb49 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -25,7 +25,7 @@ from kotaemon.base import RetrievedDocument from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.indices.ingests import DocumentIngestor -from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking +from kotaemon.indices.rankings import BaseReranking, LLMReranking from .base import BaseFileIndexIndexing, BaseFileIndexRetriever @@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever): vector_retrieval: VectorRetrieval = VectorRetrieval.withx( embedding=embeddings.get_default(), ) - reranker: BaseReranking = CohereReranking.withx( - cohere_api_key=getattr(settings, "COHERE_API_KEY", "") - ) >> LLMReranking.withx(llm=llms.get_lowest_cost()) + reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost()) get_extra_table: bool = False def run( diff --git a/libs/ktem/pyproject.toml b/libs/ktem/pyproject.toml index c3160f19d..6fee8a043 100644 --- a/libs/ktem/pyproject.toml +++ b/libs/ktem/pyproject.toml @@ -13,18 +13,14 @@ version = "0.2.0" requires-python = ">= 3.10" description = "RAG-based Question and Answering Application" dependencies = [ - "chromadb", "click", - "cohere", "platformdirs", "pluggy", "python-decouple", - "python-dotenv", "python-pptx", "sqlalchemy", "sqlmodel", "tiktoken", - "unstructured[pdf]", ] readme = "README.md" license = { text = "MIT License" } diff --git a/scripts/run_linux.sh b/scripts/run_linux.sh index 7298b87b2..8e2ea056a 100755 --- a/scripts/run_linux.sh +++ b/scripts/run_linux.sh @@ -92,7 +92,7 @@ function install_dependencies() { if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else - local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" + local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" echo "" && echo "Install kotaemon's requirements" diff --git a/scripts/run_macos.sh b/scripts/run_macos.sh index 6ad9901d0..de71f75f4 100755 --- a/scripts/run_macos.sh +++ b/scripts/run_macos.sh @@ -92,7 +92,7 @@ function install_dependencies() { if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else - local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" + local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" echo "" && echo "Install kotaemon's requirements" diff --git a/scripts/run_windows.bat b/scripts/run_windows.bat index 4e5db8621..e365686a3 100644 --- a/scripts/run_windows.bat +++ b/scripts/run_windows.bat @@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0 ( ECHO Dependencies are already installed ) ELSE ( ECHO Install kotaemon's requirements - CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]" + CALL python -m pip install -e "%CD%\libs\kotaemon" ECHO Install ktem's requirements CALL python -m pip install -e "%CD%\libs\ktem"