diff --git a/api/query.py b/api/query.py index 6e999e05..a1e9c8aa 100644 --- a/api/query.py +++ b/api/query.py @@ -12,5 +12,8 @@ async def query(payload: RequestPayload): ) chunks = await vector_service.query(input=payload.input, top_k=4) documents = await vector_service.convert_to_rerank_format(chunks=chunks) - results = await vector_service.rerank(query=payload.input, documents=documents) - return {"success": True, "data": results} + if len(documents): + documents = await vector_service.rerank( + query=payload.input, documents=documents + ) + return {"success": True, "data": documents} diff --git a/models/file.py b/models/file.py index 74f20db1..91204138 100644 --- a/models/file.py +++ b/models/file.py @@ -7,8 +7,6 @@ class FileType(Enum): docx = "DOCX" txt = "TXT" pptx = "PPTX" - csv = "CSV" - xlsx = "XLSX" md = "MARKDOWN" diff --git a/models/query.py b/models/query.py index aaab3776..19122316 100644 --- a/models/query.py +++ b/models/query.py @@ -1,5 +1,5 @@ from pydantic import BaseModel -from typing import List +from typing import List, Optional from models.vector_database import VectorDatabase @@ -12,7 +12,7 @@ class RequestPayload(BaseModel): class ResponseData(BaseModel): content: str file_url: str - page_label: str + page_label: Optional[str] class ResponsePayload(BaseModel): diff --git a/requirements.txt b/requirements.txt index 59741684..f18174fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ dataclasses-json==0.6.3 Deprecated==1.2.14 distro==1.9.0 dnspython==2.4.2 +docx2txt==0.8 fastapi==0.109.0 fastavro==1.9.3 filelock==3.13.1 diff --git a/service/embedding.py b/service/embedding.py index 781b3d5b..13e16a78 100644 --- a/service/embedding.py +++ b/service/embedding.py @@ -18,7 +18,13 @@ def __init__(self, files: List[File], index_name: str, vector_credentials: dict) self.vector_credentials = vector_credentials def _get_datasource_suffix(self, type: str) -> str: - suffixes = {"TXT": ".txt", "PDF": ".pdf", "MARKDOWN": ".md"} + suffixes = { + "TXT": ".txt", + "PDF": ".pdf", + "MARKDOWN": ".md", + "DOCX": ".docx", + "PPTX": ".pptx", + } try: return suffixes[type] except KeyError: diff --git a/service/vector_database.py b/service/vector_database.py index be1716db..bfa11648 100644 --- a/service/vector_database.py +++ b/service/vector_database.py @@ -152,7 +152,6 @@ async def convert_to_rerank_format(self, chunks: List[rest.PointStruct]): async def upsert(self, embeddings: List[tuple[str, list, dict[str, Any]]]) -> None: points = [] - for _embedding in embeddings: points.append( rest.PointStruct(