From 336344f0a4290674ac5fbf712b606927f0b4bb5e Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Wed, 17 Jan 2024 12:04:38 -0800 Subject: [PATCH 1/4] Add support for markdown files --- api/query.py | 1 + models/query.py | 4 ++-- service/vector_database.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/api/query.py b/api/query.py index 6e999e05..14d4b1a3 100644 --- a/api/query.py +++ b/api/query.py @@ -13,4 +13,5 @@ async def query(payload: RequestPayload): chunks = await vector_service.query(input=payload.input, top_k=4) documents = await vector_service.convert_to_rerank_format(chunks=chunks) results = await vector_service.rerank(query=payload.input, documents=documents) + print(results) return {"success": True, "data": results} diff --git a/models/query.py b/models/query.py index aaab3776..19122316 100644 --- a/models/query.py +++ b/models/query.py @@ -1,5 +1,5 @@ from pydantic import BaseModel -from typing import List +from typing import List, Optional from models.vector_database import VectorDatabase @@ -12,7 +12,7 @@ class RequestPayload(BaseModel): class ResponseData(BaseModel): content: str file_url: str - page_label: str + page_label: Optional[str] class ResponsePayload(BaseModel): diff --git a/service/vector_database.py b/service/vector_database.py index be1716db..a65757d7 100644 --- a/service/vector_database.py +++ b/service/vector_database.py @@ -152,7 +152,7 @@ async def convert_to_rerank_format(self, chunks: List[rest.PointStruct]): async def upsert(self, embeddings: List[tuple[str, list, dict[str, Any]]]) -> None: points = [] - + print(embeddings) for _embedding in embeddings: points.append( rest.PointStruct( From 74c141740ce7ee77353e04cd28869b0424fc8144 Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Wed, 17 Jan 2024 12:27:49 -0800 Subject: [PATCH 2/4] Add support for TXT files --- api/query.py | 1 - service/vector_database.py | 1 - 2 files changed, 2 deletions(-) diff --git a/api/query.py b/api/query.py index 14d4b1a3..6e999e05 100644 --- a/api/query.py +++ b/api/query.py @@ -13,5 +13,4 @@ async def query(payload: RequestPayload): chunks = await vector_service.query(input=payload.input, top_k=4) documents = await vector_service.convert_to_rerank_format(chunks=chunks) results = await vector_service.rerank(query=payload.input, documents=documents) - print(results) return {"success": True, "data": results} diff --git a/service/vector_database.py b/service/vector_database.py index a65757d7..bfa11648 100644 --- a/service/vector_database.py +++ b/service/vector_database.py @@ -152,7 +152,6 @@ async def convert_to_rerank_format(self, chunks: List[rest.PointStruct]): async def upsert(self, embeddings: List[tuple[str, list, dict[str, Any]]]) -> None: points = [] - print(embeddings) for _embedding in embeddings: points.append( rest.PointStruct( From 0a00f4b2c818fee5fe0f80e73f08f4b3ba9db153 Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Wed, 17 Jan 2024 12:46:30 -0800 Subject: [PATCH 3/4] Add support for .docx formats --- requirements.txt | 1 + service/embedding.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 59741684..f18174fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ dataclasses-json==0.6.3 Deprecated==1.2.14 distro==1.9.0 dnspython==2.4.2 +docx2txt==0.8 fastapi==0.109.0 fastavro==1.9.3 filelock==3.13.1 diff --git a/service/embedding.py b/service/embedding.py index 781b3d5b..657d9e12 100644 --- a/service/embedding.py +++ b/service/embedding.py @@ -18,7 +18,7 @@ def __init__(self, files: List[File], index_name: str, vector_credentials: dict) self.vector_credentials = vector_credentials def _get_datasource_suffix(self, type: str) -> str: - suffixes = {"TXT": ".txt", "PDF": ".pdf", "MARKDOWN": ".md"} + suffixes = {"TXT": ".txt", "PDF": ".pdf", "MARKDOWN": ".md", "DOCX": ".docx"} try: return suffixes[type] except KeyError: From a31b531809f2cea83ec30c98984f4ca25a74cc2b Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Wed, 17 Jan 2024 13:07:50 -0800 Subject: [PATCH 4/4] Add support for pptx --- api/query.py | 7 +++++-- models/file.py | 2 -- service/embedding.py | 8 +++++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/api/query.py b/api/query.py index 6e999e05..a1e9c8aa 100644 --- a/api/query.py +++ b/api/query.py @@ -12,5 +12,8 @@ async def query(payload: RequestPayload): ) chunks = await vector_service.query(input=payload.input, top_k=4) documents = await vector_service.convert_to_rerank_format(chunks=chunks) - results = await vector_service.rerank(query=payload.input, documents=documents) - return {"success": True, "data": results} + if len(documents): + documents = await vector_service.rerank( + query=payload.input, documents=documents + ) + return {"success": True, "data": documents} diff --git a/models/file.py b/models/file.py index 74f20db1..91204138 100644 --- a/models/file.py +++ b/models/file.py @@ -7,8 +7,6 @@ class FileType(Enum): docx = "DOCX" txt = "TXT" pptx = "PPTX" - csv = "CSV" - xlsx = "XLSX" md = "MARKDOWN" diff --git a/service/embedding.py b/service/embedding.py index 657d9e12..13e16a78 100644 --- a/service/embedding.py +++ b/service/embedding.py @@ -18,7 +18,13 @@ def __init__(self, files: List[File], index_name: str, vector_credentials: dict) self.vector_credentials = vector_credentials def _get_datasource_suffix(self, type: str) -> str: - suffixes = {"TXT": ".txt", "PDF": ".pdf", "MARKDOWN": ".md", "DOCX": ".docx"} + suffixes = { + "TXT": ".txt", + "PDF": ".pdf", + "MARKDOWN": ".md", + "DOCX": ".docx", + "PPTX": ".pptx", + } try: return suffixes[type] except KeyError: