diff --git a/libs/colbert/pyproject.toml b/libs/colbert/pyproject.toml index b9714eb93..ad66cb456 100644 --- a/libs/colbert/pyproject.toml +++ b/libs/colbert/pyproject.toml @@ -15,7 +15,7 @@ colbert-ai = "0.2.19" pyarrow = "14.0.1" torch = "2.2.1" cassio = "~0.1.7" -nest-asyncio = "^1.6.0" +pydantic = "^2.7.1" [tool.poetry.group.test.dependencies] ragstack-ai-tests-utils = { path = "../tests-utils", develop = true } diff --git a/libs/colbert/ragstack_colbert/__init__.py b/libs/colbert/ragstack_colbert/__init__.py index ff753f912..9b617734d 100644 --- a/libs/colbert/ragstack_colbert/__init__.py +++ b/libs/colbert/ragstack_colbert/__init__.py @@ -5,28 +5,31 @@ and constants related to the ColBERT model configuration are also provided. Exports: -- CassandraVectorStore: Implementation of a ColBERT vector store using Cassandra for storage. +- CassandraDatabase: Implementation of a BaseDatabase using Cassandra for storage. - ColbertEmbeddingModel: Class for generating and managing token embeddings using the ColBERT model. +- ColbertVectorStore: Implementation of a BaseVectorStore. - ColbertRetriever: Retriever class for executing ColBERT searches within a vector store. - DEFAULT_COLBERT_MODEL: The default identifier for the ColBERT model. - DEFAULT_COLBERT_DIM: The default dimensionality for ColBERT model embeddings. -- EmbeddedChunk: Data class for representing a chunk of embedded text. -- RetrievedChunk: Data class for representing a chunk of retrieved text. +- Chunk: Data class for representing a chunk of embedded text. """ -from .cassandra_vector_store import CassandraVectorStore -from .colbert_retriever import ColbertRetriever +from .cassandra_database import CassandraDatabase from .colbert_embedding_model import ColbertEmbeddingModel +from .colbert_retriever import ColbertRetriever +from .colbert_vector_store import ColbertVectorStore from .constant import DEFAULT_COLBERT_DIM, DEFAULT_COLBERT_MODEL -from .objects import ChunkData, EmbeddedChunk, RetrievedChunk +from .objects import Chunk, Embedding, Metadata, Vector __all__ = [ - "CassandraVectorStore", - "ChunkData", + "CassandraDatabase", "ColbertEmbeddingModel", "ColbertRetriever", + "ColbertVectorStore", "DEFAULT_COLBERT_DIM", "DEFAULT_COLBERT_MODEL", - "EmbeddedChunk", - "RetrievedChunk", + "Chunk", + "Embedding", + "Metadata", + "Vector", ] diff --git a/libs/colbert/ragstack_colbert/base_database.py b/libs/colbert/ragstack_colbert/base_database.py new file mode 100644 index 000000000..69ce15224 --- /dev/null +++ b/libs/colbert/ragstack_colbert/base_database.py @@ -0,0 +1,79 @@ +""" +This module defines abstract base classes for implementing storage mechanisms for text chunk +embeddings, specifically designed to work with ColBERT or similar embedding models. +""" + +from abc import ABC, abstractmethod +from typing import List, Optional, Tuple + +from .objects import Chunk, Vector + + +class BaseDatabase(ABC): + """ + Abstract base class (ABC) for a storage system designed to hold vector representations of text chunks, + typically generated by a ColBERT model or similar embedding model. + + This class defines the interface for storing and managing the embedded text chunks, supporting + operations like adding new chunks to the store and deleting existing documents by their identifiers. + """ + + @abstractmethod + def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]: + """ + Stores a list of embedded text chunks in the vector store + + Parameters: + chunks (List[Chunk]): A list of `Chunk` instances to be stored. + + Returns: + a list of tuples: (doc_id, chunk_id) + """ + + @abstractmethod + def delete_chunks(self, doc_ids: List[str]) -> bool: + """ + Deletes chunks from the vector store based on their document id. + + Parameters: + doc_ids (List[str]): A list of document identifiers specifying the chunks to be deleted. + + Returns: + True if the delete was successful. + """ + + @abstractmethod + async def search_relevant_chunks(self, vector: Vector, n: int) -> List[Chunk]: + """ + Retrieves 'n' ANN results for an embedded token vector. + + Returns: + A list of Chunks with only `doc_id` and `chunk_id` set. + Fewer than 'n' results may be returned. + """ + + @abstractmethod + async def get_chunk_embedding(self, doc_id: str, chunk_id: int) -> Chunk: + """ + Retrieve the embedding data for a chunk. + + Returns: + A chunk with `doc_id`, `chunk_id`, and `embedding` set. + """ + + @abstractmethod + async def get_chunk_data( + self, doc_id: str, chunk_id: int, include_embedding: Optional[bool] + ) -> Chunk: + """ + Retrieve the text and metadata for a chunk. + + Returns: + A chunk with `doc_id`, `chunk_id`, `text`, `metadata`, and optionally `embedding` set. + """ + + @abstractmethod + def close(self) -> None: + """ + Cleans up any open resources. + """ diff --git a/libs/colbert/ragstack_colbert/base_embedding_model.py b/libs/colbert/ragstack_colbert/base_embedding_model.py index 2abdf71b2..10cea8bcf 100644 --- a/libs/colbert/ragstack_colbert/base_embedding_model.py +++ b/libs/colbert/ragstack_colbert/base_embedding_model.py @@ -5,9 +5,7 @@ from abc import ABC, abstractmethod from typing import List, Optional -from torch import Tensor - -from .objects import ChunkData, EmbeddedChunk +from .objects import Embedding class BaseEmbeddingModel(ABC): @@ -20,25 +18,15 @@ class BaseEmbeddingModel(ABC): """ @abstractmethod - def embed_chunks( - self, chunks: List[ChunkData], doc_id: Optional[str] = None - ) -> List[EmbeddedChunk]: + def embed_texts(self, texts: List[str]) -> List[Embedding]: """ - Embeds a list of text chunks into their corresponding vector representations. - - This method takes multiple chunks of text and optionally their associated document identifier, - returning a list of `EmbeddedChunk` instances containing the embeddings. + Embeds a list of texts into their corresponding vector embedding representations. Parameters: - chunks (List[ChunkData]): A list of chunks including document text and any associated metadata. - doc_id (Optional[str], optional): An optional document identifier that all chunks belong to. - This can be used for tracing back embeddings to their - source document. If not passed, an uuid will be generated. + texts (List[str]): A list of string texts. Returns: - List[EmbeddedChunk]: A list of `EmbeddedChunks` instances with embeddings populated, - corresponding to the input text chunks, ready for insertion into - a vector store. + List[Embedding]: A list of embeddings, in the order of the input list """ @abstractmethod @@ -47,18 +35,18 @@ def embed_query( query: str, full_length_search: Optional[bool] = False, query_maxlen: int = -1, - ) -> Tensor: + ) -> Embedding: """ Embeds a single query text into its vector representation. If the query has fewer than query_maxlen tokens it will be padded with BERT special [mast] tokens. Parameters: - query (str): The query string to encode. + query (str): The query text to encode. full_length_search (Optional[bool]): Indicates whether to encode the query for a full-length search. Defaults to False. query_maxlen (int): The fixed length for the query token embedding. If -1, uses a dynamically calculated value. Returns: - Tensor: A tensor representing the embedded query. + Embedding: A vector embedding representation of the query text """ diff --git a/libs/colbert/ragstack_colbert/base_retriever.py b/libs/colbert/ragstack_colbert/base_retriever.py index 3f762a80d..7158ce1ee 100644 --- a/libs/colbert/ragstack_colbert/base_retriever.py +++ b/libs/colbert/ragstack_colbert/base_retriever.py @@ -4,9 +4,9 @@ """ from abc import ABC, abstractmethod -from typing import Any, List, Optional +from typing import Any, List, Optional, Tuple -from .objects import RetrievedChunk +from .objects import Chunk, Embedding class BaseRetriever(ABC): @@ -15,34 +15,112 @@ class BaseRetriever(ABC): the search and retrieval of text chunks based on query embeddings. """ + # handles LlamaIndex query @abstractmethod - def close(self) -> None: + def embedding_search( + self, + query_embedding: Embedding, + k: Optional[int] = None, + include_embedding: Optional[bool] = False, + **kwargs: Any + ) -> List[Tuple[Chunk, float]]: + """ + Retrieves a list of text chunks relevant to a given query from the vector store, ranked by + relevance or other metrics. + + Parameters: + query_embedding (Embedding): The query embedding to search for relevant text chunks. + k (Optional[int]): The number of top results to retrieve. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks + **kwargs (Any): Additional parameters that implementations might require for customized + retrieval operations. + + Returns: + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. + """ + + # handles LlamaIndex async query + @abstractmethod + async def aembedding_search( + self, + query_embedding: Embedding, + k: Optional[int] = None, + include_embedding: Optional[bool] = False, + **kwargs: Any + ) -> List[Tuple[Chunk, float]]: """ - Closes the retriever, releasing any resources or connections used during operation. - Implementations should ensure that all necessary cleanup is performed to avoid resource leaks. + Retrieves a list of text chunks relevant to a given query from the vector store, ranked by + relevance or other metrics. + + Parameters: + query_embedding (Embedding): The query embedding to search for relevant text chunks. + k (Optional[int]): The number of top results to retrieve. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks + **kwargs (Any): Additional parameters that implementations might require for customized + retrieval operations. + + Returns: + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. + """ + + # handles LangChain search + @abstractmethod + def text_search( + self, + query_text: str, + k: Optional[int] = None, + query_maxlen: Optional[int] = None, + include_embedding: Optional[bool] = False, + **kwargs: Any + ) -> List[Tuple[Chunk, float]]: + """ + Retrieves a list of text chunks relevant to a given query from the vector store, ranked by + relevance or other metrics. + + Parameters: + query_text (str): The query text to search for relevant text chunks. + k (Optional[int]): The number of top results to retrieve. + query_maxlen (Optional[int]): The maximum length of the query to consider. If None, the + maxlen will be dynamically generated. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks + **kwargs (Any): Additional parameters that implementations might require for customized + retrieval operations. + + Returns: + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. """ + # handles LangChain async search @abstractmethod - def retrieve( + async def atext_search( self, - query: str, + query_text: str, k: Optional[int] = None, query_maxlen: Optional[int] = None, + include_embedding: Optional[bool] = False, **kwargs: Any - ) -> List[RetrievedChunk]: + ) -> List[Tuple[Chunk, float]]: """ Retrieves a list of text chunks relevant to a given query from the vector store, ranked by relevance or other metrics. Parameters: - query (str): The query text to search for relevant text chunks. + query_text (str): The query text to search for relevant text chunks. k (Optional[int]): The number of top results to retrieve. query_maxlen (Optional[int]): The maximum length of the query to consider. If None, the maxlen will be dynamically generated. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks **kwargs (Any): Additional parameters that implementations might require for customized retrieval operations. Returns: - List[RetrievedChunk]: A list of `RetrievedChunk` instances representing the retrieved - text chunks, ranked by their relevance to the query. + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. """ diff --git a/libs/colbert/ragstack_colbert/base_vector_store.py b/libs/colbert/ragstack_colbert/base_vector_store.py index f73d1d27e..8a965cad7 100644 --- a/libs/colbert/ragstack_colbert/base_vector_store.py +++ b/libs/colbert/ragstack_colbert/base_vector_store.py @@ -1,14 +1,25 @@ """ -This module defines abstract base classes for implementing storage mechanisms for text chunk -embeddings, specifically designed to work with ColBERT or similar embedding models. +This module defines the abstract base class for a standard vector store +specifically designed to work with ColBERT or similar dense embedding models, +and can be used to create a LangChain or LlamaIndex ColBERT vector store. """ from abc import ABC, abstractmethod from typing import List, Optional, Tuple -from torch import Tensor +from .base_retriever import BaseRetriever +from .objects import Chunk, Metadata -from .objects import BaseChunk, ChunkData, EmbeddedChunk +# LlamaIndex Node (chunk) has ids, text, embedding, metadata +# VectorStore.add(nodes: List[Node]) -> List[str](ids): embeds texts OUTside add +# .delete(id) +# .query(embedding) -> Nodes, Scores, Ids + +# LangChain Document (doc or chunk) has page_content, metadata +# VectorStore.add(texts: List[str], metadatas: Optional[List[dict]]) -> List[str](ids): embeds texts INside add +# .delete(ids: List[str]): deletes by id +# .search(query: str) -> List[Document]: uses retriever to search in store +# .as_retriever() -> Retriever class BaseVectorStore(ABC): @@ -20,63 +31,57 @@ class BaseVectorStore(ABC): operations like adding new chunks to the store and deleting existing documents by their identifiers. """ + # handles LlamaIndex add @abstractmethod - def put_chunks( - self, chunks: List[EmbeddedChunk], delete_existing: Optional[bool] = False - ) -> None: + def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]: """ - Stores a list of embedded text chunks in the vector store, with an option to delete existing - entries before insertion. + Stores a list of embedded text chunks in the vector store Parameters: - chunks (List[EmbeddedChunk]): A list of `EmbeddedChunk` instances to be stored. - delete_existing (Optional[bool]): If True, any existing chunks with the same doc_ids - as those in the `chunks` list will be deleted before - inserting the new ones. Defaults to False. - """ - - @abstractmethod - def delete_chunks(self, doc_ids: List[str]) -> None: - """ - Deletes chunks from the vector store based on their document id. + chunks (List[Chunk]): A list of `Chunk` instances to be stored. - Parameters: - doc_ids (List[str]): A list of document identifiers specifying the chunks to be deleted. + Returns: + a list of tuples: (doc_id, chunk_id) """ + # handles LangChain add @abstractmethod - async def search_relevant_chunks(self, vector: List[float], n: int) -> List[BaseChunk]: + def add_texts( + self, + texts: List[str], + metadatas: Optional[List[Metadata]], + doc_id: Optional[str] = None, + ) -> List[Tuple[str, int]]: """ - Searches for relevant chunks using ANN for an embedded token vector. + Embeds and stores a list of text chunks and optional metadata into the vector store Parameters: - vector (List[float]): A vector embedding for a query token. - n (int): The number of items to return from the search + texts (List[str]): The list of text chunks to be embedded + metadatas (Optional[List[Metadata]])): An optional list of Metadata to be stored. + If provided, these are set 1 to 1 with the texts list. + doc_id (Optional[str]): The document id associated with the texts. If not provided, + it is generated. Returns: - A list of chunks with doc_id and chunk_id. Fewer than 'n' results may be returned. + a list of tuples: (doc_id, chunk_id) """ + # handles LangChain and LlamaIndex delete @abstractmethod - async def get_chunk_embeddings(self, chunk: BaseChunk) -> Tuple[BaseChunk, List[Tensor]]: + def delete_chunks(self, doc_ids: List[str]) -> bool: """ - Retrieve all the embedding data for a chunk. + Deletes chunks from the vector store based on their document id. Parameters: - chunk (BaseChunk): The chunk to return. + doc_ids (List[str]): A list of document identifiers specifying the chunks to be deleted. Returns: - A RetrievedChunk including doc_id, chunk_id, and the embeddings for the chunk. + True if the delete was successful. """ + # handles LangChain as_retriever @abstractmethod - async def get_chunk_data(self, chunk: BaseChunk) -> Tuple[BaseChunk, ChunkData]: + def as_retriever(self) -> BaseRetriever: """ - Fetches the text and metadata for a given doc_id and chunk_id. - - Parameters: - chunk (BaseChunk): The chunk to return. - - Returns: - ChunkData including text and metadata for the chunk. + Gets a retriever using the vector store. """ diff --git a/libs/colbert/ragstack_colbert/cassandra_database.py b/libs/colbert/ragstack_colbert/cassandra_database.py new file mode 100644 index 000000000..16f4e6532 --- /dev/null +++ b/libs/colbert/ragstack_colbert/cassandra_database.py @@ -0,0 +1,253 @@ +""" +This module provides an implementation of the BaseVectorStore abstract class, specifically designed +for use with a Cassandra database backend. It allows for the efficient storage and management of text embeddings +generated by a ColBERT model, facilitating scalable and high-relevancy retrieval operations. +""" + +import logging +from typing import List, Optional, Set, Tuple + +import cassio +from cassandra.cluster import ResponseFuture, Session +from cassio.table.query import Predicate, PredicateOperator +from cassio.table.tables import ClusteredMetadataVectorCassandraTable + +from .base_database import BaseDatabase +from .constant import DEFAULT_COLBERT_DIM +from .objects import Chunk, Vector + + +class CassandraDatabase(BaseDatabase): + """ + An implementation of the BaseDatabase abstract base class using Cassandra as the backend + storage system. This class provides methods to store, retrieve, and manage text embeddings within + a Cassandra database, specifically designed for handling vector embeddings generated by ColBERT. + + The table schema and custom index for ANN queries are automatically created if they do not exist. + """ + + _table: ClusteredMetadataVectorCassandraTable + _instance = None + + def __new__(cls): + if cls._instance is None: + raise ValueError( + "This class cannot be instantiated directly. Please use the `from_astra()` or `from_session()` class methods." + ) + return cls._instance + + @classmethod + def from_astra( + cls, + database_id: str, + astra_token: str, + keyspace: Optional[str] = "default_keyspace", + table_name: Optional[str] = "colbert", + timeout: Optional[int] = 180, + ): + if cls._instance is None: + cassio.init(token=astra_token, database_id=database_id, keyspace=keyspace) + session = cassio.config.resolve_session() + session.default_timeout = timeout + + return cls.from_session( + session=session, keyspace=keyspace, table_name=table_name + ) + return cls._instance + + @classmethod + def from_session( + cls, + session: Session, + keyspace: Optional[str] = "default_keyspace", + table_name: Optional[str] = "colbert", + ): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialize( + session=session, keyspace=keyspace, table_name=table_name + ) + return cls._instance + + def _initialize( + self, + session: Session, + keyspace: str, + table_name: str, + ): + """ + Initializes a new instance of the CassandraVectorStore. + + Parameters: + session (Session): The Cassandra session to use. + keyspace (str): The keyspace in which the table exists or will be created. + table_name (str): The name of the table to use or create for storing embeddings. + timeout (int, optional): The default timeout in seconds for Cassandra operations. Defaults to 180. + """ + + try: + is_astra = session.cluster.cloud + except: + is_astra = False + + logging.info( + f"Cassandra store is running on {'AstraDB' if is_astra else 'Apache Cassandra'}." + ) + + self._table = ClusteredMetadataVectorCassandraTable( + session=session, + keyspace=keyspace, + table=table_name, + row_id_type=["INT", "INT"], + vector_dimension=DEFAULT_COLBERT_DIM, + vector_source_model="bert" if is_astra else None, + vector_similarity_function=None if is_astra else "DOT_PRODUCT", + ) + + def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]: + """ + Stores a list of embedded text chunks in the vector store + + Parameters: + chunks (List[Chunk]): A list of `Chunk` instances to be stored. + + Returns: + a list of tuples: (doc_id, chunk_id) + """ + + futures: List[Tuple[str, int, int, ResponseFuture]] = [] + + for chunk in chunks: + doc_id = chunk.doc_id + chunk_id = chunk.chunk_id + text = chunk.text + metadata = chunk.metadata + + future = self._table.put_async( + partition_id=doc_id, + row_id=(chunk_id, -1), + body_blob=text, + metadata=metadata, + ) + + futures.append((doc_id, chunk_id, -1, future)) + + for index, vector in enumerate(chunk.embedding): + future = self._table.put_async( + partition_id=doc_id, row_id=(chunk_id, index), vector=vector + ) + futures.append((doc_id, chunk_id, index, future)) + + results: List[Tuple[str, int]] = [] + for doc_id, chunk_id, embedding_id, future in futures: + try: + future.result() + results.append((doc_id, chunk_id)) + except Exception as e: + if embedding_id == -1: + logging.error( + f"issue inserting document data: {doc_id} chunk: {chunk_id}: {e}" + ) + else: + logging.error( + f"issue inserting document embedding: {doc_id} chunk: {chunk_id} embedding: {embedding_id}: {e}" + ) + + return results + + def delete_chunks(self, doc_ids: List[str]) -> None: + """ + Deletes chunks from the vector store based on their document id. + + Parameters: + doc_ids (List[str]): A list of document identifiers specifying the chunks to be deleted. + + Returns: + True if the delete was successful. + """ + + futures = [ + (doc_id, self._table.delete_partition_async(partition_id=doc_id)) + for doc_id in doc_ids + ] + + success = True + for doc_id, future in futures: + try: + future.result() + except Exception as e: + success = False + logging.error(f"issue on delete of document: {doc_id}: {e}") + return success + + async def search_relevant_chunks(self, vector: Vector, n: int) -> List[Chunk]: + """ + Retrieves 'n' ANN results for an embedded token vector. + + Returns: + A list of Chunks with only `doc_id` and `chunk_id` set. + Fewer than 'n' results may be returned. + """ + + chunks: Set[Chunk] = set() + + # TODO: only return partition_id and row_id after cassio supports this + rows = await self._table.aann_search(vector=vector, n=n) + for row in rows: + chunks.add( + Chunk( + doc_id=row["partition_id"], + chunk_id=row["row_id"][0], + ) + ) + return list(chunks) + + async def get_chunk_embedding(self, doc_id: str, chunk_id: int) -> Chunk: + """ + Retrieve the embedding data for a chunk. + + Returns: + A chunk with `doc_id`, `chunk_id`, and `embedding` set. + """ + + row_id = (chunk_id, Predicate(PredicateOperator.GT, -1)) + rows = await self._table.aget_partition(partition_id=doc_id, row_id=row_id) + + embedding = [row["vector"] for row in rows] + + return Chunk(doc_id=doc_id, chunk_id=chunk_id, embedding=embedding) + + async def get_chunk_data( + self, doc_id: str, chunk_id: int, include_embedding: Optional[bool] + ) -> Chunk: + """ + Retrieve the text and metadata for a chunk. + + Returns: + A chunk with `doc_id`, `chunk_id`, `text`, and `metadata` set. + """ + + row_id = (chunk_id, Predicate(PredicateOperator.EQ, -1)) + row = await self._table.aget(partition_id=doc_id, row_id=row_id) + + if include_embedding is True: + embedded_chunk = await self.get_chunk_embedding( + doc_id=doc_id, chunk_id=chunk_id + ) + embedding = embedded_chunk.embedding + else: + embedding = None + + return Chunk( + doc_id=doc_id, + chunk_id=chunk_id, + text=row["body_blob"], + metadata=row["metadata"], + embedding=embedding, + ) + + def close(self) -> None: + """ + Cleans up any open resources. + """ + pass diff --git a/libs/colbert/ragstack_colbert/cassandra_vector_store.py b/libs/colbert/ragstack_colbert/cassandra_vector_store.py deleted file mode 100644 index c93750c45..000000000 --- a/libs/colbert/ragstack_colbert/cassandra_vector_store.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -This module provides an implementation of the BaseVectorStore abstract class, specifically designed -for use with a Cassandra database backend. It allows for the efficient storage and management of text embeddings -generated by a ColBERT model, facilitating scalable and high-relevancy retrieval operations. -""" - -import logging -from typing import Any, Dict, List, Optional, Set, Tuple - -import torch -from cassandra.cluster import ResponseFuture, Session -from cassio.table.query import Predicate, PredicateOperator -from cassio.table.tables import ClusteredMetadataVectorCassandraTable -from torch import Tensor - -from .base_vector_store import BaseVectorStore -from .objects import BaseChunk, ChunkData, EmbeddedChunk - - -class CassandraVectorStore(BaseVectorStore): - """ - An implementation of the BaseVectorStore abstract base class using Cassandra as the backend - storage system. This class provides methods to store, retrieve, and manage text embeddings within - a Cassandra database, specifically designed for handling vector embeddings generated by ColBERT. - - Attributes: - session (Session): The Cassandra session instance used for database operations. - keyspace (str): The name of the Cassandra keyspace to use. - table_name (str): The name of the table within the keyspace for storing chunk embeddings. - full_table_name (str): The full name of the table (including keyspace) for queries. - insert_chunk_stmt (PreparedStatement): Prepared statement for inserting text chunks. - insert_colbert_stmt (PreparedStatement): Prepared statement for inserting embeddings. - query_colbert_ann_stmt (PreparedStatement): Prepared statement for ANN queries. - query_colbert_chunks_stmt (PreparedStatement): Prepared statement for retrieving chunks. - query_chunk_stmt (PreparedStatement): Prepared statement for retrieving chunk bodies. - delete_chunks_by_doc_id_stmt (PreparedStatement): Prepared statement for deleting chunks. - - The table schema and custom index for ANN queries are automatically created if they do not exist. - """ - - _table: ClusteredMetadataVectorCassandraTable - - def __init__( - self, session: Session, keyspace: str, table_name: str, timeout: int = 180 - ): - """ - Initializes a new instance of the CassandraVectorStore. - - Parameters: - session (Session): The Cassandra session to use. - keyspace (str): The keyspace in which the table exists or will be created. - table_name (str): The name of the table to use or create for storing embeddings. - timeout (int, optional): The default timeout in seconds for Cassandra operations. Defaults to 180. - """ - - cluster_name = session.cluster.metadata.cluster_name.lower() - is_astra = "cndb" == cluster_name - logging.debug(f"colbert store is running on {'astra' if is_astra else 'apache cassandra'}") - - session.default_timeout = timeout - - self._table = ClusteredMetadataVectorCassandraTable( - session=session, - keyspace=keyspace, - table=table_name, - row_id_type=["INT", "INT"], - vector_dimension=128, - vector_source_model="bert" if is_astra else None, - vector_similarity_function=None if is_astra else "DOT_PRODUCT", - ) - - def put_chunks( - self, chunks: List[EmbeddedChunk], delete_existing: Optional[bool] = False - ) -> None: - """ - Stores a list of EmbeddedChunk instances in the Cassandra database, managing both the text - body and the embeddings of each chunk. Optionally deletes existing chunks for each document - before insertion. - - Parameters: - chunks (List[EmbeddedChunk]): A list of EmbeddedChunk instances to store. - delete_existing (Optional[bool]): A flag indicating whether to delete existing chunks for the - documents related to the chunks being inserted. - """ - - if delete_existing: - doc_ids = [c.doc_id for c in chunks] - self.delete_chunks(list(set(doc_ids))) - - futures: List[Tuple[str, int, int, ResponseFuture]] = [] - - for chunk in chunks: - doc_id = chunk.doc_id - chunk_id = chunk.chunk_id - text = chunk.data.text - metadata = {} if len(chunk.data.metadata) == 0 else chunk.data.metadata - - future = self._table.put_async(partition_id=doc_id, row_id=(chunk_id, -1), body_blob=text, metadata=metadata) - - futures.append((doc_id, chunk_id, -1, future)) - - for index, vector in enumerate(chunk.embeddings.tolist()): - future = self._table.put_async(partition_id=doc_id, row_id=(chunk_id, index), vector=vector) - futures.append((doc_id, chunk_id, index, future)) - - for (doc_id, chunk_id, embedding_id, future) in futures: - try: - future.result() - except Exception as e: - if embedding_id == -1: - logging.error(f"issue inserting document data: {doc_id} chunk: {chunk_id}: {e}") - else: - logging.error(f"issue inserting document embedding: {doc_id} chunk: {chunk_id} embedding: {embedding_id}: {e}") - - def delete_chunks(self, doc_ids: List[str]) -> None: - """ - Deletes all chunks associated with the specified document identifiers. - - Parameters: - doc_ids (List[str]): A list of document identifiers whose chunks should be deleted. - """ - - futures = [(doc_id, self._table.delete_partition_async(partition_id = doc_id)) for doc_id in doc_ids] - - for (doc_id, future) in futures: - try: - future.result() - except Exception as e: - logging.error(f"issue on delete of document: {doc_id}: {e}") - - async def search_relevant_chunks(self, vector: List[float], n: int) -> List[BaseChunk]: - """ - Retrieves 'n' ANN results for an embedded token vector. - - Returns: - A set of tuples of (doc_id, chunk_id). Fewer than 'n' results may be returned. - """ - - chunks: Set[BaseChunk] = set() - - # TODO: only return partition_id and row_id after cassio supports this - rows = await self._table.aann_search(vector=vector, n=n) - for row in rows: - chunks.add(BaseChunk( - doc_id=row["partition_id"], - chunk_id=row["row_id"][0], - )) - return list(chunks) - - async def get_chunk_embeddings(self, chunk: BaseChunk) -> Tuple[BaseChunk, List[Tensor]]: - """ - Retrieve all the embedding data for a chunk. - - Returns: - A tuple where the first value is BaseChunk, and the second - value is the list of embeddings for the chunk. - """ - - row_id = (chunk.chunk_id, Predicate(PredicateOperator.GT, -1)) - rows = await self._table.aget_partition(partition_id=chunk.doc_id, row_id=row_id) - - embeddings = [torch.Tensor(row["vector"]) for row in rows] - - return (chunk, embeddings) - - async def get_chunk_data(self, chunk: BaseChunk) -> Tuple[BaseChunk, ChunkData]: - """ - Fetches the text for a given chunk. - - Returns: - Tuple containing the chunk, and chunk_data - """ - - row_id = (chunk.chunk_id, Predicate(PredicateOperator.EQ, -1)) - row = await self._table.aget(partition_id=chunk.doc_id, row_id=row_id) - - chunk_data = ChunkData( - text=row["body_blob"], - metadata=row["metadata"] - ) - return (chunk, chunk_data) - - def close(self) -> None: - """ - Closes the Cassandra session and any other resources. This method should be overridden to - ensure proper cleanup if necessary. - """ - pass diff --git a/libs/colbert/ragstack_colbert/colbert_embedding_model.py b/libs/colbert/ragstack_colbert/colbert_embedding_model.py index 3d712c2d3..e4bf24df0 100644 --- a/libs/colbert/ragstack_colbert/colbert_embedding_model.py +++ b/libs/colbert/ragstack_colbert/colbert_embedding_model.py @@ -8,42 +8,14 @@ """ import logging -import uuid -from typing import Dict, List, Optional +from typing import List, Optional -import torch -import torch.distributed as dist -from torch import Tensor - -from colbert.indexing.collection_encoder import CollectionEncoder -from colbert.infra import ColBERTConfig, Run, RunConfig -from colbert.modeling.checkpoint import Checkpoint -from colbert.modeling.tokenization import QueryTokenizer +from colbert.infra import ColBERTConfig from .base_embedding_model import BaseEmbeddingModel from .constant import DEFAULT_COLBERT_MODEL -from .distributed import Distributed, Runner, reconcile_nranks -from .objects import ChunkData, EmbeddedChunk, EmbeddedText - - -def calculate_query_maxlen(tokens: List[List[str]]) -> int: - """ - Calculates an appropriate maximum query length for token embeddings, based on the length of the tokenized input. - - Parameters: - tokens (List[List[str]]): A nested list where each sublist contains tokens from a single query or chunk. - - Returns: - int: The calculated maximum length for query tokens, adhering to the specified minimum and maximum bounds, - and adjusted to the nearest power of two. - """ - - max_token_length = max(len(inner_list) for inner_list in tokens) - - # tokens from the query tokenizer does not include the SEP, CLS - # SEP, CLS, and Q tokens are added to the query - # although there could be more SEP tokens if there are more than one sentences, we only add one - return max_token_length + 3 +from .objects import Chunk, Embedding +from .text_encoder import TextEncoder class ColbertEmbeddingModel(BaseEmbeddingModel): @@ -53,40 +25,21 @@ class ColbertEmbeddingModel(BaseEmbeddingModel): retrieval tasks. It leverages a pre-trained ColBERT model and supports distributed computing environments. The class supports both GPU and CPU operations, with GPU usage recommended for performance efficiency. - - Attributes: - colbert_config (ColBERTConfig): Configuration parameters for the Colbert model. - checkpoint (Checkpoint): Manages the loading of the model and its parameters. - encoder (CollectionEncoder): Facilitates the encoding of texts into embeddings. - query_tokenizer (QueryTokenizer): Tokenizes queries for embedding. """ - colbert_config: ColBERTConfig - checkpoint: Checkpoint - encoder: CollectionEncoder - query_tokenizer: QueryTokenizer - - """ - checkpoint is the where the ColBERT model can be specified or downloaded from huggingface - colbert_model_url overwrites the checkpoint value if it exists - doc_maxlen is the number tokens each passage is truncated to - nbits is the number bits that each dimension encodes to - kmeans_niters specifies the number of iterations of kmeans clustering - nrank is the number of processors embeddings can run on - under the default value of -1, the program runs on all available GPUs under CUDA - query_maxlen is the fixed length of the tokens for query/recall encoding. Anything less will be padded. - """ + _query_maxlen: int + _chunk_batch_size: int def __init__( self, - checkpoint: str = DEFAULT_COLBERT_MODEL, - doc_maxlen: int = 220, - nbits: int = 2, - kmeans_niters: int = 4, - nranks: int = -1, - query_maxlen: int = -1, - verbose: int = 3, # 3 is the default on ColBERT checkpoint - distributed_communication: bool = False, + checkpoint: Optional[str] = DEFAULT_COLBERT_MODEL, + doc_maxlen: Optional[int] = 256, + nbits: Optional[int] = 2, + kmeans_niters: Optional[int] = 4, + nranks: Optional[int] = -1, + query_maxlen: Optional[int] = None, + verbose: Optional[int] = 3, # 3 is the default on ColBERT checkpoint + chunk_batch_size: Optional[int] = 640, **kwargs, ): """ @@ -94,124 +47,65 @@ def __init__( loading the necessary checkpoints, and preparing the tokenizer and encoder. Parameters: - checkpoint (str): Path or URL to the Colbert model checkpoint. Default is a pre-defined model. - doc_maxlen (int): Maximum number of tokens for document chunks. - nbits (int): The number bits that each dimension encodes to. - kmeans_niters (int): Number of iterations for k-means clustering during quantization. - nranks (int): Number of ranks (processors) to use for distributed computing; -1 uses all available CPUs/GPUs. - query_maxlen (int): Maximum length of query tokens for embedding. - verbose (int): Verbosity level for logging. - distributed_communication (bool): Flag to enable distributed computation. + checkpoint (Optional[str]): Path or URL to the Colbert model checkpoint. Default is a pre-defined model. + doc_maxlen (Optional[int]): Maximum number of tokens for document chunks. Should equal the chunk_size. + nbits (Optional[int]): The number bits that each dimension encodes to. + kmeans_niters (Optional[int]): Number of iterations for k-means clustering during quantization. + nranks (Optional[int]): Number of ranks (processors) to use for distributed computing; -1 uses all available CPUs/GPUs. + query_maxlen (Optional[int]): Maximum length of query tokens for embedding. + verbose (Optional[int]): Verbosity level for logging. + chunk_batch_size (Optional[int]): The number of chunks to batch during embedding. Defaults to 640. **kwargs: Additional keyword arguments for future extensions. - - Note: - This initializer also prepares the system for distributed computation if specified and available. """ - self.__cuda = torch.cuda.is_available() - self.__nranks = reconcile_nranks(nranks) - total_visible_gpus = torch.cuda.device_count() - logging.info(f"run nranks {self.__nranks}") - if ( - self.__nranks > 1 - and not dist.is_initialized() - and distributed_communication - ): - logging.info(f"distribution initialization must complete on {nranks} gpus") - Distributed(self.__nranks) - logging.info("distribution initialization completed") + if query_maxlen is None: + query_maxlen = -1 - with Run().context(RunConfig(nranks=nranks)): - if self.__cuda: - torch.cuda.empty_cache() - self.colbert_config = ColBERTConfig( - doc_maxlen=doc_maxlen, - nbits=nbits, - kmeans_niters=kmeans_niters, - nranks=self.__nranks, - checkpoint=checkpoint, - query_maxlen=query_maxlen, - gpus=total_visible_gpus, - ) - logging.info("creating checkpoint") - self.checkpoint = Checkpoint( - self.colbert_config.checkpoint, - colbert_config=self.colbert_config, - verbose=verbose, - ) - self.encoder = CollectionEncoder( - config=self.colbert_config, checkpoint=self.checkpoint + self._query_maxlen = query_maxlen + self._chunk_batch_size = chunk_batch_size + + colbert_config = ColBERTConfig( + doc_maxlen=doc_maxlen, + nbits=nbits, + kmeans_niters=kmeans_niters, + nranks=nranks, + checkpoint=checkpoint, ) - self.query_tokenizer = QueryTokenizer(self.colbert_config) - if self.__cuda: - self.checkpoint = self.checkpoint.cuda() + self._encoder = TextEncoder(config=colbert_config, verbose=verbose) - def embed_chunks( - self, chunks: List[ChunkData], doc_id: Optional[str] = None - ) -> List[EmbeddedChunk]: + # implements the Abstract Class Method + def embed_texts(self, texts: List[str]) -> List[Embedding]: """ - Encodes a list of text chunks into embeddings, returning them as a list of EmbeddedChunk objects. - Each chunk text is converted into a dense vector representation. + Embeds a list of texts into their corresponding vector embedding representations. Parameters: - texts (List[str]): The list of chunk texts to be embedded. - doc_id (Optional[str]): An optional document identifier. If not provided, a UUID is generated. + texts (List[str]): A list of string texts. Returns: - List[EmbeddedChunk]: A list of EmbeddedChunk objects containing the embeddings and document/chunk identifiers. + List[Embedding]: A list of embeddings, in the order of the input list """ - if doc_id is None: - doc_id = str(uuid.uuid4()) - - timeout = 60 + len(chunks) - - chunk_data_map: Dict[int, ChunkData] = {} - texts: List[str] = [] - - for chunk_idx, chunk in enumerate(chunks): - chunk_data_map[chunk_idx] = chunk - texts.append(chunk.text) + chunks = [ + Chunk(doc_id="dummy", chunk_id=i, text=t) for i, t in enumerate(texts) + ] - embedded_texts = self._encode_texts_using_distributed(texts=texts, timeout=timeout) + embedded_chunks = [] - output: List[EmbeddedChunk] = [] - for embedded_text in embedded_texts: - chunk_data = chunk_data_map[embedded_text.original_index] - output.append(EmbeddedChunk( - data=chunk_data, - doc_id=doc_id, - chunk_id=embedded_text.chunk_id, - embeddings=embedded_text.embeddings, - )) + for i in range(0, len(chunks), self._chunk_batch_size): + chunk_batch = chunks[i : i + self._chunk_batch_size] + embedded_chunks.extend(self._encoder.encode_chunks(chunks=chunk_batch)) - return output + sorted_embedded_chunks = sorted(embedded_chunks, key=lambda c: c.chunk_id) - # this is an alternative method for embedding the query that might give better results - def embed_query_alternative(self, query_text: str) -> List[Tensor]: - """ - Encodes a single query text into its embedding representation, optimized for retrieval tasks. - - Parameters: - query_text (str): The query text to be encoded. - - Returns: - Tensor: A tensor representing the encoded query's embedding. - - Note: - This method does not pad the query text to query_maxlen. Additionally, it does not - reload the checkpoint, therefore improving embedding speed. - """ - - embedded_text = self._encode_texts_using_distributed(texts=[query_text])[0] - return embedded_text.embeddings + return [c.embedding for c in sorted_embedded_chunks] + # implements the Abstract Class Method def embed_query( self, query: str, full_length_search: Optional[bool] = False, - query_maxlen: int = -1, - ) -> Tensor: + query_maxlen: Optional[int] = None, + ) -> Embedding: """ Embeds a single query text into its vector representation. @@ -221,78 +115,16 @@ def embed_query( query (str): The query string to encode. full_length_search (Optional[bool]): Indicates whether to encode the query for a full-length search. Defaults to False. - query_maxlen (int): The fixed length for the query token embedding. If -1, uses a dynamically calculated value. - - Returns: - Tensor: A tensor representing the embedded query. - """ - - embeddings = self._encode_queries_using_local( - [query], full_length_search, query_maxlen=query_maxlen - ) - return embeddings[0] - - def _encode_texts_using_distributed( - self, - texts: List[str], - timeout: int = 60, - ) -> List[EmbeddedText]: - """ - Encodes a list of texts chunks into embeddings, represented as EmbeddedChunk objects. This - method leverages the ColBERT model's encoding capabilities to convert textual content into - dense vector representations suitable for semantic search and retrieval applications. - - Parameters: - texts (List[str]): The list of text chunks to encode. - doc_id: An identifier for the document from which the chunks are derived. - timeout (int): The timeout in seconds for the encoding operation. Defaults to 60 seconds. + query_maxlen (int): The fixed length for the query token embedding. If None, uses a dynamically calculated value. Returns: - List[EmbeddedChunk]: A list of EmbeddedChunk objects containing the embeddings for each chunk text, along - with their associated document and chunk identifiers. + Embedding: A vector embedding representation of the query text """ - runner = Runner(self.__nranks) - return runner.encode( - config=self.colbert_config, - texts=texts, - timeout=timeout, - ) - - def _encode_queries_using_local( - self, - queries: List[str], - full_length_search: Optional[bool] = False, - query_maxlen: int = -1, - ) -> Tensor: - """ - Encodes one or more texts (queries) into dense vector representations. It supports encoding queries to a fixed - length, adjusting for the maximum token length or padding as necessary. The method is suitable for both - single and batch query processing, with optional support for full-length search encoding. - - Parameters: - queries (List[str]): A single query string or a list of query strings to be encoded. - full_length_search (Optional[bool]): If True, encodes queries for full-length search. Defaults to False. - query_maxlen (int): A fixed length for query token embeddings. If -1, uses a dynamically calculated value. - - Returns: - Tensor: A tensor containing the encoded queries. If multiple queries are provided, the tensor will - contain one row per query. - """ - - tokens = self.query_tokenizer.tokenize(queries) - _query_maxlen = max(query_maxlen, self.colbert_config.query_maxlen) - if _query_maxlen < 0: - _query_maxlen = calculate_query_maxlen(tokens) - logging.debug(f"Calculated dynamic query_maxlen of {_query_maxlen}") - - self.checkpoint.query_tokenizer.query_maxlen = _query_maxlen + if query_maxlen is None: + query_maxlen = -1 - # All query embeddings in the ColBERT documentation - # this name, EQ or Q, maps the exact name in most colBERT papers - batch_size = 128 if len(queries) > 128 else None - to_cpu = not self.__cuda - queriesQ = self.checkpoint.queryFromText( - queries, bsize=batch_size, to_cpu=to_cpu, full_length_search=full_length_search + query_maxlen = max(query_maxlen, self._query_maxlen) + return self._encoder.encode_query( + text=query, query_maxlen=query_maxlen, full_length_search=full_length_search ) - return queriesQ diff --git a/libs/colbert/ragstack_colbert/colbert_retriever.py b/libs/colbert/ragstack_colbert/colbert_retriever.py index dce8c1aed..5b72435bd 100644 --- a/libs/colbert/ragstack_colbert/colbert_retriever.py +++ b/libs/colbert/ragstack_colbert/colbert_retriever.py @@ -11,16 +11,14 @@ import asyncio import logging import math -import nest_asyncio -from typing import Any, Dict, List, Optional, Set +from typing import Any, Dict, List, Optional, Set, Tuple import torch -from torch import Tensor +from .base_database import BaseDatabase from .base_embedding_model import BaseEmbeddingModel from .base_retriever import BaseRetriever -from .base_vector_store import BaseVectorStore -from .objects import BaseChunk, ChunkData, RetrievedChunk +from .objects import Chunk, Embedding, Vector def all_gpus_support_fp16(is_cuda: Optional[bool] = False): @@ -49,18 +47,18 @@ def all_gpus_support_fp16(is_cuda: Optional[bool] = False): def max_similarity_torch( - query_vector: Tensor, - embedding_list: List[Tensor], + query_vector: Vector, + chunk_embedding: Embedding, is_cuda: Optional[bool] = False, is_fp16: Optional[bool] = False, -) -> Tensor: +) -> float: """ - Calculates the maximum similarity (dot product) between a query vector and a list of embedding vectors, + Calculates the maximum similarity (dot product) between a query vector and a chunk embedding, leveraging PyTorch for efficient computation. Parameters: - query_vector (Tensor): A 1D tensor representing the query vector. - embedding_list (List[Tensor]): A list of 1D tensors, each representing an embedding vector. + query_vector (Vector): A list of float representing the query text. + chunk_embedding (Embedding): A list of Vector, each representing an chunk embedding vector. is_cuda (Optional[bool]): A flag indicating whether to use CUDA (GPU) for computation. Defaults to False. is_fp16 (bool): A flag indicating whether to half-precision floating point operations on CUDA (GPU). Has no effect on CPU computation. Defaults to False. @@ -73,27 +71,29 @@ def max_similarity_torch( This function is designed to run on GPU for enhanced performance but can also execute on CPU. """ - # Convert embedding list to a tensor - embedding_tensor = torch.stack(embedding_list) + # Convert inputs to tensors + query_tensor = torch.Tensor(query_vector) + embedding_tensor = torch.stack([torch.Tensor(v) for v in chunk_embedding]) if is_cuda: device = torch.device("cuda") - query_vector = query_vector.to(device) + query_tensor = query_tensor.to(device) embedding_tensor = embedding_tensor.to(device) # Use half-precision operations if supported if is_fp16: - query_vector = query_vector.half() + query_tensor = query_tensor.half() embedding_tensor = embedding_tensor.half() # Perform the dot product operation - sims = torch.matmul(embedding_tensor, query_vector) + sims = torch.matmul(embedding_tensor, query_tensor) # Find the maximum similarity max_sim = torch.max(sims) # returns a tensor; the item() is the score - return max_sim + return float(max_sim.item()) + def get_trace(e: Exception) -> str: trace = "" @@ -122,32 +122,32 @@ class ColbertRetriever(BaseRetriever): computation if a GPU is not available. """ - vector_store: BaseVectorStore - embedding_model: BaseEmbeddingModel - is_cuda: bool = False - is_fp16: bool = False + _database: BaseDatabase + _embedding_model: BaseEmbeddingModel + _is_cuda: bool + _is_fp16: bool class Config: arbitrary_types_allowed = True def __init__( self, - vector_store: BaseVectorStore, + database: BaseDatabase, embedding_model: BaseEmbeddingModel, ): """ Initializes the retriever with a specific vector store and Colbert embeddings model. Parameters: - vector_store (BaseVectorStore): The vector store to be used for retrieving embeddings. + database (BaseDatabase): The data store to be used for retrieving embeddings. embedding_model (BaseEmbeddingModel): The ColBERT embeddings model to be used for encoding queries. """ - self.vector_store = vector_store - self.embedding_model = embedding_model - self.is_cuda = torch.cuda.is_available() - self.is_fp16 = all_gpus_support_fp16(self.is_cuda) + self._database = database + self._embedding_model = embedding_model + self._is_cuda = torch.cuda.is_available() + self._is_fp16 = all_gpus_support_fp16(self._is_cuda) def close(self) -> None: """ @@ -156,189 +156,256 @@ def close(self) -> None: pass async def _query_relevant_chunks( - self, query_embeddings: List[Tensor], top_k: int - ) -> Set[BaseChunk]: + self, query_embedding: Embedding, top_k: int + ) -> Set[Chunk]: """ - Retrieves the top_k ANN results for each embedded query token. + Retrieves the top_k ANN Chunks (`doc_id` and `chunk_id` only) for each embedded query token. """ - chunks: Set[BaseChunk] = set() + chunks: Set[Chunk] = set() # Collect all tasks - tasks = [self.vector_store.search_relevant_chunks(vector=v, n=top_k) for v in query_embeddings] + tasks = [ + self._database.search_relevant_chunks(vector=v, n=top_k) + for v in query_embedding + ] results = await asyncio.gather(*tasks, return_exceptions=True) # Process results and handle potential exceptions for result in results: if isinstance(result, Exception): - logging.error(f"Issue on vector_store.get_relevant_chunks(): {result} at {get_trace(result)}") + logging.error( + f"Issue on database.get_relevant_chunks(): {result} at {get_trace(result)}" + ) else: chunks.update(result) return chunks - async def _retrieve_chunks( - self, chunks: Set[BaseChunk] - ) -> Dict[BaseChunk, List[Tensor]]: + async def _get_chunk_embeddings(self, chunks: Set[Chunk]) -> List[Chunk]: """ - Retrieves embeddings for a list of chunks, returning a dictionary mapping chunk to a list of PyTorch tensors. + Retrieves Chunks with `doc_id`, `chunk_id`, and `embedding` set. """ - chunk_embeddings: Dict[BaseChunk, List[Tensor]] = {} - # Collect all tasks - tasks = [self.vector_store.get_chunk_embeddings(chunk) for chunk in chunks] + tasks = [ + self._database.get_chunk_embedding(doc_id=c.doc_id, chunk_id=c.chunk_id) + for c in chunks + ] results = await asyncio.gather(*tasks, return_exceptions=True) # Process results and handle potential exceptions for result in results: if isinstance(result, Exception): - logging.error(f"Issue on vector_store.get_chunk_embeddings(): {result} at {get_trace(result)}") - else: - chunk, embeddings = result - chunk_embeddings[chunk] = embeddings + logging.error( + f"Issue on database.get_chunk_embeddings(): {result} at {get_trace(result)}" + ) - return chunk_embeddings + return results def _score_chunks( - self, query_embeddings: Tensor, chunk_data: Dict[BaseChunk, List[Tensor]] - ) -> Dict[BaseChunk, Tensor]: + self, query_embedding: Embedding, chunk_embeddings: List[Chunk] + ) -> Dict[Chunk, float]: """ Process the retrieved chunk data to calculate scores. """ chunk_scores = {} - for chunk, embeddings in chunk_data.items(): + for chunk in chunk_embeddings: chunk_scores[chunk] = sum( max_similarity_torch( - query_vector=qv, - embedding_list=embeddings, - is_cuda=self.is_cuda, - is_fp16=self.is_fp16, + query_vector=query_vector, + chunk_embedding=chunk.embedding, + is_cuda=self._is_cuda, + is_fp16=self._is_fp16, ) - for qv in query_embeddings + for query_vector in query_embedding ) return chunk_scores - async def _fetch_chunk_data( + async def _get_chunk_data( self, - chunks_by_score: List[BaseChunk], - chunk_scores: Dict[BaseChunk, Tensor], - ) -> List[RetrievedChunk]: + chunks: List[Chunk], + include_embedding: Optional[bool] = False, + ) -> List[Chunk]: """ - Fetches text and metadata for each chunk and ranks them based on scores. - - Parameters: - chunks_by_score (List[Tuple[str, int]]): List of tuples containing (doc_id, chunk_id) sorted by score. - chunk_scores (Dict[Tuple[str, int], Tensor]): Dictionary mapping (doc_id, chunk_id) to their respective scores. + Fetches text and metadata for each chunk. Returns: - List[RetrievedChunk]: A list of RetrievedChunk objects with populated fields. + List[Chunk]: A list of chunks with `doc_id`, `chunk_id`, `text`, `metadata`, and optionally `embedding` set. """ # Collect all tasks - tasks = [self.vector_store.get_chunk_data(chunk=chunk) for chunk in chunks_by_score] + tasks = [ + self._database.get_chunk_data( + doc_id=c.doc_id, + chunk_id=c.chunk_id, + include_embedding=include_embedding, + ) + for c in chunks + ] results = await asyncio.gather(*tasks, return_exceptions=True) - # Process results and handle potential exceptions - chunk_data_map: Dict[BaseChunk, ChunkData] = {} for result in results: if isinstance(result, Exception): - logging.error(f"Issue on vector_store.get_chunk_text_and_metadata(): {result} at {get_trace(result)}") - else: - chunk, chunk_data = result - chunk_data_map[chunk] = chunk_data - - answers: List[RetrievedChunk] = [] - - for idx, chunk in enumerate(chunks_by_score): - score = chunk_scores[chunk] - chunk_data = chunk_data_map[chunk] - answers.append( - RetrievedChunk( - doc_id=chunk.doc_id, - chunk_id=chunk.chunk_id, - score=score.item(), # Ensure score is a scalar if it's a tensor - rank=idx + 1, - data=chunk_data, - ) + logging.error( + f"Issue on database.get_chunk_data(): {result} at {get_trace(result)}" ) - return answers + return results - async def aretrieve( + async def atext_search( self, - query: str, - k: int = 10, - query_maxlen: int = 64, + query_text: str, + k: Optional[int] = 5, + query_maxlen: Optional[int] = None, + include_embedding: Optional[bool] = False, **kwargs: Any, - ) -> List[RetrievedChunk]: + ) -> List[Tuple[Chunk, float]]: """ Retrieves a list of text chunks most relevant to the given query, using semantic similarity as the criteria. Parameters: - query (str): The text query for which relevant chunks are to be retrieved. - k (int, optional): The number of top relevant chunks to retrieve. Defaults to 10. - query_maxlen (int, optional): The maximum number of tokens in the query. If -1, this will be calculated dynamically. - query_timeout (int, optional): The timeout in seconds for query execution. Defaults to 180. - **kwargs (Any): Additional keyword arguments that can be used for extending functionality. + query_text (str): The query text to search for relevant text chunks. + k (Optional[int]): The number of top results to retrieve. Default 5. + query_maxlen (Optional[int]): The maximum length of the query to consider. If None, the + maxlen will be dynamically generated. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks + **kwargs (Any): Additional parameters that implementations might require for customized + retrieval operations. Returns: - List[RetrievedChunk]: A list of RetrievedChunk objects, each representing a text chunk that is relevant - to the query, along with its similarity score and rank. + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. + """ + + query_embedding = self._embedding_model.embed_query( + query=query_text, query_maxlen=query_maxlen + ) + + return await self.aembedding_search( + query_embedding=query_embedding, + k=k, + include_embedding=include_embedding, + **kwargs, + ) + + async def aembedding_search( + self, + query_embedding: Embedding, + k: Optional[int] = 5, + include_embedding: Optional[bool] = False, + **kwargs: Any, + ) -> List[Tuple[Chunk, float]]: + """ + Retrieves a list of text chunks most relevant to the given query, using semantic similarity as the criteria. - Note: - The actual retrieval process involves encoding the query, performing an ANN search to find relevant - embeddings, scoring these embeddings for similarity, and retrieving the corresponding text chunks. + Parameters: + query_embedding (Embedding): The query embedding to search for relevant text chunks. + k (Optional[int]): The number of top results to retrieve. Default 5. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks + **kwargs (Any): Additional parameters that implementations might require for customized + retrieval operations. + + Returns: + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. """ - query_embeddings = self.embedding_model.embed_query( - query, query_maxlen=query_maxlen + top_k = max(math.floor(len(query_embedding) / 2), 16) + logging.debug( + f"based on query length of {len(query_embedding)} tokens, retrieving {top_k} results per token-embedding" ) - top_k = max(math.floor(len(query_embeddings) / 2), 16) - logging.debug(f"based on query length of {len(query_embeddings)} tokens, retrieving {top_k} results per token-embedding") + # search for relevant chunks (only with `doc_id` and `chunk_id` set) + relevant_chunks: List[Chunk] = await self._query_relevant_chunks( + query_embedding=query_embedding, top_k=top_k + ) - chunks = await self._query_relevant_chunks( - query_embeddings=query_embeddings, top_k=top_k + # get the embedding for each chunk (with `doc_id`, `chunk_id`, and `embedding` set) + chunk_embeddings: List[Chunk] = await self._get_chunk_embeddings( + chunks=relevant_chunks ) - # score each chunk - chunk_data = await self._retrieve_chunks(chunks=chunks) - chunk_scores = self._score_chunks( - query_embeddings=query_embeddings, chunk_data=chunk_data + # score the chunks using max_similarity + chunk_scores: Dict[Chunk, float] = self._score_chunks( + query_embedding=query_embedding, + chunk_embeddings=chunk_embeddings, ) - # load the source chunk for the top k documents - chunks_by_score = sorted(chunk_scores, key=chunk_scores.get, reverse=True)[:k] + # only keep the top k sorted results + top_k_chunks: List[Chunk] = sorted( + chunk_scores, key=chunk_scores.get, reverse=True + )[:k] - answers = await self._fetch_chunk_data( - chunks_by_score=chunks_by_score, chunk_scores=chunk_scores + chunks: List[Chunk] = await self._get_chunk_data( + chunks=top_k_chunks, include_embedding=include_embedding ) - return answers - def retrieve( + return [(chunk, chunk_scores[chunk]) for chunk in chunks] + + def text_search( self, - query: str, - k: int = 10, - query_maxlen: int = 64, + query_text: str, + k: Optional[int] = 5, + query_maxlen: Optional[int] = None, + include_embedding: Optional[bool] = False, **kwargs: Any, - ) -> List[RetrievedChunk]: + ) -> List[Tuple[Chunk, float]]: """ - Retrieves a list of text chunks most relevant to the given query, using semantic similarity as the criteria. + Retrieves a list of text chunks relevant to a given query from the vector store, ranked by + relevance or other metrics. Parameters: - query (str): The text query for which relevant chunks are to be retrieved. - k (int, optional): The number of top relevant chunks to retrieve. Defaults to 10. - query_maxlen (int, optional): //TODO figure out a better description for this parameter, and/or a better name. - **kwargs (Any): Additional keyword arguments that can be used for extending functionality. + query_text (str): The query text to search for relevant text chunks. + k (Optional[int]): The number of top results to retrieve. Default 5. + query_maxlen (Optional[int]): The maximum length of the query to consider. If None, the + maxlen will be dynamically generated. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks + **kwargs (Any): Additional parameters that implementations might require for customized + retrieval operations. Returns: - List[RetrievedChunk]: A list of RetrievedChunk objects, each representing a text chunk that is relevant - to the query, along with its similarity score and rank. + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. + """ + + return asyncio.run( + self.atext_search( + query_text=query_text, + k=k, + query_maxlen=query_maxlen, + include_embedding=include_embedding, + ) + ) - Note: - The actual retrieval process involves encoding the query, performing an ANN search to find relevant - embeddings, scoring these embeddings for similarity, and retrieving the corresponding text chunks. + def embedding_search( + self, + query_embedding: Embedding, + k: Optional[int] = 5, + include_embedding: Optional[bool] = False, + **kwargs: Any, + ) -> List[Tuple[Chunk, float]]: + """ + Retrieves a list of text chunks relevant to a given query from the vector store, ranked by + relevance or other metrics. + + Parameters: + query_embedding (Embedding): The query embedding to search for relevant text chunks. + k (Optional[int]): The number of top results to retrieve. Default 5. + include_embedding (Optional[bool]): Optional (default False) flag to include the + embedding vectors in the returned chunks + **kwargs (Any): Additional parameters that implementations might require for customized + retrieval operations. + + Returns: + List[Tuple[Chunk, float]]: A list of retrieved Chunk, float Tuples, each representing a text chunk that is relevant + to the query, along with its similarity score. """ - # nest_asyncio does not a new event loop to be created - # in the case there is already an event loop such as colab, it's required - nest_asyncio.apply() - loop = asyncio.get_event_loop() - return loop.run_until_complete(self.aretrieve(query=query, k=k, query_maxlen=query_maxlen)) + + return asyncio.run( + self.aembedding_search( + query_embedding=query_embedding, + k=k, + include_embedding=include_embedding, + ) + ) diff --git a/libs/colbert/ragstack_colbert/colbert_vector_store.py b/libs/colbert/ragstack_colbert/colbert_vector_store.py new file mode 100644 index 000000000..8b0650726 --- /dev/null +++ b/libs/colbert/ragstack_colbert/colbert_vector_store.py @@ -0,0 +1,129 @@ +""" +This module provides an implementation of the BaseVectorStore abstract class, specifically designed +for use with a Cassandra database backend. It allows for the efficient storage and management of text embeddings +generated by a ColBERT model, facilitating scalable and high-relevancy retrieval operations. +""" + +import logging +import uuid +from typing import List, Optional, Tuple + +from .base_database import BaseDatabase +from .base_embedding_model import BaseEmbeddingModel +from .base_retriever import BaseRetriever +from .base_vector_store import BaseVectorStore +from .colbert_retriever import ColbertRetriever +from .objects import Chunk, Metadata + + +class ColbertVectorStore(BaseVectorStore): + """ + An implementation of the BaseVectorStore abstract base class. + """ + + _database: BaseDatabase + _embedding_model: BaseEmbeddingModel + + def __init__( + self, + database: BaseDatabase, + embedding_model: Optional[BaseEmbeddingModel] = None, + ): + """ + Initializes a new instance of the ColbertVectorStore. + + Parameters: + database (BaseDatabase): The database to use for storage + embedding_model (Optional[BaseEmbeddingModel]): The embedding model to use for embedding text and queries + """ + + self._database = database + self._embedding_model = embedding_model + + def _validate_embedding_model(self): + if self._embedding_model is None: + raise AttributeError( + "To use this method, `embedding_model` must be set on class creation." + ) + + # implements the abc method to handle LlamaIndex add + def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]: + """ + Stores a list of embedded text chunks in the vector store + + Parameters: + chunks (List[Chunk]): A list of `Chunk` instances to be stored. + + Returns: + a list of tuples: (doc_id, chunk_id) + """ + + return self._database.add_chunks(chunks=chunks) + + # implements the abc method to handle LangChain add + def add_texts( + self, + texts: List[str], + metadatas: Optional[List[Metadata]] = None, + doc_id: Optional[str] = None, + ) -> List[Tuple[str, int]]: + """ + Embeds and stores a list of text chunks and optional metadata into the vector store + + Parameters: + texts (List[str]): The list of text chunks to be embedded + metadatas (Optional[List[Metadata]])): An optional list of Metadata to be stored. + If provided, these are set 1 to 1 with the texts list. + doc_id (Optional[str]): The document id associated with the texts. If not provided, + it is generated. + + Returns: + a list of tuples: (doc_id, chunk_id) + """ + self._validate_embedding_model() + + if metadatas is not None and len(texts) != len(metadatas): + raise ValueError("Length of texts and metadatas must match.") + + if doc_id is None: + doc_id = str(uuid.uuid4()) + + embeddings = self._embedding_model.embed_texts(texts=texts) + + chunks: List[Chunk] = [] + for i, text in enumerate(texts): + chunks.append( + Chunk( + doc_id=doc_id, + chunk_id=i, + text=text, + metadata={} if metadatas is None else metadatas[i], + embedding=embeddings[i], + ) + ) + + return self._database.add_chunks(chunks=chunks) + + # implements the abc method to handle LangChain and LlamaIndex delete + def delete_chunks(self, doc_ids: List[str]) -> bool: + """ + Deletes chunks from the vector store based on their document id. + + Parameters: + doc_ids (List[str]): A list of document identifiers specifying the chunks to be deleted. + + Returns: + True if the delete was successful. + """ + + return self._database.delete_chunks(doc_ids=doc_ids) + + def as_retriever(self) -> BaseRetriever: + """ + Gets a retriever using the vector store. + """ + + self._validate_embedding_model() + return ColbertRetriever( + database=self._database, embedding_model=self._embedding_model + ) diff --git a/libs/colbert/ragstack_colbert/constant.py b/libs/colbert/ragstack_colbert/constant.py index 0fd6e7638..806a98681 100644 --- a/libs/colbert/ragstack_colbert/constant.py +++ b/libs/colbert/ragstack_colbert/constant.py @@ -14,5 +14,3 @@ DEFAULT_COLBERT_DIM = 128 MAX_MODEL_TOKENS = 512 - -CHUNK_MAX_PER_DOC = 100000000 diff --git a/libs/colbert/ragstack_colbert/distributed/__init__.py b/libs/colbert/ragstack_colbert/distributed/__init__.py deleted file mode 100644 index a4757d7bc..000000000 --- a/libs/colbert/ragstack_colbert/distributed/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .distributed import Distributed, reconcile_nranks -from .runner import Runner - -__all__ = [ - "Distributed", - "reconcile_nranks", - "Runner", -] diff --git a/libs/colbert/ragstack_colbert/distributed/chunk_encoder.py b/libs/colbert/ragstack_colbert/distributed/chunk_encoder.py deleted file mode 100644 index ff7ebe741..000000000 --- a/libs/colbert/ragstack_colbert/distributed/chunk_encoder.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -This module provides functionalities to encode text chunks into dense vector representations using a ColBERT -model. It supports encoding chunks in batches to efficiently manage memory usage and prevent out-of-memory errors -when processing large datasets. The module is designed for use in semantic search and retrieval systems, where such -dense embeddings are used to measure the semantic similarity between text chunks. -""" - -import logging -from typing import List - -import torch -from torch import Tensor - -from colbert.infra import ColBERTConfig -from colbert.modeling.checkpoint import Checkpoint -from colbert.utils.utils import batch - -from ..constant import CHUNK_MAX_PER_DOC -from ..objects import BaseText, EmbeddedText - - -def encode_chunks( - config: ColBERTConfig, rank: int, texts: List[BaseText] -) -> List[EmbeddedText]: - """ - Encodes a text chunks using a specified ColBERT model configuration. This function initializes - a ChunkEncoder with the given model configuration and checkpoint, then encodes the texts. - - Parameters: - config (ColBERTConfig): Configuration for the ColBERT model. - rank (int): The rank of the process. - texts (List[Text]): A list of text chunks to encode. - - Returns: - Encoded representations of the chunks, along with their mapped indices. - """ - - checkpoint = Checkpoint(config.checkpoint, colbert_config=config) - encoder = ChunkEncoder(config=config, checkpoint=checkpoint) - return encoder.encode_and_map(rank, texts) - - -class ChunkEncoder: - """ - Encapsulates the logic for encoding chunks into dense vector representations using a specified ColBERT model - configuration and checkpoint. This class is optimized for batch processing to manage GPU memory usage efficiently. - """ - - def __init__(self, config: ColBERTConfig, checkpoint: Checkpoint) -> None: - """ - Initializes the ChunkEncoder with a given ColBERT model configuration and checkpoint. - - Parameters: - config (ColBERTConfig): The configuration for the Colbert model. - checkpoint (Checkpoint): The checkpoint containing the pre-trained model weights. - """ - - self._checkpoint = checkpoint - self._use_cpu = config.total_visible_gpus == 0 - - def encode_chunks( - self, texts: List[str], batch_size: int = 64 - ) -> tuple[Tensor, List[int]]: - """ - Encodes a list of chunks into embeddings, processing in batches to efficiently manage memory. - - Parameters: - texts (List[str]): The text chunks to encode. - batch_size (int): The size of batches for processing to avoid memory overflow. Defaults to 64. - - Returns: - A tuple containing the concatenated tensor of embeddings and a list of document lengths. - """ - - logging.info(f"#> Encoding {len(texts)} chunks..") - - if len(texts) == 0: - return None, None - - with torch.inference_mode(): - embs, doclens = [], [] - - # Batch here to avoid OOM from storing intermediate embeddings on GPU. - # Storing on the GPU helps with speed of masking, etc. - # But ideally this batching happens internally inside docFromText. - for chunks_batch in batch(texts, batch_size * 10): - logging.info(f"#> Encoding batch of {len(chunks_batch)} chunks..") - embs_, doclens_ = self._checkpoint.docFromText( - chunks_batch, - bsize=batch_size, - to_cpu=self._use_cpu, - keep_dims="flatten", - showprogress=self._use_cpu, - ) - embs.append(embs_) - doclens.extend(doclens_) - - embs = torch.cat(embs) - - return embs, doclens - - def encode_and_map( - self, rank: int, texts: list[BaseText] - ) -> List[EmbeddedText]: - """ - Encodes texts and maps them to their original index, adjusting for process rank in a - distributed setting. - - Parameters: - rank (int): The process rank, used to adjust chunk indexing in distributed settings. - texts (List[Text]): The texts to encode. - - Returns: - A list of EmbeddedText objects, each containing the chunk text, - chunk_id, embeddings, and original index. - """ - # this returns an list of tensors (vectors) and a list of counts - # where the list of counts has the same size as the list of input texts - # - # for each chunk text, we need to pull off "count" vectors to create - # the ColBERT embedding - _texts = [text.text for text in texts] - embeddings, counts = self.encode_chunks(_texts) - - # if the function runs on cuda device, we use base_chunk_idx as offset - # rank should be 0 on single GPU or CPU device - chunk_idx_offset = rank * CHUNK_MAX_PER_DOC - # Starting index for slicing the embeddings tensor - start_idx = 0 - - embedded_texts: List[EmbeddedText] = [] - for text_index, text in enumerate(texts): - # The end index for slicing - end_idx = start_idx + counts[text_index] - - embedded_texts.append( - EmbeddedText( - chunk_id=text_index + chunk_idx_offset, - embeddings=embeddings[start_idx:end_idx], - text=text.text, - original_index=text.original_index, - ) - ) - - # Reset for the next chunk - start_idx = end_idx - return embedded_texts diff --git a/libs/colbert/ragstack_colbert/distributed/distributed.py b/libs/colbert/ragstack_colbert/distributed/distributed.py deleted file mode 100644 index 8cc17cbe4..000000000 --- a/libs/colbert/ragstack_colbert/distributed/distributed.py +++ /dev/null @@ -1,185 +0,0 @@ -""" -Provides utilities for setting up and managing a distributed computing environment using PyTorch. This includes -functions for determining the optimal number of ranks (processes) based on available CUDA devices, finding free -network ports, and initializing PyTorch's distributed process group. The module is designed to facilitate -distributed training or computation tasks across multiple GPUs or nodes, enhancing performance and efficiency. - -Although not required for single-device setups, these utilities become crucial for scaling to multi-device or -multi-node projects, offering streamlined setup and teardown processes for distributed operations. - -Note: -This module currently only distributes work on a single machine. -""" - -import logging -import os - -import torch -import torch.distributed as dist -import torch.multiprocessing as mp - - -def reconcile_nranks(nranks: int) -> int: - """ - Determines the appropriate number of ranks (parallel processes) for distributed operations based on the - passed value and the available CUDA (GPU) or CPU devices. - - Parameters: - nranks (int): The desired number of ranks (parallel processes). If less than 1, the function aims to use all available processors. - - Returns: - int: The number of ranks to be used, which may be adjusted based on the availability of processors. - """ - __cuda = torch.cuda.is_available() - if __cuda: - __cuda_device_count = torch.cuda.device_count() - if nranks < 1: - return __cuda_device_count - else: - return min(nranks, __cuda_device_count) - else: - if nranks < 1: - return 1 - else: - # currently let user set nranks on CPU - return nranks - - -def find_free_port(): - """ - Finds a free network port on the localhost that can be used for inter-process communication in distributed setups. - - Returns: - str: A string representation of the free port number. - """ - import socket - from contextlib import closing - - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.bind(("", 0)) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - return str(s.getsockname()[1]) - - -def setup_process( - rank: int, - master_addr: str, - master_port: str, - world_size: int, - backend="nccl", -) -> None: - """ - Initializes a distributed process group for a given rank within the world size of a distributed computing - environment. This setup is crucial for coordinated operations across multiple processes. - - Parameters: - rank (int): The rank of the current process within the distributed group. - master_addr (str): The IP address of the master node for coordination. - master_port (str): The port on the master node used for communication. - world_size (int): The total number of processes in the distributed environment. - backend (str, optional): The backend to use for distributed operations. Defaults to "nccl". - """ - - logging.info( - f"setting up {rank=} {world_size=} {backend=} {master_addr=} {master_port=}" - ) - - # set up the master's ip address so this child process can coordinate - os.environ["MASTER_ADDR"] = master_addr - os.environ["MASTER_PORT"] = master_port - - # Initializes the default distributed process group, and this will also initialize the distributed package. - dist.init_process_group( - backend, rank=rank, world_size=world_size, init_method="env://" - ) - logging.info( - f"{rank=} init_process_group completed {world_size=} {backend=} {master_port=}" - ) - # TODO: find out when to destroy the process group - # dist.destroy_process_group() - - -class Distributed: - """ - A singleton class designed to initialize and manage the distributed environment for PyTorch applications. It - ensures that the distributed setup is only initialized once and provides access to the total number of ranks - or processes involved in the computation. - - The class automatically determines the optimal configuration based on available resources and the desired - number of ranks, setting up inter-process communication and preparing the environment for distributed - operations. - """ - - _instance = None # Keep instance reference - _is_initialized = False - _world_size = 0 - - def __new__(cls, *args, **kwargs): - """ - Ensures that only one instance of the Distributed class is created (Singleton pattern). - This method checks if an instance already exists; if not, it creates a new one, ensuring that the - distributed environment setup is only executed once. - - Parameters: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - The singleton instance of the Distributed class. - """ - - if not cls._instance: - cls._instance = super(Distributed, cls).__new__(cls) - return cls._instance - - def __init__(self, nranks: int): - """ - Initializes the Distributed class instance by setting up the distributed environment if it hasn't been - initialized already. This setup includes determining the number of ranks and configuring the process group. - - Parameters: - nranks (int): The desired number of ranks (processes) to use in the distributed environment. If the - number is less than 1, the system will attempt to use all available CUDA devices. - """ - if not self._is_initialized: - self._setup(nranks) - self._is_initialized = True - - def _setup(self, nranks: int): - """ - Configures the distributed environment by determining the optimal number of ranks and initializing the - process group for distributed operations. This method is intended to be called internally once during the - initialization of the Distributed class. - - Parameters: - nranks (int): The desired number of ranks (processes) for the distributed environment. This value is - used to calculate the effective world size and to initialize the process group accordingly. - """ - - master_addr = "127.0.0.1" - master_port = find_free_port() - if nranks < 1: - nranks = torch.cuda.device_count() - world_size = min(torch.cuda.device_count(), nranks) - logging.info(f"setting up resource group {world_size}") - self._world_size = world_size - mp.spawn( - setup_process, - args=( - master_addr, - master_port, - world_size, - ), - nprocs=world_size, - ) - logging.info(f"resource group setup completed {self._world_size}") - - def world_size(self): - """ - Retrieves the world size (the total number of processes participating in the distributed environment) - that was determined during the setup. - - Returns: - int: The world size of the distributed environment. - """ - return self._world_size diff --git a/libs/colbert/ragstack_colbert/distributed/runner.py b/libs/colbert/ragstack_colbert/distributed/runner.py deleted file mode 100644 index 862b09db4..000000000 --- a/libs/colbert/ragstack_colbert/distributed/runner.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -Facilitates parallel processing of text chunk encoding by distributing workload across multiple processors or CUDA -devices. This module includes utilities to evenly distribute collections of text for encoding, leveraging multiprocessing -and CUDA capabilities for improved performance. The `Runner` class orchestrates the parallel encoding process, managing -processes and collecting their results efficiently. - -Designed to optimize encoding tasks in distributed computing environments, the module ensures workload balance and -maximizes resource utilization by dynamically adjusting to the number of available processors or CUDA-enabled GPUs. -""" - -import logging -from typing import Dict, List - -import torch -import torch.multiprocessing as mp - -from colbert.infra import ColBERTConfig - -from ..objects import BaseText, EmbeddedText -from .chunk_encoder import encode_chunks -from .distributed import reconcile_nranks - - -def distribute_work_load( - work_load_size: int = 1, processors: int = 1 -) -> List[List[int]]: - """ - Distributes a given workload size across a specified number of processors. - - Parameters: - work_load_size (int): The total size of the workload to be distributed. - processors (int): The number of processors available for workload distribution. - - Returns: - List[List[int]]: A nested list where each sublist contains indices representing the workload assigned to each processor. - """ - - if work_load_size == 0: - return [] - # ensure no empty workload assigns to a processor - processors = min(work_load_size, processors) - # Initialize an empty list for each processor - result = [[] for _ in range(processors)] - - # Distribute each workload to a processor in a round-robin fashion - for i in range(work_load_size): - result[i % processors].append(i) - return result - - -def map_work_load(texts: List[BaseText], processors: int = 1) -> List[List[BaseText]]: - """ - Maps a list of text chunks to a specified number of processors for distributed processing. This function - leverages `distribute_work_load` to evenly distribute the collections among available processors. - - Parameters: - texts (List[str]): The chunk texts to be processed. - processors (int): The number of processors available for distribution. - - Returns: - List[List[str]]: A nested list where each sublist contains the texts assigned to each processor. - """ - - work_loads = distribute_work_load(len(texts), processors) - return [[texts[i] for i in workload] for workload in work_loads] - - -def cuda_encode_texts( - config: ColBERTConfig, - rank: int, - texts: List[BaseText], - return_dict: Dict[int, List[EmbeddedText]], -): - """ - Encodes a collection of text chunks using CUDA-enabled devices, storing the results in a shared dictionary. - This function is designed to be run in a separate process for each chunk of the workload. - - Parameters: - config: The configuration settings for the encoding process. - rank (int): The rank of the current process in the distributed setting. - collection (List[str]): The collection of text chunks to encode. - return_dict: A multiprocessing.Manager().dict() to store the results of the encoding process. - """ - if torch.cuda.is_available(): - logging.info(f"encoder runs on cuda id {torch.cuda.current_device()}") - results = encode_chunks(config=config, rank=rank, texts=texts) - return_dict[rank] = results - - -class Runner: - """ - Orchestrates the distribution and parallel processing of text chunk encoding tasks across multiple processors or CUDA (GPU) - devices. Utilizes multiprocessing to initiate separate encoding processes and aggregates their results upon completion. - - Attributes: - _is_cuda (bool): Indicates if CUDA is available for GPU acceleration. - _nranks (int): The number of processor ranks determined based on availability and the provided configuration. - """ - - def __init__(self, nranks: int = 1) -> None: - """ - Initializes the Runner with a specified number of ranks, adjusting for the availability of CUDA devices. - - Parameters: - nranks (int): The desired number of ranks (processors) for distributing the encoding workload. - """ - - # this runner is only useful when nranks > 1 - self._is_cuda = torch.cuda.is_available() - self._nranks = 1 - if self._is_cuda: - self._nranks = reconcile_nranks(nranks) - - # this is the entrypoint to the distributed embedding code - def encode( - self, - config: ColBERTConfig, - texts: List[str], - timeout: int = 60, - ) -> List[EmbeddedText]: - """ - Encodes a collection of text across multiple processors or CUDA devices in parallel. Manages the lifecycle - of subprocesses, ensuring timely completion and aggregating their results. - - Parameters: - config: The configuration settings for the encoding process. - texts (List[str]): The text chunks to encode. - timeout (int): The maximum time (in seconds) allowed for each subprocess to complete. - - Returns: - A list of encoded results aggregated from all subprocesses. - """ - - manager = mp.Manager() - return_dict = manager.dict() - - _texts = [BaseText(original_index=index, text=text) for index, text in enumerate(texts)] - - work_loads = map_work_load(_texts, self._nranks) - logging.info(f"encoding {len(work_loads)} texts on nranks {self._nranks}") - - processes = [] - proc_info = [] - ranks = len(work_loads) - for rank, work_load in enumerate(work_loads): - p = mp.Process( - target=cuda_encode_texts, - args=(config, rank, work_load, return_dict), - ) - p.start() - processes.append(p) - proc_info.append((p.pid, p.name)) - logging.debug(f"start process on rank {rank} of {self._nranks} nranks") - - timed_out_processes = [] - for p, info in zip(processes, proc_info): - p.join(timeout=timeout) - if p.is_alive(): - logging.error( - f"embedding process timed out process PID: {info[0]}, Name: {info[1]}" - ) - timed_out_processes.append(p) - else: - logging.debug( - f"joined embedding process ID: {info[0]}, Name: {info[1]}" - ) - - if timed_out_processes: - raise Exception( - "one or more processes did not complete within the timeout period" - ) - else: - logging.info("all processes completed") - - # Aggregate results from each GPU - result_list:List[EmbeddedText] = [] - for new_rank in range(ranks): - result_list.extend(return_dict[new_rank]) - - return result_list diff --git a/libs/colbert/ragstack_colbert/objects.py b/libs/colbert/ragstack_colbert/objects.py index fa441e750..bfc1eeee5 100644 --- a/libs/colbert/ragstack_colbert/objects.py +++ b/libs/colbert/ragstack_colbert/objects.py @@ -3,97 +3,53 @@ processing within the ColBERT retrieval system. """ -from dataclasses import dataclass, field -from numbers import Number -from typing import Any, Dict, List - -from torch import Tensor - - -@dataclass(frozen=True) -class BaseChunk: - """ - The base class a chunk of text from a document - - Attributes: - doc_id (str): The document id from which this chunk originates. - chunk_id (int): The id of this chunk. - """ - - doc_id: str - chunk_id: int - - -@dataclass(frozen=True) -class ChunkData(): - """ - Represents a chunk of text from a document including any associated metadata - - Attributes: - text (str): The text content of this chunk. - metadata (dict): The metadata of this chunk. - """ - - text: str - metadata: Dict[str, Any] - - -@dataclass(frozen=True) -class EmbeddedChunk(BaseChunk): - """ - Extends BaseChunk with the ColBERT embedding for the chunk's text. - - Attributes: - data (ChunkData): Contains the chunk text and metadata - embeddings (Tensor): A tensor representing the embeddings of the chunk's text. The dimensions - are 'the count of tokens in the chunk' by 'the Colbert embedding size - per chunk (default 128)' - - Inherits from: - BaseChunk: Inherits all attributes and methods from the BaseChunk class. - """ - - data: ChunkData - embeddings: Tensor - - - def __len__(self): - """ - Returns the length of the embeddings tensor, representing the number of dimensions - in the embedded space. - - Returns: - int: The number of dimensions in the embeddings tensor. - """ - return len(self.embeddings) - - -@dataclass(frozen=True) -class RetrievedChunk(BaseChunk): - """ - Represents a chunk of text that has been retrieved, including ranking and scoring information. - - Attributes: - data (ChunkData): Contains the chunk text and metadata - rank (int): The rank of this chunk in the context of ColBERT retrieval, with a lower number - indicating a higher relevance or quality. - score (Number): The score assigned to this chunk by the ColBERT retrieval system, indicating - its relevancy. Higher scores are better. - - Inherits from: - BaseChunk: Inherits all attributes and methods from the BaseChunk class. - """ - - data: ChunkData - rank: int - score: Number - -@dataclass(frozen=True) -class BaseText: - original_index: int - text: str - -@dataclass(frozen=True) -class EmbeddedText(BaseText): - chunk_id: int - embeddings: List[Tensor] +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + +# LlamaIndex Node (chunk) has ids, text, embedding, metadata +# VectorStore.add(nodes: List[Node]) -> List[str](ids): embeds texts OUTside add +# .delete(id) +# .query(embedding) -> Nodes, Scores, Ids + +# LangChain Document (doc or chunk) has page_content, metadata +# VectorStore.add(texts: List[str], metadatas: Optional[List[dict]]) -> List[str](ids): embeds texts INside add +# .delete(ids: List[str]): deletes by id +# .search(query: str) -> List[Document]: uses retriever to search in store +# .as_retriever() -> Retriever + +# Define Vector and Embedding types +Vector = List[float] +Embedding = List[Vector] +Metadata = Dict[str, Any] + + +class Chunk(BaseModel): + doc_id: str = Field(..., description="id of the parent document", frozen=True) + chunk_id: int = Field(..., description="id of the chunk", frozen=True, ge=0) + text: str = Field(default=None, description="text of the chunk") + metadata: Metadata = Field( + default_factory=dict, description="flat metadata of the chunk" + ) + embedding: Optional[Embedding] = Field( + default=None, description="embedding of the chunk" + ) + + class Config: + validate_assignment = True + + # Define equality based on doc_id and chunk_id only + def __eq__(self, other: object) -> bool: + if isinstance(other, Chunk): + return (self.doc_id == other.doc_id) and (self.chunk_id == other.chunk_id) + return False + + # Define less than for ordering + def __lt__(self, other: object) -> bool: + if not isinstance(other, Chunk): + return NotImplemented + return (self.doc_id, self.chunk_id) < (other.doc_id, other.chunk_id) + + # Allow objects to be hashable - only necessary if you need to use them in sets or as dict keys + def __hash__(self): + return hash((self.doc_id, self.chunk_id)) diff --git a/libs/colbert/ragstack_colbert/text_encoder.py b/libs/colbert/ragstack_colbert/text_encoder.py new file mode 100644 index 000000000..2225e7ddf --- /dev/null +++ b/libs/colbert/ragstack_colbert/text_encoder.py @@ -0,0 +1,123 @@ +""" +This module provides functionalities to encode text chunks into dense vector representations using a ColBERT +model. It supports encoding chunks in batches to efficiently manage memory usage and prevent out-of-memory errors +when processing large datasets. The module is designed for use in semantic search and retrieval systems, where such +dense embeddings are used to measure the semantic similarity between text chunks. +""" + +import logging +from typing import List, Optional + +import torch + +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint + +from .objects import Chunk, Embedding + + +def calculate_query_maxlen(tokens: List[List[str]]) -> int: + """ + Calculates an appropriate maximum query length for token embeddings, based on the length of the tokenized input. + + Parameters: + tokens (List[List[str]]): A nested list where each sublist contains tokens from a single query or chunk. + + Returns: + int: The calculated maximum length for query tokens, adhering to the specified minimum and maximum bounds, + and adjusted to the nearest power of two. + """ + + max_token_length = max(len(inner_list) for inner_list in tokens) + + # tokens from the query tokenizer does not include the SEP, CLS + # SEP, CLS, and Q tokens are added to the query + # although there could be more SEP tokens if there are more than one sentences, we only add one + return max_token_length + 3 + + +class TextEncoder: + """ + Encapsulates the logic for encoding text chunks and queries into dense vector representations using a specified ColBERT model + configuration and checkpoint. This class is optimized for batch processing to manage GPU memory usage efficiently. + """ + + def __init__(self, config: ColBERTConfig, verbose: Optional[int] = 3) -> None: + """ + Initializes the ChunkEncoder with a given ColBERT model configuration and checkpoint. + + Parameters: + config (ColBERTConfig): The configuration for the Colbert model. + verbose (int): The level of logging to use + """ + + logging.info(f"Cuda enabled GPU available: {torch.cuda.is_available()}") + + self._checkpoint = Checkpoint( + config.checkpoint, colbert_config=config, verbose=verbose + ) + self._use_cpu = config.total_visible_gpus == 0 + + def encode_chunks(self, chunks: List[Chunk], batch_size: int = 640) -> List[Chunk]: + """ + Encodes a list of chunks into embeddings, processing in batches to efficiently manage memory. + + Parameters: + texts (List[str]): The text chunks to encode. + batch_size (int): The size of batches for processing to avoid memory overflow. Defaults to 64. + + Returns: + A tuple containing the concatenated tensor of embeddings and a list of document lengths. + """ + + logging.debug(f"#> Encoding {len(chunks)} chunks..") + + embedded_chunks: List[Chunk] = [] + + if len(chunks) == 0: + return embedded_chunks + + with torch.inference_mode(): + texts = [chunk.text for chunk in chunks] + + embeddings, counts = self._checkpoint.docFromText( + texts, + bsize=batch_size, + to_cpu=self._use_cpu, + keep_dims="flatten", + ) + + start_idx = 0 + for index, chunk in enumerate(chunks): + # The end index for slicing + end_idx = start_idx + counts[index] + chunk.embedding = embeddings[start_idx:end_idx] + + embedded_chunks.append(chunk) + + # Reset for the next chunk + start_idx = end_idx + + return embedded_chunks + + def encode_query( + self, text: str, query_maxlen: int, full_length_search: Optional[bool] = False + ) -> Embedding: + if query_maxlen < 0: + tokens = self._checkpoint.query_tokenizer.tokenize([text]) + query_maxlen = calculate_query_maxlen(tokens) + logging.debug(f"Calculated dynamic query_maxlen of {query_maxlen}") + + prev_query_maxlen = self._checkpoint.query_tokenizer.query_maxlen + self._checkpoint.query_tokenizer.query_maxlen = query_maxlen + + with torch.inference_mode(): + query_embedding = self._checkpoint.queryFromText( + queries=[text], + to_cpu=self._use_cpu, + full_length_search=full_length_search, + ) + + self._checkpoint.query_tokenizer.query_maxlen = prev_query_maxlen + + return query_embedding.tolist()[0] diff --git a/libs/colbert/tests/integration_tests/test_colbert_embedding_retrieval.py b/libs/colbert/tests/integration_tests/test_colbert_embedding_retrieval.py index 119da25fd..ce4e047ac 100644 --- a/libs/colbert/tests/integration_tests/test_colbert_embedding_retrieval.py +++ b/libs/colbert/tests/integration_tests/test_colbert_embedding_retrieval.py @@ -3,10 +3,9 @@ import pytest from ragstack_colbert import ( - CassandraVectorStore, - ChunkData, + CassandraDatabase, ColbertEmbeddingModel, - ColbertRetriever, + ColbertVectorStore, ) from tests.integration_tests.conftest import ( get_astradb_test_store, @@ -24,7 +23,7 @@ def astra_db(): return get_astradb_test_store() -#@pytest.mark.parametrize("vector_store", ["cassandra", "astra_db"]) +# @pytest.mark.parametrize("vector_store", ["cassandra", "astra_db"]) @pytest.mark.parametrize("vector_store", ["cassandra"]) def test_embedding_cassandra_retriever(request, vector_store: str): vector_store = request.getfixturevalue(vector_store) @@ -67,35 +66,41 @@ def generate_texts(text, chunk_size, overlap_size): for i, text in enumerate(texts[:3]): # Displaying the first 3 chunks for brevity logging.info(f"Chunk {i + 1}:\n{text}\n{'-' * 50}\n") - doc_id = "Marine Animals habitat" + doc_id = "marine_animals" - # colbert stuff starts - colbert = ColbertEmbeddingModel( + session = vector_store.create_cassandra_session() + session.default_timeout = 180 + + database = CassandraDatabase.from_session( + keyspace="default_keyspace", + table_name="colbert_embeddings", + session=session, + ) + + embedding_model = ColbertEmbeddingModel( doc_maxlen=220, nbits=2, kmeans_niters=4, ) - chunks = [ChunkData(text=text, metadata={}) for text in texts] + store = ColbertVectorStore( + database=database, + embedding_model=embedding_model, + ) - embedded_chunks = colbert.embed_chunks(chunks=chunks, doc_id=doc_id) + store.add_texts(texts=texts, doc_id=doc_id) - logging.info(f"embedded chunks size {len(embedded_chunks)}") + retriever = store.as_retriever() - store = CassandraVectorStore( - keyspace="default_keyspace", - table_name="colbert_embeddings", - session=vector_store.create_cassandra_session(), + chunk_scores = retriever.text_search( + query_text="what kind fish lives shallow coral reefs", k=5 ) - store.put_chunks(chunks=embedded_chunks, delete_existing=True) - - retriever = ColbertRetriever( - vector_store=store, embedding_model=colbert + assert len(chunk_scores) == 5 + for chunk, score in chunk_scores: + logging.info(f"got chunk_id {chunk.chunk_id} with score {score}") + + best_chunk = chunk_scores[0][0] + assert len(best_chunk.text) > 0 + logging.info( + f"Highest scoring chunk_id: {best_chunk.chunk_id} with text: {best_chunk.text}" ) - chunks = retriever.retrieve("what kind fish lives shallow coral reefs", k=5) - for chunk in chunks: - logging.info(f"got {chunk}") - assert len(chunks) == 5 - assert len(chunks[0].data.text) > 0 - assert chunks[0].rank == 1 - assert chunks[1].rank == 2 diff --git a/libs/colbert/tests/integration_tests/test_colbert_embeddings.py b/libs/colbert/tests/integration_tests/test_colbert_embeddings.py deleted file mode 100644 index 43c7fabec..000000000 --- a/libs/colbert/tests/integration_tests/test_colbert_embeddings.py +++ /dev/null @@ -1,74 +0,0 @@ -import torch - -from ragstack_colbert import ChunkData, ColbertEmbeddingModel -from ragstack_colbert.constant import DEFAULT_COLBERT_DIM, DEFAULT_COLBERT_MODEL - - -def test_colbert_token_embeddings(): - colbert = ColbertEmbeddingModel() - assert colbert.colbert_config is not None - - chunks = [ - ChunkData(text="test1", metadata={}), - ChunkData(text="test2", metadata={}), - ] - - embedded_chunks = colbert.embed_chunks(chunks=chunks) - - assert len(embedded_chunks) == 2 - - assert embedded_chunks[0].data.text == "test1" - assert embedded_chunks[1].data.text == "test2" - - # generate uuid based id - assert embedded_chunks[0].doc_id != "" - assert embedded_chunks[1].doc_id != "" - - embedded_chunks = colbert.embed_chunks(chunks=chunks, doc_id="test-id") - - assert embedded_chunks[0].data.text == "test1" - assert embedded_chunks[0].doc_id == "test-id" - assert embedded_chunks[1].doc_id == "test-id" - - embeddings = embedded_chunks[0].embeddings - assert len(embeddings[0]) == DEFAULT_COLBERT_DIM - - -def test_colbert_token_embeddings_with_params(): - colbert = ColbertEmbeddingModel( - doc_maxlen=220, - nbits=2, - kmeans_niters=4, - checkpoint=DEFAULT_COLBERT_MODEL, - query_maxlen=32, - ) - assert colbert.colbert_config is not None - - chunks = [ - ChunkData(text="test1", metadata={}), - ChunkData(text="test2", metadata={}), - ChunkData(text="test3", metadata={}), - ] - - embedded_chunks = colbert.embed_chunks(chunks=chunks) - - assert len(embedded_chunks) == 3 - - assert embedded_chunks[0].data.text == "test1" - assert embedded_chunks[1].data.text == "test2" - - embeddings = embedded_chunks[0].embeddings - assert len(embeddings) > 1 - assert len(embeddings[0]) == DEFAULT_COLBERT_DIM - - -def test_colbert_query_embeddings(): - colbert = ColbertEmbeddingModel() - - queryTensor = colbert.embed_query("who is the president of the united states?") - assert isinstance(queryTensor, torch.Tensor) - assert queryTensor.shape == (12, 128) - - # test query encoding - queryEncoding = colbert.embed_query("test-query", query_maxlen=512) - assert len(queryEncoding) == 512 diff --git a/libs/colbert/tests/integration_tests/test_colbert_baseline_embeddings.py b/libs/colbert/tests/unit_tests/baseline_tensors.py similarity index 98% rename from libs/colbert/tests/integration_tests/test_colbert_baseline_embeddings.py rename to libs/colbert/tests/unit_tests/baseline_tensors.py index 4b3cbade6..b7e51a778 100644 --- a/libs/colbert/tests/integration_tests/test_colbert_baseline_embeddings.py +++ b/libs/colbert/tests/unit_tests/baseline_tensors.py @@ -1,16 +1,6 @@ -import logging -from typing import List - import torch -from torch import Tensor -from torch.nn.functional import cosine_similarity - -from colbert.indexing.collection_encoder import CollectionEncoder -from colbert.infra.config import ColBERTConfig -from colbert.modeling.checkpoint import Checkpoint -from ragstack_colbert import ChunkData, ColbertEmbeddingModel, EmbeddedChunk -from ragstack_colbert.constant import DEFAULT_COLBERT_MODEL +# fmt: off baseline_tensors = [ torch.tensor([-0.0855, 0.0171, -0.0917, 0.0023, 0.0630, 0.0211, 0.0054, 0.1178, 0.0086, -0.1145, -0.0220, 0.0094, 0.0973, -0.2913, -0.1577, -0.1720, @@ -11843,159 +11833,4 @@ -0.0205, 0.1338, 0.0167, 0.1097, -0.0123, 0.0783, -0.0861, 0.0394, 0.2047, 0.0487, -0.0595, 0.0944, 0.1929, 0.1593, 0.1176, -0.0145]), ] - -""" -These tests are for embedding drift and evaluate if the embeddings are changing over time due to -1) changes in the model or drift in model weights - model drift does not necessarily mean the test fails, but it is a signal that the model is changing -2) changes in the embedding implementation - if the implementation changes, we need to investigate if the change is intentional or not -""" - -# 8 chunks with 220 tokens each for testing -# the above 645 per token embeddings matches arctic_botany_chunks's token embedding - -arctic_botany_dict = { - "Introduction to Arctic Botany": "Arctic botany is the study of plant life in the Arctic, a region characterized by extreme cold, permafrost, and minimal sunlight for much of the year. Despite these harsh conditions, a diverse range of flora thrives here, adapted to survive with minimal water, low temperatures, and high light levels during the summer. This introduction aims to shed light on the resilience and adaptation of Arctic plants, setting the stage for a deeper dive into the unique botanical ecosystem of the Arctic.", - "Arctic Plant Adaptations": "Plants in the Arctic have developed unique adaptations to endure the extreme climate. Perennial growth, antifreeze proteins, and a short growth cycle are among the evolutionary solutions. These adaptations not only allow the plants to survive but also to reproduce in short summer months. Arctic plants often have small, dark leaves to absorb maximum sunlight, and some species grow in cushion or mat forms to resist cold winds. Understanding these adaptations provides insights into the resilience of Arctic flora.", - "The Tundra Biome": "The Arctic tundra is a vast, treeless biome where the subsoil is permanently frozen. Here, the vegetation is predominantly composed of dwarf shrubs, grasses, mosses, and lichens. The tundra supports a surprisingly rich biodiversity, adapted to its cold, dry, and windy conditions. The biome plays a crucial role in the Earth's climate system, acting as a carbon sink. However, it's sensitive to climate change, with thawing permafrost and shifting vegetation patterns.", - "Arctic Plant Biodiversity": "Despite the challenging environment, the Arctic boasts a significant variety of plant species, each adapted to its niche. From the colorful blooms of Arctic poppies to the hardy dwarf willows, these plants form a complex ecosystem. The biodiversity of Arctic flora is vital for local wildlife, providing food and habitat. This diversity also has implications for Arctic peoples, who depend on certain plant species for food, medicine, and materials.", - "Climate Change and Arctic Flora": "Climate change poses a significant threat to Arctic botany, with rising temperatures, melting permafrost, and changing precipitation patterns. These changes can lead to shifts in plant distribution, phenology, and the composition of the Arctic flora. Some species may thrive, while others could face extinction. This dynamic is critical to understanding future Arctic ecosystems and their global impact, including feedback loops that may exacerbate global warming.", - "Research and Conservation in the Arctic": "Research in Arctic botany is crucial for understanding the intricate balance of this ecosystem and the impacts of climate change. Scientists conduct studies on plant physiology, genetics, and ecosystem dynamics. Conservation efforts are focused on protecting the Arctic's unique biodiversity through protected areas, sustainable management practices, and international cooperation. These efforts aim to preserve the Arctic flora for future generations and maintain its role in the global climate system.", - "Traditional Knowledge and Arctic Botany": "Indigenous peoples of the Arctic have a deep connection with the land and its plant life. Traditional knowledge, passed down through generations, includes the uses of plants for nutrition, healing, and materials. This body of knowledge is invaluable for both conservation and understanding the ecological relationships in Arctic ecosystems. Integrating traditional knowledge with scientific research enriches our comprehension of Arctic botany and enhances conservation strategies.", - "Future Directions in Arctic Botanical Studies": "The future of Arctic botany lies in interdisciplinary research, combining traditional knowledge with modern scientific techniques. As the Arctic undergoes rapid changes, understanding the ecological, cultural, and climatic dimensions of Arctic flora becomes increasingly important. Future research will need to address the challenges of climate change, explore the potential for Arctic plants in biotechnology, and continue to conserve this unique biome. The resilience of Arctic flora offers lessons in adaptation and survival relevant to global challenges." -} - -arctic_botany_chunks = [ChunkData(text=text, metadata={}) for text in arctic_botany_dict.values()] - -# a uility function to evaluate similarity of two embeddings at per token level -def are_they_similar(embedded_chunks: List[EmbeddedChunk], tensors: List[Tensor]): - n = 0 - pdist = torch.nn.PairwiseDistance(p=2) - for embedded_chunk in embedded_chunks: - for embedding in embedded_chunk.embeddings: - assert embedding.shape == tensors[n].shape - - # we still have outlier over the specified limit but almost 0 - assert pdist(embedding, tensors[n]).item() < 0.0001 - - similarity = cosine_similarity(embedding.unsqueeze(0), tensors[n].unsqueeze(0)) - assert similarity.item() > 0.999 - n = n + 1 - - assert n == len(tensors) - - -def test_embeddings_with_baseline(): - colbert = ColbertEmbeddingModel( - doc_maxlen=220, - nbits=2, - kmeans_niters=4, - checkpoint=DEFAULT_COLBERT_MODEL, - ) - - """ - 1. test any drift from the baseline - 2. test any deviation from the embedding functions - - since we don't have a graph or storage to keep track any degreation, - please add to the model and implementions resultsed euclidian and cosine threshold change - 2024-04-08 default model - https://huggingface.co/colbert-ir/colbertv2.0 - """ - embedded_chunks = colbert.embed_chunks(arctic_botany_chunks, doc_id="arctic_botany") - - pdist = torch.nn.PairwiseDistance(p=2) - embedded_tensors = [] - n = 0 - for embedded_chunk in embedded_chunks: - for embedding in embedded_chunk.embeddings: - embedded_tensors.append(embedding) - distance = torch.norm(embedding - baseline_tensors[n]) - assert abs(distance) < 0.001 - # another way to measure pairwise distance - # it must be a positive since it's from square root - assert pdist(embedding, baseline_tensors[n]).item() < 0.001 - - similarity = cosine_similarity(embedding.unsqueeze(0), baseline_tensors[n].unsqueeze(0)) - assert similarity.shape == torch.Size([1]) # this has to be scalar - # debug code to identify which token deviates - if similarity.item() < 0.99: - logging.warning(f"n = {n}, similarity = {similarity.item()}") - assert similarity.item() > 0.99 - n = n + 1 - - assert len(embedded_tensors) == 645 - - """ - test against the same function to make sure to generate the same embeddings - use the same ColBERT configurations but reload the checkpoint with the default settings - this also make sure the default ColBERT configurations have not changed - """ - colbert2 = ColbertEmbeddingModel( - checkpoint=DEFAULT_COLBERT_MODEL, - ) - embedded_chunks2 = colbert2.embed_chunks(arctic_botany_chunks) - - are_they_similar(embedded_chunks2, embedded_tensors) - - -def test_colbert_embedding_against_vanilla_impl(): - # this is a vanilla ColBERT embedding in a list of per token embeddings - # based on the just Stanford ColBERT library - cf = ColBERTConfig(checkpoint='colbert-ir/colbertv2.0') - cp = Checkpoint(cf.checkpoint, colbert_config=cf) - encoder = CollectionEncoder(cf, cp) - - texts = [chunk.text for chunk in arctic_botany_chunks] - - embeddings_flat, _ = encoder.encode_passages(texts) - - colbertSvc = ColbertEmbeddingModel( - checkpoint=DEFAULT_COLBERT_MODEL, - ) - embedded_chunks = colbertSvc.embed_chunks(arctic_botany_chunks) - - are_they_similar(embedded_chunks, embeddings_flat) - - -def model_embedding(model: str): - logging.info(f"test model compatibility {model}") - colbertSvc = ColbertEmbeddingModel( - checkpoint=model, - query_maxlen=32, - ) - embedded_chunks = colbertSvc.embed_chunks(arctic_botany_chunks) - - assert len(embedded_chunks) == 8 - n = 0 - for embedded_chunk in embedded_chunks: - for embedding in embedded_chunk.embeddings: - assert embedding.shape == (128, ) - n = n + 1 - - assert n == 645 - - # recall embeddings test - encoded = colbertSvc.embed_query( - query="What adaptations enable Arctic plants to survive and thrive in extremely cold temperatures and minimal sunlight?", - query_maxlen=32, - ) - assert encoded.shape == torch.Size([32,128]) - - -def test_compatible_models(): - # ColBERT models and Google BERT models on HF - # test representive models's compatibility with this repo's ColBERT embedding - # evaluation is not within this test scope - models = [ - "colbert-ir/colbertv1.9", - # "colbert-ir/colbertv2.0_msmarco_64way", # this model is large - "mixedbread-ai/mxbai-colbert-large-v1", - # "antoinelouis/colbert-xm", # XMOD based - # "jinaai/jina-colbert-v1-en", # requires HF token and code changes - "google-bert/bert-base-uncased", # BERT compatibility test only, do not recommend - # some colbert is trained on uncased - # "google-bert/bert-base-cased", # already tested uncased - ] - - [model_embedding(model) for model in models] +# fmt: on diff --git a/libs/colbert/tests/unit_tests/test_colbert_baseline_embeddings.py b/libs/colbert/tests/unit_tests/test_colbert_baseline_embeddings.py new file mode 100644 index 000000000..033368198 --- /dev/null +++ b/libs/colbert/tests/unit_tests/test_colbert_baseline_embeddings.py @@ -0,0 +1,175 @@ +import logging +from typing import List + +import torch +from torch import Tensor +from torch.nn.functional import cosine_similarity + +from colbert.indexing.collection_encoder import CollectionEncoder +from colbert.infra.config import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from ragstack_colbert import ColbertEmbeddingModel, Embedding +from ragstack_colbert.constant import DEFAULT_COLBERT_MODEL + +from .baseline_tensors import baseline_tensors + +""" +These tests are for embedding drift and evaluate if the embeddings are changing over time due to +1) changes in the model or drift in model weights + model drift does not necessarily mean the test fails, but it is a signal that the model is changing +2) changes in the embedding implementation + if the implementation changes, we need to investigate if the change is intentional or not +""" + +# 8 chunks with 220 tokens each for testing +# the above 645 per token embeddings matches arctic_botany_chunks's token embedding + +arctic_botany_dict = { + "Introduction to Arctic Botany": "Arctic botany is the study of plant life in the Arctic, a region characterized by extreme cold, permafrost, and minimal sunlight for much of the year. Despite these harsh conditions, a diverse range of flora thrives here, adapted to survive with minimal water, low temperatures, and high light levels during the summer. This introduction aims to shed light on the resilience and adaptation of Arctic plants, setting the stage for a deeper dive into the unique botanical ecosystem of the Arctic.", + "Arctic Plant Adaptations": "Plants in the Arctic have developed unique adaptations to endure the extreme climate. Perennial growth, antifreeze proteins, and a short growth cycle are among the evolutionary solutions. These adaptations not only allow the plants to survive but also to reproduce in short summer months. Arctic plants often have small, dark leaves to absorb maximum sunlight, and some species grow in cushion or mat forms to resist cold winds. Understanding these adaptations provides insights into the resilience of Arctic flora.", + "The Tundra Biome": "The Arctic tundra is a vast, treeless biome where the subsoil is permanently frozen. Here, the vegetation is predominantly composed of dwarf shrubs, grasses, mosses, and lichens. The tundra supports a surprisingly rich biodiversity, adapted to its cold, dry, and windy conditions. The biome plays a crucial role in the Earth's climate system, acting as a carbon sink. However, it's sensitive to climate change, with thawing permafrost and shifting vegetation patterns.", + "Arctic Plant Biodiversity": "Despite the challenging environment, the Arctic boasts a significant variety of plant species, each adapted to its niche. From the colorful blooms of Arctic poppies to the hardy dwarf willows, these plants form a complex ecosystem. The biodiversity of Arctic flora is vital for local wildlife, providing food and habitat. This diversity also has implications for Arctic peoples, who depend on certain plant species for food, medicine, and materials.", + "Climate Change and Arctic Flora": "Climate change poses a significant threat to Arctic botany, with rising temperatures, melting permafrost, and changing precipitation patterns. These changes can lead to shifts in plant distribution, phenology, and the composition of the Arctic flora. Some species may thrive, while others could face extinction. This dynamic is critical to understanding future Arctic ecosystems and their global impact, including feedback loops that may exacerbate global warming.", + "Research and Conservation in the Arctic": "Research in Arctic botany is crucial for understanding the intricate balance of this ecosystem and the impacts of climate change. Scientists conduct studies on plant physiology, genetics, and ecosystem dynamics. Conservation efforts are focused on protecting the Arctic's unique biodiversity through protected areas, sustainable management practices, and international cooperation. These efforts aim to preserve the Arctic flora for future generations and maintain its role in the global climate system.", + "Traditional Knowledge and Arctic Botany": "Indigenous peoples of the Arctic have a deep connection with the land and its plant life. Traditional knowledge, passed down through generations, includes the uses of plants for nutrition, healing, and materials. This body of knowledge is invaluable for both conservation and understanding the ecological relationships in Arctic ecosystems. Integrating traditional knowledge with scientific research enriches our comprehension of Arctic botany and enhances conservation strategies.", + "Future Directions in Arctic Botanical Studies": "The future of Arctic botany lies in interdisciplinary research, combining traditional knowledge with modern scientific techniques. As the Arctic undergoes rapid changes, understanding the ecological, cultural, and climatic dimensions of Arctic flora becomes increasingly important. Future research will need to address the challenges of climate change, explore the potential for Arctic plants in biotechnology, and continue to conserve this unique biome. The resilience of Arctic flora offers lessons in adaptation and survival relevant to global challenges.", +} + +arctic_botany_chunks = [text for text in arctic_botany_dict.values()] + + +# a uility function to evaluate similarity of two embeddings at per token level +def are_they_similar(embedded_chunks: List[Embedding], tensors: List[Tensor]): + n = 0 + pdist = torch.nn.PairwiseDistance(p=2) + for embedding in embedded_chunks: + for vector in embedding: + vector_tensor = torch.tensor(vector) + assert vector_tensor.shape == tensors[n].shape + + # we still have outlier over the specified limit but almost 0 + assert pdist(vector_tensor, tensors[n]).item() < 0.0001 + + similarity = cosine_similarity( + vector_tensor.unsqueeze(0), tensors[n].unsqueeze(0) + ) + assert similarity.item() > 0.999 + n = n + 1 + + assert n == len(tensors) + + +def test_embeddings_with_baseline(): + colbert = ColbertEmbeddingModel( + doc_maxlen=220, + nbits=2, + kmeans_niters=4, + checkpoint=DEFAULT_COLBERT_MODEL, + ) + + """ + 1. test any drift from the baseline + 2. test any deviation from the embedding functions + + since we don't have a graph or storage to keep track any degreation, + please add to the model and implementions resultsed euclidian and cosine threshold change + 2024-04-08 default model - https://huggingface.co/colbert-ir/colbertv2.0 + """ + embeddings: List[Embedding] = colbert.embed_texts(arctic_botany_chunks) + + pdist = torch.nn.PairwiseDistance(p=2) + embedded_tensors = [] + n = 0 + for embedding in embeddings: + for vector in embedding: + vector_tensor = torch.tensor(vector) + embedded_tensors.append(vector_tensor) + distance = torch.norm(vector_tensor - baseline_tensors[n]) + assert abs(distance) < 0.001 + # another way to measure pairwise distance + # it must be a positive since it's from square root + assert pdist(vector_tensor, baseline_tensors[n]).item() < 0.001 + + similarity = cosine_similarity( + vector_tensor.unsqueeze(0), baseline_tensors[n].unsqueeze(0) + ) + assert similarity.shape == torch.Size([1]) # this has to be scalar + # debug code to identify which token deviates + if similarity.item() < 0.99: + logging.warning(f"n = {n}, similarity = {similarity.item()}") + assert similarity.item() > 0.99 + n = n + 1 + + assert len(embedded_tensors) == 645 + + """ + test against the same function to make sure to generate the same embeddings + use the same ColBERT configurations but reload the checkpoint with the default settings + this also make sure the default ColBERT configurations have not changed + """ + colbert2 = ColbertEmbeddingModel( + checkpoint=DEFAULT_COLBERT_MODEL, + ) + embedded_chunks2 = colbert2.embed_texts(arctic_botany_chunks) + + are_they_similar(embedded_chunks2, embedded_tensors) + + +def test_colbert_embedding_against_vanilla_impl(): + # this is a vanilla ColBERT embedding in a list of per token embeddings + # based on the just Stanford ColBERT library + cf = ColBERTConfig(checkpoint="colbert-ir/colbertv2.0") + cp = Checkpoint(cf.checkpoint, colbert_config=cf) + encoder = CollectionEncoder(cf, cp) + + embeddings_flat, _ = encoder.encode_passages(arctic_botany_chunks) + + colbertSvc = ColbertEmbeddingModel( + checkpoint=DEFAULT_COLBERT_MODEL, + ) + embedded_chunks = colbertSvc.embed_texts(arctic_botany_chunks) + + are_they_similar(embedded_chunks, embeddings_flat) + + +def model_embedding(model: str): + logging.info(f"test model compatibility {model}") + colbertSvc = ColbertEmbeddingModel( + checkpoint=model, + query_maxlen=32, + ) + embeddings = colbertSvc.embed_texts(arctic_botany_chunks) + + assert len(embeddings) == 8 + n = 0 + for embedding in embeddings: + for vector in embedding: + assert len(vector) == 128 + n = n + 1 + + assert n == 645 + + # recall embeddings test + embedding = colbertSvc.embed_query( + query="What adaptations enable Arctic plants to survive and thrive in extremely cold temperatures and minimal sunlight?", + query_maxlen=32, + ) + assert len(embedding) == 32 + + +def test_compatible_models(): + # ColBERT models and Google BERT models on HF + # test representive models's compatibility with this repo's ColBERT embedding + # evaluation is not within this test scope + models = [ + "colbert-ir/colbertv1.9", + # "colbert-ir/colbertv2.0_msmarco_64way", # this model is large + "mixedbread-ai/mxbai-colbert-large-v1", + # "antoinelouis/colbert-xm", # XMOD based + # "jinaai/jina-colbert-v1-en", # requires HF token and code changes + "google-bert/bert-base-uncased", # BERT compatibility test only, do not recommend + # some colbert is trained on uncased + # "google-bert/bert-base-cased", # already tested uncased + ] + + [model_embedding(model) for model in models] diff --git a/libs/colbert/tests/unit_tests/test_colbert_embeddings.py b/libs/colbert/tests/unit_tests/test_colbert_embeddings.py new file mode 100644 index 000000000..46044d0d7 --- /dev/null +++ b/libs/colbert/tests/unit_tests/test_colbert_embeddings.py @@ -0,0 +1,45 @@ +import torch + +from ragstack_colbert import ColbertEmbeddingModel +from ragstack_colbert.constant import DEFAULT_COLBERT_DIM, DEFAULT_COLBERT_MODEL + + +def test_colbert_token_embeddings(): + colbert = ColbertEmbeddingModel() + + texts = ["test1", "test2"] + + embeddings = colbert.embed_texts(texts=texts) + + assert len(embeddings) == 2 + assert len(embeddings[0][0]) == DEFAULT_COLBERT_DIM + + +def test_colbert_token_embeddings_with_params(): + colbert = ColbertEmbeddingModel( + doc_maxlen=220, + nbits=2, + kmeans_niters=4, + checkpoint=DEFAULT_COLBERT_MODEL, + query_maxlen=32, + ) + + texts = ["test1", "test2", "text3"] + + embeddings = colbert.embed_texts(texts=texts) + + assert len(embeddings) == 3 + + assert len(embeddings[0][0]) == DEFAULT_COLBERT_DIM + + +def test_colbert_query_embeddings(): + colbert = ColbertEmbeddingModel() + + embedding = colbert.embed_query("who is the president of the united states?") + queryTensor = torch.tensor(embedding) + assert queryTensor.shape == (12, 128) + + # test query encoding + embedding = colbert.embed_query("test-query", query_maxlen=512) + assert len(embedding) == 512 diff --git a/libs/colbert/tests/unit_tests/test_colbert_retriever.py b/libs/colbert/tests/unit_tests/test_colbert_retriever.py index 05f38260c..aa3e5151a 100644 --- a/libs/colbert/tests/unit_tests/test_colbert_retriever.py +++ b/libs/colbert/tests/unit_tests/test_colbert_retriever.py @@ -1,7 +1,7 @@ import torch from ragstack_colbert.colbert_retriever import max_similarity_torch -from ragstack_colbert.colbert_embedding_model import calculate_query_maxlen +from ragstack_colbert.text_encoder import calculate_query_maxlen def test_max_similarity_torch(): @@ -21,11 +21,11 @@ def test_max_similarity_torch(): ) # Should be the highest # Call the function under test - max_sim = max_similarity_torch(query_vector, embedding_list, is_cuda=False) + max_sim = max_similarity_torch(query_vector, embedding_list) # Check if the returned max similarity matches the expected value assert ( - max_sim.item() == expected_max_similarity.item() + max_sim == expected_max_similarity.item() ), "The max similarity does not match the expected value." @@ -33,6 +33,5 @@ def test_query_maxlen_calculation(): tokens = [["word1"], ["word2", "word3"]] assert calculate_query_maxlen(tokens) == 5 - tokens = [["word1", "word2", "word3"], ["word1", "word2"]] assert calculate_query_maxlen(tokens) == 6 diff --git a/libs/colbert/tests/unit_tests/test_cuda_runner.py b/libs/colbert/tests/unit_tests/test_cuda_runner.py deleted file mode 100644 index b9a80f4e1..000000000 --- a/libs/colbert/tests/unit_tests/test_cuda_runner.py +++ /dev/null @@ -1,93 +0,0 @@ -import torch - -from ragstack_colbert.distributed.distributed import Distributed -from ragstack_colbert.distributed.runner import distribute_work_load, map_work_load - - -def test_distributed(): - distributed = Distributed(nranks=-1) - assert distributed.world_size() == 0 - - distributed = Distributed(nranks=8) - assert distributed.world_size() == 0 - - -def test_distributed_with_nranks(): - # this is a singletone so this would not work. - # we need to test on cuda initialization - if torch.cuda.is_available() == False: - distributed = Distributed(nranks=8) - assert distributed.world_size() == 0 - else: - world_size = torch.cuda.device_count() - distributed = Distributed(nranks=world_size) - assert distributed.world_size() == world_size - -def test_even_distribution(): - result = distribute_work_load(work_load_size=4, processors=2) - assert all(len(chunk) == 2 for chunk in result), "All chunks should have equal size" - -def test_uneven_distribution(): - result = distribute_work_load(work_load_size=5, processors=2) - assert len(result[-1]) < len(result[0]), "The last chunk should be smaller" - -def test_single_processor(): - result = distribute_work_load(work_load_size=5, processors=1) - assert len(result) == 1 and len(result[0]) == 5, "Should return the entire workload as a single chunk" - -def test_single_workload(): - result = distribute_work_load(work_load_size=1, processors=2) - assert len(result) == 1 and len(result[0]) == 1, "Should return the single workload as a single chunk" - -def test_no_workload(): - result = distribute_work_load(work_load_size=0, processors=2) - assert len(result) == 0, "Should return an empty list" - -def test_more_processors_than_workloads(): - result = distribute_work_load(work_load_size=2, processors=4) - # Expecting 2 chunks since workload cannot be split into more chunks than there are units of work - assert len(result) == 2, "Should not create more chunks than there are workloads, even if there are more processors" - -def test_even_distribution(): - texts = ["item1", "item2", "item3", "item4"] - expected = [["item1", "item3"], ["item2", "item4"]] - assert map_work_load(texts, 2) == expected, "Failed to evenly distribute an even number of items" - -def test_uneven_distribution(): - texts = ["item1", "item2", "item3", "item4", "item5"] - expected = [["item1", "item3", "item5"], ["item2", "item4"]] - assert map_work_load(texts, 2) == expected, "Failed to distribute an odd number of items" - -def test_single_processor(): - texts = ["item1", "item2", "item3"] - expected = [texts] # All items should be assigned to the single processor - assert map_work_load(texts, 1) == expected, "Failed with a single processor" - -def test_more_processors_than_text_chunks(): - texts = ["item1", "item2"] - # Expecting each item in its own list, assuming the function handles more processors than text chunks by limiting processors - expected = [["item1"], ["item2"]] - assert map_work_load(texts, 4) == expected, "Failed when there are more processors than text chunks" - -def test_no_text_chunks(): - texts = [] - expected = [] # Expecting an empty list when there are no texts - assert map_work_load(texts, 2) == expected, "Failed with no text chunks" - -def test_single_text(): - texts = ["item1"] - expected = [["item1"]] # Single item should be in its own list - assert map_work_load(texts, 1) == expected, "Failed with a single text chunk" - assert map_work_load(texts, 2) == expected, "Failed with a single text chunk and more processors" - -def test_fifteen_items_four_processors(): - texts = ["Item1", "Item2", "Item3", "Item4", "Item5", - "Item6", "Item7", "Item8", "Item9", "Item10", - "Item11", "Item12", "Item13", "Item14", "Item15"] - expected_output = [ - ["Item1", "Item5", "Item9", "Item13"], - ["Item2", "Item6", "Item10", "Item14"], - ["Item3", "Item7", "Item11", "Item15"], - ["Item4", "Item8", "Item12"] - ] - assert map_work_load(texts, 4) == expected_output, "The distribution of 15 items across 4 processors is incorrect" \ No newline at end of file diff --git a/libs/langchain/ragstack_langchain/colbert/__init__.py b/libs/langchain/ragstack_langchain/colbert/__init__.py index 93aa646fb..2dba4cff8 100644 --- a/libs/langchain/ragstack_langchain/colbert/__init__.py +++ b/libs/langchain/ragstack_langchain/colbert/__init__.py @@ -6,6 +6,10 @@ "Please install it with `pip install ragstack-ai-langchain[colbert]`." ) -from .retriever import ColbertLCRetriever +from .colbert_retriever import ColbertRetriever +from .colbert_vector_store import ColbertVectorStore -__all__ = ["ColbertLCRetriever"] +__all__ = [ + "ColbertRetriever", + "ColbertVectorStore", +] diff --git a/libs/langchain/ragstack_langchain/colbert/colbert_retriever.py b/libs/langchain/ragstack_langchain/colbert/colbert_retriever.py new file mode 100644 index 000000000..3f05d57d9 --- /dev/null +++ b/libs/langchain/ragstack_langchain/colbert/colbert_retriever.py @@ -0,0 +1,86 @@ +from typing import Any, List, Optional, Tuple + +from langchain_core.callbacks.manager import ( + AsyncCallbackManagerForRetrieverRun, + CallbackManagerForRetrieverRun, +) +from langchain_core.documents import Document +from langchain_core.retrievers import BaseRetriever +from ragstack_colbert import Chunk +from ragstack_colbert.base_retriever import BaseRetriever as ColbertBaseRetriever + + +class ColbertRetriever(BaseRetriever): + """Chain for langchain retrieve using ColBERT vector store. + + Example: + .. code-block:: python + + from langchain.chains import RetrievalQA + from langchain_openai import AzureChatOpenAI + + llm = AzureChatOpenAI() + retriever = ColbertLCRetriever(colbert_retriever, k=5) + qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever) + qa.run("what happened on June 4th?") + """ + + _retriever: ColbertBaseRetriever + _k: int + _query_maxlen: Optional[int] + + def __init__( + self, + retriever: ColbertBaseRetriever, + k: Optional[int] = 5, + query_maxlen: Optional[int] = None, + **kwargs: Any, + ): + super().__init__(retriever=retriever, k=k, **kwargs) + self._retriever = retriever + self._k = k + self._query_maxlen = query_maxlen + + def _get_relevant_documents( + self, + query: str, + *, + run_manager: CallbackManagerForRetrieverRun, # noqa + ) -> List[Document]: + """Get documents relevant to a query. + Args: + query: String to find relevant documents for + run_manager: The callbacks handler to use + Returns: + List of relevant documents + """ + chunk_scores: List[Tuple[Chunk, float]] = self._retriever.text_search( + query_text=query, k=self._k, query_maxlen=self._query_maxlen + ) + + return [ + Document(page_content=c.text, metadata=c.metadata) + for (c, _) in chunk_scores + ] + + async def _aget_relevant_documents( + self, + query: str, + *, + run_manager: AsyncCallbackManagerForRetrieverRun, # noqa + ) -> List[Document]: + """Asynchronously get documents relevant to a query. + Args: + query: String to find relevant documents for + run_manager: The callbacks handler to use + Returns: + List of relevant documents + """ + chunk_scores: List[Tuple[Chunk, float]] = await self._retriever.atext_search( + query_text=query, k=self._k, query_maxlen=self._query_maxlen + ) + + return [ + Document(page_content=c.text, metadata=c.metadata) + for (c, _) in chunk_scores + ] diff --git a/libs/langchain/ragstack_langchain/colbert/colbert_vector_store.py b/libs/langchain/ragstack_langchain/colbert/colbert_vector_store.py new file mode 100644 index 000000000..4fb6e5207 --- /dev/null +++ b/libs/langchain/ragstack_langchain/colbert/colbert_vector_store.py @@ -0,0 +1,217 @@ +from typing import Any, Iterable, List, Optional, Tuple, Type, TypeVar + +from langchain_core.documents import Document +from langchain_core.retrievers import BaseRetriever +from langchain_core.runnables.config import run_in_executor +from langchain_core.vectorstores import VectorStore +from ragstack_colbert import Chunk +from ragstack_colbert import ColbertVectorStore as RagstackColbertVectorStore +from ragstack_colbert.base_database import BaseDatabase as ColbertBaseDatabase +from ragstack_colbert.base_embedding_model import ( + BaseEmbeddingModel as ColbertBaseEmbeddingModel, +) +from ragstack_colbert.base_retriever import BaseRetriever as ColbertBaseRetriever +from ragstack_colbert.base_vector_store import BaseVectorStore as ColbertBaseVectorStore + +from .colbert_retriever import ColbertRetriever + +CVS = TypeVar("CVS", bound="ColbertVectorStore") + + +class ColbertVectorStore(VectorStore): + + _vector_store: ColbertBaseVectorStore + _retriever: ColbertBaseRetriever + + def __init__( + self, + database: ColbertBaseDatabase, + embedding_model: ColbertBaseEmbeddingModel, + ): + self._initialize(database=database, embedding_model=embedding_model) + + def _initialize( + self, + database: ColbertBaseDatabase, + embedding_model: ColbertBaseEmbeddingModel, + ): + self._vector_store = RagstackColbertVectorStore( + database=database, embedding_model=embedding_model + ) + self._retriever = self._vector_store.as_retriever() + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + doc_id: Optional[str] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: vectorstore specific parameters + + Returns: + List of ids from adding the texts into the vectorstore. + """ + return self._vector_store.add_texts( + texts=list(texts), metadatas=metadatas, doc_id=doc_id + ) + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + """Delete by vector ID or other criteria. + + Args: + ids: List of ids to delete. + **kwargs: Other keyword arguments that subclasses might use. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + return None if ids is None else self._vector_store.delete(ids=ids) + + def similarity_search( + self, + query: str, + k: Optional[int] = 5, + query_maxlen: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query.""" + chunk_scores: List[Tuple[Chunk, float]] = self._retriever.text_search( + query_text=query, k=k, query_maxlen=query_maxlen + ) + + return [ + Document(page_content=c.text, metadata=c.metadata) + for (c, _) in chunk_scores + ] + + def similarity_search_with_score( + self, + query: str, + k: Optional[int] = 5, + query_maxlen: Optional[int] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Run similarity search with distance.""" + chunk_scores: List[Tuple[Chunk, float]] = self._retriever.text_search( + query_text=query, k=k, query_maxlen=query_maxlen + ) + + return [ + (Document(page_content=c.text, metadata=c.metadata), s) + for (c, s) in chunk_scores + ] + + async def asimilarity_search( + self, + query: str, + k: Optional[int] = 5, + query_maxlen: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query.""" + chunk_scores: List[Tuple[Chunk, float]] = await self._retriever.atext_search( + query_text=query, k=k, query_maxlen=query_maxlen + ) + + return [ + Document(page_content=c.text, metadata=c.metadata) + for (c, _) in chunk_scores + ] + + async def asimilarity_search_with_score( + self, + query: str, + k: Optional[int] = 5, + query_maxlen: Optional[int] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Run similarity search with distance.""" + chunk_scores: List[Tuple[Chunk, float]] = await self._retriever.atext_search( + query_text=query, k=k, query_maxlen=query_maxlen + ) + + return [ + (Document(page_content=c.text, metadata=c.metadata), s) + for (c, s) in chunk_scores + ] + + @classmethod + def from_documents( + cls, + documents: List[Document], + database: ColbertBaseDatabase, + embedding_model: ColbertBaseEmbeddingModel, + **kwargs: Any, + ) -> CVS: + """Return VectorStore initialized from documents and embeddings.""" + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + return cls.from_texts( + texts=texts, + database=database, + embedding_model=embedding_model, + metadatas=metadatas, + **kwargs, + ) + + @classmethod + async def afrom_documents( + cls: Type[CVS], + documents: List[Document], + database: ColbertBaseDatabase, + embedding_model: ColbertBaseEmbeddingModel, + **kwargs: Any, + ) -> CVS: + """Return VectorStore initialized from documents and embeddings.""" + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + return await cls.afrom_texts( + texts=texts, + database=database, + embedding_model=embedding_model, + metadatas=metadatas, + **kwargs, + ) + + @classmethod + def from_texts( + cls: Type[CVS], + texts: List[str], + database: ColbertBaseDatabase, + embedding_model: ColbertBaseEmbeddingModel, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> CVS: + """Return VectorStore initialized from texts and embeddings.""" + + instance = super().__new__(cls) + instance._initialize(database=database, embedding_model=embedding_model) + instance.add_texts(texts=texts, metadatas=metadatas) + return instance + + @classmethod + async def afrom_texts( + cls: Type[CVS], + texts: List[str], + database: ColbertBaseDatabase, + embedding_model: ColbertBaseEmbeddingModel, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> CVS: + """Return VectorStore initialized from texts and embeddings.""" + return await run_in_executor( + None, cls.from_texts, texts, database, embedding_model, metadatas, **kwargs + ) + + def as_retriever(self, k: Optional[int] = 5, **kwargs: Any) -> BaseRetriever: + """Return a VectorStoreRetriever initialized from this VectorStore.""" + return ColbertRetriever( + retriever=self._vector_store.as_retriever(), k=k, **kwargs + ) diff --git a/libs/langchain/ragstack_langchain/colbert/retriever.py b/libs/langchain/ragstack_langchain/colbert/retriever.py deleted file mode 100644 index 09b681b78..000000000 --- a/libs/langchain/ragstack_langchain/colbert/retriever.py +++ /dev/null @@ -1,57 +0,0 @@ -from typing import Any, List - -from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun -from langchain_core.documents import Document -from langchain_core.retrievers import BaseRetriever as LangChainBaseRetriever -from pydantic import Field - -from ragstack_colbert.base_retriever import BaseRetriever - -class ColbertLCRetriever(LangChainBaseRetriever): - """Chain for langchain retrieve using ColBERT vector store. - - Example: - .. code-block:: python - - from langchain.chains import RetrievalQA - from langchain_openai import AzureChatOpenAI - - llm = AzureChatOpenAI() - retriever = ColbertLCRetriever(colbert_retriever, k=5) - qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever) - qa.run("what happened on June 4th?") - """ - - retriever: BaseRetriever = Field(default=None) - kwargs: dict = {} - k: int = 10 - - class Config: - """Configuration for this pydantic object.""" - - arbitrary_types_allowed = True - - def __init__( - self, retriever: BaseRetriever, k: int = 10, **kwargs: Any - ): - super().__init__(retriever=retriever, k=k, **kwargs) - self.retriever = retriever - self.k = k - - def _get_relevant_documents( - self, - query: str, - *, - run_manager: CallbackManagerForRetrieverRun, # noqa - ) -> List[Document]: - """Get documents relevant to a query.""" - chunks = self.retriever.retrieve(query, self.k) - - output: List[Document] = [] - for chunk in chunks: - page_content = chunk.data.text - metadata=chunk.data.metadata - metadata["rank"] = chunk.rank - output.append(Document(page_content=page_content, metadata=metadata)) - - return output diff --git a/libs/langchain/tests/integration_tests/conftest.py b/libs/langchain/tests/integration_tests/conftest.py index 486e316f5..211c52909 100644 --- a/libs/langchain/tests/integration_tests/conftest.py +++ b/libs/langchain/tests/integration_tests/conftest.py @@ -1,7 +1,5 @@ -from ragstack_tests_utils import LocalCassandraTestStore, AstraDBTestStore - import pytest - +from ragstack_tests_utils import AstraDBTestStore, LocalCassandraTestStore status = { "local_cassandra_test_store": None, diff --git a/libs/langchain/tests/integration_tests/test_colbert.py b/libs/langchain/tests/integration_tests/test_colbert.py new file mode 100644 index 000000000..5de63bdb8 --- /dev/null +++ b/libs/langchain/tests/integration_tests/test_colbert.py @@ -0,0 +1,116 @@ +import logging +from typing import List, Tuple + +import pytest +from langchain_core.documents import Document +from ragstack_colbert import CassandraDatabase, ColbertEmbeddingModel +from ragstack_tests_utils import TestData +from transformers import BertTokenizer + +from langchain.text_splitter import RecursiveCharacterTextSplitter +from ragstack_langchain.colbert import ColbertVectorStore + +logging.getLogger("cassandra").setLevel(logging.ERROR) + +from tests.integration_tests.conftest import ( + get_astradb_test_store, + get_local_cassandra_test_store, +) + + +def validate_retrieval(results: List[Document], key_value: str): + passed = False + for result in results: + if key_value in result.page_content: + passed = True + return passed + + +@pytest.fixture +def cassandra(): + return get_local_cassandra_test_store() + + +@pytest.fixture +def astra_db(): + return get_astradb_test_store() + + +@pytest.mark.parametrize("vector_store", ["cassandra", "astra_db"]) +def test_sync(request, vector_store: str): + vector_store = request.getfixturevalue(vector_store) + session = vector_store.create_cassandra_session() + session.default_timeout = 180 + + table_name = f"LangChain_colbert_sync" + + database = CassandraDatabase.from_session(session=session, table_name=table_name) + + docs: List[Document] = [] + docs.append( + Document( + page_content=TestData.marine_animals_text(), + metadata={"name": "marine_animals"}, + ) + ) + docs.append( + Document( + page_content=TestData.nebula_voyager_text(), + metadata={"name": "nebula_voyager"}, + ) + ) + + batch_size = 5 # 640 recommended for production use + chunk_size = 250 + chunk_overlap = 50 + + embedding_model = ColbertEmbeddingModel(doc_maxlen=chunk_size) + + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + + def _len_function(text: str) -> int: + return len(tokenizer.tokenize(text)) + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=_len_function, + ) + + doc_chunks: List[Document] = text_splitter.split_documents(docs) + + logging.info(f"split {len(docs)} documents into {len(doc_chunks)} chunks") + + embedding_model = ColbertEmbeddingModel( + doc_maxlen=chunk_size, + chunk_batch_size=batch_size, + ) + + logging.debug("Starting to embed ColBERT docs and save them to the database") + + vector_store: ColbertVectorStore = ColbertVectorStore.from_documents( + documents=doc_chunks, database=database, embedding_model=embedding_model + ) + + results: List[Document] = vector_store.similarity_search( + "What challenges does the Quantum Opacity phenomenon present to the crew of the Nebula Voyager" + ) + assert validate_retrieval(results, key_value="Quantum Opacity") + + results: List[Tuple[Document, float]] = vector_store.similarity_search_with_score( + "What are Xenospheric Particulates?" + ) + + assert len(results) > 3 + assert results[1][1] > 0 # check score from result 2 + assert results[2][1] > 0 # check score from result 3 + assert results[1][1] > results[2][1] # check that scores are returned in order + + assert validate_retrieval( + [r[0] for r in results], key_value="Xenospheric Particulates" + ) + + results: List[Document] = vector_store.similarity_search( + "What communication methods do dolphins use within their pods?" + ) + assert validate_retrieval(results, key_value="dolphin") diff --git a/libs/langchain/tests/integration_tests/test_colbert_embedding_retrieval.py b/libs/langchain/tests/integration_tests/test_colbert_embedding_retrieval.py deleted file mode 100644 index 61f440dcc..000000000 --- a/libs/langchain/tests/integration_tests/test_colbert_embedding_retrieval.py +++ /dev/null @@ -1,99 +0,0 @@ -import logging - -import pytest -from ragstack_langchain.colbert import ColbertLCRetriever -from ragstack_colbert import ( - CassandraVectorStore, - ColbertRetriever, - ColbertEmbeddingModel, - ChunkData, -) -from tests.integration_tests.conftest import ( - get_astradb_test_store, - get_local_cassandra_test_store, -) - - -@pytest.fixture -def cassandra(): - return get_local_cassandra_test_store() - - -@pytest.fixture -def astra_db(): - return get_astradb_test_store() - - -@pytest.mark.parametrize("vector_store", ["cassandra", "astra_db"]) -def test_embedding_cassandra_retriever(request, vector_store: str): - vector_store = request.getfixturevalue(vector_store) - narrative = """ - Marine animals inhabit some of the most diverse environments on our planet. From the shallow coral reefs teeming with colorful fish to the dark depths of the ocean where mysterious creatures lurk, the marine world is full of wonder and mystery. - - One of the most iconic marine animals is the dolphin, known for its intelligence, social behavior, and playful antics. Dolphins communicate with each other using a variety of clicks, whistles, and body movements. They live in social groups called pods and often display behaviors that suggest a high level of social complexity, including cooperation for hunting and care for the injured or sick members of their pod. - - Another remarkable creature is the sea turtle, which navigates vast oceans to return to the very beaches where they were born to lay their eggs. These ancient mariners are true survivors, having roamed the oceans for millions of years. However, they face numerous threats from human activities, including plastic pollution, accidental capture in fishing gear, and the loss of nesting beaches due to climate change. - - Deep in the ocean's abyss, where sunlight fails to penetrate, live the bioluminescent creatures, such as the anglerfish. These eerie-looking fish use a natural light produced by bacteria in their lure to attract prey in the pitch-black waters. This fascinating adaptation is a perfect example of the unique strategies marine animals have evolved to survive in the ocean's different layers. - - Coral reefs, often referred to as the "rainforests of the sea," are another critical habitat. They are bustling with life and serve as a vital ecosystem for many marine species. Corals themselves are fascinating organisms. They are made up of thousands of tiny creatures called polyps and have a symbiotic relationship with algae, which provides them with food through photosynthesis. - - The diversity of marine life is vast, and each species plays a crucial role in the ocean's ecosystem. From the microscopic plankton that form the base of the oceanic food chain to the majestic blue whale, the largest animal to have ever lived on Earth, marine animals are an integral part of our world's biodiversity. Protecting these creatures and their habitats is essential for maintaining the health of our oceans and the planet as a whole. - """ - - # Define the desired chunk size and overlap size - chunk_size = 450 - overlap_size = 50 - - # Function to generate chunks with the specified size and overlap - def generate_texts(text, chunk_size, overlap_size): - texts = [] - start = 0 - end = chunk_size - while start < len(text): - # If this is not the first chunk, move back 'overlap_size' characters to create the overlap - if start != 0: - start -= overlap_size - texts.append(text[start:end]) - start = end - end += chunk_size - return texts - - # Generate the chunks based on the narrative - texts = generate_texts(narrative, chunk_size, overlap_size) - - # Output the first few chunks to ensure they meet the specifications - for i, text in enumerate(texts[:3]): # Displaying the first 3 chunks for brevity - logging.info(f"Chunk {i + 1}:\n{text}\n{'-' * 50}\n") - - doc_id = "Marine Animals habitat" - - # colbert stuff starts - colbert = ColbertEmbeddingModel( - doc_maxlen=220, - nbits=2, - kmeans_niters=4, - ) - - chunks = [ChunkData(text=text, metadata={}) for text in texts] - - embedded_chunks = colbert.embed_chunks(chunks=chunks, doc_id=doc_id) - - logging.info(f"embedded chunks size {len(embedded_chunks)}") - - store = CassandraVectorStore( - keyspace="default_keyspace", - table_name="colbert_embeddings", - session=vector_store.create_cassandra_session(), - ) - store.put_chunks(chunks=embedded_chunks, delete_existing=True) - - retriever = ColbertRetriever( - vector_store=store, embedding_model=colbert - ) - lc_retriever = ColbertLCRetriever(retriever, k=2) - docs = lc_retriever.get_relevant_documents( - "what kind fish lives shallow coral reefs atlantic, india ocean, red sea, gulf of mexico, pacific, and arctic ocean" - ) - assert len(docs) == 2 - assert len(docs[0].page_content) > 0 \ No newline at end of file diff --git a/libs/langchain/tests/unit_tests/test_import.py b/libs/langchain/tests/unit_tests/test_import.py index 5724a184b..45dff34f8 100644 --- a/libs/langchain/tests/unit_tests/test_import.py +++ b/libs/langchain/tests/unit_tests/test_import.py @@ -1,13 +1,14 @@ def test_import(): - from langchain.vectorstores import AstraDB # noqa - from langchain_astradb import AstraDBVectorStore # noqa - import langchain_core # noqa - import langsmith # noqa import astrapy # noqa import cassio # noqa - import unstructured # noqa + import langchain_core # noqa + import langsmith # noqa import openai # noqa import tiktoken # noqa + import unstructured # noqa + from langchain_astradb import AstraDBVectorStore # noqa + + from langchain.vectorstores import AstraDB # noqa def test_meta(): @@ -19,5 +20,3 @@ def check_meta(package: str): assert meta["license"] == "BUSL-1.1" check_meta("ragstack-ai-langchain") - - diff --git a/libs/llamaindex/.python-version b/libs/llamaindex/.python-version new file mode 100644 index 000000000..2c0733315 --- /dev/null +++ b/libs/llamaindex/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/libs/llamaindex/pyproject.toml b/libs/llamaindex/pyproject.toml index 9cefd855d..17d0500da 100644 --- a/libs/llamaindex/pyproject.toml +++ b/libs/llamaindex/pyproject.toml @@ -34,6 +34,7 @@ llama-index-llms-gemini = { version = "0.1.7", optional = true } llama-index-multi-modal-llms-gemini = { version = "0.1.5", optional = true } llama-index-llms-vertex = { version = "0.1.5", optional = true } llama-index-embeddings-gemini = { version = "0.1.6", optional = true } +cffi = "^1.16.0" [tool.poetry.extras] colbert = ["ragstack-ai-colbert"] diff --git a/libs/llamaindex/ragstack_llamaindex/colbert/__init__.py b/libs/llamaindex/ragstack_llamaindex/colbert/__init__.py index 5d8388f4a..0cabcce4f 100644 --- a/libs/llamaindex/ragstack_llamaindex/colbert/__init__.py +++ b/libs/llamaindex/ragstack_llamaindex/colbert/__init__.py @@ -6,6 +6,8 @@ "Please install it with `pip install ragstack-ai-llamaindex[colbert]`." ) -from .retriever import ColbertLIRetriever +from .colbert_retriever import ColbertRetriever -__all__ = ["ColbertLIRetriever"] +__all__ = [ + "ColbertRetriever", +] diff --git a/libs/llamaindex/ragstack_llamaindex/colbert/retriever.py b/libs/llamaindex/ragstack_llamaindex/colbert/colbert_retriever.py similarity index 57% rename from libs/llamaindex/ragstack_llamaindex/colbert/retriever.py rename to libs/llamaindex/ragstack_llamaindex/colbert/colbert_retriever.py index 84292631f..26c1b4372 100644 --- a/libs/llamaindex/ragstack_llamaindex/colbert/retriever.py +++ b/libs/llamaindex/ragstack_llamaindex/colbert/colbert_retriever.py @@ -1,14 +1,14 @@ +from typing import Any, List, Optional, Tuple -from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode from llama_index.core.callbacks.base import CallbackManager -from llama_index.core.retrievers import BaseRetriever as LlamaIndexBaseRetriever from llama_index.core.constants import DEFAULT_SIMILARITY_TOP_K -from typing import Any, List, Optional - -from ragstack_colbert.base_retriever import BaseRetriever +from llama_index.core.retrievers import BaseRetriever +from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode +from ragstack_colbert import Chunk +from ragstack_colbert.base_retriever import BaseRetriever as ColbertBaseRetriever -class ColbertLIRetriever(LlamaIndexBaseRetriever): +class ColbertRetriever(BaseRetriever): """ColBERT vector store retriever. Args: @@ -16,9 +16,13 @@ class ColbertLIRetriever(LlamaIndexBaseRetriever): similarity_top_k (int): number of top k results to return. """ + _retriever: ColbertBaseRetriever + _k: int + _query_maxlen: Optional[int] + def __init__( self, - retriever: BaseRetriever, + retriever: ColbertBaseRetriever, similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K, callback_manager: Optional[CallbackManager] = None, object_map: Optional[dict] = None, @@ -28,7 +32,7 @@ def __init__( ) -> None: """Initialize params.""" self._retriever = retriever - self._similarity_top_k = similarity_top_k + self._k = similarity_top_k self._query_maxlen = query_maxlen super().__init__( callback_manager=callback_manager, @@ -40,14 +44,12 @@ def _retrieve( self, query_bundle: QueryBundle, ) -> List[NodeWithScore]: - nodes: List[NodeWithScore] = [] - - chunks = self._retriever.retrieve(query_bundle.query_str, k=self._similarity_top_k, query_maxlen=self._query_maxlen) - for chunk in chunks: - text = chunk.data.text - metadata=chunk.data.metadata - metadata["rank"] = chunk.rank - - node = TextNode(text=text, metadata=metadata) - nodes.append(NodeWithScore(node=node, score=chunk.score)) - return nodes + chunk_scores: List[Tuple[Chunk, float]] = self._retriever.text_search( + query_text=query_bundle.query_str, + k=self._k, + query_maxlen=self._query_maxlen, + ) + return [ + NodeWithScore(node=TextNode(text=c.text, metadata=c.metadata), score=s) + for (c, s) in chunk_scores + ] diff --git a/libs/llamaindex/tests/integration_tests/conftest.py b/libs/llamaindex/tests/integration_tests/conftest.py index 486e316f5..211c52909 100644 --- a/libs/llamaindex/tests/integration_tests/conftest.py +++ b/libs/llamaindex/tests/integration_tests/conftest.py @@ -1,7 +1,5 @@ -from ragstack_tests_utils import LocalCassandraTestStore, AstraDBTestStore - import pytest - +from ragstack_tests_utils import AstraDBTestStore, LocalCassandraTestStore status = { "local_cassandra_test_store": None, diff --git a/libs/llamaindex/tests/integration_tests/test_colbert.py b/libs/llamaindex/tests/integration_tests/test_colbert.py new file mode 100644 index 000000000..6bd642c14 --- /dev/null +++ b/libs/llamaindex/tests/integration_tests/test_colbert.py @@ -0,0 +1,129 @@ +import logging +from typing import Dict, List, Tuple + +import pytest +from llama_index.core.ingestion import IngestionPipeline +from llama_index.core import get_response_synthesizer, Settings +from llama_index.core.query_engine import RetrieverQueryEngine +from llama_index.core.schema import Document, NodeWithScore +from llama_index.core.text_splitter import SentenceSplitter +from ragstack_colbert import ( + CassandraDatabase, + ColbertEmbeddingModel, + ColbertVectorStore, + Metadata, +) +from ragstack_tests_utils import TestData + +from ragstack_llamaindex.colbert import ColbertRetriever + +logging.getLogger("cassandra").setLevel(logging.ERROR) + +from tests.integration_tests.conftest import ( + get_astradb_test_store, + get_local_cassandra_test_store, +) + + +def validate_retrieval(results: List[NodeWithScore], key_value: str): + passed = False + for result in results: + if key_value in result.text: + passed = True + return passed + + +@pytest.fixture +def cassandra(): + return get_local_cassandra_test_store() + + +@pytest.fixture +def astra_db(): + return get_astradb_test_store() + + +@pytest.mark.parametrize("vector_store", [ "astra_db"]) #"cassandra", +def test_sync(request, vector_store: str): + vector_store = request.getfixturevalue(vector_store) + session = vector_store.create_cassandra_session() + session.default_timeout = 180 + + table_name = "LlamaIndex_colbert_sync" + + batch_size = 5 # 640 recommended for production use + chunk_size = 256 + chunk_overlap = 50 + + database = CassandraDatabase.from_session(session=session, table_name=table_name) + embedding_model = ColbertEmbeddingModel( + doc_maxlen=chunk_size, + chunk_batch_size=batch_size, + ) + + vector_store = ColbertVectorStore( + database=database, + embedding_model=embedding_model, + ) + + docs: List[Document] = [] + docs.append( + Document( + text=TestData.marine_animals_text(), metadata={"name": "marine_animals"} + ) + ) + docs.append( + Document( + text=TestData.nebula_voyager_text(), metadata={"name": "nebula_voyager"} + ) + ) + + splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + pipeline = IngestionPipeline(transformations=[splitter]) + + nodes = pipeline.run(documents=docs) + + docs: Dict[str, Tuple[List[str], List[Metadata]]] = {} + + for node in nodes: + doc_id = node.metadata["name"] + if doc_id not in docs: + docs[doc_id] = ([], []) + docs[doc_id][0].append(node.text) + docs[doc_id][1].append(node.metadata) + + logging.debug("Starting to embed ColBERT docs and save them to the database") + + for doc_id in docs: + texts = docs[doc_id][0] + metadatas = docs[doc_id][1] + + logging.debug(f"processing {doc_id} that has {len(texts)} chunks") + + vector_store.add_texts(texts=texts, metadatas=metadatas, doc_id=doc_id) + + retriever = ColbertRetriever( + retriever=vector_store.as_retriever(), similarity_top_k=5 + ) + + Settings.llm = None + + response_synthesizer = get_response_synthesizer() + + pipeline = RetrieverQueryEngine( + retriever=retriever, + response_synthesizer=response_synthesizer, + ) + + results = pipeline.retrieve("Who developed the Astroflux Navigator?") + assert validate_retrieval(results, key_value="Astroflux Navigator") + + results = pipeline.retrieve( + "Describe the phenomena known as 'Chrono-spatial Echoes'" + ) + assert validate_retrieval(results, key_value="Chrono-spatial Echoes") + + results = pipeline.retrieve( + "How do anglerfish adapt to the deep ocean's darkness?" + ) + assert validate_retrieval(results, key_value="anglerfish") diff --git a/libs/llamaindex/tests/integration_tests/test_colbert_embedding_retrieval.py b/libs/llamaindex/tests/integration_tests/test_colbert_embedding_retrieval.py deleted file mode 100644 index e17eecb48..000000000 --- a/libs/llamaindex/tests/integration_tests/test_colbert_embedding_retrieval.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging - -import pytest - -from ragstack_colbert import ( - CassandraVectorStore, - ColbertRetriever, - ColbertEmbeddingModel, - ChunkData, -) -from ragstack_llamaindex.colbert import ColbertLIRetriever -from tests.integration_tests.conftest import ( - get_astradb_test_store, - get_local_cassandra_test_store, -) - - -@pytest.fixture -def cassandra(): - return get_local_cassandra_test_store() - - -@pytest.fixture -def astra_db(): - return get_astradb_test_store() - - -@pytest.mark.parametrize("vector_store", ["cassandra", "astra_db"]) -def test_embedding_cassandra_retriever(request, vector_store: str): - vector_store = request.getfixturevalue(vector_store) - narrative = """ - Marine animals inhabit some of the most diverse environments on our planet. From the shallow coral reefs teeming with colorful fish to the dark depths of the ocean where mysterious creatures lurk, the marine world is full of wonder and mystery. - - One of the most iconic marine animals is the dolphin, known for its intelligence, social behavior, and playful antics. Dolphins communicate with each other using a variety of clicks, whistles, and body movements. They live in social groups called pods and often display behaviors that suggest a high level of social complexity, including cooperation for hunting and care for the injured or sick members of their pod. - - Another remarkable creature is the sea turtle, which navigates vast oceans to return to the very beaches where they were born to lay their eggs. These ancient mariners are true survivors, having roamed the oceans for millions of years. However, they face numerous threats from human activities, including plastic pollution, accidental capture in fishing gear, and the loss of nesting beaches due to climate change. - - Deep in the ocean's abyss, where sunlight fails to penetrate, live the bioluminescent creatures, such as the anglerfish. These eerie-looking fish use a natural light produced by bacteria in their lure to attract prey in the pitch-black waters. This fascinating adaptation is a perfect example of the unique strategies marine animals have evolved to survive in the ocean's different layers. - - Coral reefs, often referred to as the "rainforests of the sea," are another critical habitat. They are bustling with life and serve as a vital ecosystem for many marine species. Corals themselves are fascinating organisms. They are made up of thousands of tiny creatures called polyps and have a symbiotic relationship with algae, which provides them with food through photosynthesis. - - The diversity of marine life is vast, and each species plays a crucial role in the ocean's ecosystem. From the microscopic plankton that form the base of the oceanic food chain to the majestic blue whale, the largest animal to have ever lived on Earth, marine animals are an integral part of our world's biodiversity. Protecting these creatures and their habitats is essential for maintaining the health of our oceans and the planet as a whole. - """ - - # Define the desired chunk size and overlap size - chunk_size = 450 - overlap_size = 50 - - # Function to generate chunks with the specified size and overlap - def generate_texts(text, chunk_size, overlap_size): - texts = [] - start = 0 - end = chunk_size - while start < len(text): - # If this is not the first chunk, move back 'overlap_size' characters to create the overlap - if start != 0: - start -= overlap_size - texts.append(text[start:end]) - start = end - end += chunk_size - return texts - - # Generate the chunks based on the narrative - texts = generate_texts(narrative, chunk_size, overlap_size) - - # Output the first few chunks to ensure they meet the specifications - for i, text in enumerate(texts[:3]): # Displaying the first 3 chunks for brevity - logging.info(f"Chunk {i + 1}:\n{text}\n{'-' * 50}\n") - - doc_id = "Marine Animals habitat" - - # colbert stuff starts - colbert = ColbertEmbeddingModel( - doc_maxlen=220, - nbits=2, - kmeans_niters=4, - ) - - chunks = [ChunkData(text=text, metadata={}) for text in texts] - - embedded_chunks = colbert.embed_chunks(chunks=chunks, doc_id=doc_id) - - logging.info(f"embedded chunks size {len(embedded_chunks)}") - - store = CassandraVectorStore( - keyspace="default_keyspace", - table_name="colbert_embeddings", - session=vector_store.create_cassandra_session(), - ) - store.put_chunks(chunks=embedded_chunks, delete_existing=True) - - retriever = ColbertRetriever( - vector_store=store, embedding_model=colbert - ) - li_retriever = ColbertLIRetriever(retriever, similarity_top_k=3) - nodes = li_retriever.retrieve("what kind fish lives shallow coral reefs") - assert len(nodes) == 3 diff --git a/libs/llamaindex/tests/unit_tests/test_import.py b/libs/llamaindex/tests/unit_tests/test_import.py index 92c0c6b2c..8946d77ab 100644 --- a/libs/llamaindex/tests/unit_tests/test_import.py +++ b/libs/llamaindex/tests/unit_tests/test_import.py @@ -2,13 +2,13 @@ def test_import(): - from llama_index.vector_stores.astra_db import AstraDBVectorStore # noqa - from llama_index.vector_stores.cassandra import CassandraVectorStore # noqa import astrapy # noqa import cassio # noqa - import unstructured # noqa import openai # noqa import tiktoken # noqa + import unstructured # noqa + from llama_index.vector_stores.astra_db import AstraDBVectorStore # noqa + from llama_index.vector_stores.cassandra import CassandraVectorStore # noqa def check_no_import(fn: callable): @@ -18,12 +18,14 @@ def check_no_import(fn: callable): except ImportError: pass + def test_not_import(): check_no_import(lambda: importlib.import_module("langchain.vectorstores")) check_no_import(lambda: importlib.import_module("langchain_astradb")) check_no_import(lambda: importlib.import_module("langchain_core")) check_no_import(lambda: importlib.import_module("langsmith")) + def test_meta(): from importlib import metadata @@ -33,5 +35,3 @@ def check_meta(package: str): assert meta["license"] == "BUSL-1.1" check_meta("ragstack-ai-llamaindex") - - diff --git a/libs/tests-utils/ragstack_tests_utils/__init__.py b/libs/tests-utils/ragstack_tests_utils/__init__.py index 26642d2d9..e061780c7 100644 --- a/libs/tests-utils/ragstack_tests_utils/__init__.py +++ b/libs/tests-utils/ragstack_tests_utils/__init__.py @@ -1,9 +1,11 @@ from .cassandra_container import CassandraContainer +from .test_data import TestData from .test_store import TestStore, LocalCassandraTestStore, AstraDBTestStore __all__ = [ + "AstraDBTestStore", "CassandraContainer", - "TestStore", "LocalCassandraTestStore", - "AstraDBTestStore" + "TestData", + "TestStore", ] diff --git a/libs/tests-utils/ragstack_tests_utils/test_data.py b/libs/tests-utils/ragstack_tests_utils/test_data.py new file mode 100644 index 000000000..b23cb8c2b --- /dev/null +++ b/libs/tests-utils/ragstack_tests_utils/test_data.py @@ -0,0 +1,46 @@ +import os + +class TestData(): + + @staticmethod + def _get_test_data_path(file_name: str) -> str: + script_directory = os.path.dirname(os.path.abspath(__file__)) + return os.path.join(script_directory, "test_data", file_name) + + @staticmethod + def _get_text_file(file_name:str) -> str: + with open(TestData._get_test_data_path(file_name), 'r') as f: + return f.read() + + @staticmethod + def marine_animals_text() -> str: + """ + Returns: + A story of approx 350 words about marine animals. + + Potential queries on the text: + - What kind of fish lives in shallow coral reefs? + - What communication methods do dolphins use within their pods? + - How do anglerfish adapt to the deep ocean's darkness? + - What role do coral reefs play in marine ecosystems? + """ + + return TestData._get_text_file("marine_animals.txt") + + @staticmethod + def nebula_voyager_text() -> str: + """ + Returns: + A story of approx 2500 words about a theoretical spaceship. + Includes very technical names and terms that can be + difficult for standard retrieval systems. + + Potential queries on the text: + - Who developed the Astroflux Navigator? + - Describe the phenomena known as "Chrono-spatial Echoes"? + - What challenges does the Quantum Opacity phenomenon present to the crew of the Nebula Voyager? + - How does the Bioquantum Array aid Dr. Nyx Moreau in studying the Nebuloforms within Orion’s Whisper? + - What are Xenospheric Particulates? + - What is the significance of the Cryptolingual Synthesizer used by Jiro Takashi, and how does it function? + """ + return TestData._get_text_file("nebula_voyager.txt") diff --git a/libs/tests-utils/ragstack_tests_utils/test_data/marine_animals.txt b/libs/tests-utils/ragstack_tests_utils/test_data/marine_animals.txt new file mode 100644 index 000000000..5bdbb0d1c --- /dev/null +++ b/libs/tests-utils/ragstack_tests_utils/test_data/marine_animals.txt @@ -0,0 +1,11 @@ + Marine animals inhabit some of the most diverse environments on our planet. From the shallow coral reefs teeming with colorful fish to the dark depths of the ocean where mysterious creatures lurk, the marine world is full of wonder and mystery. + + One of the most iconic marine animals is the dolphin, known for its intelligence, social behavior, and playful antics. Dolphins communicate with each other using a variety of clicks, whistles, and body movements. They live in social groups called pods and often display behaviors that suggest a high level of social complexity, including cooperation for hunting and care for the injured or sick members of their pod. + + Another remarkable creature is the sea turtle, which navigates vast oceans to return to the very beaches where they were born to lay their eggs. These ancient mariners are true survivors, having roamed the oceans for millions of years. However, they face numerous threats from human activities, including plastic pollution, accidental capture in fishing gear, and the loss of nesting beaches due to climate change. + + Deep in the ocean's abyss, where sunlight fails to penetrate, live the bioluminescent creatures, such as the anglerfish. These eerie-looking fish use a natural light produced by bacteria in their lure to attract prey in the pitch-black waters. This fascinating adaptation is a perfect example of the unique strategies marine animals have evolved to survive in the ocean's different layers. + + Coral reefs, often referred to as the "rainforests of the sea," are another critical habitat. They are bustling with life and serve as a vital ecosystem for many marine species. Corals themselves are fascinating organisms. They are made up of thousands of tiny creatures called polyps and have a symbiotic relationship with algae, which provides them with food through photosynthesis. + + The diversity of marine life is vast, and each species plays a crucial role in the ocean's ecosystem. From the microscopic plankton that form the base of the oceanic food chain to the majestic blue whale, the largest animal to have ever lived on Earth, marine animals are an integral part of our world's biodiversity. Protecting these creatures and their habitats is essential for maintaining the health of our oceans and the planet as a whole. diff --git a/libs/tests-utils/ragstack_tests_utils/test_data/nebula_voyager.txt b/libs/tests-utils/ragstack_tests_utils/test_data/nebula_voyager.txt new file mode 100644 index 000000000..aee8ec02f --- /dev/null +++ b/libs/tests-utils/ragstack_tests_utils/test_data/nebula_voyager.txt @@ -0,0 +1,135 @@ +In the cold vastness of space, the Nebula Voyager glides silently towards its destination: Orion's Whisper. This nebula, known for its vibrant colors and mysterious energies, sits on the outer edges of human-explored space. Commanded by Captain Elara Thorne, a veteran of interstellar expeditions, the Voyager is equipped with the most advanced technology designed specifically for this mission. Among these, the primary tool is the Chrono-spatial Echo Locator (CEL), a device capable of detecting and analyzing the unusual energy patterns known as "Chrono-spatial Echoes." + +Chrono-spatial Echoes are anomalous energy patterns that have puzzled scientists for decades. They exhibit non-linear temporal fluctuations—appearing and disappearing across various timelines in a manner that defies the known laws of quantum causality. Traditional theories of time and space suggest that events occur in a linear sequence, but these echoes behave as if time itself loops or folds unpredictably. Such phenomena not only challenge the foundational principles of quantum mechanics but also hint at the existence of previously unimagined cosmic processes. + +As the Voyager approaches the nebula, Captain Thorne reviews the mission parameters with her crew in the holo-conference room. The walls of the room display real-time images of deep space, the nebula looming larger with each passing moment. + +"Team, our primary goal here is to understand the source and mechanics of the Chrono-spatial Echoes. These patterns are not just scientific anomalies; they could be keys to new forms of energy or even new models of the universe," Captain Thorne explains, her voice steady and clear. + +Dr. Soren Petrovich, the chief engineer, interjects with a technical overview. "The CEL has been calibrated to detect fluctuations within the quantum field that are as minute as one part in ten billion. This sensitivity is crucial, as the echoes appear to interact weakly with conventional matter and energy." + +As the crew listens intently, Science Officer Dr. Nyx Moreau adds, "Not only are these echoes elusive, but their temporal irregularities could potentially allow us to observe events out of sequence. Imagine the implications for understanding causality and time!" + +With the crew briefed, the Voyager enters the nebula. The ship's exterior cameras and sensors capture stunning visuals of gas clouds swirling in a kaleidoscope of colors, illuminated by distant stars. However, the beauty of the nebula is a stark contrast to the dangers it presents. The density and activity within make navigation perilous, requiring constant adjustments and monitoring. + +As they delve deeper, the first set of echoes are detected. The CEL pings rapidly, signaling the detection of an echo sequence. The data appears on the main screen, a complex array of signals represented in multidimensional graphs and charts. + +"Initial analysis indicates that these echoes are not just reflections of energy; they seem to have their own distinct pattern of behavior," reports Dr. Moreau, her eyes scanning the data intently. + +Captain Thorne decides to deploy a series of quantum probes to track the echoes more accurately. These probes, equipped with temporal sensors and quantum entanglement communicators, can relay information back to the Voyager without the delays normally caused by the vast distances of space. + +The crew watches as the probes disappear into the nebula's depths. Each probe sends back streams of data, painting a clearer picture of the nebula's internal structure and the nature of the echoes. + +"This is unprecedented," murmurs Dr. Petrovich, analyzing the incoming data. "The echoes are not just anomalies—they're consistent, repeatable. It's as if we're looking at a shadow of time itself, cast by events that happen out of our conventional understanding." + +As the investigation progresses, the crew of the Nebula Voyager finds themselves on the brink of a revolutionary discovery. What began as a mission to explore a nebula could very well end up rewriting the rules of physics, propelling humanity into a new era of cosmic understanding. The data collected here promises to hold secrets that go beyond the mere scientific curiosity, possibly unraveling new truths about the universe itself. + +As the Nebula Voyager delves deeper into the heart of Orion’s Whisper, the cosmic environment grows increasingly complex and perilous. The nebula is filled with what the crew has termed "Stellar Labyrinths," vast regions of chaotic celestial formations. These zones are known for their high-density gravitonic waves, which distort gravitational fields and make standard space navigation systems virtually useless. It's within this challenging scenario that the Astroflux Navigator, a masterpiece of engineering crafted by Chief Engineer Dr. Soren Petrovich, becomes crucial. + +The Astroflux Navigator is not a typical space navigation device. It is an advanced, hyper-dimensional navigation system engineered specifically to handle the unpredictable conditions within nebulae like Orion’s Whisper. This system integrates a series of quantum computers and sensors that map the gravitational anomalies and translate them into navigable data. These computers use a form of advanced algorithmic processing called "Quantum Coherence Mapping," which synthesizes real-time spatial data with theoretical models of space-time anomalies. + +Dr. Petrovich, a man whose life’s work revolves around defying the conventional limitations of space travel, explains the system to the crew with palpable enthusiasm. "The Astroflux Navigator doesn’t just see the universe as we do. It perceives layers of quantum fluctuations invisible to traditional instruments, interpreting disruptions in the gravitational field as navigable pathways." + +As the Voyager approaches the first of these Stellar Labyrinths, the effectiveness of the Astroflux Navigator is immediately put to the test. Traditional sensors begin to show conflicting data, a symptom of "Quantum Opacity," where the dense energy of the nebula interferes with the ability to make accurate space-time measurements. Quantum Opacity is not merely a navigational hazard; it represents a breakdown in the fundamental ways humanity understands and traverses space. + +Navigating through a Stellar Labyrinth is akin to sailing through a stormy sea where the waves are made of the fabric of space itself, and the wind is the cosmic energy that flows through the nebula. Each wave of gravitonic disturbance could easily throw the ship off course or worse, into a gravitational vortex. + +Captain Elara Thorne watches the data streaming in from the Astroflux Navigator. "Adjust heading to 47-mark-3. Increase power to forward shields. Let’s not get brushed aside by these gravitonic waves," she commands, her voice steady despite the increasing tension on the bridge. + +The crew's trust in the Astroflux Navigator is soon vindicated as they maneuver through the labyrinth with a precision that would be unimaginable with conventional systems. The Navigator predicts gravitational anomalies before they become apparent, allowing the Voyager to weave through the cosmic maze. + +The journey through the Stellar Labyrinths is taxing for both the crew and the ship. As they progress, the nebula reveals its secrets slowly, showcasing bizarre phenomena that challenge their understanding of physics. Pockets of anti-time, areas where time seems to flow backward relative to the universe outside the nebula, and spatial folds, where space itself seems to double back on itself, are just a few of the wonders and dangers they encounter. + +Dr. Nyx Moreau, the ship’s science officer, is particularly fascinated by these discoveries. "Each of these anomalies could represent a textbook’s worth of new physics. The implications for our understanding of the universe are profound," she notes, recording the observations that flood in from the various sensors and probes. + +Throughout this challenging navigation, the crew remains resilient, guided by the steady hand of Captain Thorne and the genius of Dr. Petrovich’s Astroflux Navigator. Their voyage through the Stellar Labyrinths not only tests their skills and technology but also deepens their bond as a team, each member driven by a shared quest for knowledge and the thrill of discovery. As they emerge from the labyrinth, they are not the same crew that entered; they are pioneers on the threshold of new cosmic revelations. + +As the Nebula Voyager emerges from the treacherous Stellar Labyrinths, the crew's scientific curiosity is immediately reignited by a new discovery: Xenospheric Particulates. Dr. Nyx Moreau, the Science Officer, observes anomalous readings on her sensors—fleeting blips of quantum irregularities scattered throughout the nebula. Unlike anything previously cataloged, these microscopic entities, which Dr. Moreau dubs "Nebuloforms," exhibit the unique ability to alter their phase state in response to observational stimuli. + +Nebuloforms are not merely physical entities; they challenge the conventional boundaries of biology and quantum physics. Existing in a state of quantum flux, they seem to flicker at the edges of detectability, responding dynamically to the act of observation itself—a phenomenon that recalls the observer effect in quantum mechanics, where the state of a particle is affected by its observation. + +Recognizing the significance of these findings, Dr. Moreau convenes a meeting with the crew to discuss the implications. "These particulates aren’t just unusual; they might represent a fundamentally new form of life, or at least, a new state of matter that behaves like life," she explains, her voice filled with excitement and wonder. + +To study these enigmatic particles more closely, Dr. Moreau proposes the deployment of the Bioquantum Array, an advanced suite of instruments specifically designed for the interaction and analysis of quantum biological systems. The Bioquantum Array, developed through a collaborative effort between astrobiologists and quantum physicists, uses a combination of high-resolution quantum coherence imaging and bio-molecular resonance mapping to decode the complex molecular quantum states of these particulates. + +The deployment of the Bioquantum Array is a delicate operation, requiring precise calibration to handle the ethereal nature of the Nebuloforms. The Array sends out modulated quantum pulses that resonate with the particulates, allowing the crew to observe and record their behavior without inducing phase collapse—a critical challenge when dealing with quantum-sensitive entities. + +As the Bioquantum Array hums to life, the data begins to flood in. The Nebuloforms, when stimulated by the Array, reveal their ability to shift between different quantum states, displaying a spectrum of colors and patterns that suggest a complex internal structure. "It's almost as if they're communicating or reacting to our probes," Dr. Moreau observes, intrigued by the display. + +This interaction leads Dr. Moreau to hypothesize that the Nebuloforms could be the foundation of an ecosystem that operates on principles different from any known biology. They seem to exist in a superposition of states, each configuration potentially corresponding to different biological functions—reproduction, energy conversion, perhaps even cognition. + +The implications of this discovery are profound. If these Nebuloforms are indeed a form of life, they represent a biology that operates not just at the chemical level, but fundamentally at the quantum level. This could mean that life in the universe might be far more varied and widespread than previously thought, existing in forms and places unimaginable to conventional science. + +This hypothesis energizes the entire crew, each member aware that they are on the frontier of a new biological science. Dr. Moreau continues to lead the exploration, her reports combining detailed scientific analysis with philosophical reflections on the nature of life and observation. The data collected is not just a series of measurements; it is a dialogue with an unknown form of life, offering a glimpse into the vast potential diversity of the cosmos. + +The journey of the Nebula Voyager into the depths of Orion's Whisper becomes a journey into the unknown territories of life itself, with each discovery challenging the crew's understanding of nature and pushing the boundaries of science. As they document and study these quantum life forms, they not only uncover secrets of the universe but also reflect on the broader question of what it means to be alive. + +Within the confines of his communications suite aboard the Nebula Voyager, Communications Specialist Jiro Takashi is ensconced amidst a labyrinth of monitors and interfacing systems. His main focus now lies on the Galacto-semiotic Sequences—complex, structured emissions that the nebula Orion’s Whisper has been projecting since the Voyager's arrival. These sequences are not merely random noise or cosmic background interference; they possess a structure and repetition that hint at intentional communication, though their origin and purpose remain enigmatic. + +Jiro, with his background in xenolinguistics and cryptographic analysis, is tasked with one of the most pivotal roles in this mission: to decode and translate these sequences. To aid in this monumental task, he utilizes the Cryptolingual Synthesizer, a sophisticated piece of technology capable of analyzing, decoding, and simulating non-human linguistic structures. This device uses a combination of quantum computing and neural network algorithms to model potential linguistic frameworks that could match the patterns observed. + +The Cryptolingual Synthesizer operates by first capturing the Galacto-semiotic Sequences through a high-fidelity quantum entanglement receiver, which ensures that the subtlest nuances of the emissions are preserved. Once captured, the sequences are processed through a series of analytical algorithms designed to identify repetitive patterns and structural symmetries—hallmarks of language. + +Jiro watches the data streams flow across his screens, a kaleidoscope of symbols and waveforms that represent the raw language of the cosmos. “If these patterns are indeed a form of communication, they could be expressions of thought, attempts at contact, or something entirely beyond our current understanding,” he muses aloud, aware that his efforts could potentially mark the first human contact with a non-human intelligence. + +As the synthesizer works through countless computational cycles, Jiro cross-references known linguistic structures from hundreds of Earth languages and several theoretical extraterrestrial models. He hypothesizes that the sequences might represent a form of high-dimensional communication, where information is conveyed not just through linear symbols but through multi-layered quantum states that interact with each other in complex ways. + +The challenge is not merely technical but also conceptual. The idea of high-dimensional communication suggests that the sender—whatever it may be—perceives and interacts with the universe in ways fundamentally different from humans. This form of communication could involve concepts like time, causality, and existence being intertwined and expressed simultaneously. + +Days turn into weeks as Jiro and the Cryptolingual Synthesizer work tirelessly, slowly making progress. The synthesizer begins to isolate what appear to be key lexical and grammatical structures within the sequences, suggesting that they are not random but possess a consistent internal logic. + +Through iterative modeling and adjustments, Jiro starts to translate fragments of the Galacto-semiotic Sequences. The initial translations are fragmented and abstract, full of metaphors relating to quantum states, gravitational waves, and cosmic phenomena—phrases like "the dance of gravity’s embrace" or "the echo of the universe's breath." + +As he delves deeper into the translation process, Jiro finds that understanding the sequences requires not only linguistic intuition but also a philosophical rethinking of communication itself. What does it mean to "speak" in the language of the universe? How does one interpret phrases that might encapsulate concepts alien to human experience? + +The work is slow and often frustrating, but every small breakthrough brings a surge of excitement. The potential to unlock a truly universal language—one that might enable humans to communicate with other forms of intelligence across the cosmos—is a compelling motivator. Jiro's work, translating the mysterious Galacto-semiotic Sequences, stands on the precipice of expanding humanity’s place within the galactic community, promising a future where humans are not only explorers of space but also interpreters of its deepest, most ancient conversations. + +As the Nebula Voyager navigates the deeper regions of Orion's Whisper, the scientific breakthroughs and discoveries are momentarily overshadowed by an emergent crisis. The ship's primary operational system, known as the Voyager Core—a highly sophisticated quantum computing matrix that coordinates everything from life support to propulsion—begins to experience irregular disruptions. These disturbances are identified as "Interstellar Quantum Interference" (IQI), a phenomenon caused by intense quantum fluctuations within the nebula. + +IQI presents an unprecedented challenge. The quantum fluctuations intrude into the operational fabric of the Voyager Core, causing unpredictable malfunctions. Systems flicker without warning; navigational arrays recalibrate spontaneously, and life-support cycles exhibit uncharacteristic irregularities. These disruptions pose a direct threat not only to the mission but also to the safety of the entire crew. + +At the forefront of addressing this crisis is Dr. Lian Zhao, the ship’s chief medical officer, who quickly realizes that the quantum disturbances are having a peculiar effect on the crew themselves. Early symptoms are subtle and varied, ranging from transient bouts of disorientation to more severe neurological disruptions. Dr. Zhao, with her background in genetic medicine, suspects that the quantum interference is interacting with the crew's biogenetics—specifically, the quantum signatures that are part of the cellular makeup of every human. + +As Dr. Zhao investigates further, she discovers that the crew’s exposure to the nebula’s unique quantum energies, amplified by the IQI, has led to "Genetic Reverberations." These are cascading genetic anomalies triggered by the distorted quantum fields affecting the Voyager Core. In essence, the quantum fluctuations of the nebula interact with the intrinsic quantum fields within human DNA, leading to unpredictable genetic shifts. + +Recognizing the urgency of the situation, Dr. Zhao mobilizes her medical team to monitor the crew's health while collaborating with the engineering department to devise a mitigation strategy. She initiates a ship-wide bio-scan to assess the extent of genetic alterations, using the ship’s advanced biometric monitoring systems to track any deviations from each crew member’s baseline genetic data. + +The results are troubling yet fascinating. Some crew members show signs of accelerated epigenetic modifications, changes that normally occur over generations now happening in mere days. Others display fluctuations in their gene expression patterns, potentially altering everything from physiological functions to more subtle psychological behaviors. + +To counteract these Genetic Reverberations, Dr. Zhao proposes the use of a "Quantum Genomic Stabilizer" (QGS), a theoretical device that could recalibrate the crew's quantum genetic signatures to their original state. The engineering team, led by Dr. Soren Petrovich, springs into action to assemble the QGS, using a combination of quantum computing elements and biogenetic templates specific to each crew member. + +The stabilization process is tense, with each crew member undergoing a personalized quantum recalibration. The procedure is not without its risks, involving delicate adjustments to the very quantum foundation of biological existence. As the QGS activates, it envelops each crew member in a cocoon of quantum-corrective energy, designed to shield and then methodically restore their original genetic architecture. + +Throughout this crisis, the bond among the crew members strengthens. They face these challenges not just as colleagues but as a family bound by a common ordeal. Dr. Zhao and her team work tirelessly, monitoring the effects of the QGS, adjusting its parameters, and ensuring that no permanent genetic alterations take root. + +After several intense hours, the QGS proves effective. The crew’s genetic markers begin to stabilize, returning to their pre-interference state. The relief is palpable across the Voyager, but the incident leaves a lasting impact. It serves as a stark reminder of the unpredictable nature of space exploration, particularly within such a mysterious and quantumly active environment as Orion’s Whisper. + +As stability returns, so does the crew’s resolve to continue their mission. They press on, driven by a renewed sense of vulnerability and a profound respect for the forces they seek to understand and explore. The experience of facing and overcoming the Genetic Reverberations not only deepens their commitment to the mission but also to each other, underscoring the inherent risks and rewards of venturing into the unknown. + +As the Nebula Voyager continues its journey through the depths of Orion's Whisper, the crew confronts not only the physical and technological challenges posed by the nebula's intricate environments but also the profound philosophical questions that arise from their discoveries. The nature of existence, the fabric of reality, and the potential for intelligence far beyond human comprehension come into stark relief against the backdrop of this uncharted celestial phenomenon. + +The scientific breakthroughs aboard the Voyager—such as the interactions with Xenospheric Particulates, the decoding of Galacto-semiotic Sequences, and the handling of Genetic Reverberations—have all pointed to a universe much more complex and intricately woven than previously imagined. Each discovery peels back a layer of cosmic mystery, revealing not just new data, but also new enigmas. + +Captain Elara Thorne finds herself increasingly pondering the broader implications of their mission. "What does it mean for us, as a species, to find that the building blocks of life might be quantum in nature, or that communication might exist in dimensions beyond our perception?" she muses during a crew meeting. These questions are not just rhetorical; they shape the crew's understanding of their place in the universe. + +Science Officer Dr. Nyx Moreau, whose work with the Nebuloforms has revealed the potential for life forms based on quantum states, often discusses these questions with the crew. "If life can exist in the quantum foam of space, essentially as a part of the fabric of the universe itself, what does that say about consciousness? Could intelligence be as ubiquitous as matter, woven into the very structure of reality?" Her questions challenge the crew’s foundational beliefs about life and intelligence. + +These discussions often take place in the ship's communal areas, where the crew gathers to share meals and thoughts. It’s here that Communications Specialist Jiro Takashi, who has spent countless hours attempting to translate the nebula's emissions, shares his own reflections. "If we're right, and these Galacto-semiotic Sequences are a form of communication, it suggests that intelligence and intention might not be limited to forms of life we understand. Perhaps, in a way, the universe itself might be attempting to communicate." + +Dr. Lian Zhao, who has seen firsthand the impact of the nebula’s energy on human genetics, contributes a medical and biological perspective to these philosophical discussions. "The changes we observed in our own DNA suggest that what we consider 'human' might be more fluid than we thought. If our genetic makeup can interact so dynamically with the universe, are we, in some small way, also a reflection of the cosmos?" + +These discussions often delve into the idea of interconnectedness, a concept that the crew begins to view not just as a philosophical or spiritual notion, but as a potential scientific truth. The idea that all elements of the universe—from the smallest particles to the largest celestial structures—might be more directly connected than ever conceived. + +As the mission progresses, these philosophical explorations become as much a part of the journey as the physical exploration of Orion’s Whisper. The crew’s growing understanding of the nebula’s complexities and the mysterious phenomena they encounter broadens their conception of existence itself. They grapple with the implications of their findings not only for human knowledge but for human identity. + +Navigating through Orion’s Whisper thus transforms the crew of the Nebula Voyager. They began as explorers of space, driven by the human quest for knowledge. But as they delve deeper, they become explorers of possibility, pioneers at the edge of humanity’s understanding, looking into the unknown not just to see what is there but to question what it means for everything we thought we knew about the universe and our place within it. This journey, filled with wonders and mysteries, leaves each crew member changed, expanded by the vastness of what they encounter and the depth of the questions they are compelled to ask. + +As the Nebula Voyager prepares to conclude its mission within the enigmatic depths of Orion’s Whisper, the crew gathers in the observation deck, watching as the nebula's vibrant colors swirl outside the panoramic windows. Each crew member carries the weight and wonder of their experiences, forever altered by what they've encountered. + +Captain Elara Thorne stands before her team, her expression contemplative. "This mission has shown us wonders few humans have ever seen and presented us with questions that we may spend lifetimes attempting to answer," she begins. "We've seen that the universe is more intricate and more mysterious than our wildest imaginings and that our pursuit of understanding it is an unending journey." + +She continues, her voice firm yet inspired, "We return not just with data, but with a new understanding of our place in the cosmos. We’ve learned that the universe speaks in many languages—of quantum, of biology, of energy. It’s our task now to continue this conversation, to listen and perhaps, one day, to truly understand." + +The crew nods, a shared sense of resolve and awe binding them together. They are no longer just witnesses to the universe’s complexity but participants in its unraveling mystery. As the Nebula Voyager turns towards home, its trajectory marks not just a path through space but a continued journey into the vast, uncharted territories of knowledge and existence. + +The mission of the Nebula Voyager may be ending, but the journey for its crew and for humanity—armed now with new questions and a broader vision—has just begun. As they continue to ponder and decode the secrets of Orion’s Whisper, their discoveries promise to reshape human understanding of life, intelligence, and the fabric of the cosmos itself.