Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tablestore vector store. #312

Merged
merged 4 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion docs/config_guide_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ source = [PaiEas, OpenAI, DashScope]

## rag.index

vector_store.type = [FAISS, Hologres, ElasticSearch, AnalyticDB, Milvus]
vector_store.type = [FAISS, Hologres, ElasticSearch, AnalyticDB, Milvus, Tablestore]

目前, pai_rag 支持多种方式创建和存储索引。

Expand Down Expand Up @@ -153,6 +153,19 @@ vector_store.type = [FAISS, Hologres, ElasticSearch, AnalyticDB, Milvus]
database = "pairag"
collection = "pairag_collection"

如果 vector_store.type = "Tablestore", 需要提供如下信息:

[rag.index]
persist_path = "localdata/storage"

[rag.index.vector_store]
type = "Tablestore"
endpoint = ""
instance_name = ""
access_key_id = ""
access_key_secret = ""
table_name = "pai_rag"

该设置也可在网页中配置。

## rag.node_parser
Expand Down
15 changes: 14 additions & 1 deletion docs/config_guide_en.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ This setting is also available in webui.

## rag.index

vector_store.type = [FAISS, Hologres, ElasticSearch, AnalyticDB, Milvus]
vector_store.type = [FAISS, Hologres, ElasticSearch, AnalyticDB, Milvus, Tablestore]

Currently, pai_rag provides a variety of approaches for creating & storing indices.

Expand Down Expand Up @@ -154,6 +154,19 @@ If vector_store.type = "Milvus", you need to provide the following information:
database = "pairag"
collection = "pairag_collection"

If vector_store.type = "Tablestore", you need to provide the following information:

[rag.index]
persist_path = "localdata/storage"

[rag.index.vector_store]
type = "Tablestore"
endpoint = ""
instance_name = ""
access_key_id = ""
access_key_secret = ""
table_name = "pai_rag"

This setting is also available in webui.

## rag.node_parser
Expand Down
198 changes: 197 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ aspose-slides = "^24.10.0"
ultralytics = "8.3.43"
datasketch = "^1.6.5"
primp = "0.9.1"
tablestore = "^6.1.0"

[tool.poetry.scripts]
pai_rag = "pai_rag.main:run"
Expand Down
4 changes: 4 additions & 0 deletions src/pai_rag/app/web/event_listeners.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def change_vectordb_conn(vectordb_type):
milvus_visible = False
opensearch_visible = False
postgresql_visible = False
tablestore_visible = False
if vectordb_type.lower() == "analyticdb":
adb_visible = True
elif vectordb_type.lower() == "hologres":
Expand All @@ -175,6 +176,8 @@ def change_vectordb_conn(vectordb_type):
opensearch_visible = True
elif vectordb_type.lower() == "postgresql":
postgresql_visible = True
elif vectordb_type.lower() == "tablestore":
tablestore_visible = True

return [
gr.update(visible=adb_visible),
Expand All @@ -184,6 +187,7 @@ def change_vectordb_conn(vectordb_type):
gr.update(visible=milvus_visible),
gr.update(visible=opensearch_visible),
gr.update(visible=postgresql_visible),
gr.update(visible=tablestore_visible),
]


Expand Down
40 changes: 40 additions & 0 deletions src/pai_rag/app/web/index_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
MilvusVectorStoreConfig,
OpenSearchVectorStoreConfig,
PostgreSQLVectorStoreConfig,
TablestoreVectorStoreConfig,
)


Expand Down Expand Up @@ -72,6 +73,11 @@
"postgresql_username",
"postgresql_password",
"postgresql_table_name",
"tablestore_endpoint",
"tablestore_instance_name",
"tablestore_access_key_id",
"tablestore_access_key_secret",
"tablestore_table_name",
]


Expand Down Expand Up @@ -292,6 +298,26 @@ def index_to_components_settings(
{"value": ""},
]
)
if isinstance(vector_store_config, TablestoreVectorStoreConfig):
vector_component_settings.extend(
[
{"value": vector_store_config.endpoint},
{"value": vector_store_config.instance_name},
{"value": vector_store_config.access_key_id},
{"value": vector_store_config.access_key_secret},
{"value": vector_store_config.table_name},
]
)
else:
vector_component_settings.extend(
[
{"value": ""},
{"value": ""},
{"value": ""},
{"value": ""},
{"value": ""},
]
)
component_settings = [
*index_component_settings,
*embed_component_settings,
Expand Down Expand Up @@ -359,6 +385,11 @@ def components_to_index(
milvus_password,
milvus_database,
milvus_collection_name,
tablestore_endpoint,
tablestore_instance_name,
tablestore_access_key_id,
tablestore_access_key_secret,
tablestore_table_name,
**kwargs,
) -> RagIndexEntry:
if vector_index is None or vector_index.lower() == "new":
Expand Down Expand Up @@ -443,6 +474,15 @@ def components_to_index(
"username": postgresql_username,
"password": postgresql_password,
}
elif vectordb_type.lower() == "tablestore":
vector_store = {
"type": vectordb_type.lower(),
"endpoint": tablestore_endpoint,
"instance_name": tablestore_instance_name,
"access_key_id": tablestore_access_key_id,
"access_key_secret": tablestore_access_key_secret,
"table_name": tablestore_table_name,
}
else:
raise ValueError(f"Unknown vector db type: {vectordb_type}")

Expand Down
39 changes: 39 additions & 0 deletions src/pai_rag/app/web/tabs/vector_db_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def create_vector_db_panel() -> Dict[str, Any]:
"faiss",
"opensearch",
"postgresql",
"tablestore",
],
label="Which VectorStore do you want to use?",
elem_id="vectordb_type",
Expand Down Expand Up @@ -232,6 +233,37 @@ def create_vector_db_panel() -> Dict[str, Any]:
interactive=True,
)

with gr.Column(visible=(vectordb_type == "tablestore")) as tablestore_col:
with gr.Row():
tablestore_endpoint = gr.Textbox(
label="tablestore_endpoint",
elem_id="tablestore_endpoint",
interactive=True,
)
tablestore_instance_name = gr.Textbox(
label="tablestore_instance_name",
elem_id="tablestore_instance_name",
interactive=True,
)
with gr.Row():
tablestore_access_key_id = gr.Textbox(
label="tablestore_access_key_id",
elem_id="tablestore_access_key_id",
interactive=True,
)
tablestore_access_key_secret = gr.Textbox(
label="tablestore_access_key_secret",
type="password",
elem_id="tablestore_access_key_secret",
interactive=True,
)
with gr.Row():
tablestore_table_name = gr.Textbox(
label="tablestore_table_name",
elem_id="tablestore_table_name",
interactive=True,
)

vectordb_type.change(
fn=ev_listeners.change_vectordb_conn,
inputs=vectordb_type,
Expand All @@ -243,6 +275,7 @@ def create_vector_db_panel() -> Dict[str, Any]:
milvus_col,
opensearch_col,
postgresql_col,
tablestore_col,
],
)
db_related_elements = [
Expand Down Expand Up @@ -291,6 +324,12 @@ def create_vector_db_panel() -> Dict[str, Any]:
adb_account,
adb_account_password,
adb_namespace,
# tablestore
tablestore_endpoint,
tablestore_instance_name,
tablestore_access_key_id,
tablestore_access_key_secret,
tablestore_table_name,
]
components.extend(db_related_elements)
return db_related_elements, components_to_dict(components)
87 changes: 87 additions & 0 deletions src/pai_rag/integrations/index/pai/utils/vector_store_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
import faiss
import os
import json
import tablestore
from llama_index.core.vector_stores.simple import DEFAULT_VECTOR_STORE, NAMESPACE_SEP
from llama_index.core.vector_stores.types import DEFAULT_PERSIST_FNAME
from elasticsearch.helpers.vectorstore import AsyncDenseVectorStrategy
from pai_rag.integrations.index.pai.utils.sparse_embed_function import (
BGEM3SparseEmbeddingFunction,
)
from pai_rag.integrations.vector_stores.tablestore.tablestore import (
TablestoreVectorStore,
)
from pai_rag.integrations.vector_stores.hologres.hologres import HologresVectorStore
from pai_rag.integrations.vector_stores.elasticsearch.my_elasticsearch import (
MyElasticsearchStore,
Expand All @@ -27,6 +31,7 @@
ElasticSearchVectorStoreConfig,
OpenSearchVectorStoreConfig,
HologresVectorStoreConfig,
TablestoreVectorStoreConfig,
)


Expand Down Expand Up @@ -57,6 +62,8 @@ def create_vector_store(
create_vector_store_func = create_postgresql
elif isinstance(vectordb_config, OpenSearchVectorStoreConfig):
create_vector_store_func = create_opensearch
elif isinstance(vectordb_config, TablestoreVectorStoreConfig):
create_vector_store_func = create_tablestore
else:
raise ValueError(f"Unknown vector store config {vectordb_config}.")

Expand Down Expand Up @@ -236,6 +243,86 @@ def create_opensearch(
return opensearch_store


def create_tablestore(
tablestore_config: TablestoreVectorStoreConfig,
embed_dims: int,
is_image_store: bool = False,
):
table_name = tablestore_config.table_name
if is_image_store:
table_name = f"{table_name}__image"

tablestore_store = TablestoreVectorStore(
endpoint=tablestore_config.endpoint,
instance_name=tablestore_config.instance_name,
access_key_id=tablestore_config.access_key_id,
access_key_secret=tablestore_config.access_key_secret,
table_name=table_name,
index_name="pai_rag_vector_store_ots_index_v1",
vector_dimension=embed_dims,
# metadata mapping is used to filter non-vector fields.
metadata_mappings=[
tablestore.FieldSchema(
"file_name",
tablestore.FieldType.KEYWORD,
index=True,
enable_sort_and_agg=True,
),
tablestore.FieldSchema(
"file_type",
tablestore.FieldType.KEYWORD,
index=True,
enable_sort_and_agg=True,
),
tablestore.FieldSchema(
"file_size",
tablestore.FieldType.LONG,
index=True,
enable_sort_and_agg=True,
),
tablestore.FieldSchema(
"file_path",
tablestore.FieldType.TEXT,
index=True,
enable_sort_and_agg=False,
),
tablestore.FieldSchema(
"image_url",
tablestore.FieldType.TEXT,
index=True,
enable_sort_and_agg=False,
),
tablestore.FieldSchema(
"creation_date",
tablestore.FieldType.DATE,
index=True,
enable_sort_and_agg=True,
date_formats=[
"yyyy-MM-dd",
"yyyy-MM-dd HH:mm",
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd HH:mm:ss.SSS",
],
),
tablestore.FieldSchema(
"last_modified_date",
tablestore.FieldType.DATE,
index=True,
enable_sort_and_agg=True,
date_formats=[
"yyyy-MM-dd",
"yyyy-MM-dd HH:mm",
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd HH:mm:ss.SSS",
],
),
],
)
tablestore_store.create_table_if_not_exist()
tablestore_store.create_search_index_if_not_exist()
return tablestore_store


def create_postgresql(
pg_config: PostgreSQLVectorStoreConfig,
embed_dims: int,
Expand Down
13 changes: 13 additions & 0 deletions src/pai_rag/integrations/index/pai/vector_store_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class SupportedVectorStoreType(str, Enum):
opensearch = "opensearch"
milvus = "milvus"
hologres = "hologres"
tablestore = "tablestore"


class VectorIndexRetrievalType(str, Enum):
Expand All @@ -25,6 +26,7 @@ class VectorIndexRetrievalType(str, Enum):
SupportedVectorStoreType.elasticsearch,
SupportedVectorStoreType.postgresql,
SupportedVectorStoreType.milvus,
SupportedVectorStoreType.tablestore,
]


Expand Down Expand Up @@ -99,6 +101,17 @@ class OpenSearchVectorStoreConfig(BaseVectorStoreConfig):
table_name: str


class TablestoreVectorStoreConfig(BaseVectorStoreConfig):
type: Literal[
SupportedVectorStoreType.tablestore
] = SupportedVectorStoreType.tablestore
endpoint: str
instance_name: str
access_key_id: str
access_key_secret: str
table_name: str


class PostgreSQLVectorStoreConfig(BaseVectorStoreConfig):
type: Literal[
SupportedVectorStoreType.postgresql
Expand Down
Loading
Loading