Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

Poor performance of building index for binary vectors #494

Open
yhmo opened this issue Oct 8, 2022 · 1 comment
Open

Poor performance of building index for binary vectors #494

yhmo opened this issue Oct 8, 2022 · 1 comment
Assignees

Comments

@yhmo
Copy link

yhmo commented Oct 8, 2022

Deploy milvus 2.1.4 and run this script:

import numpy as np
import pandas as pd
import time
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)
from bitstring import BitArray
from random import randint

fp_size = 8192

milvus_host = "127.0.0.1"
milvus_port = "19530"

collection_name = "emols_fp_1024_test"

print("*** Connect to milvus ***")
connections.connect("default", host=milvus_host, port=milvus_port)

has = utility.has_collection(collection_name)
print(f"Does collection exist in Milvus: {has}")

if not has:
    print("*** Create collection ***")
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="smiles", dtype=DataType.VARCHAR, max_length=200),
        FieldSchema(
            name="morgan_fingerprint",
            dtype=DataType.BINARY_VECTOR,
            dim=fp_size,
            is_primary=False,
        ),
    ]
    schema = CollectionSchema(fields, collection_name)
    ligands_collection = Collection(collection_name, schema, consistency_level="Strong")

    print("Collection created")
else:
    ligands_collection = Collection(collection_name)
    print(ligands_collection)

chunk_size = 100000
smiles = ["test_string"] * chunk_size
for i in range(0, 240):
    print("i", i)
    fps = [BitArray([randint(0, 1) for i in range(1024)]).bin] * chunk_size
    fps_bytes = list(map(lambda p: bytes(p, encoding='utf-8'), fps))
    ligands_collection.insert([smiles, fps_bytes])

print("*** Load collection ***")
ligands_collection.load()

print("*** Create index TANIMOTO BIN_IVF_FLAT ***")
index = {
    "index_type": "BIN_IVF_FLAT",
    "metric_type": "TANIMOTO",
    "params": {"nlist": 1024},
}

ligands_collection.create_index("morgan_fingerprint", index)
print("index finished")

info = utility.get_query_segment_info(collection_name=collection_name)
print(info)
print("segments count", len(info))

pre = 0
start = 0
while True:
    prog = utility.index_building_progress(collection_name=collection_name)
    print(prog)
    if prog['indexed_rows'] != pre:
        end = time.time()
        print("index a segment cost", (end-start), "s")
        start = time.time()
        pre = prog['indexed_rows']

    time.sleep(3)

This script insert 24M binary vectors(8192 dim) into milvus, and build IVF_FLAT index with nlist=1024.
After insert is finished, there are 120 segments generated, and each segment has about 200000 rows.
The index process is very slow. building an index for a segment cost 5 minutes. seems the index process for binary vector is single-thread(only one CPU is busy).

@cydrain
Copy link
Collaborator

cydrain commented Jun 19, 2023

/assign @cydrain

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants