Skip to content

Commit

Permalink
optimized zero vector flushing
Browse files Browse the repository at this point in the history
  • Loading branch information
swetavooda committed Apr 11, 2024
1 parent da8c4a2 commit 29138aa
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 69 deletions.
2 changes: 1 addition & 1 deletion src/pinecone/pinecone.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ IndexBulkDeleteResult *no_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteRe
void pinecone_spec_validator(const char *spec);
void pinecone_host_validator(const char *spec);
void validate_api_key(void);
void validate_vector_nonzero(Vector* vector);
bool validate_vector_nonzero(Vector* vector);
bool no_validate(Oid opclassoid);

// utils
Expand Down
1 change: 1 addition & 0 deletions src/pinecone/pinecone_build.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ void pinecone_build_callback(Relation index, ItemPointer tid, Datum *values, boo
cJSON *json_vector;
char* pinecone_id = pinecone_id_from_heap_tid(*tid);
json_vector = tuple_get_pinecone_vector(itup_desc, values, isnull, pinecone_id);
if(json_vector==NULL) return;
cJSON_AddItemToArray(buildstate->json_vectors, json_vector);
if (cJSON_GetArraySize(buildstate->json_vectors) >= PINECONE_BATCH_SIZE) {
pinecone_bulk_upsert(pinecone_api_key, buildstate->host, buildstate->json_vectors, pinecone_vectors_per_request);
Expand Down
6 changes: 2 additions & 4 deletions src/pinecone/pinecone_insert.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,12 @@ bool AppendBufferTupleInCtx(Relation index, Datum *values, bool *isnull, ItemPoi
MemoryContext oldCtx;
MemoryContext insertCtx;
bool checkpoint_created;
Vector* vector;
// use a memory context because index_form_tuple can allocate
insertCtx = AllocSetContextCreate(CurrentMemoryContext,
"Pinecone insert tuple temporary context",
ALLOCSET_DEFAULT_SIZES);
oldCtx = MemoryContextSwitchTo(insertCtx);

vector = DatumGetVector(values[0]);
validate_vector_nonzero(vector);

checkpoint_created = AppendBufferTuple(index, values, isnull, heap_tid, heapRel);
MemoryContextSwitchTo(oldCtx);
Expand Down Expand Up @@ -286,7 +283,8 @@ void FlushToPinecone(Relation index)

vector_id = pinecone_id_from_heap_tid(buffer_tup.tid);
json_vector = tuple_get_pinecone_vector(index->rd_att, index_values, index_isnull, vector_id);
cJSON_AddItemToArray(json_vectors, json_vector);
if(json_vector!=NULL)
cJSON_AddItemToArray(json_vectors, json_vector);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/pinecone/pinecone_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ void load_buffer_into_sort(Relation index, PineconeScanOpaque so, Datum query_da
page = BufferGetPage(buf);

// add all tuples on the page to the sortstate
for (OffsetNumber offno = FirstOffsetNumber; offno <= PageGetMaxOffsetNumber(page); offno = OffsetNumberNext(offno)) {
for (OffsetNumber offno = FirstOffsetNumber; offno <= PageGetMaxOffsetNumber(page) && n_sortedtuple > pinecone_max_buffer_scan; offno = OffsetNumberNext(offno)) {
// get the tid and the vector from the heap tuple
ItemId itemid;
Item item;
Expand Down
26 changes: 4 additions & 22 deletions src/pinecone/pinecone_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@ cJSON* tuple_get_pinecone_vector(TupleDesc tup_desc, Datum *values, bool *isnull
cJSON *metadata = cJSON_CreateObject();
Vector *vector;
cJSON *json_values;
bool isNonZero;

vector = DatumGetVector(values[0]);
validate_vector_nonzero(vector);
isNonZero = validate_vector_nonzero(vector);
if(!isNonZero) return NULL;
json_values = cJSON_CreateFloatArray(vector->x, vector->dim);
// prepare metadata
for (int i = 1; i < tup_desc->natts; i++) // skip the first column which is the vector
Expand Down Expand Up @@ -52,27 +55,6 @@ cJSON* tuple_get_pinecone_vector(TupleDesc tup_desc, Datum *values, bool *isnull
return json_vector;
}

cJSON* index_tuple_get_pinecone_vector(Relation index, IndexTuple itup) {
int natts = index->rd_att->natts;
Datum *itup_values = (Datum *) palloc(sizeof(Datum) * natts);
bool *itup_isnull = (bool *) palloc(sizeof(bool) * natts);
TupleDesc itup_desc = index->rd_att;
char* vector_id;
index_deform_tuple(itup, itup_desc, itup_values, itup_isnull);
vector_id = pinecone_id_from_heap_tid(itup->t_tid);
return tuple_get_pinecone_vector(itup_desc, itup_values, itup_isnull, vector_id);
}

cJSON* heap_tuple_get_pinecone_vector(Relation heap, HeapTuple htup) {
int natts = heap->rd_att->natts;
Datum *htup_values = (Datum *) palloc(sizeof(Datum) * natts);
bool *htup_isnull = (bool *) palloc(sizeof(bool) * natts);
TupleDesc htup_desc = heap->rd_att;
char* vector_id;
heap_deform_tuple(htup, htup_desc, htup_values, htup_isnull);
vector_id = pinecone_id_from_heap_tid(htup->t_self);
return tuple_get_pinecone_vector(htup_desc, htup_values, htup_isnull, vector_id);
}

ItemPointerData pinecone_id_get_heap_tid(char *id)
{
Expand Down
6 changes: 4 additions & 2 deletions src/pinecone/pinecone_validate.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ void validate_api_key(void) {
}
}

void validate_vector_nonzero(Vector* vector) {
bool validate_vector_nonzero(Vector* vector) {
if (vector_eq_zero_internal(vector)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("Invalid vector: zero vector"),
errhint("Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.")));
return false;
}
return true;
}


Expand Down
117 changes: 97 additions & 20 deletions test/expected/pinecone_zero_vector_insert.out
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ SET client_min_messages = 'notice';
-- flush each vector individually
SET pinecone.vectors_per_request = 1;
SET pinecone.requests_per_batch = 1;
SET pinecone.max_buffer_scan = 0;
-- disable flat scan to force use of the index
SET enable_seqscan = off;
-- CREATE TABLE
Expand Down Expand Up @@ -35,29 +36,105 @@ VALUES ('https://api.pinecone.io/indexes', 'POST', $${
}$$);
-- mock describe index stats
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":0}');
INSERT INTO t (id, val) VALUES (2, '[0,0,0]');
-- create index after insering 0 vector - Throws an error
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');
ERROR: Invalid vector: zero vector
HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.
-- Truncate the table to remove the values for creating an index successfully
TRUNCATE TABLE t;
VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":2}');
-- mock upsert
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/upsert', 'POST', '{"upsertedCount":1}');
-- mock query
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/query', 'POST', $${
"results": [],
"matches": [{
"id": "000000000001",
"score": 2,
"values": []
}],
"namespace": "",
"usage": {
"readUnits": 5
}
}$$);
-- mock fetch
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/fetch', 'GET', $${
"code": 3,
"message": "No IDs provided for fetch query",
"details": []
}$$);
-- create index
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/upsert',
'{ "vectors": [{
"id": "000000000001",
"values": [100, 1, 1],
"metadata": {
}
}]
}',
'{"upsertedCount":1}'
);
-- insert vectors: throws warning while flushing zero-vector
INSERT INTO t (id, val) VALUES (1, '[100,1,1]');
INSERT INTO t (id, val) VALUES (2, '[0,0,0]');
ERROR: Invalid vector: zero vector
INSERT INTO t (id, val) VALUES (3, '[10120,76,1]');
WARNING: Invalid vector: zero vector
HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.
WARNING: No vectors to flush to pinecone
-- returns only id = 1 as it is flushed to pinecone )zero vector not flushed to pinecone)
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
NOTICE: Buffer is too large
HINT: There are 0 tuples in the buffer that have not yet been flushed to pinecone and 2 tuples in pinecone that are not yet live. You may want to consider flushing the buffer.
NOTICE: Reached max local scan
id | val
----+-----------
1 | [100,1,1]
(1 row)

SELECT * FROM t;
id | val
----+--------------
1 | [100,1,1]
2 | [0,0,0]
3 | [10120,76,1]
(3 rows)

DROP INDEX i2;
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
id | val
----+--------------
2 | [0,0,0]
1 | [100,1,1]
3 | [10120,76,1]
(3 rows)

DELETE FROM pinecone_mock
WHERE url_prefix = 'https://fakehost/query' AND method = 'POST';
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/query', 'POST', $${
"results": [],
"matches": [{
"id": "000000000001",
"score": 2,
"values": []
},
{
"id": "000000000003",
"score": 2,
"values": []
}],
"namespace": "",
"usage": {
"readUnits": 5
}
}$$);
-- displays warning while flushing zero vector to pinecone
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');
WARNING: Invalid vector: zero vector
HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
NOTICE: Reached max local scan
id | val
----+--------------
1 | [100,1,1]
3 | [10120,76,1]
(2 rows)

SELECT * FROM t;
id | val
----+--------------
1 | [100,1,1]
2 | [0,0,0]
3 | [10120,76,1]
(3 rows)

DROP TABLE t;
85 changes: 66 additions & 19 deletions test/sql/pinecone_zero_vector_insert.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ SET client_min_messages = 'notice';
-- flush each vector individually
SET pinecone.vectors_per_request = 1;
SET pinecone.requests_per_batch = 1;
SET pinecone.max_buffer_scan = 0;

-- disable flat scan to force use of the index
SET enable_seqscan = off;
-- CREATE TABLE
Expand Down Expand Up @@ -36,33 +38,78 @@ VALUES ('https://api.pinecone.io/indexes', 'POST', $${

-- mock describe index stats
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":0}');

VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":2}');

INSERT INTO t (id, val) VALUES (2, '[0,0,0]');
-- mock upsert
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/upsert', 'POST', '{"upsertedCount":1}');

-- create index after insering 0 vector - Throws an error
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');
-- mock query
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/query', 'POST', $${
"results": [],
"matches": [{
"id": "000000000001",
"score": 2,
"values": []
}],
"namespace": "",
"usage": {
"readUnits": 5
}
}$$);

-- Truncate the table to remove the values for creating an index successfully
TRUNCATE TABLE t;
-- mock fetch
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/fetch', 'GET', $${
"code": 3,
"message": "No IDs provided for fetch query",
"details": []
}$$);

-- create index
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');

INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/upsert',
'{ "vectors": [{
"id": "000000000001",
"values": [100, 1, 1],
"metadata": {
}
}]
}',
'{"upsertedCount":1}'
);

-- insert vectors: throws warning while flushing zero-vector
INSERT INTO t (id, val) VALUES (1, '[100,1,1]');
INSERT INTO t (id, val) VALUES (2, '[0,0,0]');
INSERT INTO t (id, val) VALUES (3, '[10120,76,1]');

-- returns only id = 1 as it is flushed to pinecone )zero vector not flushed to pinecone)
SELECT * FROM t ORDER BY val <-> '[3,3,3]';

SELECT * FROM t;

DROP INDEX i2;

SELECT * FROM t ORDER BY val <-> '[3,3,3]';

DELETE FROM pinecone_mock
WHERE url_prefix = 'https://fakehost/query' AND method = 'POST';

INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/query', 'POST', $${
"results": [],
"matches": [{
"id": "000000000001",
"score": 2,
"values": []
},
{
"id": "000000000003",
"score": 2,
"values": []
}],
"namespace": "",
"usage": {
"readUnits": 5
}
}$$);

-- displays warning while flushing zero vector to pinecone
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');

SELECT * FROM t ORDER BY val <-> '[3,3,3]';
SELECT * FROM t;

DROP TABLE t;

0 comments on commit 29138aa

Please sign in to comment.