diff --git a/src/pinecone/pinecone.h b/src/pinecone/pinecone.h index c67d8019..484fd43f 100644 --- a/src/pinecone/pinecone.h +++ b/src/pinecone/pinecone.h @@ -203,7 +203,7 @@ IndexBulkDeleteResult *no_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteRe void pinecone_spec_validator(const char *spec); void pinecone_host_validator(const char *spec); void validate_api_key(void); -void validate_vector_nonzero(Vector* vector); +bool validate_vector_nonzero(Vector* vector); bool no_validate(Oid opclassoid); // utils diff --git a/src/pinecone/pinecone_build.c b/src/pinecone/pinecone_build.c index 1bc3f8dc..fbdeea01 100644 --- a/src/pinecone/pinecone_build.c +++ b/src/pinecone/pinecone_build.c @@ -153,6 +153,7 @@ void pinecone_build_callback(Relation index, ItemPointer tid, Datum *values, boo cJSON *json_vector; char* pinecone_id = pinecone_id_from_heap_tid(*tid); json_vector = tuple_get_pinecone_vector(itup_desc, values, isnull, pinecone_id); + if(json_vector==NULL) return; cJSON_AddItemToArray(buildstate->json_vectors, json_vector); if (cJSON_GetArraySize(buildstate->json_vectors) >= PINECONE_BATCH_SIZE) { pinecone_bulk_upsert(pinecone_api_key, buildstate->host, buildstate->json_vectors, pinecone_vectors_per_request); diff --git a/src/pinecone/pinecone_insert.c b/src/pinecone/pinecone_insert.c index 3f08817c..c5bd0ada 100644 --- a/src/pinecone/pinecone_insert.c +++ b/src/pinecone/pinecone_insert.c @@ -156,15 +156,12 @@ bool AppendBufferTupleInCtx(Relation index, Datum *values, bool *isnull, ItemPoi MemoryContext oldCtx; MemoryContext insertCtx; bool checkpoint_created; - Vector* vector; // use a memory context because index_form_tuple can allocate insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Pinecone insert tuple temporary context", ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(insertCtx); - vector = DatumGetVector(values[0]); - validate_vector_nonzero(vector); checkpoint_created = AppendBufferTuple(index, values, isnull, heap_tid, heapRel); MemoryContextSwitchTo(oldCtx); @@ -286,7 +283,8 @@ void FlushToPinecone(Relation index) vector_id = pinecone_id_from_heap_tid(buffer_tup.tid); json_vector = tuple_get_pinecone_vector(index->rd_att, index_values, index_isnull, vector_id); - cJSON_AddItemToArray(json_vectors, json_vector); + if(json_vector!=NULL) + cJSON_AddItemToArray(json_vectors, json_vector); } } diff --git a/src/pinecone/pinecone_scan.c b/src/pinecone/pinecone_scan.c index 12eca13a..08b79174 100644 --- a/src/pinecone/pinecone_scan.c +++ b/src/pinecone/pinecone_scan.c @@ -315,7 +315,7 @@ void load_buffer_into_sort(Relation index, PineconeScanOpaque so, Datum query_da page = BufferGetPage(buf); // add all tuples on the page to the sortstate - for (OffsetNumber offno = FirstOffsetNumber; offno <= PageGetMaxOffsetNumber(page); offno = OffsetNumberNext(offno)) { + for (OffsetNumber offno = FirstOffsetNumber; offno <= PageGetMaxOffsetNumber(page) && n_sortedtuple > pinecone_max_buffer_scan; offno = OffsetNumberNext(offno)) { // get the tid and the vector from the heap tuple ItemId itemid; Item item; diff --git a/src/pinecone/pinecone_utils.c b/src/pinecone/pinecone_utils.c index 9a5dbfce..80ec9dee 100644 --- a/src/pinecone/pinecone_utils.c +++ b/src/pinecone/pinecone_utils.c @@ -12,8 +12,11 @@ cJSON* tuple_get_pinecone_vector(TupleDesc tup_desc, Datum *values, bool *isnull cJSON *metadata = cJSON_CreateObject(); Vector *vector; cJSON *json_values; + bool isNonZero; + vector = DatumGetVector(values[0]); - validate_vector_nonzero(vector); + isNonZero = validate_vector_nonzero(vector); + if(!isNonZero) return NULL; json_values = cJSON_CreateFloatArray(vector->x, vector->dim); // prepare metadata for (int i = 1; i < tup_desc->natts; i++) // skip the first column which is the vector @@ -52,27 +55,6 @@ cJSON* tuple_get_pinecone_vector(TupleDesc tup_desc, Datum *values, bool *isnull return json_vector; } -cJSON* index_tuple_get_pinecone_vector(Relation index, IndexTuple itup) { - int natts = index->rd_att->natts; - Datum *itup_values = (Datum *) palloc(sizeof(Datum) * natts); - bool *itup_isnull = (bool *) palloc(sizeof(bool) * natts); - TupleDesc itup_desc = index->rd_att; - char* vector_id; - index_deform_tuple(itup, itup_desc, itup_values, itup_isnull); - vector_id = pinecone_id_from_heap_tid(itup->t_tid); - return tuple_get_pinecone_vector(itup_desc, itup_values, itup_isnull, vector_id); -} - -cJSON* heap_tuple_get_pinecone_vector(Relation heap, HeapTuple htup) { - int natts = heap->rd_att->natts; - Datum *htup_values = (Datum *) palloc(sizeof(Datum) * natts); - bool *htup_isnull = (bool *) palloc(sizeof(bool) * natts); - TupleDesc htup_desc = heap->rd_att; - char* vector_id; - heap_deform_tuple(htup, htup_desc, htup_values, htup_isnull); - vector_id = pinecone_id_from_heap_tid(htup->t_self); - return tuple_get_pinecone_vector(htup_desc, htup_values, htup_isnull, vector_id); -} ItemPointerData pinecone_id_get_heap_tid(char *id) { diff --git a/src/pinecone/pinecone_validate.c b/src/pinecone/pinecone_validate.c index 7a4b73e5..50932c8a 100644 --- a/src/pinecone/pinecone_validate.c +++ b/src/pinecone/pinecone_validate.c @@ -12,12 +12,14 @@ void validate_api_key(void) { } } -void validate_vector_nonzero(Vector* vector) { +bool validate_vector_nonzero(Vector* vector) { if (vector_eq_zero_internal(vector)) { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid vector: zero vector"), errhint("Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine."))); + return false; } + return true; } diff --git a/test/expected/pinecone_zero_vector_insert.out b/test/expected/pinecone_zero_vector_insert.out index c8d73b80..ad662959 100644 --- a/test/expected/pinecone_zero_vector_insert.out +++ b/test/expected/pinecone_zero_vector_insert.out @@ -7,6 +7,7 @@ SET client_min_messages = 'notice'; -- flush each vector individually SET pinecone.vectors_per_request = 1; SET pinecone.requests_per_batch = 1; +SET pinecone.max_buffer_scan = 0; -- disable flat scan to force use of the index SET enable_seqscan = off; -- CREATE TABLE @@ -35,29 +36,105 @@ VALUES ('https://api.pinecone.io/indexes', 'POST', $${ }$$); -- mock describe index stats INSERT INTO pinecone_mock (url_prefix, method, response) -VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":0}'); -INSERT INTO t (id, val) VALUES (2, '[0,0,0]'); --- create index after insering 0 vector - Throws an error -CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}'); -ERROR: Invalid vector: zero vector -HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine. --- Truncate the table to remove the values for creating an index successfully -TRUNCATE TABLE t; +VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":2}'); +-- mock upsert +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/vectors/upsert', 'POST', '{"upsertedCount":1}'); +-- mock query +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/query', 'POST', $${ + "results": [], + "matches": [{ + "id": "000000000001", + "score": 2, + "values": [] + }], + "namespace": "", + "usage": { + "readUnits": 5 + } +}$$); +-- mock fetch +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/vectors/fetch', 'GET', $${ + "code": 3, + "message": "No IDs provided for fetch query", + "details": [] +}$$); -- create index CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}'); -INSERT INTO pinecone_mock (url_prefix, method, response) -VALUES ('https://fakehost/vectors/upsert', -'{ "vectors": [{ - "id": "000000000001", - "values": [100, 1, 1], - "metadata": { - } - }] - }', - '{"upsertedCount":1}' -); +-- insert vectors: throws warning while flushing zero-vector INSERT INTO t (id, val) VALUES (1, '[100,1,1]'); INSERT INTO t (id, val) VALUES (2, '[0,0,0]'); -ERROR: Invalid vector: zero vector +INSERT INTO t (id, val) VALUES (3, '[10120,76,1]'); +WARNING: Invalid vector: zero vector +HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine. +WARNING: No vectors to flush to pinecone +-- returns only id = 1 as it is flushed to pinecone )zero vector not flushed to pinecone) +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; +NOTICE: Buffer is too large +HINT: There are 0 tuples in the buffer that have not yet been flushed to pinecone and 2 tuples in pinecone that are not yet live. You may want to consider flushing the buffer. +NOTICE: Reached max local scan + id | val +----+----------- + 1 | [100,1,1] +(1 row) + +SELECT * FROM t; + id | val +----+-------------- + 1 | [100,1,1] + 2 | [0,0,0] + 3 | [10120,76,1] +(3 rows) + +DROP INDEX i2; +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + id | val +----+-------------- + 2 | [0,0,0] + 1 | [100,1,1] + 3 | [10120,76,1] +(3 rows) + +DELETE FROM pinecone_mock +WHERE url_prefix = 'https://fakehost/query' AND method = 'POST'; +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/query', 'POST', $${ + "results": [], + "matches": [{ + "id": "000000000001", + "score": 2, + "values": [] + }, + { + "id": "000000000003", + "score": 2, + "values": [] + }], + "namespace": "", + "usage": { + "readUnits": 5 + } +}$$); +-- displays warning while flushing zero vector to pinecone +CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}'); +WARNING: Invalid vector: zero vector HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine. +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; +NOTICE: Reached max local scan + id | val +----+-------------- + 1 | [100,1,1] + 3 | [10120,76,1] +(2 rows) + +SELECT * FROM t; + id | val +----+-------------- + 1 | [100,1,1] + 2 | [0,0,0] + 3 | [10120,76,1] +(3 rows) + DROP TABLE t; diff --git a/test/sql/pinecone_zero_vector_insert.sql b/test/sql/pinecone_zero_vector_insert.sql index 4bbd61c1..b6dbd631 100644 --- a/test/sql/pinecone_zero_vector_insert.sql +++ b/test/sql/pinecone_zero_vector_insert.sql @@ -7,6 +7,8 @@ SET client_min_messages = 'notice'; -- flush each vector individually SET pinecone.vectors_per_request = 1; SET pinecone.requests_per_batch = 1; +SET pinecone.max_buffer_scan = 0; + -- disable flat scan to force use of the index SET enable_seqscan = off; -- CREATE TABLE @@ -36,33 +38,78 @@ VALUES ('https://api.pinecone.io/indexes', 'POST', $${ -- mock describe index stats INSERT INTO pinecone_mock (url_prefix, method, response) -VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":0}'); - +VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":2}'); -INSERT INTO t (id, val) VALUES (2, '[0,0,0]'); +-- mock upsert +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/vectors/upsert', 'POST', '{"upsertedCount":1}'); --- create index after insering 0 vector - Throws an error -CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}'); +-- mock query +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/query', 'POST', $${ + "results": [], + "matches": [{ + "id": "000000000001", + "score": 2, + "values": [] + }], + "namespace": "", + "usage": { + "readUnits": 5 + } +}$$); --- Truncate the table to remove the values for creating an index successfully -TRUNCATE TABLE t; +-- mock fetch +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/vectors/fetch', 'GET', $${ + "code": 3, + "message": "No IDs provided for fetch query", + "details": [] +}$$); -- create index CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}'); -INSERT INTO pinecone_mock (url_prefix, method, response) -VALUES ('https://fakehost/vectors/upsert', -'{ "vectors": [{ - "id": "000000000001", - "values": [100, 1, 1], - "metadata": { - } - }] - }', - '{"upsertedCount":1}' -); - +-- insert vectors: throws warning while flushing zero-vector INSERT INTO t (id, val) VALUES (1, '[100,1,1]'); INSERT INTO t (id, val) VALUES (2, '[0,0,0]'); +INSERT INTO t (id, val) VALUES (3, '[10120,76,1]'); + +-- returns only id = 1 as it is flushed to pinecone )zero vector not flushed to pinecone) +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + +SELECT * FROM t; + +DROP INDEX i2; + +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + +DELETE FROM pinecone_mock +WHERE url_prefix = 'https://fakehost/query' AND method = 'POST'; + +INSERT INTO pinecone_mock (url_prefix, method, response) +VALUES ('https://fakehost/query', 'POST', $${ + "results": [], + "matches": [{ + "id": "000000000001", + "score": 2, + "values": [] + }, + { + "id": "000000000003", + "score": 2, + "values": [] + }], + "namespace": "", + "usage": { + "readUnits": 5 + } +}$$); + +-- displays warning while flushing zero vector to pinecone +CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}'); + +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; +SELECT * FROM t; DROP TABLE t; \ No newline at end of file