Skip to content

Commit

Permalink
refactor: remove unnecessary tokenizer options from to_svector functi…
Browse files Browse the repository at this point in the history
…ons (#20)

When building the bm25 matrix, we store the tokenizer options. We can
reduce some verbosity by referring to these in the
`bm25_document_to_svector` and `bm25_query_to_svector` functions.

**as is:**

```SQL
SELECT bm25_create('documents', 'passage', 'documents_passage_bm25', 'hf', 'google-bert/bert-base-uncased', 0.75, 1.2);

SELECT bm25_document_to_svector('documents_passage_bm25', 'requiring error for due process claim', 'hf', 'google-bert/bert-base-uncased');
SELECT bm25_query_to_svector('documents_passage_bm25', 'requiring error for due process claim', 'hf', 'google-bert/bert-base-uncased');
```

**to be:**

```SQL
SELECT bm25_create('documents', 'passage', 'documents_passage_bm25', 'hf', 'google-bert/bert-base-uncased', 0.75, 1.2);

SELECT bm25_document_to_svector('documents_passage_bm25', 'requiring error for due process claim');
SELECT bm25_query_to_svector('documents_passage_bm25', 'requiring error for due process claim');
```

Signed-off-by: jwnz <[email protected]>
  • Loading branch information
jwnz authored Oct 11, 2024
1 parent 9a7fcae commit 2924f67
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions src/sql/finalize.sql
Original file line number Diff line number Diff line change
Expand Up @@ -94,26 +94,30 @@ BEGIN
END;
$fn$ LANGUAGE plpgsql;

CREATE FUNCTION bm25_document_to_svector(mat regclass, t TEXT, tokenizer TEXT, model TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$
CREATE FUNCTION bm25_document_to_svector(mat regclass, t TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$
DECLARE
idx regclass;
p_b REAL;
p_k1 REAL;
p_words INT;
p_docs INT;
p_dims INT;
p_tokenizer TEXT;
p_model TEXT;
BEGIN
SELECT indexrelid, b, k1, words, docs, dims INTO idx, p_b, p_k1, p_words, p_docs, p_dims FROM bm_catalog.pg_bm25 WHERE matrelid = mat;
RETURN bm_catalog.bm25_document_to_svector_internal(mat::oid, idx::oid, p_b, p_k1, p_words, p_docs, p_dims, t, style, tokenizer, model);
SELECT indexrelid, b, k1, words, docs, dims, tokenizer, model INTO idx, p_b, p_k1, p_words, p_docs, p_dims, p_tokenizer, p_model FROM bm_catalog.pg_bm25 WHERE matrelid = mat;
RETURN bm_catalog.bm25_document_to_svector_internal(mat::oid, idx::oid, p_b, p_k1, p_words, p_docs, p_dims, t, style, p_tokenizer, p_model);
END;
$fn$ LANGUAGE plpgsql;

CREATE FUNCTION bm25_query_to_svector(mat regclass, t TEXT, tokenizer TEXT, model TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$
CREATE FUNCTION bm25_query_to_svector(mat regclass, t TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$
DECLARE
idx regclass;
p_dims INT;
p_tokenizer TEXT;
p_model TEXT;
BEGIN
SELECT indexrelid, dims INTO idx, p_dims FROM bm_catalog.pg_bm25 WHERE matrelid = mat;
RETURN bm_catalog.bm25_query_to_svector_internal(mat::oid, idx::oid, p_dims, t, style, tokenizer, model);
SELECT indexrelid, dims, tokenizer, model INTO idx, p_dims, p_tokenizer, p_model FROM bm_catalog.pg_bm25 WHERE matrelid = mat;
RETURN bm_catalog.bm25_query_to_svector_internal(mat::oid, idx::oid, p_dims, t, style, p_tokenizer, p_model);
END;
$fn$ LANGUAGE plpgsql;

0 comments on commit 2924f67

Please sign in to comment.