Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: indexing on scalar8 #141

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/sql/finalize.sql
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ CREATE OPERATOR FAMILY vector_cosine_ops USING vchordrq;
CREATE OPERATOR FAMILY halfvec_l2_ops USING vchordrq;
CREATE OPERATOR FAMILY halfvec_ip_ops USING vchordrq;
CREATE OPERATOR FAMILY halfvec_cosine_ops USING vchordrq;
CREATE OPERATOR FAMILY scalar8_l2_ops USING vchordrq;
CREATE OPERATOR FAMILY scalar8_ip_ops USING vchordrq;
CREATE OPERATOR FAMILY scalar8_cosine_ops USING vchordrq;

CREATE OPERATOR FAMILY vector_l2_ops USING Vchordrqfscan;
CREATE OPERATOR FAMILY vector_ip_ops USING Vchordrqfscan;
Expand Down Expand Up @@ -199,6 +202,24 @@ CREATE OPERATOR CLASS halfvec_cosine_ops
OPERATOR 2 <<=>> (halfvec, sphere_halfvec) FOR SEARCH,
FUNCTION 1 _vchordrq_support_halfvec_cosine_ops();

CREATE OPERATOR CLASS scalar8_l2_ops
FOR TYPE scalar8 USING vchordrq FAMILY scalar8_l2_ops AS
OPERATOR 1 <-> (scalar8, scalar8) FOR ORDER BY float_ops,
OPERATOR 2 <<->> (scalar8, sphere_scalar8) FOR SEARCH,
FUNCTION 1 _vchordrq_support_scalar8_l2_ops();

CREATE OPERATOR CLASS scalar8_ip_ops
FOR TYPE scalar8 USING vchordrq FAMILY scalar8_ip_ops AS
OPERATOR 1 <#> (scalar8, scalar8) FOR ORDER BY float_ops,
OPERATOR 2 <<#>> (scalar8, sphere_scalar8) FOR SEARCH,
FUNCTION 1 _vchordrq_support_scalar8_ip_ops();

CREATE OPERATOR CLASS scalar8_cosine_ops
FOR TYPE scalar8 USING vchordrq FAMILY scalar8_cosine_ops AS
OPERATOR 1 <=> (scalar8, scalar8) FOR ORDER BY float_ops,
OPERATOR 2 <<=>> (scalar8, sphere_scalar8) FOR SEARCH,
FUNCTION 1 _vchordrq_support_scalar8_cosine_ops();

CREATE OPERATOR CLASS vector_l2_ops
FOR TYPE vector USING Vchordrqfscan FAMILY vector_l2_ops AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
Expand Down
8 changes: 4 additions & 4 deletions src/vchordrq/algorithm/insert.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::postgres::Relation;
use crate::vchordrq::algorithm::rabitq::fscan_process_lowerbound;
use crate::vchordrq::algorithm::rabitq::process_lowerbound;
use crate::vchordrq::algorithm::tuples::*;
use crate::vchordrq::algorithm::vectors;
use base::always_equal::AlwaysEqual;
Expand Down Expand Up @@ -31,7 +31,7 @@ pub fn insert<V: Vector>(
let vector = vector.as_borrowed();
let is_residual = meta_tuple.is_residual;
let default_lut = if !is_residual {
Some(V::rabitq_fscan_preprocess(vector))
Some(V::rabitq_preprocess(vector))
} else {
None
};
Expand Down Expand Up @@ -74,7 +74,7 @@ pub fn insert<V: Vector>(
let mut results = Vec::new();
{
let lut = if is_residual {
&V::rabitq_fscan_preprocess(
&V::rabitq_preprocess(
V::residual(vector, list.1.as_ref().map(|x| x.as_borrowed()).unwrap())
.as_borrowed(),
)
Expand All @@ -91,7 +91,7 @@ pub fn insert<V: Vector>(
.map(rkyv::check_archived_root::<Height1Tuple>)
.expect("data corruption")
.expect("data corruption");
let lowerbounds = fscan_process_lowerbound(
let lowerbounds = process_lowerbound(
distance_kind,
dims,
lut,
Expand Down
8 changes: 4 additions & 4 deletions src/vchordrq/algorithm/rabitq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,19 @@ pub fn code(dims: u32, vector: &[f32]) -> Code {

pub type Lut = (f32, f32, f32, f32, (Vec<u64>, Vec<u64>, Vec<u64>, Vec<u64>));

pub fn fscan_preprocess(vector: &[f32]) -> Lut {
pub fn preprocess(vector: &[f32]) -> Lut {
use base::simd::quantize;
let dis_v_2 = f32::reduce_sum_of_x2(vector);
let (k, b, qvector) = quantize::quantize(vector, 15.0);
let qvector_sum = if vector.len() <= 4369 {
let qvector_sum = if qvector.len() <= 4369 {
base::simd::u8::reduce_sum_of_x_as_u16(&qvector) as f32
} else {
base::simd::u8::reduce_sum_of_x_as_u32(&qvector) as f32
};
(dis_v_2, b, k, qvector_sum, binarize(&qvector))
}

pub fn fscan_process_lowerbound(
pub fn process_lowerbound(
distance_kind: DistanceKind,
_dims: u32,
lut: &Lut,
Expand Down Expand Up @@ -104,7 +104,7 @@ pub fn fscan_process_lowerbound(
}
}

fn binarize(vector: &[u8]) -> (Vec<u64>, Vec<u64>, Vec<u64>, Vec<u64>) {
pub fn binarize(vector: &[u8]) -> (Vec<u64>, Vec<u64>, Vec<u64>, Vec<u64>) {
let n = vector.len();
let mut t0 = vec![0u64; n.div_ceil(64)];
let mut t1 = vec![0u64; n.div_ceil(64)];
Expand Down
12 changes: 6 additions & 6 deletions src/vchordrq/algorithm/scan.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::postgres::Relation;
use crate::vchordrq::algorithm::rabitq::fscan_process_lowerbound;
use crate::vchordrq::algorithm::rabitq::process_lowerbound;
use crate::vchordrq::algorithm::tuples::*;
use crate::vchordrq::algorithm::vectors;
use base::always_equal::AlwaysEqual;
Expand Down Expand Up @@ -32,7 +32,7 @@ pub fn scan<V: Vector>(
let vector = V::random_projection(vector);
let is_residual = meta_tuple.is_residual;
let default_lut = if !is_residual {
Some(V::rabitq_fscan_preprocess(vector.as_borrowed()))
Some(V::rabitq_preprocess(vector.as_borrowed()))
} else {
None
};
Expand All @@ -53,7 +53,7 @@ pub fn scan<V: Vector>(
let mut results = Vec::new();
for list in lists {
let lut = if is_residual {
&V::rabitq_fscan_preprocess(
&V::rabitq_preprocess(
V::residual(
vector.as_borrowed(),
list.1.as_ref().map(|x| x.as_borrowed()).unwrap(),
Expand All @@ -73,7 +73,7 @@ pub fn scan<V: Vector>(
.map(rkyv::check_archived_root::<Height1Tuple>)
.expect("data corruption")
.expect("data corruption");
let lowerbounds = fscan_process_lowerbound(
let lowerbounds = process_lowerbound(
distance_kind,
dims,
lut,
Expand Down Expand Up @@ -125,7 +125,7 @@ pub fn scan<V: Vector>(
let mut results = Vec::new();
for list in lists {
let lut = if is_residual {
&V::rabitq_fscan_preprocess(
&V::rabitq_preprocess(
V::residual(
vector.as_borrowed(),
list.1.as_ref().map(|x| x.as_borrowed()).unwrap(),
Expand All @@ -145,7 +145,7 @@ pub fn scan<V: Vector>(
.map(rkyv::check_archived_root::<Height0Tuple>)
.expect("data corruption")
.expect("data corruption");
let lowerbounds = fscan_process_lowerbound(
let lowerbounds = process_lowerbound(
distance_kind,
dims,
lut,
Expand Down
153 changes: 148 additions & 5 deletions src/vchordrq/algorithm/tuples.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use super::rabitq::{self, Code, Lut};
use crate::types::scalar8::Scalar8Owned;
use crate::vchordrq::types::OwnedVector;
use base::distance::DistanceKind;
use base::simd::ScalarLike;
use base::vector::VectorBorrowed;
use base::vector::{VectOwned, VectorOwned};
use half::f16;
use rkyv::{Archive, ArchiveUnsized, CheckBytes, Deserialize, Serialize};
Expand Down Expand Up @@ -56,7 +58,7 @@ pub trait Vector: VectorOwned {

fn residual(vector: Self::Borrowed<'_>, center: Self::Borrowed<'_>) -> Self;

fn rabitq_fscan_preprocess(vector: Self::Borrowed<'_>) -> Lut;
fn rabitq_preprocess(vector: Self::Borrowed<'_>) -> Lut;

fn rabitq_code(dims: u32, vector: Self::Borrowed<'_>) -> Code;

Expand Down Expand Up @@ -129,8 +131,8 @@ impl Vector for VectOwned<f32> {
Self::new(ScalarLike::vector_sub(vector.slice(), center.slice()))
}

fn rabitq_fscan_preprocess(vector: Self::Borrowed<'_>) -> Lut {
rabitq::fscan_preprocess(vector.slice())
fn rabitq_preprocess(vector: Self::Borrowed<'_>) -> Lut {
rabitq::preprocess(vector.slice())
}

fn rabitq_code(dims: u32, vector: Self::Borrowed<'_>) -> Code {
Expand Down Expand Up @@ -212,8 +214,8 @@ impl Vector for VectOwned<f16> {
Self::new(ScalarLike::vector_sub(vector.slice(), center.slice()))
}

fn rabitq_fscan_preprocess(vector: Self::Borrowed<'_>) -> Lut {
rabitq::fscan_preprocess(&f16::vector_to_f32(vector.slice()))
fn rabitq_preprocess(vector: Self::Borrowed<'_>) -> Lut {
rabitq::preprocess(&f16::vector_to_f32(vector.slice()))
}

fn rabitq_code(dims: u32, vector: Self::Borrowed<'_>) -> Code {
Expand All @@ -229,6 +231,147 @@ impl Vector for VectOwned<f16> {
}
}

impl Vector for Scalar8Owned {
type Metadata = (f32, f32, f32, f32);

type Element = u8;

fn metadata_from_archived(
archived: &<Self::Metadata as ArchiveUnsized>::Archived,
) -> Self::Metadata {
(archived.0, archived.1, archived.2, archived.3)
}

fn vector_split(vector: Self::Borrowed<'_>) -> (Self::Metadata, Vec<&[Self::Element]>) {
let code = vector.code();
(
(
vector.sum_of_x2(),
vector.k(),
vector.b(),
vector.sum_of_code(),
),
match code.len() {
0..=3840 => vec![code],
3841..=5120 => vec![&code[..2560], &code[2560..]],
5121.. => code.chunks(7680).collect(),
},
)
}

fn vector_merge(metadata: Self::Metadata, slice: &[Self::Element]) -> Self {
Scalar8Owned::new(
metadata.0,
metadata.1,
metadata.2,
metadata.3,
slice.to_vec(),
)
}

fn from_owned(vector: OwnedVector) -> Self {
match vector {
OwnedVector::Scalar8(x) => x,
_ => unreachable!(),
}
}

type DistanceAccumulator = (DistanceKind, u32, u32);

fn distance_begin(distance_kind: DistanceKind) -> Self::DistanceAccumulator {
(distance_kind, 0, 0)
}

fn distance_next(
accumulator: &mut Self::DistanceAccumulator,
left: &[Self::Element],
right: &[Self::Element],
) {
match accumulator.0 {
DistanceKind::L2 => accumulator.1 += base::simd::u8::reduce_sum_of_xy(left, right),
DistanceKind::Dot => accumulator.1 += base::simd::u8::reduce_sum_of_xy(left, right),
DistanceKind::Hamming => unreachable!(),
DistanceKind::Jaccard => unreachable!(),
}
accumulator.2 += left.len() as u32;
}

fn distance_end(
accumulator: Self::DistanceAccumulator,
(sum_of_x2_u, k_u, b_u, sum_of_code_u): Self::Metadata,
(sum_of_x2_v, k_v, b_v, sum_of_code_v): Self::Metadata,
) -> f32 {
match accumulator.0 {
DistanceKind::L2 => {
let xy = k_u * k_v * accumulator.1 as f32
+ b_u * b_v * accumulator.2 as f32
+ k_u * b_v * sum_of_code_u
+ b_u * k_v * sum_of_code_v;
sum_of_x2_u + sum_of_x2_v - 2.0 * xy
}
DistanceKind::Dot => {
let xy = k_u * k_v * accumulator.1 as f32
+ b_u * b_v * accumulator.2 as f32
+ k_u * b_v * sum_of_code_u
+ b_u * k_v * sum_of_code_v;
-xy
}
DistanceKind::Hamming => unreachable!(),
DistanceKind::Jaccard => unreachable!(),
}
}

fn random_projection(vector: Self::Borrowed<'_>) -> Self {
vector.own()
}

fn residual(_: Self::Borrowed<'_>, _: Self::Borrowed<'_>) -> Self {
unimplemented!()
}

fn rabitq_preprocess(vector: Self::Borrowed<'_>) -> Lut {
let dis_v_2 = vector.sum_of_x2();
let k = vector.k() * 17.0;
let b = vector.b();
let qvector = vector
.code()
.iter()
.map(|&x| ((x as u32 + 8) / 17) as u8)
.collect::<Vec<_>>();
let qvector_sum = if qvector.len() <= 4369 {
base::simd::u8::reduce_sum_of_x_as_u16(&qvector) as f32
} else {
base::simd::u8::reduce_sum_of_x_as_u32(&qvector) as f32
};
(dis_v_2, b, k, qvector_sum, rabitq::binarize(&qvector))
}

fn rabitq_code(dims: u32, vector: Self::Borrowed<'_>) -> Code {
let dequantized = vector
.code()
.iter()
.map(|&x| vector.k() * x as f32 + vector.b())
.collect::<Vec<_>>();
rabitq::code(dims, &dequantized)
}

fn build_to_vecf32(vector: Self::Borrowed<'_>) -> Vec<f32> {
vector
.code()
.iter()
.map(|&x| vector.k() * x as f32 + vector.b())
.collect()
}

fn build_from_vecf32(x: &[f32]) -> Self {
let sum_of_x2 = f32::reduce_sum_of_x2(x);
let (k, b, code) =
base::simd::quantize::quantize(f32::vector_to_f32_borrowed(x).as_ref(), 255.0);
let sum_of_code = base::simd::u8::reduce_sum_of_x_as_u32(&code) as f32;
Self::new(sum_of_x2, k, b, sum_of_code, code)
}
}

#[derive(Clone, PartialEq, Archive, Serialize, Deserialize)]
#[archive(check_bytes)]
pub struct MetaTuple {
Expand Down
Loading
Loading