fix with comments

Signed-off-by: cutecutecat <[email protected]>
tensorchord · Jul 23, 2024 · c6acb93 · c6acb93
1 parent 59c5e8a
commit c6acb93
Show file tree

Hide file tree

Showing 7 changed files with 59 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -277,7 +277,7 @@ class Item(models.Model):
                 fields=["embedding"],
                 opclasses=["vector_l2_ops"],
                 # don't pass any of `m`, `ef_construction`, `threads`, `quantization_type` or `quantization_ratio`
-                # if created by `with_option`, they will be overwritten
+                # if created by `with_option`, they will be overridden
             ).with_option(
                 IndexOption(index=Hnsw(m=16, ef_construction=100), threads=1)
             ),

diff --git a/src/pgvecto_rs/django/indexes.py b/src/pgvecto_rs/django/indexes.py
@@ -81,6 +81,14 @@ def get_with_params(self):
         return [f"options = $${option.dumps()}$$"]
 
     def with_option(self, option: IndexOption):
+        """
+        Fill a partially initialized HnswIndex object with option, override arguments:
+        - m
+        - ef_construction
+        - threads
+        - quantization_type
+        - quantization_ratio
+        """
         if not isinstance(option.index, Hnsw):
             raise IndexOptionTypeError(Hnsw, type(option.index))
         self.m = option.index.m
@@ -126,6 +134,13 @@ def get_with_params(self):
         return [f"options = $${option.dumps()}$$"]
 
     def with_option(self, option: IndexOption):
+        """
+        Fill a partially initialized IvfIndex object with option, override arguments:
+        - nlist
+        - threads
+        - quantization_type
+        - quantization_ratio
+        """
         if not isinstance(option.index, Ivf):
             raise IndexOptionTypeError(Ivf, type(option.index))
         self.nlist = option.index.nlist
@@ -167,6 +182,12 @@ def get_with_params(self):
         return [f"options = $${option.dumps()}$$"]
 
     def with_option(self, option: IndexOption):
+        """
+        Fill a partially initialized FlatIndex object with option, override arguments:
+        - threads
+        - quantization_type
+        - quantization_ratio
+        """
         if not isinstance(option.index, Flat):
             raise IndexOptionTypeError(Flat, type(option.index))
         super().with_option(option)

diff --git a/src/pgvecto_rs/errors.py b/src/pgvecto_rs/errors.py
@@ -67,3 +67,8 @@ def __init__(self, required_type: type, dtype: type) -> None:
         super().__init__(
             f"the index requires IndexOption of {required_type} type, but got {dtype}"
         )
+
+
+class TextParseError(PGVectoRsError):
+    def __init__(self, payload: str, dtype: type) -> None:
+        super().__init__(f"failed to parse text of '{payload}' as a {dtype}")
diff --git a/src/pgvecto_rs/types/bvector.py b/src/pgvecto_rs/types/bvector.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from pgvecto_rs.errors import NDArrayDimensionError, ToDBDimUnequalError
+from pgvecto_rs.errors import NDArrayDimensionError, TextParseError, ToDBDimUnequalError
 
 
 class BinaryVector:
@@ -42,14 +42,18 @@ def to_binary(self):
 
     @classmethod
     def from_text(cls, value):
-        return cls([int(v) for v in value[1:-1].split(",")])
+        left, right = value.find("["), value.rfind("]")
+        if left == -1 or right == -1 or left > right:
+            raise TextParseError(value, cls)
+        return cls([int(v) for v in value[left + 1 : right].split(",")])
 
     @classmethod
     def from_binary(cls, value):
+        view = memoryview(value)
         # start reading buffer from 3th byte (first 2 bytes are for dimension info)
-        dim = unpack("<H", value[:2])[0]
+        dim = unpack("<H", view[:2])[0]
         length = math.ceil(dim / 64)
-        data = np.frombuffer(value, dtype="<u8", count=length, offset=2).view(np.uint8)
+        data = np.frombuffer(view, dtype="<u8", count=length, offset=2).view(np.uint8)
         return cls(np.unpackbits(data, bitorder="little", count=dim))
 
     @classmethod

diff --git a/src/pgvecto_rs/types/svector.py b/src/pgvecto_rs/types/svector.py
@@ -11,6 +11,7 @@
     SparseExtraArgError,
     SparseMissingArgError,
     SparseShapeError,
+    TextParseError,
     ToDBDimUnequalError,
 )
 
@@ -142,30 +143,33 @@ def _from_dense(self, value):
         self._values = [float(value[i]) for i in self._indices]
 
     @classmethod
-    def from_text(cls, value):
+    def from_text(cls, value: str):
         elements, dim = value.split("/", 2)
+        left, right = elements.find("{"), elements.rfind("}")
+        if left == -1 or right == -1 or left > right:
+            raise TextParseError(value, cls)
         indices = []
         values = []
-        for e in elements[1:-1].split(","):
+        for e in elements[left + 1 : right].split(","):
             i, v = e.split(":", 2)
             indices.append(int(i))
             values.append(float(v))
         return cls._from_parts(int(dim), indices, values)
 
     @classmethod
     def from_binary(cls, value):
+        view = memoryview(value)
         # unpack dims and length as little-endian uint32, keep same endian with pgvecto.rs
-        dims = unpack("<I", value[:4])[0]
-        length = unpack("<I", value[4:8])[0]
-        bytes = value[8:]
+        dims = unpack("<I", view[:4])[0]
+        length = unpack("<I", view[4:8])[0]
+        bytes = view[8:]
         # unpack indices and values as little-endian uint32 and float32, keep same endian with pgvecto.rs
         indices = np.frombuffer(bytes, dtype="<I", count=length, offset=0).astype(
             np.uint32
         )
-        bytes = bytes[4 * length :]
-        values = np.frombuffer(bytes, dtype="<f", count=length, offset=0).astype(
-            np.float32
-        )
+        values = np.frombuffer(
+            bytes, dtype="<f", count=length, offset=4 * length
+        ).astype(np.float32)
         return cls.from_parts(dims, indices, values)
 
     @classmethod

diff --git a/src/pgvecto_rs/types/vecf16.py b/src/pgvecto_rs/types/vecf16.py
@@ -1,11 +1,12 @@
-from struct import pack, unpack
+from struct import unpack
 
 import numpy as np
 
-from pgvecto_rs.errors import NDArrayDimensionError, ToDBDimUnequalError
+from pgvecto_rs.errors import NDArrayDimensionError
+from pgvecto_rs.types import Vector
 
 
-class Float16Vector:
+class Float16Vector(Vector):
     def __init__(self, value):
         # asarray still copies if same dtype
         if not isinstance(value, np.ndarray) or value.dtype != "<f2":
@@ -19,27 +20,6 @@ def __init__(self, value):
     def __repr__(self):
         return f"Float16Vector({self.to_list()})"
 
-    def dimensions(self):
-        return len(self._value)
-
-    def to_list(self):
-        return self._value.tolist()
-
-    def to_numpy(self):
-        return self._value
-
-    def to_text(self):
-        return "[" + ",".join([str(float(v)) for v in self._value]) + "]"
-
-    def to_binary(self):
-        # pack to little-endian uint16, keep same endian with pgvecto.rs
-        dims: bytes = pack("<H", self._value.shape[0])
-        return dims + self._value.tobytes()
-
-    @classmethod
-    def from_text(cls, value):
-        return cls([float(v) for v in value[1:-1].split(",")])
-
     @classmethod
     def from_binary(cls, value):
         dim = unpack("<H", value[:2])[0]
@@ -48,29 +28,6 @@ def from_binary(cls, value):
             np.frombuffer(value, dtype="<f2", count=dim, offset=2).astype(np.float16)
         )
 
-    @classmethod
-    def _to_db(cls, value, dim=None):
-        if value is None:
-            return value
-
-        if not isinstance(value, cls):
-            value = cls(value)
-
-        if dim is not None and value.dimensions() != dim:
-            raise ToDBDimUnequalError(dim, value.dimensions())
-
-        return value.to_text()
-
-    @classmethod
-    def _to_db_binary(cls, value):
-        if value is None:
-            return value
-
-        if not isinstance(value, cls):
-            value = cls(value)
-
-        return value.to_binary()
-
     @classmethod
     def _from_db(cls, value):
         if value is None or isinstance(value, cls):

diff --git a/src/pgvecto_rs/types/vector.py b/src/pgvecto_rs/types/vector.py
@@ -38,14 +38,18 @@ def to_binary(self):
 
     @classmethod
     def from_text(cls, value):
-        return cls([float(v) for v in value[1:-1].split(",")])
+        left, right = value.find("["), value.rfind("]")
+        if left == -1 or right == -1 or left > right:
+            raise ValueError
+        return cls([float(v) for v in value[left + 1 : right].split(",")])
 
     @classmethod
     def from_binary(cls, value):
-        dim = unpack("<H", value[:2])[0]
+        view = memoryview(value)
+        dim = unpack("<H", view[:2])[0]
         # start reading buffer from 3th byte (first 2 bytes are for dimension info)
         return cls(
-            np.frombuffer(value, dtype="<f", count=dim, offset=2).astype(np.float32)
+            np.frombuffer(view, dtype="<f", count=dim, offset=2).astype(np.float32)
         )
 
     @classmethod