Skip to content

Commit

Permalink
unify the serialize and deserialize method for hnsw (#14)
Browse files Browse the repository at this point in the history
Signed-off-by: LHT129 <[email protected]>
  • Loading branch information
LHT129 authored Aug 30, 2024
1 parent fba46db commit 677492b
Show file tree
Hide file tree
Showing 68 changed files with 594 additions and 604 deletions.
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ file (GLOB CPP_SRCS "*.cpp")
file (GLOB CPP_FACTORY_SRCS "factory/*.cpp")
file (GLOB CPP_CONJUGATE_GRAPH_SRCS "impl/*.cpp")
file (GLOB CPP_INDEX_SRCS "index/*.cpp")
file (GLOB CPP_HNSWLIB_SRCS "algorithm/hnswlib/*.cpp")
list (FILTER CPP_SRCS EXCLUDE REGEX "_test.cpp")
list (FILTER CPP_FACTORY_SRCS EXCLUDE REGEX "_test.cpp")
list (FILTER CPP_CONJUGATE_GRAPH_SRCS EXCLUDE REGEX "_test.cpp")
list (FILTER CPP_INDEX_SRCS EXCLUDE REGEX "_test.cpp")

set (VSAG_SRCS ${CPP_SRCS} ${CPP_FACTORY_SRCS} ${CPP_INDEX_SRCS} ${CPP_CONJUGATE_GRAPH_SRCS})
set (VSAG_SRCS ${CPP_SRCS} ${CPP_FACTORY_SRCS} ${CPP_INDEX_SRCS} ${CPP_CONJUGATE_GRAPH_SRCS} ${CPP_HNSWLIB_SRCS})
add_library (vsag SHARED ${VSAG_SRCS})
add_library (vsag_static STATIC ${VSAG_SRCS})

Expand Down
167 changes: 167 additions & 0 deletions src/algorithm/hnswlib/block_manager.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@

// Copyright 2024-present the vsag project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "block_manager.h"

namespace hnswlib {

BlockManager::BlockManager(size_t max_elements,
size_t size_data_per_element,
size_t block_size_limit,
vsag::Allocator* allocator)
: max_elements_(max_elements),
size_data_per_element_(size_data_per_element),
allocator_(allocator) {
data_num_per_block_ = block_size_limit / size_data_per_element_;
block_size_ = size_data_per_element * data_num_per_block_;
size_t full_blocks = (max_elements * size_data_per_element) / block_size_;
size_t remaining_size = (max_elements * size_data_per_element) % block_size_;
for (size_t i = 0; i < full_blocks; ++i) {
blocks_.emplace_back(static_cast<char*>(allocator_->Allocate(block_size_)));
block_lens_.emplace_back(block_size_);
}
if (remaining_size > 0) {
blocks_.emplace_back(static_cast<char*>(allocator_->Allocate(remaining_size)));
block_lens_.emplace_back(remaining_size);
}
}

BlockManager::~BlockManager() {
for (char* block : blocks_) {
allocator_->Deallocate(block);
}
}

char*
BlockManager::GetElementPtr(size_t index, size_t offset) {
if (index >= max_elements_) {
throw std::out_of_range("Index is out of range:" + std::to_string(index));
}

size_t block_index = (index * size_data_per_element_) / block_size_;
size_t offset_in_block = (index * size_data_per_element_) % block_size_;
return blocks_[block_index] + offset_in_block + offset;
}

bool
BlockManager::Resize(size_t new_max_elements) {
if (new_max_elements < max_elements_) {
throw std::runtime_error("new_max_elements is less than max_elements_");
}

size_t new_full_blocks = (new_max_elements * size_data_per_element_) / block_size_;
size_t new_remaining_size = (new_max_elements * size_data_per_element_) % block_size_;

try {
bool append_more_block = blocks_.size() <= new_full_blocks;
// Adjust the size of the last block. There are two scenarios here: when more blocks
// need to be padded, the last block should be converted from a remaining_block to a
// full_block; otherwise, the size of the remaining_block should be increased to make
// it a larger remaining_block.
if (!blocks_.empty() && blocks_.back() != nullptr && block_lens_.back() != block_size_) {
char* last_block = blocks_.back();

size_t new_last_block_size = append_more_block ? block_size_ : new_remaining_size;
auto new_last_block = allocator_->Reallocate(last_block, new_last_block_size);
if (new_last_block == nullptr) {
return false;
}
blocks_.back() = static_cast<char*>(new_last_block);
block_lens_.back() = new_last_block_size;
}

// If the current number of blocks is less than the number of complete blocks needed, proceed with padding.
while (blocks_.size() < new_full_blocks) {
blocks_.push_back(static_cast<char*>(allocator_->Allocate(block_size_)));
block_lens_.push_back(block_size_);
}

// Padding the last block is necessary only when there are not enough blocks.
if (new_remaining_size > 0 && append_more_block) {
blocks_.push_back(static_cast<char*>(allocator_->Allocate(new_remaining_size)));
block_lens_.push_back(new_remaining_size);
}
max_elements_ = new_max_elements;
return true;
} catch (const std::bad_alloc&) {
return false;
}
}

bool
BlockManager::Serialize(char*& buffer, size_t cur_element_count) {
BufferStreamWriter writer(buffer);
return this->SerializeImpl(writer, cur_element_count);
}

bool
BlockManager::Serialize(std::ostream& ofs, size_t cur_element_count) {
IOStreamWriter writer(ofs);
return this->SerializeImpl(writer, cur_element_count);
}

bool
BlockManager::Deserialize(std::function<void(uint64_t, uint64_t, void*)> read_func,
uint64_t cursor,
size_t cur_element_count) {
ReadFuncStreamReader reader(read_func, cursor);
return this->DeserializeImpl(reader, cur_element_count);
}

bool
BlockManager::Deserialize(std::istream& ifs, size_t cur_element_count) {
IOStreamReader reader(ifs);
return this->DeserializeImpl(reader, cur_element_count);
}

bool
BlockManager::SerializeImpl(StreamWriter& writer, uint64_t cur_element_count) {
size_t store_size = cur_element_count * size_data_per_element_;
try {
size_t offset = 0;
for (int i = 0; i < blocks_.size(); ++i) {
size_t new_offset = offset + block_lens_[i];
size_t current_block_size = std::min(new_offset, store_size) - offset;
writer.Write(blocks_[i], current_block_size);
offset = new_offset;
if (new_offset >= store_size) {
break;
}
}
} catch (const std::ios_base::failure&) {
return false;
}
return true;
}

bool
BlockManager::DeserializeImpl(StreamReader& reader, uint64_t cur_element_count) {
try {
size_t offset = 0;
size_t need_read_size = cur_element_count * size_data_per_element_;
for (size_t i = 0; i < blocks_.size(); ++i) {
size_t current_read_size = std::min(need_read_size, offset + block_lens_[i]) - offset;
reader.Read(blocks_[i], current_read_size);
offset += block_lens_[i];
if (offset >= need_read_size) {
break;
}
}
} catch (const std::ios_base::failure&) {
return false;
}
return true;
}
} // namespace hnswlib
78 changes: 78 additions & 0 deletions src/algorithm/hnswlib/block_manager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@

// Copyright 2024-present the vsag project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <cstring>
#include <deque>
#include <functional>
#include <mutex>

#include "../../default_allocator.h"
#include "stream_reader.h"
#include "stream_writer.h"

namespace hnswlib {

class BlockManager {
public:
BlockManager(size_t max_elements,
size_t size_data_per_element,
size_t block_size_limit,
vsag::Allocator* allocator);

~BlockManager();

char*
GetElementPtr(size_t index, size_t offset);

bool
Resize(size_t new_max_elements);

bool
Serialize(char*& buffer, size_t cur_element_count);

bool
Serialize(std::ostream& ofs, size_t cur_element_count);

bool
Deserialize(std::function<void(uint64_t, uint64_t, void*)> read_func,
uint64_t cursor,
size_t cur_element_count);

bool
Deserialize(std::istream& ifs, size_t cur_element_count);

inline size_t
GetSize() const {
return max_elements_ * size_data_per_element_;
}

bool
SerializeImpl(StreamWriter& writer, uint64_t cur_element_count);

bool
DeserializeImpl(StreamReader& reader, uint64_t cur_element_count);

private:
std::vector<char*> blocks_ = {};
size_t data_num_per_block_ = 0;
size_t block_size_ = 0;
size_t size_data_per_element_ = 0;
size_t max_elements_ = 0;
std::vector<size_t> block_lens_ = {};
vsag::Allocator* const allocator_ = nullptr;
};
} // namespace hnswlib
1 change: 1 addition & 0 deletions src/algorithm/hnswlib/bruteforce.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

// Copyright 2024-present the vsag project
//
// Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
Loading

0 comments on commit 677492b

Please sign in to comment.