From 256eaa69019f2e7b6e63fd1e12e96f1eca13515f Mon Sep 17 00:00:00 2001 From: Victor Ung Date: Mon, 12 Aug 2024 09:27:22 -0700 Subject: [PATCH] rename asin_str2int to fixed_len_10_str2int --- .github/style_type_check_cfg/.flake8 | 2 +- pecos/core/base.py | 6 +-- pecos/core/libpecos.cpp | 22 ++++---- pecos/core/utils/mmap_hashmap.hpp | 61 ++++++++++++---------- pecos/utils/mmap_hashmap_util.py | 12 ++++- test/pecos/utils/test_mmap_hashmap_util.py | 36 +++++++++---- 6 files changed, 84 insertions(+), 55 deletions(-) diff --git a/.github/style_type_check_cfg/.flake8 b/.github/style_type_check_cfg/.flake8 index dbe66680..5326990d 100644 --- a/.github/style_type_check_cfg/.flake8 +++ b/.github/style_type_check_cfg/.flake8 @@ -1,3 +1,3 @@ [flake8] -ignore = E203,E501,W605,F541 +extend-ignore = E203,E501,W605,F541 max_line_length = 100 diff --git a/pecos/core/base.py b/pecos/core/base.py index 6a80b4b5..8cbe6660 100644 --- a/pecos/core/base.py +++ b/pecos/core/base.py @@ -2070,7 +2070,7 @@ def link_mmap_hashmap_methods(self): Specify C-lib's Memory-mappable Hashmap methods arguments and return types. """ fn_prefix = "mmap_hashmap" - map_type_list = ["str2int", "fixed_len_str2int", "asin_str2int", "int2int"] + map_type_list = ["str2int", "fixed_len_str2int", "fixed_len_10_str2int", "int2int"] key_args_dict = { "str2int": [ c_char_p, # pointer of key string @@ -2080,7 +2080,7 @@ def link_mmap_hashmap_methods(self): c_char_p, # pointer of key string c_uint32, # length of key string ], - "asin_str2int": [ + "fixed_len_10_str2int": [ c_char_p, # pointer of key string c_uint32, # length of key string ], @@ -2097,7 +2097,7 @@ def link_mmap_hashmap_methods(self): c_void_p, # List of pointer of key string POINTER(c_uint32), # List of length of key string ], - "asin_str2int": [ + "fixed_len_10_str2int": [ c_void_p, # List of pointer of key string POINTER(c_uint32), # List of length of key string ], diff --git a/pecos/core/libpecos.cpp b/pecos/core/libpecos.cpp index 9337ab37..7e3b2636 100644 --- a/pecos/core/libpecos.cpp +++ b/pecos/core/libpecos.cpp @@ -663,7 +663,7 @@ extern "C" { typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_str2int; typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_fixed_len_str2int; - typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_asin_str2int; + typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_fixed_len_10_str2int; typedef pecos::mmap_hashmap::Int2IntMap mmap_hashmap_int2int; // New @@ -672,7 +672,7 @@ extern "C" { return static_cast(new mmap_hashmap_ ## SUFFIX()); } MMAP_MAP_NEW(str2int) MMAP_MAP_NEW(fixed_len_str2int) - MMAP_MAP_NEW(asin_str2int) + MMAP_MAP_NEW(fixed_len_10_str2int) MMAP_MAP_NEW(int2int) // Destruct @@ -681,7 +681,7 @@ extern "C" { delete static_cast(map_ptr); } MMAP_MAP_DESTRUCT(str2int) MMAP_MAP_DESTRUCT(fixed_len_str2int) - MMAP_MAP_DESTRUCT(asin_str2int) + MMAP_MAP_DESTRUCT(fixed_len_10_str2int) MMAP_MAP_DESTRUCT(int2int) // Save @@ -690,7 +690,7 @@ extern "C" { static_cast(map_ptr)->save(map_dir); } MMAP_MAP_SAVE(str2int) MMAP_MAP_SAVE(fixed_len_str2int) - MMAP_MAP_SAVE(asin_str2int) + MMAP_MAP_SAVE(fixed_len_10_str2int) MMAP_MAP_SAVE(int2int) // Load @@ -701,7 +701,7 @@ extern "C" { return static_cast(map_ptr); } MMAP_MAP_LOAD(str2int) MMAP_MAP_LOAD(fixed_len_str2int) - MMAP_MAP_LOAD(asin_str2int) + MMAP_MAP_LOAD(fixed_len_10_str2int) MMAP_MAP_LOAD(int2int) // Size @@ -710,7 +710,7 @@ extern "C" { return static_cast(map_ptr)->size(); } MMAP_MAP_SIZE(str2int) MMAP_MAP_SIZE(fixed_len_str2int) - MMAP_MAP_SIZE(asin_str2int) + MMAP_MAP_SIZE(fixed_len_10_str2int) MMAP_MAP_SIZE(int2int) // Insert @@ -720,7 +720,7 @@ extern "C" { static_cast(map_ptr)->insert(FUNC_CALL_KEY, val); } MMAP_MAP_INSERT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_INSERT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_INSERT(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_INSERT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_INSERT(int2int, uint64_t key, key) // Get @@ -729,7 +729,7 @@ extern "C" { return static_cast(map_ptr)->get(FUNC_CALL_KEY); } MMAP_MAP_GET(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_GET(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET(int2int, uint64_t key, key) #define MMAP_MAP_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \ @@ -737,7 +737,7 @@ extern "C" { return static_cast(map_ptr)->get_w_default(FUNC_CALL_KEY, def_val); } MMAP_MAP_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_GET_W_DEFAULT(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET_W_DEFAULT(int2int, uint64_t key, key) #define MMAP_MAP_BATCH_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \ @@ -745,7 +745,7 @@ extern "C" { static_cast(map_ptr)->batch_get_w_default(n_key, FUNC_CALL_KEY, def_val, vals, threads); } MMAP_MAP_BATCH_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) - MMAP_MAP_BATCH_GET_W_DEFAULT(asin_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) + MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) MMAP_MAP_BATCH_GET_W_DEFAULT(int2int, const uint64_t* key, key) // Contains @@ -754,7 +754,7 @@ extern "C" { return static_cast(map_ptr)->contains(FUNC_CALL_KEY); } MMAP_MAP_CONTAINS(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_CONTAINS(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_CONTAINS(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_CONTAINS(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_CONTAINS(int2int, uint64_t key, key) diff --git a/pecos/core/utils/mmap_hashmap.hpp b/pecos/core/utils/mmap_hashmap.hpp index b6a90af2..da34db9b 100644 --- a/pecos/core/utils/mmap_hashmap.hpp +++ b/pecos/core/utils/mmap_hashmap.hpp @@ -285,15 +285,20 @@ class AnkerlFixedLenStr2IntMmapableVector { std::forward_as_tuple(size_), std::forward< std::tuple >(args)); + + size_type key_length = k.size(); + // Length of new key should be the same as previous keys - if (fixed_str_len_ != -1 && fixed_str_len_ != k.size()) { - throw std::runtime_error("String length differs from previous keys."); + if (key_length == 0) { + throw std::runtime_error("Key length should be greater than 0."); + } else if (fixed_str_len_ != 0 && fixed_str_len_ != key_length) { + throw std::runtime_error("Key length differs from previous keys."); } else { - fixed_str_len_ = k.size(); + fixed_str_len_ = key_length; } // Append key string - str_store_.insert(str_store_.end(), k.data(), k.data() + k.size()); + str_store_.insert(str_store_.end(), k.data(), k.data() + key_length); // Update pointers size_ = store_.size(); @@ -349,7 +354,7 @@ class AnkerlFixedLenStr2IntMmapableVector { value_type* data_ = nullptr; char* str_data_ = nullptr; - size_type fixed_str_len_ = -1; + size_type fixed_str_len_ = 0; // Actual data storage for in-memory case std::vector store_; @@ -432,26 +437,31 @@ class AnkerlFixedLenStr2IntMmapableVector { }; -// Memory-mappable vector of std::pair for Ankerl -// This vector takes/gets std::string_view as the key, but emplace back as the special mmap format StrView +// Memory-mappable vector of std::pair for Ankerl +// This vector takes/gets std::string_view as the key, but emplace back as the special mmap format FixedLen10Str // The key must be of length 10 -class AnkerlAsinStr2IntMmapableVector { +class AnkerlFixedLen10Str2IntMmapableVector { template class iter_t; - struct StrView { - char str[10]; + // Fixed Length of 10 for keys + static constexpr std::size_t fixed_str_len = 10; + + struct FixedLen10Str { + char str[fixed_str_len]; - StrView(const char* input_str = nullptr) { + FixedLen10Str(const char* input_str = nullptr) { if (input_str) { - std::strncpy(str, input_str, 10); + std::memcpy(str, input_str, fixed_str_len); + } else { + throw std::runtime_error("Illegal initialization of FixLen10Str with nullptr."); } } }; public: using key_type = std::string_view; - using value_type = std::pair; + using value_type = std::pair; using size_type = std::size_t; using difference_type = std::ptrdiff_t; using allocator_type = std::allocator; @@ -463,8 +473,8 @@ class AnkerlAsinStr2IntMmapableVector { using iterator = iter_t; using const_iterator = iter_t; - AnkerlAsinStr2IntMmapableVector() = default; - AnkerlAsinStr2IntMmapableVector(allocator_type alloc) + AnkerlFixedLen10Str2IntMmapableVector() = default; + AnkerlFixedLen10Str2IntMmapableVector(allocator_type alloc) : store_(alloc) {} value_type* data() { return data_; } @@ -487,23 +497,20 @@ class AnkerlAsinStr2IntMmapableVector { void shrink_to_fit() { store_.shrink_to_fit(); } void reserve(size_t new_capacity) { store_.reserve(new_capacity); } - /* Emplace string-like key and int value as std::pair*/ + /* Emplace string-like key and int value as std::pair*/ template auto emplace_back(std::piecewise_construct_t, std::tuple key, std::tuple args) { // Extract key key_type key_string = std::get<0>(key); - if (key_string.size() != 10) { + if (key_string.size() != fixed_str_len) { throw std::runtime_error("ASIN string length is not 10."); } - char key_arr[10]; - std::strncpy(key_arr, key_string.data(), key_string.size()); - - // Emplace back std::pair + // Emplace back std::pair auto eb_val = store_.emplace_back( std::piecewise_construct, - std::forward_as_tuple(key_arr), + std::forward_as_tuple(key_string.data()), std::forward< std::tuple >(args)); // Update pointers @@ -524,7 +531,7 @@ class AnkerlAsinStr2IntMmapableVector { /* Get key for given member */ key_type get_key(value_type const& vt) const { - return key_type(vt.first.str, 10); + return key_type(vt.first.str, fixed_str_len); } /* Mmap save/load with MmapStore */ @@ -564,7 +571,7 @@ class AnkerlAsinStr2IntMmapableVector { template class iter_t { using ptr_t = typename std::conditional_t; + AnkerlFixedLen10Str2IntMmapableVector::const_pointer, AnkerlFixedLen10Str2IntMmapableVector::pointer>; ptr_t iter_data_{}; template @@ -572,12 +579,12 @@ class AnkerlAsinStr2IntMmapableVector { public: using iterator_category = std::forward_iterator_tag; - using difference_type = AnkerlAsinStr2IntMmapableVector::difference_type; - using value_type = AnkerlAsinStr2IntMmapableVector::value_type; + using difference_type = AnkerlFixedLen10Str2IntMmapableVector::difference_type; + using value_type = AnkerlFixedLen10Str2IntMmapableVector::value_type; using reference = typename std::conditional_t; using pointer = typename std::conditional_t; + AnkerlFixedLen10Str2IntMmapableVector::const_pointer, AnkerlFixedLen10Str2IntMmapableVector::pointer>; iter_t() noexcept = default; diff --git a/pecos/utils/mmap_hashmap_util.py b/pecos/utils/mmap_hashmap_util.py index 74c13053..b9d1cc0e 100644 --- a/pecos/utils/mmap_hashmap_util.py +++ b/pecos/utils/mmap_hashmap_util.py @@ -187,7 +187,11 @@ def init(cls, map_type, map_dir, lazy_load): fn_dict = clib.mmap_hashmap_init(map_type) map_ptr = fn_dict["load"](map_dir.encode("utf-8"), lazy_load) - if map_type == "str2int" or map_type == "fixed_len_str2int" or map_type == "asin_str2int": + if ( + map_type == "str2int" + or map_type == "fixed_len_str2int" + or map_type == "fixed_len_10_str2int" + ): return _MmapHashmapStr2IntReadOnly(map_ptr, fn_dict) elif map_type == "int2int": return _MmapHashmapInt2IntReadOnly(map_ptr, fn_dict) @@ -340,7 +344,11 @@ def init(cls, map_type, map_dir): fn_dict = clib.mmap_hashmap_init(map_type) map_ptr = fn_dict["new"]() - if map_type == "str2int" or map_type == "fixed_len_str2int" or map_type == "asin_str2int": + if ( + map_type == "str2int" + or map_type == "fixed_len_str2int" + or map_type == "fixed_len_10_str2int" + ): return _MmapHashmapStr2IntWrite(map_ptr, fn_dict, map_dir) elif map_type == "int2int": return _MmapHashmapInt2IntWrite(map_ptr, fn_dict, map_dir) diff --git a/test/pecos/utils/test_mmap_hashmap_util.py b/test/pecos/utils/test_mmap_hashmap_util.py index eb74be84..1bed124a 100644 --- a/test/pecos/utils/test_mmap_hashmap_util.py +++ b/test/pecos/utils/test_mmap_hashmap_util.py @@ -64,6 +64,8 @@ def test_str2int_mmap_hashmap(tmpdir): ) # Non-exist key vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) assert r_map_batch_getter.get(ks, 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15 def test_fixed_len_str2int_mmap_hashmap(tmpdir): @@ -119,19 +121,27 @@ def test_fixed_len_str2int_mmap_hashmap(tmpdir): ) # Non-exist key vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) assert r_map_batch_getter.get(ks, 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15 -def test_asin_str2int_mmap_hashmap(tmpdir): +def test_fixed_len_10_str2int_mmap_hashmap(tmpdir): from pecos.utils.mmap_hashmap_util import MmapHashmap, MmapHashmapBatchGetter + len_10_a_string = "a" * 10 + len_10_b_string = "b" * 10 + len_10_c_string = "c" * 10 - map_dir = tmpdir.join("asin_str2int").realpath().strpath - kv_dict = {"aaaaaaaaaa".encode("utf-8"): 2, "bbbbbbbbbb".encode("utf-8"): 3} + map_dir = tmpdir.join("fixed_len_10_str2int").realpath().strpath + kv_dict = { + len_10_a_string.encode("utf-8"): 2, + len_10_b_string.encode("utf-8"): 3 + } # Write-only Mode - w_map = MmapHashmap("asin_str2int") + w_map = MmapHashmap("fixed_len_10_str2int") w_map.open("w", map_dir) # Insert - w_map.map.insert("aaaaaaaaaa".encode("utf-8"), 1) # Test for overwrite later + w_map.map.insert(len_10_a_string.encode("utf-8"), 1) # Test for overwrite later for k, v in kv_dict.items(): w_map.map.insert(k, v) # Size @@ -139,7 +149,7 @@ def test_asin_str2int_mmap_hashmap(tmpdir): w_map.close() # Read-only Mode - r_map = MmapHashmap("asin_str2int") + r_map = MmapHashmap("fixed_len_10_str2int") r_map.open("r", map_dir) # Get for k, v in kv_dict.items(): @@ -147,11 +157,11 @@ def test_asin_str2int_mmap_hashmap(tmpdir): # Get with default for k, v in kv_dict.items(): assert r_map.map.get(k, 10) == v - assert r_map.map.get("cccccccccc".encode("utf-8"), 10) == 10 + assert r_map.map.get(len_10_c_string.encode("utf-8"), 10) == 10 # Contains for k, _ in kv_dict.items(): assert k in r_map.map - assert not ("cccccccccc".encode("utf-8") in r_map.map) + assert not (len_10_c_string.encode("utf-8") in r_map.map) # Size assert r_map.map.size() == len(kv_dict) @@ -159,21 +169,23 @@ def test_asin_str2int_mmap_hashmap(tmpdir): max_batch_size = 5 # max_batch_size > num of key r_map_batch_getter = MmapHashmapBatchGetter(r_map.map, max_batch_size) - ks = list(kv_dict.keys()) + ["cccccccccc".encode("utf-8")] # Non-exist key + ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] # Non-exist key vs = list(kv_dict.values()) + [10] assert r_map_batch_getter.get(ks, 10).tolist() == vs # max_batch_size = num of key - ks = list(kv_dict.keys()) + ["cccccccccc".encode("utf-8")] * ( + ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] * ( max_batch_size - len(kv_dict) ) # Non-exist key vs = list(kv_dict.values()) + [10] * (max_batch_size - len(kv_dict)) assert r_map_batch_getter.get(ks, 10).tolist() == vs # max_batch_size = num of key * 3 - ks = list(kv_dict.keys()) + ["cccccccccc".encode("utf-8")] * ( + ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] * ( 3 * max_batch_size - len(kv_dict) ) # Non-exist key vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) assert r_map_batch_getter.get(ks, 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15 def test_int2int_mmap_hashmap(tmpdir): @@ -226,3 +238,5 @@ def test_int2int_mmap_hashmap(tmpdir): ks = list(kv_dict.keys()) + [1000] * (3 * max_batch_size - len(kv_dict)) # Non-exist key vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) assert r_map_batch_getter.get(np.array(ks, dtype=np.int64), 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15