From 45489256155c798e5a859e7d7bbe3e8f5272f6f2 Mon Sep 17 00:00:00 2001 From: Ye Cao Date: Thu, 20 Jun 2024 10:16:41 +0800 Subject: [PATCH] Refine the config and the ci test of llm kv cache. (#1915) A follow up of https://github.com/v6d-io/v6d/pull/1913 Signed-off-by: Ye Cao --- .github/workflows/build-test.yml | 1 + modules/llm-cache/README.md | 12 +++++------ modules/llm-cache/tests/k8s-test/worker.py | 5 +++-- .../tests/kv_cache_local_file_test.cc | 20 +++++++++---------- modules/llm-cache/tests/kv_cache_test.cc | 18 ++++++++--------- python/vineyard/llm/cache.py | 3 +-- python/vineyard/llm/tests/test_llm.py | 8 ++++---- 7 files changed, 34 insertions(+), 33 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 406e33d7..edbd8e2e 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -377,6 +377,7 @@ jobs: rm -rf default.etcd rm -rf /dev/shm/etcd* + mkdir -p /tmp/vineyard/llm_cache python3 test/runner.py $RUNNER_ARGS --with-llm-python - name: Run contrib/thirdparty tests diff --git a/modules/llm-cache/README.md b/modules/llm-cache/README.md index 6259728d..dead6ba9 100644 --- a/modules/llm-cache/README.md +++ b/modules/llm-cache/README.md @@ -152,8 +152,8 @@ import vineyard from vineyard.llm import KVCache from vineyard.llm import KVTensor -from vineyard.llm.config import FileCacheConfig -from vineyard.llm.config import VineyardCacheConfig +from vineyard.llm.cache import FileCacheConfig +from vineyard.llm.cache import VineyardCacheConfig vineyard_cache_config = VineyardCacheConfig( socket="/tmp/vineyard_test.sock" @@ -205,7 +205,7 @@ for _ in range(len(tokens)): ] ) -matched = cache.query(tokens, kv_tensors_to_query) +matched = cache.query(None, tokens, kv_tensors_to_query) kv_tensors_from_cache = kv_tensors_to_query[:matched] assert matched == len(tokens) @@ -243,8 +243,8 @@ import vineyard from vineyard.llm import KVCache from vineyard.llm import KVTensor -from vineyard.llm.config import FileCacheConfig -from vineyard.llm.config import VineyardCacheConfig +from vineyard.llm.cache import FileCacheConfig +from vineyard.llm.cache import VineyardCacheConfig file_cache_config = FileCacheConfig( chunk_size=2, @@ -299,7 +299,7 @@ for _ in range(len(tokens)): for _ in range(cache.layer) ] ) -matched = cache.query(tokens, kv_tensors) +matched = cache.query(None, tokens, kv_tensors) assert matched == len(tokens) assert len(kv_tensors) == len(kv_tensors_from_cache) diff --git a/modules/llm-cache/tests/k8s-test/worker.py b/modules/llm-cache/tests/k8s-test/worker.py index f46da83c..1cbaf7b1 100644 --- a/modules/llm-cache/tests/k8s-test/worker.py +++ b/modules/llm-cache/tests/k8s-test/worker.py @@ -6,7 +6,7 @@ import numpy as np from vineyard.llm import KVCache, KVTensor -from vineyard.llm.config import FileCacheConfig +from vineyard.llm.cache import FileCacheConfig def start_server(port=8888): serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) @@ -79,11 +79,12 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor): tokens = tokens.decode('utf-8') tokens = tokens.replace('\n', '').split(' ') tokens = [int(token) for token in tokens] + tokens = tokens[:len(tokens) - len(tokens) % batch_size] kv_cache_list = reserve_kv_tensors(kv_cache_list, len(tokens), kv_tensor) query_start_time = time.time() - matched = cache.query(tokens, kv_cache_list) + matched = cache.query(None, tokens, kv_cache_list) query_end_time = time.time() if matched > 0: total_query_time += query_end_time - query_start_time diff --git a/modules/llm-cache/tests/kv_cache_local_file_test.cc b/modules/llm-cache/tests/kv_cache_local_file_test.cc index ee9e4f4d..5892d6ad 100644 --- a/modules/llm-cache/tests/kv_cache_local_file_test.cc +++ b/modules/llm-cache/tests/kv_cache_local_file_test.cc @@ -59,7 +59,7 @@ void print_current_tokens(const std::vector& prefix, int next_token) { } void print_kv_state(const std::vector>& kv_state) { - LOG(INFO) << "kv_state: "; + VLOG(100) << "kv_state: "; for (size_t i = 0; i < kv_state.size(); ++i) { uint8_t* key_state_data = reinterpret_cast(kv_state[i].first.data); @@ -72,10 +72,10 @@ void print_kv_state(const std::vector>& kv_state) { key_state_str += std::to_string(key_state_data[j]) + " "; value_state_str += std::to_string(value_state_data[j]) + " "; } - LOG(INFO) << "layer " << i << ":"; - LOG(INFO) << "key_state: " << key_state_str; - LOG(INFO) << "value_state: " << value_state_str; - LOG(INFO) << "---------------------"; + VLOG(100) << "layer " << i << ":"; + VLOG(100) << "key_state: " << key_state_str; + VLOG(100) << "value_state: " << value_state_str; + VLOG(100) << "---------------------"; } } @@ -107,16 +107,16 @@ void check_kv_state(const std::vector>& kv_state, int& token) { VINEYARD_ASSERT(kv_state.size() == (size_t) layer); for (size_t index = 0; index < kv_state.size(); ++index) { - LOG(INFO) << "kv_state length: " << kv_state[index].first.length + VLOG(100) << "kv_state length: " << kv_state[index].first.length << "tensorNBytes: " << tensorNBytes << "layer: " << layer; VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes); VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes); for (int i = 0; i < tensorNBytes; ++i) { if ((reinterpret_cast(kv_state[index].first.data))[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes + VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; - LOG(INFO) << "key_state[" << i << "]: " + VLOG(100) << "key_state[" << i << "]: " << (reinterpret_cast(kv_state[index].first.data))[i] << ". But is should be " << (static_cast(token)) + i + index; @@ -124,9 +124,9 @@ void check_kv_state(const std::vector>& kv_state, } if (reinterpret_cast(kv_state[index].second.data)[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes + VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; - LOG(INFO) << "value_state[" << i << "]: " + VLOG(100) << "value_state[" << i << "]: " << (reinterpret_cast( kv_state[index].second.data))[i] << ". But is should be " diff --git a/modules/llm-cache/tests/kv_cache_test.cc b/modules/llm-cache/tests/kv_cache_test.cc index 25b39060..9e8b8a46 100644 --- a/modules/llm-cache/tests/kv_cache_test.cc +++ b/modules/llm-cache/tests/kv_cache_test.cc @@ -64,7 +64,7 @@ void print_current_tokens(const std::vector& prefix, int next_token) { } void print_kv_state(const std::vector>& kv_state) { - LOG(INFO) << "kv_state: "; + VLOG(100) << "kv_state: "; for (size_t i = 0; i < kv_state.size(); ++i) { uint8_t* key_state_data = reinterpret_cast(kv_state[i].first.data); @@ -77,10 +77,10 @@ void print_kv_state(const std::vector>& kv_state) { key_state_str += std::to_string(key_state_data[j]) + " "; value_state_str += std::to_string(value_state_data[j]) + " "; } - LOG(INFO) << "layer " << i << ":"; - LOG(INFO) << "key_state: " << key_state_str; - LOG(INFO) << "value_state: " << value_state_str; - LOG(INFO) << "---------------------"; + VLOG(100) << "layer " << i << ":"; + VLOG(100) << "key_state: " << key_state_str; + VLOG(100) << "value_state: " << value_state_str; + VLOG(100) << "---------------------"; } } @@ -116,9 +116,9 @@ void check_kv_state(const std::vector>& kv_state, for (int i = 0; i < tensorNBytes; ++i) { if ((reinterpret_cast(kv_state[index].first.data))[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes + VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; - LOG(INFO) << "key_state[" << i << "]: " + VLOG(100) << "key_state[" << i << "]: " << (reinterpret_cast(kv_state[index].first.data))[i] << ". But is should be " << (static_cast(token)) + i + index; @@ -126,9 +126,9 @@ void check_kv_state(const std::vector>& kv_state, } if (reinterpret_cast(kv_state[index].second.data)[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes + VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; - LOG(INFO) << "value_state[" << i << "]: " + VLOG(100) << "value_state[" << i << "]: " << (reinterpret_cast( kv_state[index].second.data))[i] << ". But is should be " diff --git a/python/vineyard/llm/cache.py b/python/vineyard/llm/cache.py index 3e8859af..925a04ea 100644 --- a/python/vineyard/llm/cache.py +++ b/python/vineyard/llm/cache.py @@ -78,7 +78,6 @@ def __init__( """ import vineyard - self.socket = socket self.block_size = block_size self.sync_interval = sync_interval self.llm_cache_sync_lock = llm_cache_sync_lock @@ -91,7 +90,7 @@ def __init__( def __repr__(self): return ( f'VineyardCacheConfig(' - f'socket={self.socket}, ' + f'ipc_client={self.ipc_client}, ' f'block_size={self.block_size}, ' f'sync_interval={self.sync_interval}, ' f'llm_cache_sync_lock={self.llm_cache_sync_lock}, ' diff --git a/python/vineyard/llm/tests/test_llm.py b/python/vineyard/llm/tests/test_llm.py index 57549e4c..85688563 100644 --- a/python/vineyard/llm/tests/test_llm.py +++ b/python/vineyard/llm/tests/test_llm.py @@ -20,8 +20,8 @@ from vineyard.llm import KVCache from vineyard.llm import KVTensor -from vineyard.llm.config import FileCacheConfig -from vineyard.llm.config import VineyardCacheConfig +from vineyard.llm.cache import FileCacheConfig +from vineyard.llm.cache import VineyardCacheConfig def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets): @@ -75,7 +75,7 @@ def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets): ] ) - matched = cache.query(tokens, kv_tensors_to_query) + matched = cache.query(None, tokens, kv_tensors_to_query) kv_tensors_from_cache = kv_tensors_to_query[:matched] assert matched == len(tokens) @@ -151,7 +151,7 @@ def test_kv_cache_update_and_query_on_fs(): for _ in range(cache.layer) ] ) - matched = cache.query(tokens, kv_tensors) + matched = cache.query(None, tokens, kv_tensors) assert matched == len(tokens) assert len(kv_tensors) == len(kv_tensors_from_cache)