Refine the config and the ci test of llm kv cache. (#1915)

A follow up of #1913 Signed-off-by: Ye Cao <[email protected]>
v6d-io · Jun 20, 2024 · 4548925 · 4548925
1 parent b6e0bfa
commit 4548925
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 33 deletions.
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -377,6 +377,7 @@ jobs:
 
           rm -rf default.etcd
           rm -rf /dev/shm/etcd*
+          mkdir -p /tmp/vineyard/llm_cache
           python3 test/runner.py $RUNNER_ARGS --with-llm-python
 
       - name: Run contrib/thirdparty tests

diff --git a/modules/llm-cache/README.md b/modules/llm-cache/README.md
@@ -152,8 +152,8 @@ import vineyard
 
 from vineyard.llm import KVCache
 from vineyard.llm import KVTensor
-from vineyard.llm.config import FileCacheConfig
-from vineyard.llm.config import VineyardCacheConfig
+from vineyard.llm.cache import FileCacheConfig
+from vineyard.llm.cache import VineyardCacheConfig
 
 vineyard_cache_config = VineyardCacheConfig(
     socket="/tmp/vineyard_test.sock"
@@ -205,7 +205,7 @@ for _ in range(len(tokens)):
         ]
     )
 
-matched = cache.query(tokens, kv_tensors_to_query)
+matched = cache.query(None, tokens, kv_tensors_to_query)
 kv_tensors_from_cache = kv_tensors_to_query[:matched]
 assert matched == len(tokens)
 
@@ -243,8 +243,8 @@ import vineyard
 
 from vineyard.llm import KVCache
 from vineyard.llm import KVTensor
-from vineyard.llm.config import FileCacheConfig
-from vineyard.llm.config import VineyardCacheConfig
+from vineyard.llm.cache import FileCacheConfig
+from vineyard.llm.cache import VineyardCacheConfig
 
 file_cache_config = FileCacheConfig(
     chunk_size=2,
@@ -299,7 +299,7 @@ for _ in range(len(tokens)):
             for _ in range(cache.layer)
         ]
     )
-matched = cache.query(tokens, kv_tensors)
+matched = cache.query(None, tokens, kv_tensors)
 assert matched == len(tokens)
 
 assert len(kv_tensors) == len(kv_tensors_from_cache)

diff --git a/modules/llm-cache/tests/k8s-test/worker.py b/modules/llm-cache/tests/k8s-test/worker.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 from vineyard.llm import KVCache, KVTensor
-from vineyard.llm.config import FileCacheConfig
+from vineyard.llm.cache import FileCacheConfig
 
 def start_server(port=8888):
     serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -79,11 +79,12 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor):
         tokens = tokens.decode('utf-8')
         tokens = tokens.replace('\n', '').split(' ')
         tokens = [int(token) for token in tokens]
+        tokens = tokens[:len(tokens) - len(tokens) % batch_size]
 
         kv_cache_list = reserve_kv_tensors(kv_cache_list, len(tokens), kv_tensor)
 
         query_start_time = time.time()
-        matched = cache.query(tokens, kv_cache_list)
+        matched = cache.query(None, tokens, kv_cache_list)
         query_end_time = time.time()
         if matched > 0:
             total_query_time += query_end_time - query_start_time

diff --git a/modules/llm-cache/tests/kv_cache_local_file_test.cc b/modules/llm-cache/tests/kv_cache_local_file_test.cc
@@ -59,7 +59,7 @@ void print_current_tokens(const std::vector<int>& prefix, int next_token) {
 }
 
 void print_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state) {
-  LOG(INFO) << "kv_state: ";
+  VLOG(100) << "kv_state: ";
   for (size_t i = 0; i < kv_state.size(); ++i) {
     uint8_t* key_state_data =
         reinterpret_cast<uint8_t*>(kv_state[i].first.data);
@@ -72,10 +72,10 @@ void print_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state) {
       key_state_str += std::to_string(key_state_data[j]) + " ";
       value_state_str += std::to_string(value_state_data[j]) + " ";
     }
-    LOG(INFO) << "layer " << i << ":";
-    LOG(INFO) << "key_state: " << key_state_str;
-    LOG(INFO) << "value_state: " << value_state_str;
-    LOG(INFO) << "---------------------";
+    VLOG(100) << "layer " << i << ":";
+    VLOG(100) << "key_state: " << key_state_str;
+    VLOG(100) << "value_state: " << value_state_str;
+    VLOG(100) << "---------------------";
   }
 }
 
@@ -107,26 +107,26 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
                     int& token) {
   VINEYARD_ASSERT(kv_state.size() == (size_t) layer);
   for (size_t index = 0; index < kv_state.size(); ++index) {
-    LOG(INFO) << "kv_state length: " << kv_state[index].first.length
+    VLOG(100) << "kv_state length: " << kv_state[index].first.length
               << "tensorNBytes: " << tensorNBytes << "layer: " << layer;
     VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes);
     VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes);
     for (int i = 0; i < tensorNBytes; ++i) {
       if ((reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
+        VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
-        LOG(INFO) << "key_state[" << i << "]: "
+        VLOG(100) << "key_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i]
                   << ". But is should be "
                   << (static_cast<uint8_t>(token)) + i + index;
         throw std::runtime_error("key_state error!");
       }
       if (reinterpret_cast<uint8_t*>(kv_state[index].second.data)[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
+        VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
-        LOG(INFO) << "value_state[" << i << "]: "
+        VLOG(100) << "value_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(
                          kv_state[index].second.data))[i]
                   << ". But is should be "

diff --git a/modules/llm-cache/tests/kv_cache_test.cc b/modules/llm-cache/tests/kv_cache_test.cc
@@ -64,7 +64,7 @@ void print_current_tokens(const std::vector<int>& prefix, int next_token) {
 }
 
 void print_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state) {
-  LOG(INFO) << "kv_state: ";
+  VLOG(100) << "kv_state: ";
   for (size_t i = 0; i < kv_state.size(); ++i) {
     uint8_t* key_state_data =
         reinterpret_cast<uint8_t*>(kv_state[i].first.data);
@@ -77,10 +77,10 @@ void print_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state) {
       key_state_str += std::to_string(key_state_data[j]) + " ";
       value_state_str += std::to_string(value_state_data[j]) + " ";
     }
-    LOG(INFO) << "layer " << i << ":";
-    LOG(INFO) << "key_state: " << key_state_str;
-    LOG(INFO) << "value_state: " << value_state_str;
-    LOG(INFO) << "---------------------";
+    VLOG(100) << "layer " << i << ":";
+    VLOG(100) << "key_state: " << key_state_str;
+    VLOG(100) << "value_state: " << value_state_str;
+    VLOG(100) << "---------------------";
   }
 }
 
@@ -116,19 +116,19 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
     for (int i = 0; i < tensorNBytes; ++i) {
       if ((reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
+        VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
-        LOG(INFO) << "key_state[" << i << "]: "
+        VLOG(100) << "key_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i]
                   << ". But is should be "
                   << (static_cast<uint8_t>(token)) + i + index;
         throw std::runtime_error("key_state error!");
       }
       if (reinterpret_cast<uint8_t*>(kv_state[index].second.data)[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
+        VLOG(100) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
-        LOG(INFO) << "value_state[" << i << "]: "
+        VLOG(100) << "value_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(
                          kv_state[index].second.data))[i]
                   << ". But is should be "

diff --git a/python/vineyard/llm/cache.py b/python/vineyard/llm/cache.py
@@ -78,7 +78,6 @@ def __init__(
         """
         import vineyard
 
-        self.socket = socket
         self.block_size = block_size
         self.sync_interval = sync_interval
         self.llm_cache_sync_lock = llm_cache_sync_lock
@@ -91,7 +90,7 @@ def __init__(
     def __repr__(self):
         return (
             f'VineyardCacheConfig('
-            f'socket={self.socket}, '
+            f'ipc_client={self.ipc_client}, '
             f'block_size={self.block_size}, '
             f'sync_interval={self.sync_interval}, '
             f'llm_cache_sync_lock={self.llm_cache_sync_lock}, '

diff --git a/python/vineyard/llm/tests/test_llm.py b/python/vineyard/llm/tests/test_llm.py
@@ -20,8 +20,8 @@
 
 from vineyard.llm import KVCache
 from vineyard.llm import KVTensor
-from vineyard.llm.config import FileCacheConfig
-from vineyard.llm.config import VineyardCacheConfig
+from vineyard.llm.cache import FileCacheConfig
+from vineyard.llm.cache import VineyardCacheConfig
 
 
 def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets):
@@ -75,7 +75,7 @@ def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets):
             ]
         )
 
-    matched = cache.query(tokens, kv_tensors_to_query)
+    matched = cache.query(None, tokens, kv_tensors_to_query)
     kv_tensors_from_cache = kv_tensors_to_query[:matched]
     assert matched == len(tokens)
 
@@ -151,7 +151,7 @@ def test_kv_cache_update_and_query_on_fs():
                 for _ in range(cache.layer)
             ]
         )
-    matched = cache.query(tokens, kv_tensors)
+    matched = cache.query(None, tokens, kv_tensors)
     assert matched == len(tokens)
 
     assert len(kv_tensors) == len(kv_tensors_from_cache)