diff --git a/.gitignore b/.gitignore
index b014a4e..77e8eca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,6 @@
 **/.ipynb_checkpoints/**
 **.tbl
 **/.vscode
-**/DeepMapping.egg-info
\ No newline at end of file
+**/DeepMapping.egg-info
+**.7z
+**.onnx
\ No newline at end of file
diff --git a/DeepMapping/DeepMapping/byte_dictionary_compression.py b/DeepMapping/DeepMapping/byte_dictionary_compression.py
index f5b7ce3..d93265e 100644
--- a/DeepMapping/DeepMapping/byte_dictionary_compression.py
+++ b/DeepMapping/DeepMapping/byte_dictionary_compression.py
@@ -1,8 +1,9 @@
-import pandas as pd 
-import numpy as np
-import sys
+import gc
 import math
+import numpy as np
 import os
+import pandas as pd 
+import sys
 from DeepMapping import ndb_utils
 from sklearn import preprocessing
 from DeepMapping.ndb_utils import Timer, recreate_temp_dir, save_byte_to_disk, read_bytes_from_disk
@@ -55,7 +56,7 @@ def dict_compression(df):
     return dict_comp_data, list_encoder
 
 def measure_latency(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary'):
     """Measure the end-end latency of data query
 
@@ -70,10 +71,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -81,12 +78,14 @@ def measure_latency(df, data_ori, task_name, sample_size,
         search_algo : str
             search algorithm that applied to search entry in each partition
     """
+    mode = os.environ['MODE']
     data_ori_size = 0
     data_comp_size = 0
     memory_optimized_latency = None 
     latency_optimized_latency = None 
     memory_optimized_result = None
     latency_optimized_result = None
+    exp_data_dict = dict()
     dict_comp_data, dict_encoder = dict_compression(df)
     list_type = []
 
@@ -132,200 +131,135 @@ def measure_latency(df, data_ori, task_name, sample_size,
         data_ori_size = data_ori.nbytes/1024/1024
         data_comp_size = data_size/1024/1024
         print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        exp_data_dict['list_type'] = list_type
+        exp_data_dict['dict_compressor'] = dict_comp_data, dict_encoder
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+    else:
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_type = exp_data_dict['list_type']
+        dict_comp_data, dict_encoder = exp_data_dict['dict_compressor']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
     
     list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
 
     # Measure latency for run-time memory optimized strategy
-    if memory_optimized:
-        timer_total = Timer()
-        timer_decomp = Timer()
-        timer_sort = Timer()
-        timer_lookup = Timer()
-        timer_locate_part = Timer() 
-        t_total = 0
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_locate_part = 0
+   
+    timer_total = Timer()
+    timer_decomp = Timer()
+    timer_lookup = Timer()
+    timer_total = Timer()
+    timer_sort = Timer()
+    timer_locate_part = Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    timer_total.tic()
+
+    for _ in tqdm(range(num_loop)):  
+        decomp_block = dict()
+        partition_hit = dict()
         peak_memory = 0
-
-        for _ in tqdm(range(num_loop)):
-            decomp_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = -1
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic() 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    t_locate_part += timer_locate_part.toc()
-                    timer_decomp.tic()
-
-                    if part_idx != prev_part_idx:
-                        # new block to decompress                  
-                        current_memory = 0
-                        # decompress index first
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
-                        block_bytes = read_bytes_from_disk(file_name)
-                        current_memory += sys.getsizeof(block_bytes)
-                        block_data = np.frombuffer(block_bytes, dtype=list_type[0])
-                        curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype)
-                        curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
-                        
-                        for i in range(1, len(dict_comp_data)):
-                            file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                            block_bytes = read_bytes_from_disk(file_name)
-                            current_memory += sys.getsizeof(block_bytes)
-                            block_data = np.frombuffer(block_bytes, dtype=list_type[i])
-                            col_name = data_ori.dtype.names[i]
-
-                            if dict_encoder[i] is not None:
-                                fun_a = lambda x: dict_encoder[i].classes_[x]
-                                curr_decomp_block[col_name] = fun_a(block_data.astype(np.int32))
-                            else:
-                                curr_decomp_block[col_name] = block_data
-                            
-                        current_memory += curr_decomp_block.nbytes
-                        decomp_block = curr_decomp_block
-                        num_decomp += 1
-                        
-                        if current_memory > peak_memory:
-                            peak_memory = current_memory
-
-                        prev_part_idx = part_idx
-                        decomp_block = curr_decomp_block
-                    else:
-                        curr_decomp_block = decomp_block
-                    # -----
-
-                    t_decomp += timer_decomp.toc()
-                    # -----
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            t_sort += timer_sort.toc()
+            result = np.ndarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+
+            for idx in range(sample_size):
+                timer_locate_part.tic()
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]
+                t_locate_part += timer_locate_part.toc()
+
+                part_idx = int((query_key-x_start) // num_record_per_part)
+                timer_decomp.tic()
+                # -----
+                decomp_memory = 0
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100)
+
+                    partition_hit[part_idx] =1
+
+                    # decompress index first
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
+                    block_bytes = read_bytes_from_disk(file_name)
+                    block_data = np.frombuffer(block_bytes, dtype=list_type[0])
+                    curr_decomp_block = np.ndarray((len(block_data),), dtype=data_ori.dtype)                       
+                    decomp_memory += sys.getsizeof(block_bytes)                       
+                    curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
                     
-                    t_lookup += timer_lookup.toc()
-
-                t_total += timer_total.toc()
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-    if latency_optimized: 
-        timer_total = Timer()
-        timer_decomp = Timer()
-        timer_lookup = Timer()
-        timer_total = Timer()
-        timer_sort = Timer()
-        timer_locate_part = Timer()
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_total = 0
-        t_locate_part = 0
-        timer_total.tic()
-
-        for _ in tqdm(range(num_loop)):  
-            decomp_block = dict()
-            peak_memory = 0
-            num_decomp = 0
-            count_nonexist = 0
-            cache_block_memory = 0
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-                result_idx = 0
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic()
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    t_locate_part += timer_locate_part.toc()
-
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    timer_decomp.tic()
-                    # -----
-                    decomp_memory = 0
-                    if part_idx not in decomp_block:
-                        # decompress index first
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
+                    for i in range(1, len(dict_comp_data)):
+                        file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
                         block_bytes = read_bytes_from_disk(file_name)
-                        block_data = np.frombuffer(block_bytes, dtype=list_type[0])
-                        curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype)                       
-                        decomp_memory += sys.getsizeof(block_bytes)                       
-                        curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
+                        decomp_memory += sys.getsizeof(block_bytes)
+                        block_data = np.frombuffer(block_bytes, dtype=list_type[i])
+                        col_name = data_ori.dtype.names[i]
+
+                        if dict_encoder[i] is not None:
+                            # encoded col                                
+                            fun_a = lambda x: dict_encoder[i].classes_[x]
+                            curr_decomp_block[col_name] = fun_a(block_data.astype(np.int32))                               
+                        else:
+                            curr_decomp_block[col_name] = block_data
                         
-                        for i in range(1, len(dict_comp_data)):
-                            file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                            block_bytes = read_bytes_from_disk(file_name)
-                            decomp_memory += sys.getsizeof(block_bytes)
-                            block_data = np.frombuffer(block_bytes, dtype=list_type[i])
-                            col_name = data_ori.dtype.names[i]
-
-                            if dict_encoder[i] is not None:
-                                # encoded col                                
-                                fun_a = lambda x: dict_encoder[i].classes_[x]
-                                curr_decomp_block[col_name] = fun_a(block_data.astype(np.int32))                               
-                            else:
-                                curr_decomp_block[col_name] = block_data
-                            
-                        cache_block_memory += curr_decomp_block.nbytes
-                        decomp_block[part_idx] = curr_decomp_block
-                        num_decomp += 1
-                    else:
-                        curr_decomp_block = decomp_block[part_idx]
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-
-                    t_lookup += timer_lookup.toc()
-                    result_idx += 1
-
-                    if cache_block_memory + decomp_memory > peak_memory:
-                        peak_memory = cache_block_memory + decomp_memory
-
-                t_total += timer_total.toc()
-        latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-        return_latency = None 
+                    cache_block_memory += curr_decomp_block.nbytes
+                    decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                else:
+                    partition_hit[part_idx] += 1
+                    curr_decomp_block = decomp_block[part_idx]
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
+
+                if search_algo == 'binary':
+                    data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
+                elif search_algo == 'naive':
+                    data_idx = curr_decomp_block[key] == query_key
+
+                if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx])
+                else:
+                    count_nonexist += 1
+
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+
+                if cache_block_memory + decomp_memory > peak_memory:
+                    peak_memory = cache_block_memory + decomp_memory
+
+            t_total += timer_total.toc()
+    latency_optimized_result = result.copy()
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop,  0 / num_loop, # build_index time #TODO this is required for build hash table, current is no needed, use binary search instead
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+
+    return_latency = None 
 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
         return_latency = latency_optimized_latency.reshape((1,-1))
diff --git a/DeepMapping/DeepMapping/convert_model_from_h5_to_onnx.py b/DeepMapping/DeepMapping/convert_model_from_h5_to_onnx.py
new file mode 100644
index 0000000..c043266
--- /dev/null
+++ b/DeepMapping/DeepMapping/convert_model_from_h5_to_onnx.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+import tensorflow as tf 
+
+"""This script is used to convert the h5 model into onnx format,
+if you want to use onnxrunutime as backend.
+
+You are required to install tf2onnx by using pip.
+"""
+
+for root, dirs, files in os.walk("models/nas/tpch-s1/", topdown=False):
+   for name in files:
+      if '.h5' in name:
+         model_name = name.split('.')[0]
+         # h5 file 
+         model_path = os.path.join(root, name)
+
+         model = tf.keras.models.load_model(model_path, compile=False)
+         # save in pb 
+         model.save(os.path.join(root, model_name))
+         cmd = "python -m tf2onnx.convert --saved-model {} --output {}.onnx".format(os.path.join(root, model_name),
+                                                                                        os.path.join(root, model_name))
+         os.system(cmd)
+         shutil.rmtree(os.path.join(root, model_name))
+         print(root, name, cmd)
diff --git a/DeepMapping/DeepMapping/deepmapping.py b/DeepMapping/DeepMapping/deepmapping.py
index e7fa1c4..9b68057 100644
--- a/DeepMapping/DeepMapping/deepmapping.py
+++ b/DeepMapping/DeepMapping/deepmapping.py
@@ -1,17 +1,24 @@
-import pandas as pd 
+import ctypes
+import gc
+import math
 import numpy as np
+import onnx
+import onnxruntime as ort
+import os
+import pandas as pd 
 import sys
+import tensorflow as tf
 import zstd
-import math
-import os
+
+from bitarray import bitarray
+from collections import defaultdict
 from DeepMapping import ndb_utils
-import tensorflow as tf
+from onnx_opcounter import calculate_params
+
+from sklearn import preprocessing
 from tensorflow.keras import layers, regularizers
 from tensorflow import keras
-import ctypes
-from sklearn import preprocessing
-from bitarray import bitarray
-from more_itertools import run_length
+
 from tqdm.auto import tqdm
 
 
@@ -78,7 +85,7 @@ def __getitem__(self, index):
         X = create_features_c_multi_thread(shared_utils, data_x, num_record, max_len)
         
         return X, Y
-        
+    
         
     def on_epoch_end(self):
         """Updates indexes after each epoch
@@ -87,6 +94,38 @@ def on_epoch_end(self):
         if self.shuffle == True:
             np.random.shuffle(self.indexes)
 
+class InferenceDataGenerator(tf.keras.utils.Sequence):
+    def __init__(self, x, batch_size, max_len, shuffle=False):
+        self.x = x
+        self.batch_size = batch_size
+        self.max_len = max_len
+        self.shuffle = shuffle
+    
+    def __len__(self):
+        return int(np.ceil(len(self.x) / self.batch_size))
+    
+    def __getitem__(self, index):
+        idx_start = index*self.batch_size
+        idx_end = (index+1)*self.batch_size
+        data_x = self.x[idx_start:idx_end]
+
+        num_record = len(data_x)
+        max_len = self.max_len
+        
+        shared_utils.create_fetures.restype = ctypes.POINTER(ctypes.c_bool * (num_record * max_len * 10))
+        shared_utils.create_fetures_mutlt_thread_mgr.restype = ctypes.POINTER(ctypes.c_bool * (num_record * max_len * 10))
+
+        X = create_features_c_multi_thread(shared_utils, data_x, num_record, max_len)
+        
+        return X
+    
+        
+    def on_epoch_end(self):
+        """Updates indexes after each epoch
+        """
+        self.indexes = np.arange(len(self.x))
+
+
 def build_model(num_in, model_sturcture, list_num_out):
     x = tf.keras.Input(shape=(num_in,1))
     flatten = tf.keras.layers.Flatten(input_shape=(num_in,1), name='in')(x)
@@ -174,12 +213,51 @@ def compress_data(df, model_sturcture, batch_size=1024, num_epochs=500, train_ve
     else:
         return model, train_generator
 
+def finetune_model(df, model_path, batch_size=1024, num_epochs=500, train_verbose=1, train=True):
+    df_key = [df.columns[0]]
+    list_y_encoded = []
+    list_y_encoder = []
+
+    for col in df.columns:
+        if col not in df_key:
+            encoded_val, encoder = encode_label(df[col])
+            list_y_encoded.append(encoded_val)
+            list_y_encoder.append(encoder)
+    num_tasks = len(list_y_encoded)
+    num_tasks
+
+    for encoder in list_y_encoder:
+        print(len(encoder.classes_))
+    
+    x = df[df_key[0]].values.astype(np.int32)
+    max_len = len(str(np.max(x)))
+    print('MAX LEN', max_len)
+    list_num_out = [len(encoder.classes_) for encoder in list_y_encoder]
+    strategy = tf.distribute.MirroredStrategy()
+    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
+
+    with strategy.scope():
+        # Everything that creates variables should be under the strategy scope.
+        # In general this is only model construction & `compile()`.
+        # model = build_model(max_len*10, model_sturcture, list_num_out)
+        # x = tf.keras.Input(shape=(num_in,1))
+        model = tf.keras.models.load_model(model_path)
+        model = tf.keras.models.clone_model(model, tf.keras.Input(shape=(max_len*10,1)))
+
+        opt = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=1e-3/1000)
+        model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=["accuracy"])
+    train_generator = DataGenerator(x, list_y_encoded, batch_size, max_len)
+
+    if train == True:
+        train_history = model.fit(train_generator, epochs=num_epochs, verbose=train_verbose, callbacks=[SOMT(model, 1)])
+        return model, train_history
+    else:
+        return model, train_generator
+
 def measure_latency_any(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary', path_to_model=None,
                     block_size=1024*1024):
-    # TODO add support of hash to run-time memory optimized strategy
-    # TODO add support of binary_c to run-time memory optimized strategy
     """Measure the end-end latency of data query
 
     Args:
@@ -193,10 +271,6 @@ def measure_latency_any(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -206,6 +280,8 @@ def measure_latency_any(df, data_ori, task_name, sample_size,
         path_to_model : str
             load model from custom path
     """
+    backend = os.environ['BACKEND']
+    mode = os.environ['MODE']
     data_ori_size = 0
     data_comp_size = 0
     memory_optimized_latency = None 
@@ -213,20 +289,25 @@ def measure_latency_any(df, data_ori, task_name, sample_size,
     memory_optimized_result = None
     latency_optimized_result = None
 
+    
+    root_path = 'temp'
+    folder_name = 'deepmapping'
+    comp_data_dir = os.path.join(root_path, task_name, folder_name)
+    if 'DATA_OPS' in os.environ:
+        comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS'])
+    print('[Generate File Path]: {}'.format(comp_data_dir))
+    
     df_key = [df.columns[0]]
     list_y_encoded = []
     list_y_encoder = []
     size_encoder = 0
-    for col in df.columns:
-        if col not in df_key:
-            encoded_val, encoder = encode_label(df[col])
-            list_y_encoded.append(encoded_val)
-            list_y_encoder.append(encoder)
-            size_encoder += encoder.classes_.nbytes
-    num_tasks = len(list_y_encoded)
-    
-    for encoder in list_y_encoder:
-        print(len(encoder.classes_))
+    exist_bit_arr = None
+    num_record_per_part = None 
+    data_comp_size = None 
+    max_len = None 
+    x_start = None 
+    x_end = None 
+    num_tasks = None 
 
     shared_utils = ctypes.CDLL(os.path.abspath("shared_utils.so")) # Or full path to file 
     ND_POINTER_1 = np.ctypeslib.ndpointer(dtype=np.bool_, 
@@ -242,45 +323,80 @@ def measure_latency_any(df, data_ori, task_name, sample_size,
 
     num_threads = 8
 
-    x = df[df_key[0]].values.astype(np.int32)
-    max_len = len(str(np.max(x)))
-    y = np.array(list_y_encoded).T.astype(np.int32)
-    data = np.concatenate((x.reshape(-1,1), y), axis=1, dtype=np.int32)
-    print(data.nbytes/1024/1024)
-
     if path_to_model is None:
-        model = tf.keras.models.load_model('models/nas/{}.h5'.format(task_name), compile=False)
+        if backend == 'tf':
+            model = tf.keras.models.load_model('models/nas/{}.h5'.format(task_name), compile=False)
+            model_size = model.count_params()*4/1024/1024
+        elif backend == 'onnx': 
+            model = ort.InferenceSession('models/nas/{}.onnx'.format(task_name), providers=['CUDAExecutionProvider']) 
+            input_name = model.get_inputs()[0].name
+            model_size = calculate_params(onnx.load_model('models/nas/{}.onnx'.format(task_name)))*4/1024/1024
     else:
-        model = tf.keras.models.load_model(path_to_model, compile=False)
-    train_generator = DataGenerator(x, list_y_encoded, 1024*2**4, max_len)
+        if backend == 'tf':
+            model = tf.keras.models.load_model(path_to_model, compile=False)
+            model_size = model.count_params()*4/1024/1024
+        elif backend == 'onnx':
+            model = ort.InferenceSession(path_to_model, providers=['CUDAExecutionProvider']) 
+            input_name = model.get_inputs()[0].name
+            model_size = calculate_params(onnx.load_model(path_to_model))*4/1024/1024
 
-    # exist_bitarray
-    x_start = np.min(x)
-    x_end = np.max(x)
-    exist_bit_arr = bitarray('0')*(x_end - x_start + 1)
-
-    for val in x:
-        exist_bit_arr[val-x_start] = 1
-    print(sys.getsizeof(exist_bit_arr)/1024/1024)
-
-    root_path = 'temp'
-    folder_name = 'ours-any'
-    comp_data_dir = os.path.join(root_path, task_name, folder_name)
-    
     print('[Generate File Path]: {}'.format(comp_data_dir))
     # generate file
     if generate_file:
         ndb_utils.recreate_temp_dir(comp_data_dir)
+        exp_data_dict = dict()
+
+
+        for col in df.columns:
+            if col not in df_key:
+                encoded_val, encoder = encode_label(df[col])
+                list_y_encoded.append(encoded_val)
+                list_y_encoder.append(encoder)
+                size_encoder += encoder.classes_.nbytes / 1024 / 1024
+        num_tasks = len(list_y_encoded)
+
+        exp_data_dict['list_y_encoder'] = list_y_encoder 
+        exp_data_dict['num_tasks'] = num_tasks 
+        exp_data_dict['size_encoder'] = size_encoder 
+        
+        for encoder in list_y_encoder:
+            print(len(encoder.classes_))
+
+        x = df[df_key[0]].values.astype(np.int32)
+        max_len = len(str(np.max(x)))
+        exp_data_dict['max_len'] = max_len
+        y = np.array(list_y_encoded).T.astype(np.int32)
+        data = np.concatenate((x.reshape(-1,1), y), axis=1, dtype=np.int32)
+        print(data.nbytes/1024/1024)
+
+        train_generator = DataGenerator(x, list_y_encoded, 1024*2**4, max_len)
+
+        # exist_bitarray
+        x_start = np.min(x)
+        x_end = np.max(x)
+        exist_bit_arr = bitarray('0')*(x_end - x_start + 1)
+
+        for val in x:
+            exist_bit_arr[val-x_start] = 1
+        print(sys.getsizeof(exist_bit_arr)/1024/1024)
+
         misclassified_index = []
 
         for idx, (x_sub,y_sub) in tqdm(enumerate(train_generator), total=len(train_generator)):
             y_sub = list(y_sub.values())
-            y_sub_pred = model(x_sub)
+            if backend == 'tf':
+                y_sub_pred = model(x_sub)
+            elif backend == 'onnx':
+                 y_sub_pred = model.run(None, {input_name: np.expand_dims(x_sub, -1).astype(np.float32)})
             mis_pred = []
 
             for i in range(num_tasks):
                 if num_tasks == 1:
-                    mis_pred.append(y_sub[i] != np.argmax(y_sub_pred, axis=1))
+                    if backend == 'tf':
+                        mis_pred.append(y_sub[i] != np.argmax(y_sub_pred, axis=1))
+                    if backend == 'onnx':
+                        mis_pred.append(y_sub[i] != np.argmax(y_sub_pred[0], axis=1))
+                    # mis_pred.append(y_sub[i] != np.argmax(y_sub_pred, axis=1))
                 else:
                     mis_pred.append(y_sub[i] != np.argmax(y_sub_pred[i], axis=1))
             
@@ -302,8 +418,6 @@ def measure_latency_any(df, data_ori, task_name, sample_size,
         if len(misclassified_data) == 0:
             misclassified_data = np.zeros((1,2))
         record_size = misclassified_data[0].nbytes
-        # block_size = 1024 * 1024
-        # block_size = 1024 * 512
         num_record_per_part = np.floor(block_size / record_size)
 
         x_start = np.min(misclassified_data[:,0])
@@ -314,297 +428,262 @@ def measure_latency_any(df, data_ori, task_name, sample_size,
 
         list_comp_aux_blocks = []
         comp_zstd_size = 0
+        data_partition_idx = (misclassified_data[:, 0] - x_start) // num_record_per_part
         for block_idx in tqdm(range(num_partition)):
-            val_start, val_end = x_start + block_idx*num_record_per_part, x_start + (block_idx+1)*num_record_per_part
-            data_idx = np.logical_and(misclassified_data[:, 0] >= val_start, misclassified_data[:, 0] < val_end)
+            # val_start, val_end = x_start + block_idx*num_record_per_part, x_start + (block_idx+1)*num_record_per_part
+            data_idx = data_partition_idx == block_idx
+            # data_idx = np.logical_and(misclassified_data[:, 0] >= val_start, misclassified_data[:, 0] < val_end)
             data_part = misclassified_data[data_idx]
+            if search_algo == 'binary_c':
+                dict_contigous_key[block_idx] = np.array(data_part[:, 0], order='F').astype(np.int32)
+
             if len(data_part) == 0:
                 continue
             data_bytes = data_part.tobytes()
-            data_zstd_comp = zstd.compress(data_bytes,1)
+            data_zstd_comp = zstd.compress(data_bytes)
             list_comp_aux_blocks.append(data_zstd_comp)
             comp_zstd_size += sys.getsizeof(data_zstd_comp)/1024/1024
             file_name = os.path.join(comp_data_dir, str(block_idx) + '.data')
             ndb_utils.save_byte_to_disk(file_name, data_zstd_comp)
 
         data_ori_size = data_ori.nbytes/1024/1024
-        data_comp_size = [size_encoder, comp_zstd_size, model.count_params()*4/1024/1024, sys.getsizeof(zstd.compress(exist_bit_arr.tobytes()))/1024/1024]
+        data_comp_size = [size_encoder, comp_zstd_size, model_size, sys.getsizeof(zstd.compress(exist_bit_arr.tobytes()))/1024/1024]
         print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_comp_size))
-        np.save(os.path.join(comp_data_dir, 'num_record_per_part'), num_record_per_part)
+        x = df[df_key[0]].values.astype(np.int32)
+        max_len = len(str(np.max(x)))
+        x_start = np.min(x)
+        x_end = np.max(x)
+
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = [size_encoder, comp_zstd_size, model_size, sys.getsizeof(zstd.compress(exist_bit_arr.tobytes()))/1024/1024]
+        exp_data_dict['max_len'] = max_len 
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        ndb_utils.save_byte_to_disk(os.path.join(comp_data_dir, 'exist_bit_arr.data'), zstd.compress(exist_bit_arr.tobytes()))
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+        
     else:
-        num_record_per_part = np.load(os.path.join(comp_data_dir, 'num_record_per_part.npy'))  
+        exist_bit_arr = bitarray()
+        exist_bit_arr.frombytes(zstd.decompress(ndb_utils.read_bytes_from_disk(os.path.join(comp_data_dir, 'exist_bit_arr.data'))))
+
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        max_len = exp_data_dict['max_len']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_y_encoder = exp_data_dict['list_y_encoder']
+        num_tasks = exp_data_dict['num_tasks']
+        size_encoder = exp_data_dict['size_encoder']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
     
-    x = df[df_key[0]].values.astype(np.int32)
-    max_len = len(str(np.max(x)))
-    x_start = np.min(x)
-    x_end = np.max(x)
+    data_ori = data_ori[:2]
+    del df 
+    gc.collect() 
     shared_utils.create_fetures.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_long, ctypes.c_int]
     shared_utils.create_fetures_mutlt_thread_mgr.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_long, ctypes.c_int32, ctypes.c_int32]
     shared_utils.create_fetures_mutlt_thread_mgr.restype = ctypes.POINTER(ctypes.c_bool * (sample_size * max_len * 10))
-    list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
-    # Measure latency for run-time memory optimzed strategy
-    if memory_optimized:
-        timer_creatfeatures = ndb_utils.Timer()
-        timer_nn = ndb_utils.Timer()
-        timer_aux_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_exist_lookup = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_remap = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        t_remap = 0
-        t_locate_part = 0
-        t_decomp = 0
-        t_createfeatures = 0
-        t_aux_lookup = 0
-        t_nn = 0
-        t_exist_lookup = 0
-        t_total = 0
-        t_sort = 0
-        peak_memory = -1
-        block_bytes_size = 0
-
-        timer_total.tic()
-        for _ in tqdm(range(num_loop)):
-            decomp_aux_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = None
-            
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]              
-                timer_total.tic()                                      
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()               
-                timer_creatfeatures.tic()               
-                result = np.recarray((sample_size, ), dtype=data_ori.dtype)
-                result[df_key[0]] = sample_index
+   
+    timer_creatfeatures = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    timer_nn = ndb_utils.Timer()
+    timer_aux_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_exist_lookup = ndb_utils.Timer()
+    timer_remap = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_build_index = ndb_utils.Timer()
+    t_remap = 0
+    t_decomp = 0
+    t_createfeatures = 0
+    t_aux_lookup = 0
+    t_nn = 0
+    t_exist_lookup = 0
+    t_total = 0
+    t_sort = 0
+    t_locate_part = 0
+    t_build_index = 0
+    block_bytes_size = 0
+    timer_total.tic()
+    for _ in tqdm(range(num_loop)):
+        partition_hit = dict()
+        decomp_aux_block = dict()
+        num_decomp = 0
+        count_nonexist = 0
+        peak_memory = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        # build hash table
+        if search_algo == 'hash':
+            data_hash = dict()
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]                
+            timer_total.tic()                
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part
+            sample_index_partition = sample_index_partition.astype(np.int32)
+            t_sort += timer_sort.toc()     
+            result = np.ndarray((sample_size, ), dtype=data_ori.dtype)
+            result[df_key[0]] = sample_index
+            if mode == 'edge':
+                edge_batch_size = 5000
+                timer_creatfeatures.tic()        
+                inference_generator = InferenceDataGenerator(sample_index, edge_batch_size, max_len)
+                for idx, (x_sub) in enumerate(inference_generator):
+                    t_createfeatures += timer_creatfeatures.toc()
+                    timer_nn.tic()
+                    if backend == 'tf':
+                        y_nn_pred = model(x_sub)
+                    elif backend == 'onnx':
+                        y_nn_pred = model.run(None, {input_name: np.expand_dims(x_sub, -1).astype(np.float32)})
+                    
+                    for i in range(num_tasks):
+                        if num_tasks == 1 and backend == 'onnx':
+                            col_name = data_ori.dtype.names[i+1]
+                            result[col_name][idx*edge_batch_size:(idx+1)*edge_batch_size] = np.argmax(y_nn_pred[0], axis=1)
+                        elif num_tasks == 1 and backend == 'tf':
+                            col_name = data_ori.dtype.names[i+1]
+                            result[col_name][idx*edge_batch_size:(idx+1)*edge_batch_size] = np.argmax(y_nn_pred, axis=1)
+                        else:
+                            col_name = data_ori.dtype.names[i+1]
+                            result[col_name][idx*edge_batch_size:(idx+1)*edge_batch_size] = np.argmax(y_nn_pred[i], axis=1)
+                    t_nn += timer_nn.toc()
+                    timer_creatfeatures.tic()  
+
+            else:                         
+                timer_creatfeatures.tic()        
+                
                 x_features_arr = np.zeros(sample_size * max_len * 10, dtype=bool)
                 x_features_arr_ptr = shared_utils.create_fetures_mutlt_thread_mgr(
                     x_features_arr, sample_index, sample_size, max_len, num_threads)
                 sampled_features = np.frombuffer(
                     x_features_arr_ptr.contents, dtype=bool).reshape(sample_size, -1)
+                # sampled_features = ndb_utils.create_features(sample_index, max_len)[0]
                 t_createfeatures += timer_creatfeatures.toc()
-                # ---------
                 timer_nn.tic()
-                y_nn_pred = model(sampled_features)
+                if backend == 'tf':
+                    y_nn_pred = model(sampled_features)
+                elif backend == 'onnx':
+                    y_nn_pred = model.run(None, {input_name: np.expand_dims(sampled_features, -1).astype(np.float32)})
+                
                 for i in range(num_tasks):
-                    if num_tasks == 1:
+                    if num_tasks == 1 and backend == 'onnx':
+                        col_name = data_ori.dtype.names[i+1]
+                        result[col_name] = np.argmax(y_nn_pred[0], axis=1)
+                    elif num_tasks == 1 and backend == 'tf':
                         col_name = data_ori.dtype.names[i+1]
                         result[col_name] = np.argmax(y_nn_pred, axis=1)
                     else:
                         col_name = data_ori.dtype.names[i+1]
                         result[col_name] = np.argmax(y_nn_pred[i], axis=1)
                 t_nn += timer_nn.toc()
-                for idx, val in enumerate(sample_index_sorted):
-                    # ------ non exist look up
-                    timer_exist_lookup.tic()
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    exist_flag = exist_bit_arr[query_key-x_start] == 1
+            
+            for idx, val in enumerate(sample_index):
+                # ------ non exist look up
+                timer_exist_lookup.tic()
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]
+                exist_flag = exist_bit_arr[query_key-x_start] == 1
+
+                if not exist_flag:
+                    result[query_key_index_in_old] = -1
+                    count_nonexist += 1
+                    t_exist_lookup += timer_exist_lookup.toc()
+                else:
+                    # misclassified lookup
                     t_exist_lookup += timer_exist_lookup.toc()
-                    if not exist_flag:
-                        result[idx] = -1
-                        count_nonexist += 1
-                        t_exist_lookup += timer_exist_lookup.toc()
+                    timer_locate_part.tic()
+                    part_idx = sample_index_partition[idx]
+                    t_locate_part += timer_locate_part.toc()
+                    timer_decomp.tic()
+
+                    if part_idx not in decomp_aux_block:
+                        if mode == 'edge':
+                            available_memory = ndb_utils.get_available_memory()
+                            if available_memory < 1024*1024*100:
+                                # memory not eneough, free some memory
+                                decomp_aux_block = ndb_utils.evict_unused_partition(decomp_aux_block, partition_hit, free_memory=1024*1024*100)
+
+                        partition_hit[part_idx] = 1
+
+
+                        file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')                           
+
+                        if not os.path.exists(file_name):
+                            continue
+                        block_zstd_comp = ndb_utils.read_bytes_from_disk(file_name)
+                        data_uncomp = np.frombuffer(
+                        # zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1).copy(order='F')
+                        zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1)
+                        # decomp_aux_block[part_idx] = data_uncomp
+                        try:
+                            decomp_aux_block[part_idx] = data_uncomp
+                        except:
+                            decomp_aux_block = dict()
+                            decomp_aux_block[part_idx] = data_uncomp
+                        num_decomp += 1
+                        block_bytes_size = sys.getsizeof(block_zstd_comp)
+                        prev_part_idx = part_idx
+
+                        if search_algo == 'hash':
+                            t_decomp += timer_decomp.toc()
+                            timer_build_index.tic()
+                            for block_data_idx in range(len(data_uncomp)):
+                                data_entry_key = data_uncomp[block_data_idx, 0]
+                                # print(data_entry_key)
+                                data_entry_val = data_uncomp[block_data_idx]
+                                data_hash[data_entry_key] = data_entry_val   
+                            cache_block_memory = sys.getsizeof(data_hash)
+                            t_build_index += timer_build_index.toc()
+                            timer_decomp.tic()
+                        else:
+                            cache_block_memory += data_uncomp.nbytes 
                     else:
-                        # misclassified lookup
-                        t_exist_lookup += timer_exist_lookup.toc()
-                        timer_locate_part.tic()
-                        part_idx = int((query_key - x_start) // num_record_per_part)
-                        t_locate_part += timer_locate_part.toc()
-                        timer_decomp.tic()
-
-                        if part_idx != prev_part_idx:
-                            file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                            if not os.path.exists(file_name):
-                                continue
-                            block_zstd_comp = ndb_utils.read_bytes_from_disk(file_name)
-                            current_memory = sys.getsizeof(block_zstd_comp)
-                            data_uncomp = np.frombuffer(zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1).copy(order='F')
-
-                            decomp_aux_block = data_uncomp
-                            num_decomp += 1
-                            current_memory += data_uncomp.nbytes
-                            prev_part_idx = part_idx
-                            if current_memory > peak_memory:
-                                peak_memory = current_memory
-
+                        data_uncomp = decomp_aux_block[part_idx]
+                        partition_hit[part_idx] +=1
+                    t_decomp += timer_decomp.toc()    
+                    timer_aux_lookup.tic()
+                    if search_algo == 'binary':
+                        data_idx = ndb_utils.binary_search(data_uncomp[:,0], query_key, len(data_uncomp))
+                        if data_idx != -1:
+                            result[query_key_index_in_old] = tuple(data_uncomp[data_idx])
                         else:
-                            data_uncomp = decomp_aux_block
-                        
-                        t_decomp += timer_decomp.toc()                       
-                        timer_aux_lookup.tic()
+                            count_nonexist += 1
+                    elif search_algo == 'binary_c': 
                         data_idx = shared_utils.aux_look_up_bin(data_uncomp[:,0], query_key, len(data_uncomp))
                         if data_idx != -1:
                             result[query_key_index_in_old] = tuple(data_uncomp[data_idx])
-                        t_aux_lookup += timer_aux_lookup.toc()
-                
-                timer_remap.tic()
-                for i in range(num_tasks):    
-                    col_name = data_ori.dtype.names[i+1]
-                    fun_a = lambda x: list_y_encoder[i].classes_[x]
-                    result[col_name] = fun_a(result[col_name].astype(np.int32))
-                t_remap += timer_remap.toc()
-                t_total += timer_total.toc()
-
-
-        peak_memory += exist_bit_arr.nbytes
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, np.sum(data_comp_size), sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, t_createfeatures / num_loop, t_nn / num_loop, t_locate_part / num_loop, t_decomp / num_loop,
-      t_aux_lookup / num_loop, t_exist_lookup / num_loop, t_remap / num_loop, t_total / num_loop, num_decomp, count_nonexist, exist_bit_arr.nbytes/1024/1024, model.count_params()*4/1024/1024)).T
-
-    # Measure latency for end-end latency optimzed strategy
-    if latency_optimized: 
-        timer_creatfeatures = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        timer_nn = ndb_utils.Timer()
-        timer_aux_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_exist_lookup = ndb_utils.Timer()
-        timer_remap = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_build_index = ndb_utils.Timer()
-        t_remap = 0
-        t_decomp = 0
-        t_createfeatures = 0
-        t_aux_lookup = 0
-        t_nn = 0
-        t_exist_lookup = 0
-        t_total = 0
-        t_sort = 0
-        t_locate_part = 0
-        t_build_index = 0
-        block_bytes_size = 0
-        timer_total.tic()
-        for _ in tqdm(range(num_loop)):
-            decomp_aux_block = dict()
-            num_decomp = 0
-            count_nonexist = 0
-            peak_memory = 0
-            cache_block_memory = 0
-
-            # build hash table
-            if search_algo == 'hash':
-                data_hash = dict()
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]                
-                timer_total.tic()                
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()                              
-                timer_creatfeatures.tic()        
-                result = np.recarray((sample_size, ), dtype=data_ori.dtype)
-                result[df_key[0]] = sample_index
-                x_features_arr = np.zeros(sample_size * max_len * 10, dtype=bool)
-                x_features_arr_ptr = shared_utils.create_fetures_mutlt_thread_mgr(
-                    x_features_arr, sample_index, sample_size, max_len, num_threads)
-                sampled_features = np.frombuffer(
-                    x_features_arr_ptr.contents, dtype=bool).reshape(sample_size, -1)
-                # sampled_features = ndb_utils.create_features(sample_index, max_len)[0]
-
-                t_createfeatures += timer_creatfeatures.toc()
-                timer_nn.tic()
-                y_nn_pred = model(sampled_features)
-                
-                for i in range(num_tasks):
-                    if num_tasks == 1:
-                        col_name = data_ori.dtype.names[i+1]
-                        result[col_name] = np.argmax(y_nn_pred, axis=1)
-                    else:
-                        col_name = data_ori.dtype.names[i+1]
-                        result[col_name] = np.argmax(y_nn_pred[i], axis=1)
-                t_nn += timer_nn.toc()
-                
-                for idx, val in enumerate(sample_index):
-                    # ------ non exist look up
-                    timer_exist_lookup.tic()
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    exist_flag = exist_bit_arr[query_key-x_start] == 1
-
-                    if not exist_flag:
-                        result[query_key_index_in_old] = -1
-                        count_nonexist += 1
-                        t_exist_lookup += timer_exist_lookup.toc()
-                    else:
-                        # misclassified lookup
-                        t_exist_lookup += timer_exist_lookup.toc()
-                        timer_locate_part.tic()
-                        part_idx = int((query_key - x_start) // num_record_per_part)
-                        
-                        t_locate_part += timer_locate_part.toc()
-                        timer_decomp.tic()
-
-                        if part_idx not in decomp_aux_block:
-                            file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                            if not os.path.exists(file_name):
-                                continue
-                            block_zstd_comp = ndb_utils.read_bytes_from_disk(file_name)
-                            data_uncomp = np.frombuffer(
-                            zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1).copy(order='F')
-                            decomp_aux_block[part_idx] = data_uncomp
-                            num_decomp += 1
-                            block_bytes_size = sys.getsizeof(block_zstd_comp)
-                            prev_part_idx = part_idx
-
-                            # TODO add size computation for hash approach
-                            if search_algo == 'hash':
-                                t_decomp += timer_decomp.toc()
-                                timer_build_index.tic()
-                                for block_data_idx in range(len(data_uncomp)):
-                                    data_entry_key = data_uncomp[block_data_idx, 0]
-                                    # print(data_entry_key)
-                                    data_entry_val = data_uncomp[block_data_idx]
-                                    data_hash[data_entry_key] = data_entry_val   
-                                cache_block_memory = sys.getsizeof(data_hash)
-                                t_build_index += timer_build_index.toc()
-                                timer_decomp.tic()
-                            else:
-                                cache_block_memory += data_uncomp.nbytes 
                         else:
-                            data_uncomp = decomp_aux_block[part_idx]
-                        t_decomp += timer_decomp.toc()    
-                        timer_aux_lookup.tic()
-                        if search_algo == 'binary':
-                            # TODO code can be optimized at revision stage
-                            data_idx = ndb_utils.binary_search(data_uncomp[:,0], query_key, len(data_uncomp))
-                            if data_idx != -1:
-                                result[query_key_index_in_old] = tuple(data_uncomp[data_idx])
-                            else:
-                                count_nonexist += 1
-                        elif search_algo == 'binary_c': 
-                            data_idx = shared_utils.aux_look_up_bin(data_uncomp[:,0], query_key, len(data_uncomp))
-                            if data_idx != -1:
-                                result[query_key_index_in_old] = tuple(data_uncomp[data_idx])
-                            else:
-                                count_nonexist += 1
-                        elif search_algo == 'hash':
-                            if query_key in data_hash.keys():
-                                result[query_key_index_in_old] = tuple(data_hash[query_key])
-
-                        t_aux_lookup += timer_aux_lookup.toc()    
-
-                    if cache_block_memory + block_bytes_size > peak_memory:
-                        peak_memory = cache_block_memory + block_bytes_size
-
-                timer_remap.tic()
-                for i in range(num_tasks):    
-                    col_name = data_ori.dtype.names[i+1]
-                    fun_a = lambda x: list_y_encoder[i].classes_[x]
-                    result[col_name] = fun_a(result[col_name].astype(np.int32))
-                t_remap += timer_remap.toc()
-                t_total += timer_total.toc()
-
-        peak_memory += exist_bit_arr.nbytes
+                            count_nonexist += 1
+                    elif search_algo == 'hash':
+                        if query_key in data_hash.keys():
+                            result[query_key_index_in_old] = tuple(data_hash[query_key])
+
+                    t_aux_lookup += timer_aux_lookup.toc()    
+
+                if cache_block_memory + block_bytes_size > peak_memory:
+                    peak_memory = cache_block_memory + block_bytes_size
+
+            timer_remap.tic()
+            for i in range(num_tasks):    
+                col_name = data_ori.dtype.names[i+1]
+                fun_a = lambda x: list_y_encoder[i].classes_[x]
+                result[col_name] = fun_a(result[col_name].astype(np.int32))
+            t_remap += timer_remap.toc()
+            t_total += timer_total.toc()
         latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, np.sum(data_comp_size), sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, t_createfeatures / num_loop, t_nn / num_loop, t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
-      t_aux_lookup / num_loop, t_exist_lookup / num_loop, t_remap / num_loop, t_total / num_loop, num_decomp, count_nonexist, exist_bit_arr.nbytes/1024/1024, model.count_params()*4/1024/1024)).T
+        del result
+        gc.collect()
+    peak_memory += exist_bit_arr.nbytes
+    latency_optimized_latency = np.array((data_ori_size, np.sum(data_comp_size), sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, t_createfeatures / num_loop, t_nn / num_loop, t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
+    t_aux_lookup / num_loop, t_exist_lookup / num_loop, t_remap / num_loop, t_total / num_loop, num_decomp, count_nonexist, exist_bit_arr.nbytes/1024/1024, model_size)).T
 
     return_latency = None 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
diff --git a/DeepMapping/DeepMapping/delta_compression.py b/DeepMapping/DeepMapping/delta_compression.py
index 18e8856..03c1fde 100644
--- a/DeepMapping/DeepMapping/delta_compression.py
+++ b/DeepMapping/DeepMapping/delta_compression.py
@@ -1,15 +1,17 @@
-import pandas as pd 
+import gc
 import numpy as np
-import sys
 import math
 import os
+import pandas as pd 
+import sys
+import warnings
 from DeepMapping import ndb_utils
 from tqdm.auto import tqdm
 
 
 
 def measure_latency(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary'):
     """Measure the end-end latency of data query
 
@@ -24,10 +26,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -37,17 +35,19 @@ def measure_latency(df, data_ori, task_name, sample_size,
         path_to_model : str
             load model from custom path
     """
+    mode = os.environ['MODE']
     data_ori_size = 0
     data_comp_size = 0
     memory_optimized_latency = None 
     latency_optimized_latency = None 
     memory_optimized_result = None
     latency_optimized_result = None
+    exp_data_dict = dict()
 
     list_type = []
     for col in data_ori.dtype.names:
-        if data_ori[col].dtype == object:
-            list_type.append({'names': [col], 'formats': ['O'], 'offsets': [0], 'itemsize': 8})
+        if data_ori[col].dtype == 'S8':
+            list_type.append((col, 'S8'))
         elif data_ori[col].dtype == np.float64:
             list_type.append(np.float64)
         else:
@@ -129,235 +129,145 @@ def measure_latency(df, data_ori, task_name, sample_size,
         data_comp_size = data_size/1024/1024       
         print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
         np.save(os.path.join(comp_data_dir, 'list_delta_enabled'), list_delta_enabled)
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        exp_data_dict['list_type'] = list_type
+        exp_data_dict['diff_dtype'] = diff_dtype
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
     else:
         list_delta_enabled = np.load(os.path.join(comp_data_dir, 'list_delta_enabled.npy'))  
-
-    list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
-
-    # Measure latency for run-time memory optimized strategy
-    if memory_optimized:
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer() 
-        t_total = 0
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_locate_part = 0
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        diff_dtype = exp_data_dict['diff_dtype']
+        list_type = exp_data_dict['list_type']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
+
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    timer_total.tic()
+
+    for _ in tqdm(range(num_loop)):  
+        decomp_block = dict()
+        partition_hit = dict()
         peak_memory = 0
-
-        for _ in tqdm(range(num_loop)):
-            decomp_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = -1
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic() 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    part_idx = int((query_key-x_start) // num_record_per_part)   
-                    t_locate_part += timer_locate_part.toc()
-                    timer_decomp.tic()
-
-                    if part_idx != prev_part_idx:
-                        # new block to decompress
-                        curr_block = []
-                        current_memory = 0   
-                        # decompress index first
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        block_data = np.frombuffer(block_bytes, dtype=list_type[0])                      
-                        curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype)                       
-                        current_memory += sys.getsizeof(block_bytes)
-                        current_memory += block_data.nbytes                        
-                        curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
-                        
-                        for i in range(1, len(list_delta_enabled)):
-                            col_name = data_ori.dtype.names[i]
-
-                            if list_delta_enabled[i] == False:
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                                block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-
-                                if list_type[i] == np.int32 or list_type[i] == np.float64:
-                                    block_data = np.frombuffer(block_bytes, dtype=list_type[i])
-                                else:
-                                    block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name]
-
-                                curr_decomp_block[col_name] = block_data
-                                current_memory += sys.getsizeof(block_bytes)
-                                current_memory += block_data.nbytes
-                            else:
-                                # delta
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                                delta_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=diff_dtype)                                
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}-init.data'.format(i))
-                                init_value = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=np.int32)                               
-                                col_value = np.zeros(len(delta_data)+1, dtype=np.int32)
-                                curr_value = init_value[0]
-
-                                for i in range(0, len(delta_data)):
-                                    col_value[i]  = curr_value
-                                    curr_value += delta_data[i]
-                                
-                                curr_decomp_block[col_name] = col_value
-                                current_memory += delta_data.nbytes
-                                current_memory += init_value.nbytes
-                                current_memory += col_value.nbytes
-                        
-                        decomp_block = curr_decomp_block
-                        current_memory += decomp_block.nbytes
-                        num_decomp += 1
-                        
-                        if current_memory > peak_memory:
-                            peak_memory = current_memory
-                        prev_part_idx = part_idx
-                        decomp_block = curr_decomp_block
-                    else:
-                        curr_decomp_block = decomp_block
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            t_sort += timer_sort.toc()
+            result = np.ndarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+
+            for idx in range(sample_size):
+                timer_locate_part.tic()          
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]      
+                t_locate_part += timer_locate_part.toc()
+                part_idx = int((query_key-x_start) // num_record_per_part)
+                timer_decomp.tic()
+                decomp_memory = 0
+
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100)
+
+                    partition_hit[part_idx] =1
+                    # decompress index first
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
+                    block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                    block_data = np.frombuffer(block_bytes, dtype=list_type[0])   
+                    curr_decomp_block = np.ndarray((len(block_data),), dtype=data_ori.dtype)
+                    decomp_memory += sys.getsizeof(block_bytes)
+                    curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
                     
-                    t_lookup += timer_lookup.toc()
-
-                t_total += timer_total.toc()
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-    # Measure latency for end-end latency optimzed strategy
-    if latency_optimized: 
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_total = 0
-        t_locate_part = 0
-        timer_total.tic()
-
-        for _ in tqdm(range(num_loop)):  
-            decomp_block = dict()
-            peak_memory = 0
-            num_decomp = 0
-            count_nonexist = 0
-            cache_block_memory = 0
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-                result_idx = 0
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic()          
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]      
-                    t_locate_part += timer_locate_part.toc()
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    timer_decomp.tic()
-                    decomp_memory = 0
-
-                    if part_idx not in decomp_block:
-                        # decompress index first
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        block_data = np.frombuffer(block_bytes, dtype=list_type[0])   
-                        curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype)      
-                        decomp_memory += sys.getsizeof(block_bytes)
-                        curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
-                        
-                        for i in range(1, len(list_delta_enabled)):
-                            col_name = data_ori.dtype.names[i]
-
-                            if list_delta_enabled[i] == False:
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                                block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                                decomp_memory += sys.getsizeof(block_bytes)
+                    for i in range(1, len(list_delta_enabled)):
+                        col_name = data_ori.dtype.names[i]
 
-                                if list_type[i] == np.int32 or list_type[i] == np.float64:
-                                    block_data = np.frombuffer(block_bytes, dtype=list_type[i])
-                                else:
-                                    block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name]
+                        if list_delta_enabled[i] == False:
+                            file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
+                            block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                            decomp_memory += sys.getsizeof(block_bytes)
 
-                                curr_decomp_block[col_name] = block_data
+                            if list_type[i] == np.int32 or list_type[i] == np.float64:
+                                block_data = np.frombuffer(block_bytes, dtype=list_type[i])
                             else:
-                                # delta
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                                delta_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=diff_dtype)                      
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}-init.data'.format(i))
-                                init_value = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=np.int32)                           
-                                col_value = np.zeros(len(delta_data)+1, dtype=np.int32)
-                                curr_value = init_value[0]
-
-                                for i in range(0, len(delta_data)):
-                                    col_value[i]  = curr_value
-                                    curr_value += delta_data[i]    
-
-                                curr_decomp_block[col_name] = col_value
-                                decomp_memory += delta_data.nbytes
-                                decomp_memory += init_value.nbytes
-                                decomp_memory += col_value.nbytes
-                                
-                        cache_block_memory += curr_decomp_block.nbytes 
-                        decomp_block[part_idx] = curr_decomp_block
-                        num_decomp += 1
-                    else:
-                        curr_decomp_block = decomp_block[part_idx]
-
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
+                                block_data = np.frombuffer(block_bytes, dtype=list_type[i])[col_name]
+
+                            curr_decomp_block[col_name] = block_data
+                        else:
+                            # delta
+                            file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
+                            delta_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=diff_dtype)                      
+                            file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}-init.data'.format(i))
+                            init_value = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=np.int32)                           
+                            col_value = np.zeros(len(delta_data)+1, dtype=np.int32)
+                            curr_value = init_value[0]
+
+                            for i in range(0, len(delta_data)):
+                                col_value[i]  = curr_value
+                                curr_value += delta_data[i]    
+
+                            curr_decomp_block[col_name] = col_value
+                            decomp_memory += delta_data.nbytes
+                            decomp_memory += init_value.nbytes
+                            decomp_memory += col_value.nbytes
+                            
+                    cache_block_memory += curr_decomp_block.nbytes 
+                    decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                else:
+                    partition_hit[part_idx] += 1
+                    curr_decomp_block = decomp_block[part_idx]
 
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
 
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    t_lookup += timer_lookup.toc()
-                    result_idx += 1
+                if search_algo == 'binary':
+                    data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
+                elif search_algo == 'naive':
+                    data_idx = curr_decomp_block[key] == query_key
 
-                    if cache_block_memory + decomp_memory > peak_memory:
-                        peak_memory = cache_block_memory + decomp_memory
-                t_total += timer_total.toc()
-        latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+                if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx])
+                else:
+                    count_nonexist += 1
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+
+                if cache_block_memory + decomp_memory > peak_memory:
+                    peak_memory = cache_block_memory + decomp_memory
+            t_total += timer_total.toc()
+    latency_optimized_result = result.copy()
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop,  0 / num_loop, # build_index time
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
 
     return_latency = None 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
diff --git a/DeepMapping/DeepMapping/dgpe_compression.py b/DeepMapping/DeepMapping/dgpe_compression.py
index 6f25507..22608b9 100644
--- a/DeepMapping/DeepMapping/dgpe_compression.py
+++ b/DeepMapping/DeepMapping/dgpe_compression.py
@@ -79,7 +79,7 @@ def dge_decompression(dge_comp_data, required_bits, dge_col_min_vals):
 
 
 def measure_latency(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary'):
     """Measure the end-end latency of data query
 
@@ -94,10 +94,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -181,193 +177,99 @@ def measure_latency(df, data_ori, task_name, sample_size,
     
     list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
 
-    # Measure latency for run-time memory optimized strategy
-    if memory_optimized:
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer() 
-        t_total = 0
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_locate_part = 0
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    timer_total.tic()
+
+    for _ in tqdm(range(num_loop)):  
+        decomp_block = dict()
         peak_memory = 0
-
-        for _ in tqdm(range(num_loop)):
-            decomp_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = -1
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic() 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    part_idx = int((query_key-x_start) // num_record_per_part) 
-                    t_locate_part += timer_locate_part.toc()
-                    timer_decomp.tic()
-                    
-                    if part_idx != prev_part_idx:
-                        # new block to decompress
-                        current_memory = 0                
-                        file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-dge.data')                
-                        block_dge_bytes = ndb_utils.read_bytes_from_disk(file_name2)
-                        current_memory += sys.getsizeof(block_dge_bytes)                 
-                        data_dge_part = np.frombuffer(block_dge_bytes, dtype=dge_dtype)
-                        data_int_part = dge_decompression(data_dge_part, required_bits, dge_col_min_vals)
-                        current_memory += data_int_part.nbytes            
-                        curr_decomp_block = np.recarray((len(data_dge_part),), dtype=data_ori.dtype)
-                        
-                        if len(non_int_data) != 0:
-                            file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-nonint.data')
-                            block_nonint_bytes = ndb_utils.read_bytes_from_disk(file_name1)
-                            data_non_int_part = np.rec.array(block_nonint_bytes, dtype=non_int_data.dtype)
-
-                            for i in range(len(non_int_cols)):
-                                curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]]
-                            
-                            current_memory += sys.getsizeof(block_nonint_bytes)
-                            
-                        for i in range(len(int_cols)):
-                            curr_decomp_block[int_cols[i]] = data_int_part[:, i]
-                        
-                        num_decomp += 1         
-                        current_memory += curr_decomp_block.nbytes
-
-                        if current_memory > peak_memory:
-                            peak_memory = current_memory
-
-                        prev_part_idx = part_idx
-                        decomp_block = curr_decomp_block
-                    else:
-                        curr_decomp_block = decomp_block
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            t_sort += timer_sort.toc()
+            result = np.recarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+            current_memory = 0
+
+            for idx in range(sample_size):
+                timer_locate_part.tic()    
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]
+                t_locate_part += timer_locate_part.toc()
+                part_idx = int((query_key-x_start) // num_record_per_part)
+                timer_decomp.tic()
+                # -----
+                decomp_memory = 0
+                if part_idx not in decomp_block:
+                    file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-dge.data')
+                    block_dge_bytes = ndb_utils.read_bytes_from_disk(file_name2)
+                    decomp_memory += sys.getsizeof(block_dge_bytes)
+                    data_dge_part = np.frombuffer(block_dge_bytes, dtype=dge_dtype)
+                    data_int_part = dge_decompression(data_dge_part, required_bits, dge_col_min_vals)
+                    decomp_memory += data_int_part.nbytes
+                    curr_decomp_block = np.recarray((len(data_dge_part),), dtype=data_ori.dtype)
                     
-                    t_lookup += timer_lookup.toc()
-
-                t_total += timer_total.toc()
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-    # Measure latency for end-end latency optimzed strategy
-    if latency_optimized: 
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_total = 0
-        t_locate_part = 0
-        timer_total.tic()
-
-        for _ in tqdm(range(num_loop)):  
-            decomp_block = dict()
-            peak_memory = 0
-            num_decomp = 0
-            count_nonexist = 0
-            cache_block_memory = 0
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-                result_idx = 0
-                current_memory = 0
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic()    
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    t_locate_part += timer_locate_part.toc()
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    timer_decomp.tic()
-                    # -----
-                    decomp_memory = 0
-                    if part_idx not in decomp_block:
-                        file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-dge.data')
-                        block_dge_bytes = ndb_utils.read_bytes_from_disk(file_name2)
-                        decomp_memory += sys.getsizeof(block_dge_bytes)
-                        data_dge_part = np.frombuffer(block_dge_bytes, dtype=dge_dtype)
-                        data_int_part = dge_decompression(data_dge_part, required_bits, dge_col_min_vals)
-                        decomp_memory += data_int_part.nbytes
-                        curr_decomp_block = np.recarray((len(data_dge_part),), dtype=data_ori.dtype)
-                        
-                        if len(non_int_data) != 0:
-                            file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-nonint.data')
-                            block_nonint_bytes = ndb_utils.read_bytes_from_disk(file_name1)
-                            decomp_memory += sys.getsizeof(block_nonint_bytes)
-                            data_non_int_part = np.rec.array(block_nonint_bytes, dtype=non_int_data.dtype)
-                            current_memory += sys.getsizeof(data_non_int_part)
-                            for i in range(len(non_int_cols)):
-                                curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]]
-                            
-                            peak_memory += data_non_int_part.nbytes
-                                      
-                        for i in range(len(int_cols)):
-                            curr_decomp_block[int_cols[i]] = data_int_part[:, i]
-                        
+                    if len(non_int_data) != 0:
+                        file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-nonint.data')
+                        block_nonint_bytes = ndb_utils.read_bytes_from_disk(file_name1)
+                        decomp_memory += sys.getsizeof(block_nonint_bytes)
+                        data_non_int_part = np.rec.array(block_nonint_bytes, dtype=non_int_data.dtype)
+                        current_memory += sys.getsizeof(data_non_int_part)
                         for i in range(len(non_int_cols)):
                             curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]]
                         
-                        decomp_block[part_idx] = curr_decomp_block
-                        num_decomp += 1
-                        cache_block_memory += curr_decomp_block.nbytes
-                    else:
-                        curr_decomp_block = decomp_block[part_idx]
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    t_lookup += timer_lookup.toc()
-                    result_idx += 1
+                        peak_memory += data_non_int_part.nbytes
+                                    
+                    for i in range(len(int_cols)):
+                        curr_decomp_block[int_cols[i]] = data_int_part[:, i]
+                    
+                    for i in range(len(non_int_cols)):
+                        curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]]
                     
-                    if cache_block_memory + decomp_memory > peak_memory:
-                        peak_memory = cache_block_memory + decomp_memory
-                t_total += timer_total.toc()
-        latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+                    decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                    cache_block_memory += curr_decomp_block.nbytes
+                else:
+                    curr_decomp_block = decomp_block[part_idx]
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
+
+                if search_algo == 'binary':
+                    data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
+                elif search_algo == 'naive':
+                    data_idx = curr_decomp_block[key] == query_key
+
+                if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
+                    result[query_key_index_in_old] = curr_decomp_block[data_idx]
+                else:
+                    count_nonexist += 1
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+                
+                if cache_block_memory + decomp_memory > peak_memory:
+                    peak_memory = cache_block_memory + decomp_memory
+            t_total += timer_total.toc()
+    latency_optimized_result = result.copy()
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop, 
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
 
     return_latency = None 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
diff --git a/DeepMapping/DeepMapping/generate_sample_index.py b/DeepMapping/DeepMapping/generate_sample_index.py
new file mode 100644
index 0000000..fa0abb4
--- /dev/null
+++ b/DeepMapping/DeepMapping/generate_sample_index.py
@@ -0,0 +1,43 @@
+import os
+import itertools
+from DeepMapping import ndb_utils
+
+# Generate the sample index for TPC-H, SF=1 experiments for re-use in non_generate_file mode
+
+list_dataset = ['tpch-s1/customer', 'tpch-s1/lineitem', 'tpch-s1/orders', 'tpch-s1/part', 'tpch-s1/supplier']
+
+for dataset in list_dataset:
+    path_to_meta = os.path.join('temp', dataset, 'uncompress/extra_meta.data')
+    print('[INFO] Generating sample index for', dataset)
+    extra_meta = ndb_utils.load_obj_from_disk_with_pickle(path_to_meta)
+    x_start = extra_meta['x_start']
+    x_end = extra_meta['x_end']
+    num_query = 5
+    for sample_size in [1000, 10000, 100000]:
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+        
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join('temp', dataset, 'sample_index_{}.data'.format(sample_size)),
+                                               list_sample_index)
+        
+
+
+# Generate the sample index for data manipulation experiments for re-use in non_generate_file mode
+
+list_dataset = ['data_manipulation/single_column_low_corr_100m', 
+                'data_manipulation/single_column_high_corr_100m', 
+                'data_manipulation/multi_column_low_corr_100m', 
+                'data_manipulation/multi_column_high_corr_100m']
+
+
+for dataset in list_dataset:
+    path_to_meta = os.path.join('temp', dataset, 'uncompress', 'Default', 'extra_meta.data')
+    print('[INFO] Generating sample index for', dataset)
+    extra_meta = ndb_utils.load_obj_from_disk_with_pickle(path_to_meta)
+    x_start = extra_meta['x_start']
+    x_end = extra_meta['x_end']
+    num_query = 5
+    for sample_size in [1000, 10000, 100000]:
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+        
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join('temp', dataset, 'sample_index_{}.data'.format(sample_size)),
+                                               list_sample_index)
\ No newline at end of file
diff --git a/DeepMapping/DeepMapping/hashtable.py b/DeepMapping/DeepMapping/hashtable.py
new file mode 100644
index 0000000..280a62d
--- /dev/null
+++ b/DeepMapping/DeepMapping/hashtable.py
@@ -0,0 +1,229 @@
+import ctypes
+import gc
+import math
+import numpy as np
+import os
+import pandas as pd 
+import sys
+from DeepMapping import ndb_utils
+from tqdm.auto import tqdm
+
+ND_POINTER_1 = np.ctypeslib.ndpointer(dtype=np.bool_, 
+                                            ndim=1,
+                                            flags="C")
+ND_POINTER_2 = np.ctypeslib.ndpointer(dtype=np.int32, 
+                                    ndim=1,
+                                    flags="C")
+
+shared_utils = ctypes.CDLL(os.path.abspath("shared_utils.so")) # Or full path to file 
+shared_utils.aux_look_up_bin.argtypes = [ND_POINTER_2, ctypes.c_int, ctypes.c_long]
+shared_utils.aux_look_up_bin.restype = ctypes.c_long
+
+def measure_latency(df, data_ori, task_name, sample_size, 
+                    generate_file=True,
+                    num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024):
+    """Measure the end-end latency of data query
+
+    Args:
+        df : dataframe
+            dataset in pd.dataframe format
+        data_ori : np.record
+            dataset in np.record format
+        task_name : str
+            task name
+        sample_size : int
+            number of queried data per query
+        generate_file : bool
+            whether need to store the data to disk
+        num_loop : int
+            number of loops to run for measuring the latency
+        num_query : int
+            number of queries
+        search_algo : str
+            search algorithm that applied to search entry in each partition, available strategy: naive, binary, hash
+        path_to_model : str
+            load model from custom path
+    """
+    mode = os.environ['MODE']
+    data_ori_size = 0
+    data_comp_size = 0
+    memory_optimized_latency = None 
+    latency_optimized_latency = None 
+    memory_optimized_result = None
+    latency_optimized_result = None
+    exp_data_dict = dict()
+
+    key = df.columns[0]
+    record_size = data_ori[0].nbytes
+    num_record_per_part = np.floor(block_size / record_size)
+    x = data_ori[key]
+    x_start = np.min(x)
+    x_end = np.max(x)
+    x_range = x_end - x_start
+    num_partition = int(math.ceil(x_range / num_record_per_part))
+    print('[Partition] Size {} Per Partition, # Partition: {}'.format(record_size*num_record_per_part/1024, num_partition))
+    root_path = 'temp'
+    task_name = task_name
+    folder_name = 'hashtable'
+    comp_data_dir = os.path.join(root_path, task_name, folder_name)
+    if 'DATA_OPS' in os.environ:
+        comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS'])
+        
+    print('[Generate File Path]: {}'.format(comp_data_dir))
+
+    dict_contigous_key = dict()
+    
+    # generate file
+    if generate_file:
+        ndb_utils.recreate_temp_dir(comp_data_dir)
+        data_size = 0
+
+        for block_idx in tqdm(range(num_partition)):
+            val_start, val_end = x_start + block_idx * \
+                num_record_per_part, x_start + (block_idx+1)*num_record_per_part
+            data_idx = np.logical_and(x >= val_start, x < val_end)
+            data_part = data_ori[data_idx]
+            if search_algo == 'binary_c':
+                dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32)
+
+            if len(data_part) == 0:
+                continue
+            data_part_hash_table = dict()
+            for data_idx in range(len(data_part)):
+                data_part_hash_table[data_part[key][data_idx]] = data_part[data_idx]
+
+            file_name = os.path.join(comp_data_dir, str(block_idx) + '.data')
+            ndb_utils.save_hashtable_to_disk(file_name, data_part_hash_table)
+            data_size += os.path.getsize(file_name)
+
+        data_ori_size = data_ori.nbytes/1024/1024
+        data_comp_size = data_size/1024/1024
+        print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+    else:
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
+    
+
+
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_build_index = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    t_build_index = 0
+    timer_total.tic()
+    for _ in tqdm(range(num_loop)):  
+        partition_hit = dict() 
+        decomp_block = dict()
+        peak_memory = 0
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        # build hash table
+        # if search_algo == 'hash':
+        #     data_hash = dict() 
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part
+            sample_index_partition = sample_index_partition.astype(np.int32)
+            t_sort += timer_sort.toc()
+            result = np.recarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+            
+            for idx in range(sample_size):
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]
+                timer_locate_part.tic()        
+                part_idx = sample_index_partition[idx]
+                t_locate_part += timer_locate_part.toc()
+                timer_decomp.tic()
+
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100)
+                    partition_hit[part_idx] = 1
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
+                    curr_decomp_block = ndb_utils.load_hashtable_from_disk(file_name)
+                    try:
+                        decomp_block[part_idx] = curr_decomp_block
+                    except:
+                        decomp_block = dict()
+                        decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                    block_bytes_size = ndb_utils.get_nested_dict_size(curr_decomp_block)
+
+                    cache_block_memory += block_bytes_size
+
+                    if search_algo == 'hash':
+                        t_decomp += timer_decomp.toc()
+                        timer_build_index.tic()
+                        t_build_index += timer_build_index.toc()
+                        timer_decomp.tic()
+                    else:
+                        pass
+                else:
+                    curr_decomp_block = decomp_block[part_idx]
+                    partition_hit[part_idx] +=1
+
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
+
+
+                data_idx = query_key in curr_decomp_block.keys()                            
+                if data_idx == True:
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[query_key])
+                else: 
+                    count_nonexist += 1
+
+
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+                if cache_block_memory > peak_memory:
+                    peak_memory = cache_block_memory
+            t_total += timer_total.toc()
+        latency_optimized_result = result.copy()
+        del result
+        gc.collect()
+        print('[DEBUG] number of decompressed partition', len(decomp_block))
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+
+    return_latency = None 
+    if memory_optimized_latency is None and latency_optimized_latency is not None:
+        return_latency = latency_optimized_latency.reshape((1,-1))
+    elif memory_optimized_latency is not None and latency_optimized_latency is None:
+        return_latency =  memory_optimized_latency.reshape((1,-1))
+    elif memory_optimized_latency is not None and latency_optimized_latency is not None:
+        return_latency = np.vstack((memory_optimized_latency, latency_optimized_latency))
+
+    return data_ori_size, data_comp_size, [memory_optimized_result, latency_optimized_result], return_latency
diff --git a/DeepMapping/DeepMapping/hashtable_with_compression.py b/DeepMapping/DeepMapping/hashtable_with_compression.py
new file mode 100644
index 0000000..0969a48
--- /dev/null
+++ b/DeepMapping/DeepMapping/hashtable_with_compression.py
@@ -0,0 +1,238 @@
+import ctypes
+import gc
+import json
+import math
+import numpy as np
+import os
+import pandas as pd 
+import pickle
+import sys
+import zstd
+from DeepMapping import ndb_utils
+from tqdm.auto import tqdm
+
+ND_POINTER_1 = np.ctypeslib.ndpointer(dtype=np.bool_, 
+                                            ndim=1,
+                                            flags="C")
+ND_POINTER_2 = np.ctypeslib.ndpointer(dtype=np.int32, 
+                                    ndim=1,
+                                    flags="C")
+
+shared_utils = ctypes.CDLL(os.path.abspath("shared_utils.so")) # Or full path to file 
+shared_utils.aux_look_up_bin.argtypes = [ND_POINTER_2, ctypes.c_int, ctypes.c_long]
+shared_utils.aux_look_up_bin.restype = ctypes.c_long
+
+def measure_latency(df, data_ori, task_name, sample_size, 
+                    generate_file=True,
+                    num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024):
+    """Measure the end-end latency of data query
+
+    Args:
+        df : dataframe
+            dataset in pd.dataframe format
+        data_ori : np.record
+            dataset in np.record format
+        task_name : str
+            task name
+        sample_size : int
+            number of queried data per query
+        generate_file : bool
+            whether need to store the data to disk
+        num_loop : int
+            number of loops to run for measuring the latency
+        num_query : int
+            number of queries
+        search_algo : str
+            search algorithm that applied to search entry in each partition, available strategy: naive, binary, hash
+        path_to_model : str
+            load model from custom path
+    """
+    mode = os.environ['MODE']
+    data_ori_size = 0
+    data_comp_size = 0
+    memory_optimized_latency = None 
+    latency_optimized_latency = None 
+    memory_optimized_result = None
+    latency_optimized_result = None
+    exp_data_dict = dict()
+
+    key = df.columns[0]
+    record_size = data_ori[0].nbytes
+    num_record_per_part = np.floor(block_size / record_size)
+    x = data_ori[key]
+    x_start = np.min(x)
+    x_end = np.max(x)
+    x_range = x_end - x_start
+    num_partition = int(math.ceil(x_range / num_record_per_part))
+    print('[Partition] Size {} Per Partition, # Partition: {}'.format(record_size*num_record_per_part/1024, num_partition))
+    root_path = 'temp'
+    task_name = task_name
+    folder_name = 'hashtable_with_compression'
+    comp_data_dir = os.path.join(root_path, task_name, folder_name)
+    if 'DATA_OPS' in os.environ:
+        comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS'])
+        
+    print('[Generate File Path]: {}'.format(comp_data_dir))
+
+    dict_contigous_key = dict()
+    
+    # generate file
+    if generate_file:
+        ndb_utils.recreate_temp_dir(comp_data_dir)
+        data_size = 0
+
+        for block_idx in tqdm(range(num_partition)):
+            val_start, val_end = x_start + block_idx * \
+                num_record_per_part, x_start + (block_idx+1)*num_record_per_part
+            data_idx = np.logical_and(x >= val_start, x < val_end)
+            data_part = data_ori[data_idx]
+            if search_algo == 'binary_c':
+                dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32)
+
+            if len(data_part) == 0:
+                continue
+            data_part_hash_table = dict()
+            for data_idx in range(len(data_part)):
+                data_part_hash_table[data_part[key][data_idx]] = data_part[data_idx]
+            
+            data_part_hash_table_bytes = pickle.dumps(data_part_hash_table)
+            # data_size += sys.getsizeof(data_bytes)
+            file_name = os.path.join(comp_data_dir, str(block_idx) + '.data')
+
+            # data_part_hash_table_bytes = zstd.compress(json.dumps(data_part_hash_table).encode('utf-8'))
+            # ndb_utils.save_hashtable_to_disk(file_name, data_part_hash_table)
+            ndb_utils.save_byte_to_disk(file_name, zstd.compress(data_part_hash_table_bytes))
+            data_size += os.path.getsize(file_name)
+
+        data_ori_size = data_ori.nbytes/1024/1024
+        data_comp_size = data_size/1024/1024
+        print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+    else:
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
+
+  
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_build_index = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    t_build_index = 0
+    timer_total.tic()
+    for _ in tqdm(range(num_loop)):  
+        partition_hit = dict() 
+        decomp_block = dict()
+        peak_memory = 0
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        # build hash table
+        # if search_algo == 'hash':
+        #     data_hash = dict() 
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part
+            sample_index_partition = sample_index_partition.astype(np.int32)
+            t_sort += timer_sort.toc()
+            result = np.recarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+            
+            for idx in range(sample_size):
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]
+                timer_locate_part.tic()        
+                # part_idx = int((query_key-x_start) // num_record_per_part)
+                part_idx = sample_index_partition[idx]
+                t_locate_part += timer_locate_part.toc()
+                timer_decomp.tic()
+
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100)
+                    partition_hit[part_idx] = 1
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
+                    block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                    curr_decomp_block = pickle.loads(zstd.uncompress(block_bytes))
+                    # curr_decomp_block = ndb_utils.load_hashtable_from_disk(file_name)
+                    try:
+                        decomp_block[part_idx] = curr_decomp_block
+                    except:
+                        decomp_block = dict()
+                        decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                    load_block_bytes = sys.getsizeof(block_bytes)
+                    block_bytes_size = ndb_utils.get_nested_dict_size(curr_decomp_block)
+
+                    cache_block_memory += block_bytes_size
+
+                    if search_algo == 'hash':
+                        t_decomp += timer_decomp.toc()
+                        timer_build_index.tic()
+                        t_build_index += timer_build_index.toc()
+                        timer_decomp.tic()
+                    else:
+                        pass
+                else:
+                    curr_decomp_block = decomp_block[part_idx]
+                    partition_hit[part_idx] += 1
+
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
+
+                data_idx = query_key in curr_decomp_block.keys()                            
+                if data_idx == True:
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[query_key])
+                else: 
+                    count_nonexist += 1
+
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+                if cache_block_memory + load_block_bytes > peak_memory:
+                    peak_memory = cache_block_memory + load_block_bytes
+            t_total += timer_total.toc()
+        latency_optimized_result = result.copy()
+        del result
+        gc.collect()
+        print('[DEBUG] number of decompressed partition', len(decomp_block))
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+
+    return_latency = None 
+    if memory_optimized_latency is None and latency_optimized_latency is not None:
+        return_latency = latency_optimized_latency.reshape((1,-1))
+    elif memory_optimized_latency is not None and latency_optimized_latency is None:
+        return_latency =  memory_optimized_latency.reshape((1,-1))
+    elif memory_optimized_latency is not None and latency_optimized_latency is not None:
+        return_latency = np.vstack((memory_optimized_latency, latency_optimized_latency))
+
+    return data_ori_size, data_comp_size, [memory_optimized_result, latency_optimized_result], return_latency
diff --git a/DeepMapping/DeepMapping/lzo_compression.py b/DeepMapping/DeepMapping/lzo_compression.py
index 90b48fc..ce0ea2a 100644
--- a/DeepMapping/DeepMapping/lzo_compression.py
+++ b/DeepMapping/DeepMapping/lzo_compression.py
@@ -1,16 +1,17 @@
-import pandas as pd 
-import numpy as np
-import sys
+import gc
+import lzo
 import math
+import numpy as np
 import os
-import lzo
+import pandas as pd 
+import sys
 from DeepMapping import ndb_utils
 from tqdm.auto import tqdm
 
 
 
 def measure_latency(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary'):
     """Measure the end-end latency of data query
 
@@ -25,10 +26,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -38,12 +35,18 @@ def measure_latency(df, data_ori, task_name, sample_size,
         path_to_model : str
             load model from custom path
     """
+    mode = os.environ['MODE']
     data_ori_size = 0
     data_comp_size = 0
     memory_optimized_latency = None 
     latency_optimized_latency = None 
     memory_optimized_result = None
     latency_optimized_result = None
+    exp_data_dict = dict()
+    memory_optimized_latency = None 
+    latency_optimized_latency = None 
+    memory_optimized_result = None
+    latency_optimized_result = None
     key = df.columns[0]
     block_size = 1024 * 1024
     record_size = data_ori[0].nbytes
@@ -83,155 +86,118 @@ def measure_latency(df, data_ori, task_name, sample_size,
         data_ori_size = data_ori.nbytes/1024/1024
         data_comp_size = data_size/1024/1024            
         print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+    else:
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
     
-    list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
-
-    # Measure latency for run-time memory optimized strategy
-    if memory_optimized:
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer() 
-        t_total = 0
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_locate_part = 0
-        peak_memory = 0
 
-        for _ in tqdm(range(num_loop)):
-            decomp_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = -1
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic() 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    part_idx = int((query_key-x_start) // num_record_per_part)                    
-                    t_locate_part += timer_locate_part.toc()
-                    timer_decomp.tic()
-
-                    if part_idx != prev_part_idx:
-                        # new block to decompress
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        curr_decomp_block = np.rec.array(lzo.decompress(block_bytes), dtype=data_ori.dtype)
-                        # curr_decomp_block = np.frombuffer(block_bytes, dtype=np.int32).reshape(-1, num_cols)
-                        decomp_block = curr_decomp_block
-                        num_decomp += 1
-                        current_memory = sys.getsizeof(block_bytes)
-                        current_memory += curr_decomp_block.nbytes
-                        if current_memory > peak_memory:
-                            peak_memory = current_memory
-                        prev_part_idx = part_idx
-                    else:
-                        curr_decomp_block = decomp_block
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    
-                    t_lookup += timer_lookup.toc()
-
-                t_total += timer_total.toc()
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-    # Measure latency for end-end latency optimzed strategy
-    if latency_optimized: 
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_total = 0
-        t_locate_part = 0
-        timer_total.tic()
-
-        for _ in tqdm(range(num_loop)):  
-            decomp_block = dict()
-            peak_memory = 0
-            num_decomp = 0
-            count_nonexist = 0
-            cache_block_memory = 0
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-                result_idx = 0
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic()                 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]        
-                    t_locate_part += timer_locate_part.toc()
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    timer_decomp.tic()
-
-                    if part_idx not in decomp_block:
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        curr_decomp_block = np.rec.array(lzo.decompress(block_bytes), dtype=data_ori.dtype)
-                        decomp_block[part_idx] = curr_decomp_block
-                        num_decomp += 1
-                        cache_block_memory += curr_decomp_block.nbytes
-                        block_bytes_size = sys.getsizeof(block_bytes)
-                    else:
-                        curr_decomp_block = decomp_block[part_idx]
-                    
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-                    
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    
-                    t_lookup += timer_lookup.toc()
-                    result_idx += 1
-                    if cache_block_memory + block_bytes_size > peak_memory:
-                        peak_memory = cache_block_memory + block_bytes_size
-                t_total += timer_total.toc()
+
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    timer_build_index = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    t_build_index = 0
+    timer_total.tic()
+
+    for _ in tqdm(range(num_loop)):  
+        decomp_block = dict()
+        partition_hit = dict()
+        peak_memory = 0
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part
+            sample_index_partition = sample_index_partition.astype(np.int32)
+            t_sort += timer_sort.toc()
+            result = np.ndarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+
+            for idx in range(sample_size):
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]        
+                timer_locate_part.tic()                 
+                part_idx = sample_index_partition[idx]
+                # part_idx = int((query_key-x_start) // num_record_per_part)
+                t_locate_part += timer_locate_part.toc()
+                timer_decomp.tic()
+
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            curr_decomp_block = ndb_utils.evict_unused_partition(curr_decomp_block, partition_hit, free_memory=1024*1024*100)
+
+                    partition_hit[part_idx] =1
+
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
+                    block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                    curr_decomp_block = np.frombuffer(lzo.decompress(block_bytes), dtype=data_ori.dtype)
+                    decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                    cache_block_memory += curr_decomp_block.nbytes
+                    block_bytes_size = sys.getsizeof(block_bytes)
+                    if search_algo == 'hash':
+                        t_decomp += timer_decomp.toc()
+                        timer_build_index.tic()
+                        t_build_index += timer_build_index.toc()
+                        timer_decomp.tic()
+                else:
+                    curr_decomp_block = decomp_block[part_idx]
+                    partition_hit[part_idx] +=1
+                
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
+                
+                if search_algo == 'binary':
+                    data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
+                elif search_algo == 'naive':
+                    data_idx = curr_decomp_block[key] == query_key
+
+                if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx])
+                else:
+                    count_nonexist += 1
+                
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+                if cache_block_memory + block_bytes_size > peak_memory:
+                    peak_memory = cache_block_memory + block_bytes_size
+            t_total += timer_total.toc()
         latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+        del result
+        gc.collect()
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
 
     return_latency = None 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
diff --git a/DeepMapping/DeepMapping/rle_compression.py b/DeepMapping/DeepMapping/rle_compression.py
index 2aeb0bc..0415ce0 100644
--- a/DeepMapping/DeepMapping/rle_compression.py
+++ b/DeepMapping/DeepMapping/rle_compression.py
@@ -1,15 +1,16 @@
-import pandas as pd 
-import numpy as np
-import sys
+import gc
 import math
+import numpy as np
 import os
+import pandas as pd 
+import sys
 from DeepMapping import ndb_utils
 from DeepMapping.ndb_utils import Timer, recreate_temp_dir, save_byte_to_disk, read_bytes_from_disk
 from more_itertools import run_length
 from tqdm.auto import tqdm
 
 def measure_latency(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary'):
     """Measure the end-end latency of data query
 
@@ -24,10 +25,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -37,12 +34,14 @@ def measure_latency(df, data_ori, task_name, sample_size,
         path_to_model : str
             load model from custom path
     """
+    mode = os.environ['MODE']
     data_ori_size = 0
     data_comp_size = 0
     memory_optimized_latency = None 
     latency_optimized_latency = None 
     memory_optimized_result = None
     latency_optimized_result = None
+    exp_data_dict = dict()
     key = df.columns[0]
     block_size = 1024 * 1024
     record_size = data_ori[0].nbytes
@@ -62,8 +61,8 @@ def measure_latency(df, data_ori, task_name, sample_size,
 
     list_type = []
     for col in data_ori.dtype.names:
-        if data_ori[col].dtype == object:
-            list_type.append({'names': [col], 'formats': ['O'], 'offsets': [0], 'itemsize': 8})
+        if data_ori[col].dtype == 'S8':
+            list_type.append('S8')
         elif data_ori[col].dtype == np.int32:
             list_type.append(np.int32)
         elif data_ori[col].dtype == np.float64:
@@ -109,7 +108,7 @@ def measure_latency(df, data_ori, task_name, sample_size,
                             temp_dtype = {'names': [data_ori.dtype.names[col_idx]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4}
                         else: 
                             temp_dtype = list_type[col_idx]
-                        a = np.recarray((len(col_val_rle_encode),), dtype=temp_dtype)
+                        a = np.ndarray((len(col_val_rle_encode),), dtype=temp_dtype)
                         b = np.zeros(len(col_val_rle_encode), np.int32)
                         for idx, val in enumerate(col_val_rle_encode):
                             a[idx] = val[0]
@@ -124,228 +123,143 @@ def measure_latency(df, data_ori, task_name, sample_size,
         data_comp_size = data_size/1024/1024            
         print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
         np.save(os.path.join(comp_data_dir, 'list_rle_enabled'), list_rle_enabled)
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        exp_data_dict['list_type'] = list_type
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
     else:
         list_rle_enabled = np.load(os.path.join(comp_data_dir, 'list_rle_enabled.npy'))  
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_type = exp_data_dict['list_type']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
     
     list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
 
-    # Measure latency for run-time memory optimized strategy
-    if memory_optimized:
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer() 
-        t_total = 0
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_locate_part = 0
-        peak_memory = 0
-
-        for _ in tqdm(range(num_loop)):
-            decomp_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = -1
+  
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    timer_total.tic()
 
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
+    for _ in tqdm(range(num_loop)):  
+        partition_hit = dict()
+        decomp_block = dict()
+        peak_memory = 0
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
 
-                for idx in range(sample_size):
-                    timer_locate_part.tic() 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    t_locate_part += timer_locate_part.toc()
-                    timer_decomp.tic()
-                    if part_idx != prev_part_idx:     
-                        current_memory = 0   
-                        # decompress index first
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        block_data = np.frombuffer(block_bytes, dtype=list_type[0]) 
-                        curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype)                        
-                        current_memory += sys.getsizeof(block_bytes)                       
-                        curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
-                        
-                        for i in range(1, df.shape[1]):
-                            col_name = data_ori.dtype.names[i]
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            t_sort += timer_sort.toc()
+            result = np.ndarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+            
+            for idx in range(sample_size):
+                timer_locate_part.tic()        
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]            
+                t_locate_part += timer_locate_part.toc()
 
-                            if list_rle_enabled[i + part_idx*df.shape[1]] == False:
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                                block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                                current_memory += sys.getsizeof(block_bytes)
+                part_idx = int((query_key-x_start) // num_record_per_part)
+                timer_decomp.tic()
+                decomp_memory = 0
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100)
 
-                                if list_type[i] == np.int32 or list_type[i] == np.float64:
-                                    block_data = np.frombuffer(block_bytes, dtype=list_type[i])
-                                else:
-                                    block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name]
-                                curr_decomp_block[col_name] = block_data
+                    partition_hit[part_idx] =1
+                    # decompress index first
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
+                    block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                    block_data = np.frombuffer(block_bytes, dtype=list_type[0])
+                    curr_decomp_block = np.ndarray((len(block_data),), dtype=data_ori.dtype)   
+                    decomp_memory += sys.getsizeof(block_bytes)   
+                    curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
+                    
+                    for i in range(1, df.shape[1]):
+                        col_name = data_ori.dtype.names[i]
+                        if list_rle_enabled[i + part_idx*df.shape[1]] == False:
+                            file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
+                            block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                            if list_type[i] == np.int32 or list_type[i] == np.float64:
+                                block_data = np.frombuffer(block_bytes, dtype=list_type[i])
                             else:
-                                # rle decode
-                                if list_type[i] == np.int32:
-                                    temp_dtype = {'names': [data_ori.dtype.names[i]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4}
-                                else: 
-                                    temp_dtype = list_type[i]
-                            
-                                file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-{}-val.data'.format(i))
-                                file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-{}-num.data'.format(i))
-                                val_data = np.rec.array(ndb_utils.read_bytes_from_disk(file_name1), dtype=temp_dtype)
-                                num_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name2), dtype=np.int32)
-                                temp_col_decode_data = []
-
-                                for val, num in zip(val_data, num_data):
-                                    temp_col_decode_data.extend([val[0]]*num)
-                                curr_decomp_block[col_name] = temp_col_decode_data
+                                block_data = np.frombuffer(block_bytes, dtype=list_type[i])
+                            curr_decomp_block[col_name] = block_data
+                            decomp_memory += sys.getsizeof(block_bytes)
+                        else:
+                            # rle decode
+                            if list_type[i] == np.int32:
+                                temp_dtype = {'names': [data_ori.dtype.names[i]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4}
+                            else: 
+                                temp_dtype = list_type[i]
                                 
-                                current_memory += val_data.nbytes
-                                current_memory += num_data.nbytes
-          
-                        current_memory += curr_decomp_block.nbytes
-                        
-                        decomp_block = curr_decomp_block
-                        num_decomp += 1                
-                        if current_memory > peak_memory:
-                            peak_memory = current_memory
-                        prev_part_idx = part_idx
-                    else:
-                        curr_decomp_block = decomp_block
+                            file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-{}-val.data'.format(i))
+                            file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-{}-num.data'.format(i))
+                            val_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name1), dtype=temp_dtype)
+                            num_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name2), dtype=np.int32)
+                            temp_col_decode_data = []
 
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    
-                    t_lookup += timer_lookup.toc()
-
-                t_total += timer_total.toc()
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-    # Measure latency for end-end latency optimzed strategy
-    if latency_optimized: 
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_total = 0
-        t_locate_part = 0
-        timer_total.tic()
-
-        for _ in tqdm(range(num_loop)):  
-            decomp_block = dict()
-            peak_memory = 0
-            num_decomp = 0
-            count_nonexist = 0
-            cache_block_memory = 0
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-                result_idx = 0
-                
-                for idx in range(sample_size):
-                    timer_locate_part.tic()        
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]            
-                    t_locate_part += timer_locate_part.toc()
-
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    timer_decomp.tic()
-                    decomp_memory = 0
-                    if part_idx not in decomp_block:
-                        # decompress index first
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0))
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        block_data = np.frombuffer(block_bytes, dtype=list_type[0])
-                        curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype)   
-                        decomp_memory += sys.getsizeof(block_bytes)   
-                        curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data
+                            for val, num in zip(val_data, num_data):
+                                temp_col_decode_data.extend([val[0]]*num)
+                            curr_decomp_block[col_name] = temp_col_decode_data
+                            decomp_memory += val_data.nbytes
+                            decomp_memory += num_data.nbytes
                         
-                        for i in range(1, df.shape[1]):
-                            col_name = data_ori.dtype.names[i]
-                            if list_rle_enabled[i + part_idx*df.shape[1]] == False:
-                                file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i))
-                                block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                                if list_type[i] == np.int32 or list_type[i] == np.float64:
-                                    block_data = np.frombuffer(block_bytes, dtype=list_type[i])
-                                else:
-                                    block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name]
-                                curr_decomp_block[col_name] = block_data
-                                decomp_memory += sys.getsizeof(block_bytes)
-                            else:
-                                # rle decode
-                                if list_type[i] == np.int32:
-                                    temp_dtype = {'names': [data_ori.dtype.names[i]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4}
-                                else: 
-                                    temp_dtype = list_type[i]
-                                    
-                                file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-{}-val.data'.format(i))
-                                file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-{}-num.data'.format(i))
-                                val_data = np.rec.array(ndb_utils.read_bytes_from_disk(file_name1), dtype=temp_dtype)
-                                num_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name2), dtype=np.int32)
-                                temp_col_decode_data = []
-
-                                for val, num in zip(val_data, num_data):
-                                    temp_col_decode_data.extend([val[0]]*num)
-                                curr_decomp_block[col_name] = temp_col_decode_data
-                                decomp_memory += val_data.nbytes
-                                decomp_memory += num_data.nbytes
-                            
-                        cache_block_memory += curr_decomp_block.nbytes  
-                        decomp_block[part_idx] = curr_decomp_block
-                        num_decomp += 1
-                    else:
-                        curr_decomp_block = decomp_block[part_idx]
-                    t_decomp += timer_decomp.toc()
-                    # -----
-                    timer_lookup.tic()
+                    cache_block_memory += curr_decomp_block.nbytes  
+                    decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                else:
+                    partition_hit[part_idx] += 1
+                    curr_decomp_block = decomp_block[part_idx]
+                t_decomp += timer_decomp.toc()
+                # -----
+                timer_lookup.tic()
 
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
+                if search_algo == 'binary':
+                    data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
+                elif search_algo == 'naive':
+                    data_idx = curr_decomp_block[key] == query_key
 
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    t_lookup += timer_lookup.toc()
-                    result_idx += 1
-                    if cache_block_memory + decomp_memory > peak_memory:
-                        peak_memory = cache_block_memory + decomp_memory
-                t_total += timer_total.toc()
-        latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+                if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx])
+                else:
+                    count_nonexist += 1
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+                if cache_block_memory + decomp_memory > peak_memory:
+                    peak_memory = cache_block_memory + decomp_memory
+            t_total += timer_total.toc()
+    latency_optimized_result = result.copy()
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop, 0 / num_loop, # build_index time
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
 
     return_latency = None 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
diff --git a/DeepMapping/DeepMapping/uncompress.py b/DeepMapping/DeepMapping/uncompress.py
index e792fd0..b668ae2 100644
--- a/DeepMapping/DeepMapping/uncompress.py
+++ b/DeepMapping/DeepMapping/uncompress.py
@@ -1,10 +1,12 @@
-import pandas as pd 
-import numpy as np
-import sys
+import ctypes
+import gc
 import math
+import numpy as np
+import pandas as pd 
 import os
-import ctypes
+import sys
 from DeepMapping import ndb_utils
+from collections import defaultdict
 # from DeepMapping.ndb_utils import Timer, recreate_temp_dir, save_byte_to_disk, read_bytes_from_disk
 # from more_itertools import run_length
 from tqdm.auto import tqdm
@@ -21,10 +23,8 @@
 shared_utils.aux_look_up_bin.restype = ctypes.c_long
 
 def measure_latency(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024):
-    # TODO add support of hash to run-time memory optimized strategy
-    # TODO add support of binary_c to run-time memory optimized strategy
     """Measure the end-end latency of data query
 
     Args:
@@ -38,10 +38,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -51,12 +47,14 @@ def measure_latency(df, data_ori, task_name, sample_size,
         path_to_model : str
             load model from custom path
     """
+    mode = os.environ['MODE']
     data_ori_size = 0
     data_comp_size = 0
     memory_optimized_latency = None 
     latency_optimized_latency = None 
     memory_optimized_result = None
     latency_optimized_result = None
+    exp_data_dict = dict()
 
     key = df.columns[0]
     # block_size = 1024 * 1024
@@ -73,6 +71,9 @@ def measure_latency(df, data_ori, task_name, sample_size,
     task_name = task_name
     folder_name = 'uncompress'
     comp_data_dir = os.path.join(root_path, task_name, folder_name)
+    if 'DATA_OPS' in os.environ:
+        comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS'])
+                                     
     print('[Generate File Path]: {}'.format(comp_data_dir))
 
     dict_contigous_key = dict()
@@ -88,7 +89,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             data_idx = np.logical_and(x >= val_start, x < val_end)
             data_part = data_ori[data_idx]
             if search_algo == 'binary_c':
-                # FIXME temporary workaround to avoid the overhead of converting to contiguous array
                 dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32)
 
             if len(data_part) == 0:
@@ -101,178 +101,137 @@ def measure_latency(df, data_ori, task_name, sample_size,
         data_ori_size = data_ori.nbytes/1024/1024
         data_comp_size = data_size/1024/1024
         print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
-    list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
-
-    # Measure latency for run-time memory optimized strategy
-    if memory_optimized:
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer() 
-        t_total = 0
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_locate_part = 0
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+    else:
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
+
+
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_build_index = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    t_build_index = 0
+    timer_total.tic()
+    for _ in tqdm(range(num_loop)):  
+        decomp_block = dict()
+        partition_hit = dict()
         peak_memory = 0
-
-        for _ in tqdm(range(num_loop)):
-            decomp_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = -1
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic() 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    t_locate_part += timer_locate_part.toc()
-                    timer_decomp.tic()
-
-                    if part_idx != prev_part_idx:
-                        # new block to decompress
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        curr_decomp_block = np.rec.array(block_bytes, dtype=data_ori.dtype)
-                        decomp_block = curr_decomp_block
-                        num_decomp += 1
-                        current_memory = sys.getsizeof(block_bytes)
-                        current_memory += curr_decomp_block.nbytes
-                        if current_memory > peak_memory:
-                            peak_memory = current_memory
-                        prev_part_idx = part_idx
-                    else:
-                        curr_decomp_block = decomp_block
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-    
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    
-                    t_lookup += timer_lookup.toc()
-
-                t_total += timer_total.toc()
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-    # Measure latency for end-end latency optimzed strategy
-    if latency_optimized: 
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_build_index = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_total = 0
-        t_locate_part = 0
-        t_build_index = 0
-        timer_total.tic()
-        for _ in tqdm(range(num_loop)):  
-            decomp_block = dict()
-            peak_memory = 0
-            num_decomp = 0
-            count_nonexist = 0
-            cache_block_memory = 0
-
-            # build hash table
-            if search_algo == 'hash':
-                data_hash = dict() 
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-                result_idx = 0
-                
-                for idx in range(sample_size):
-                    timer_locate_part.tic()        
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    t_locate_part += timer_locate_part.toc()
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    timer_decomp.tic()
-
-                    if part_idx not in decomp_block:
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        curr_decomp_block = np.rec.array(block_bytes, dtype=data_ori.dtype)
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        # build hash table
+        if search_algo == 'hash':
+            data_hash = dict() 
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part
+            sample_index_partition = sample_index_partition.astype(np.int32)
+            t_sort += timer_sort.toc()
+            result = np.ndarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+            
+            for idx in range(sample_size):
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx]
+                timer_locate_part.tic()        
+                part_idx = sample_index_partition[idx]
+                t_locate_part += timer_locate_part.toc()
+                timer_decomp.tic()
+            
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100)
+                    partition_hit[part_idx] =1
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
+                    block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                    curr_decomp_block = np.frombuffer(block_bytes, dtype=data_ori.dtype)
+                    try: 
                         decomp_block[part_idx] = curr_decomp_block
-                        num_decomp += 1
-                        block_bytes_size = sys.getsizeof(block_bytes)
-
-                        if search_algo == 'hash':
-                            t_decomp += timer_decomp.toc()
-                            timer_build_index.tic()
-                            for block_data_idx in range(len(curr_decomp_block)):
-                                data_entry_key = curr_decomp_block[key][block_data_idx]
-                                # print(data_entry_key)
-                                data_entry_val = curr_decomp_block[block_data_idx]
-                                data_hash[data_entry_key] = data_entry_val   
-                            cache_block_memory = sys.getsizeof(data_hash)
-                            t_build_index += timer_build_index.toc()
-                            timer_decomp.tic()
-                        else:
-                            cache_block_memory += curr_decomp_block.nbytes 
-                    else:
-                        curr_decomp_block = decomp_block[part_idx]
-
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'binary_c':
-                        arr_contigous_arr = dict_contigous_key[part_idx]
-                        data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-                    elif search_algo == 'hash':
-                        data_idx = query_key in data_hash.keys()                            
-
-                    if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    elif search_algo == 'hash' and data_idx == True:
-                        result[query_key_index_in_old] = data_hash[query_key]
+                    except:
+                        decomp_block = dict()
+                        decomp_block[part_idx] = curr_decomp_block
+                    
+                    num_decomp += 1
+                    block_bytes_size = sys.getsizeof(block_bytes)
+
+                    if search_algo == 'hash':
+                        t_decomp += timer_decomp.toc()
+                        timer_build_index.tic()
+                        for block_data_idx in range(len(curr_decomp_block)):
+                            data_entry_key = curr_decomp_block[key][block_data_idx]
+                            # print(data_entry_key)
+                            data_entry_val = curr_decomp_block[block_data_idx]
+                            data_hash[data_entry_key] = data_entry_val   
+                        cache_block_memory = sys.getsizeof(data_hash)
+                        t_build_index += timer_build_index.toc()
+                        timer_decomp.tic()
                     else:
-                        count_nonexist += 1
-
-                    t_lookup += timer_lookup.toc()
-                    result_idx += 1
-                    if cache_block_memory + block_bytes_size > peak_memory:
-                        peak_memory = cache_block_memory + block_bytes_size
-                t_total += timer_total.toc()
+                        cache_block_memory += curr_decomp_block.nbytes 
+                else:
+                    curr_decomp_block = decomp_block[part_idx]
+                    partition_hit[part_idx] +=1
+
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
+
+                if search_algo == 'binary':
+                    data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
+                elif search_algo == 'binary_c':
+                    arr_contigous_arr = dict_contigous_key[part_idx]
+                    data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block))
+                elif search_algo == 'naive':
+                    data_idx = curr_decomp_block[key] == query_key
+                elif search_algo == 'hash':
+                    data_idx = query_key in data_hash.keys()                            
+
+                if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx])
+                elif search_algo == 'hash' and data_idx == True:
+                    result[query_key_index_in_old] = tuple(data_hash[query_key])
+                else:
+                    count_nonexist += 1
+
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+                if cache_block_memory + block_bytes_size > peak_memory:
+                    peak_memory = cache_block_memory + block_bytes_size
+            t_total += timer_total.toc()
         latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+        del result
+        gc.collect()
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
 
     return_latency = None 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
diff --git a/DeepMapping/DeepMapping/zstd_compression.py b/DeepMapping/DeepMapping/zstd_compression.py
index 117ea92..368b72d 100644
--- a/DeepMapping/DeepMapping/zstd_compression.py
+++ b/DeepMapping/DeepMapping/zstd_compression.py
@@ -1,9 +1,11 @@
+import ctypes
+import gc
+import math
 import numpy as np
+import os
 import sys
 import zstd
-import math
-import os
-import ctypes
+from collections import defaultdict
 from DeepMapping import ndb_utils
 from tqdm.auto import tqdm
 
@@ -19,10 +21,8 @@
 shared_utils.aux_look_up_bin.restype = ctypes.c_long
 
 def measure_latency(df, data_ori, task_name, sample_size, 
-                    generate_file=True, memory_optimized=True, latency_optimized=True,
+                    generate_file=True,
                     num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024):
-    # TODO add support for run-time memory optimized strategy
-    # TODO add support of binary_c to run-time memory optimized strategy
     """Measure the end-end latency of data query
 
     Args:
@@ -36,10 +36,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             number of queried data per query
         generate_file : bool
             whether need to store the data to disk
-        memory_optimized : bool
-            whether measure the end-end latency with the run-time memory optimized strategy
-        latency_optimized : bool
-            whether measure the end-end latency with the latency optimized strategy
         num_loop : int
             number of loops to run for measuring the latency
         num_query : int
@@ -49,15 +45,15 @@ def measure_latency(df, data_ori, task_name, sample_size,
         path_to_model : str
             load model from custom path
     """
+    mode = os.environ['MODE']
     data_ori_size = 0
     data_comp_size = 0
     memory_optimized_latency = None 
     latency_optimized_latency = None 
     memory_optimized_result = None
     latency_optimized_result = None
+    exp_data_dict = dict()
     key = df.columns[0]
-    # block_size = 1024 * 1024
-    # block_size = 1024 * 512
     record_size = data_ori[0].nbytes
     num_record_per_part = np.floor(block_size / record_size)
     x = data_ori[key]
@@ -70,6 +66,9 @@ def measure_latency(df, data_ori, task_name, sample_size,
     task_name = task_name
     folder_name = 'zstd'
     comp_data_dir = os.path.join(root_path, task_name, folder_name)
+    
+    if 'DATA_OPS' in os.environ:
+        comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS'])
     print('[Generate File Path]: {}'.format(comp_data_dir))
 
     dict_contigous_key = dict() 
@@ -86,7 +85,6 @@ def measure_latency(df, data_ori, task_name, sample_size,
             data_part = data_ori[data_idx]
 
             if search_algo == 'binary_c':
-                # FIXME temporary workaround to avoid the overhead of converting to contiguous array
                 dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32)
 
             if len(data_part) == 0:
@@ -99,177 +97,137 @@ def measure_latency(df, data_ori, task_name, sample_size,
         data_ori_size = data_ori.nbytes/1024/1024
         data_comp_size = data_size/1024/1024            
         print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024))
-    
-    list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
-
-    # Measure latency for run-time memory optimized strategy
-    if memory_optimized:
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer() 
-        t_total = 0
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_locate_part = 0
+        exp_data_dict['num_record_per_part'] = num_record_per_part 
+        exp_data_dict['data_ori_size'] = data_ori_size
+        exp_data_dict['data_comp_size'] = data_comp_size
+        exp_data_dict['x_start'] = x_start 
+        exp_data_dict['x_end'] = x_end 
+        ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict)
+        list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size)
+    else:
+        exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'))
+        num_record_per_part = exp_data_dict['num_record_per_part']
+        data_ori_size = exp_data_dict['data_ori_size']
+        data_comp_size = exp_data_dict['data_comp_size']
+        x_start = exp_data_dict['x_start']
+        x_end = exp_data_dict['x_end']
+        list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size)))
+
+    timer_total = ndb_utils.Timer()
+    timer_decomp = ndb_utils.Timer()
+    timer_lookup = ndb_utils.Timer()
+    timer_total = ndb_utils.Timer()
+    timer_sort = ndb_utils.Timer()
+    timer_locate_part = ndb_utils.Timer()
+    timer_build_index = ndb_utils.Timer()
+    t_decomp = 0
+    t_lookup = 0
+    t_sort = 0
+    t_total = 0
+    t_locate_part = 0
+    t_build_index = 0
+    timer_total.tic()
+
+    for _ in tqdm(range(num_loop)):  
+        decomp_block = dict()
+        partition_hit = dict()
         peak_memory = 0
-
-        for _ in tqdm(range(num_loop)):
-            decomp_block = None
-            num_decomp = 0
-            count_nonexist = 0
-            prev_part_idx = -1
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-
-                for idx in range(sample_size):
-                    timer_locate_part.tic() 
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx]
-                    part_idx = int((query_key-x_start) // num_record_per_part)                   
-                    t_locate_part += timer_locate_part.toc()
-                    timer_decomp.tic()
-
-                    if part_idx != prev_part_idx:
-                        # new block to decompress
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        curr_decomp_block = np.rec.array(zstd.decompress(block_bytes), dtype=data_ori.dtype)
-                        decomp_block = curr_decomp_block
-                        num_decomp += 1
-                        current_memory = sys.getsizeof(block_bytes)
-                        current_memory += curr_decomp_block.nbytes
-                        if current_memory > peak_memory:
-                            peak_memory = current_memory
-                        prev_part_idx = part_idx
-                    else:
-                        curr_decomp_block = decomp_block
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-
-                    if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    else:
-                        count_nonexist += 1
-                    
-                    t_lookup += timer_lookup.toc()
-
-                t_total += timer_total.toc()
-        memory_optimized_result = result.copy()
-        memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, 
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
-
-    # Measure latency for end-end latency optimzed strategy
-    if latency_optimized: 
-        timer_total = ndb_utils.Timer()
-        timer_decomp = ndb_utils.Timer()
-        timer_lookup = ndb_utils.Timer()
-        timer_total = ndb_utils.Timer()
-        timer_sort = ndb_utils.Timer()
-        timer_locate_part = ndb_utils.Timer()
-        timer_build_index = ndb_utils.Timer()
-        t_decomp = 0
-        t_lookup = 0
-        t_sort = 0
-        t_total = 0
-        t_locate_part = 0
-        t_build_index = 0
-        timer_total.tic()
-
-        for _ in tqdm(range(num_loop)):  
-            decomp_block = dict()
-            peak_memory = 0
-            num_decomp = 0
-            count_nonexist = 0
-            cache_block_memory = 0
-
-            # build hash table
-            if search_algo == 'hash':
-                data_hash = dict() 
-
-            for query_idx in range(num_query):
-                sample_index = list_sample_index[query_idx]
-                timer_total.tic()
-                timer_sort.tic()
-                sample_index_sorted = np.sort(sample_index)
-                sample_index_argsort = np.argsort(sample_index)
-                t_sort += timer_sort.toc()
-                result = np.recarray((sample_size,), dtype=data_ori.dtype)
-                result_idx = 0
-                for idx in range(sample_size):
-                    timer_locate_part.tic()  
-                    query_key = sample_index_sorted[idx]
-                    query_key_index_in_old = sample_index_argsort[idx] 
-                    t_locate_part += timer_locate_part.toc()
-                    part_idx = int((query_key-x_start) // num_record_per_part)
-                    timer_decomp.tic()
-
-                    if part_idx not in decomp_block:
-                        file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
-                        block_bytes = ndb_utils.read_bytes_from_disk(file_name)
-                        curr_decomp_block = np.rec.array(zstd.decompress(block_bytes), dtype=data_ori.dtype)
+        num_decomp = 0
+        count_nonexist = 0
+        cache_block_memory = 0
+        gc.collect()
+
+        # build hash table
+        if search_algo == 'hash':
+            data_hash = dict() 
+
+        for query_idx in range(num_query):
+            sample_index = list_sample_index[query_idx]
+            timer_total.tic()
+            timer_sort.tic()
+            sample_index_sorted = np.sort(sample_index)
+            sample_index_argsort = np.argsort(sample_index)
+            sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part
+            sample_index_partition = sample_index_partition.astype(np.int32)
+            t_sort += timer_sort.toc()
+            result = np.ndarray((sample_size,), dtype=data_ori.dtype)
+            result_idx = 0
+            for idx in range(sample_size):
+                query_key = sample_index_sorted[idx]
+                query_key_index_in_old = sample_index_argsort[idx] 
+                timer_locate_part.tic()  
+                part_idx = sample_index_partition[idx]
+                t_locate_part += timer_locate_part.toc()
+                timer_decomp.tic()
+
+                if part_idx not in decomp_block:
+                    if mode == 'edge':
+                        available_memory = ndb_utils.get_available_memory()
+                        if available_memory < 1024*1024*100:
+                            # memory not eneough, free some memory
+                            decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100)
+
+                    partition_hit[part_idx] =1
+                    file_name = os.path.join(comp_data_dir, str(part_idx) + '.data')
+                    block_bytes = ndb_utils.read_bytes_from_disk(file_name)
+                    curr_decomp_block = np.frombuffer(zstd.decompress(block_bytes), dtype=data_ori.dtype)
+                    try:
                         decomp_block[part_idx] = curr_decomp_block
-                        num_decomp += 1
-                        block_bytes_size = sys.getsizeof(block_bytes)
-
-                        if search_algo == 'hash':
-                            t_decomp += timer_decomp.toc()
-                            timer_build_index.tic()
-                            for block_data_idx in range(len(curr_decomp_block)):
-                                data_entry_key = curr_decomp_block[key][block_data_idx]
-                                # print(data_entry_key)
-                                data_entry_val = curr_decomp_block[block_data_idx]
-                                data_hash[data_entry_key] = data_entry_val   
-                            cache_block_memory = sys.getsizeof(data_hash)
-                            t_build_index += timer_build_index.toc()
-                            timer_decomp.tic()
-                        else:
-                            cache_block_memory += curr_decomp_block.nbytes 
-                    else:
-                        curr_decomp_block = decomp_block[part_idx]
-                    t_decomp += timer_decomp.toc()
-                    timer_lookup.tic()
-
-                    if search_algo == 'binary':
-                        data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
-                    elif search_algo == 'binary_c':
-                        arr_contigous_arr = dict_contigous_key[part_idx]
-                        data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block))
-                    elif search_algo == 'naive':
-                        data_idx = curr_decomp_block[key] == query_key
-                    elif search_algo == 'hash':
-                        data_idx = query_key in data_hash.keys()      
-
-                    if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
-                        result[query_key_index_in_old] = curr_decomp_block[data_idx]
-                    elif search_algo == 'hash' and data_idx == True:
-                        result[query_key_index_in_old] = data_hash[query_key]
+                    except:
+                        decomp_block = dict()
+                        decomp_block[part_idx] = curr_decomp_block
+                    num_decomp += 1
+                    block_bytes_size = sys.getsizeof(block_bytes)
+
+                    if search_algo == 'hash':
+                        t_decomp += timer_decomp.toc()
+                        timer_build_index.tic()
+                        for block_data_idx in range(len(curr_decomp_block)):
+                            data_entry_key = curr_decomp_block[key][block_data_idx]
+                            # print(data_entry_key)
+                            data_entry_val = curr_decomp_block[block_data_idx]
+                            data_hash[data_entry_key] = data_entry_val   
+                        cache_block_memory = sys.getsizeof(data_hash)
+                        t_build_index += timer_build_index.toc()
+                        timer_decomp.tic()
                     else:
-                        count_nonexist += 1
-                    t_lookup += timer_lookup.toc()
-                    result_idx += 1
-
-                    if cache_block_memory + block_bytes_size > peak_memory:
-                        peak_memory = cache_block_memory + block_bytes_size
-                t_total += timer_total.toc()
+                        cache_block_memory += curr_decomp_block.nbytes 
+                else:
+                    curr_decomp_block = decomp_block[part_idx]
+                    partition_hit[part_idx] +=1
+                t_decomp += timer_decomp.toc()
+                timer_lookup.tic()
+
+                if search_algo == 'binary':
+                    data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block))
+                elif search_algo == 'binary_c':
+                    arr_contigous_arr = dict_contigous_key[part_idx]
+                    data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block))
+                elif search_algo == 'naive':
+                    data_idx = curr_decomp_block[key] == query_key
+                elif search_algo == 'hash':
+                    data_idx = query_key in data_hash.keys()      
+
+                if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0):
+                    result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx])
+                elif search_algo == 'hash' and data_idx == True:
+                    result[query_key_index_in_old] = tuple(data_hash[query_key])
+                else:
+                    count_nonexist += 1
+                t_lookup += timer_lookup.toc()
+                result_idx += 1
+
+                if cache_block_memory + block_bytes_size > peak_memory:
+                    peak_memory = cache_block_memory + block_bytes_size
+            t_total += timer_total.toc()
         latency_optimized_result = result.copy()
-        latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
-        t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
-        t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
+        del result 
+        del decomp_block
+        del partition_hit
+        gc.collect()
+    latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, 
+    t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop,
+    t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T
 
     return_latency = None 
     if memory_optimized_latency is None and latency_optimized_latency is not None:
diff --git a/run_benchmark_data_manipulation.py b/run_benchmark_data_manipulation.py
index b32bb3b..a673817 100644
--- a/run_benchmark_data_manipulation.py
+++ b/run_benchmark_data_manipulation.py
@@ -12,24 +12,38 @@
                 'data_manipulation/multi_column_low_corr_100m', 
                 'data_manipulation/multi_column_high_corr_100m']
 list_benchmark = ['uncompress', 'zstd', 'deepmapping']
-list_sample_size = [10000]
+list_sample_size = [1000, 100000]
 list_ops = ['Default', 'Insert', 'Update', 'Delete']
 list_run_config = list(itertools.product(list_dataset, list_benchmark, list_sample_size, list_ops))
 print('[Config]: \n\t Dataset: {} \n\t Benchmark: {} \n\t Sample Size: {}'.format(list_dataset, list_benchmark, list_sample_size))
 
-memory_optimized = False
-latency_optimized = True
 num_loop = 100
 num_query = 5
 search_algo = 'binary'
 file_name = 'benchmark_data_manipulation.csv'
+# The following flag is used to indicate whether you can re-use the existing disk 
+# files (stored in temp dir) saved from a fresh run. Usually, you can start a 
+# fresh run and then change this flag to False. Also, if you set this flag a False
+# please make sure, you also run the generate_sample_index.py under DeepMapping
+# folder to pre-generate the query index before your next run.
+generate_file = True   
+# specificy your deep learning model backend, current support keras h5 model and onnx
+# model. There is a utility under DeepMapping to convert a h5 model into onnx format.
+os.environ['BACKEND'] = 'onnx'
+# Run the benchmark with the specified mode. full mode: assume memory is sufficient to cache
+# all the data; edge mode: try to cache all data within the available memory but reserve
+# a number of free memory for underlying process, current value: 100MB. Once the memory 
+# is insufficient, it will try to evict the least used partition to free the memory.
+os.environ['MODE'] = 'full'
 
 for run_config in tqdm(list_run_config):
     print('[STATUS] Current config: {}'.format(run_config))
     task_name, benchmark, sample_size, data_ops = run_config   
     generate_file = True
-
-    df = pd.read_csv('dataset/{}.csv'.format(task_name))
+    if generate_file: 
+        df = pd.read_csv('dataset/{}.csv'.format(task_name))
+    else:
+        df = pd.read_csv('dataset/{}.csv'.format(task_name), nrows=2)
     df, data_ori = df_preprocess(df, benchmark)
     # perform data manipulation to the data
     df, data_ori = data_manipulation(df, data_ops)
@@ -38,8 +52,7 @@
     try:
         data_ori_size, data_comp_size, result, latency = function_call(df=df, data_ori=data_ori, 
                                                                 task_name=task_name,  sample_size=sample_size,
-                                                                generate_file=generate_file, memory_optimized=memory_optimized,
-                                                                latency_optimized=latency_optimized, num_loop=num_loop,
+                                                                generate_file=generate_file, num_loop=num_loop,
                                                                 num_query=num_query, search_algo=search_algo) 
         result_df = pd.DataFrame(latency)
         result_df['config'] = str(run_config)
diff --git a/run_benchmark_data_query.py b/run_benchmark_data_query.py
index 624f846..c8213ae 100644
--- a/run_benchmark_data_query.py
+++ b/run_benchmark_data_query.py
@@ -8,32 +8,44 @@
 from DeepMapping.benchmark_utils import benchmark_handler
 
 list_dataset = ['tpch-s1/customer', 'tpch-s1/lineitem', 'tpch-s1/orders', 'tpch-s1/part', 'tpch-s1/supplier']
-list_benchmark = ['uncompress', 'dgpe', 'delta', 'byte_dictionary', 'lzo', 'zstd', 'rle', 'deepmapping']
-list_sample_size = [1000, 10000]
+list_benchmark = ['uncompress', 'zstd', 'deepmapping', 'hashtable', 'hashtable_with_compression']
+list_sample_size = [1000, 100000]
 list_run_config = list(itertools.product(list_dataset, list_benchmark, list_sample_size))
 print('[Config]: \n\t Dataset: {} \n\t Benchmark: {} \n\t Sample Size: {}'.format(list_dataset, list_benchmark, list_sample_size))
 
-memory_optimized = True # whether measure the latency for memory optimized strategy
-latency_optimized = True # whether measure teh latency for latency optimized strategy
 num_loop = 100
 num_query = 5
 search_algo = 'binary'
 file_name = 'benchmark_data_query.csv'
+# The following flag is used to indicate whether you can re-use the existing disk 
+# files (stored in temp dir) saved from a fresh run. Usually, you can start a 
+# fresh run and then change this flag to False. Also, if you set this flag a False
+# please make sure, you also run the generate_sample_index.py under DeepMapping
+# folder to pre-generate the query index before your next run.
+generate_file = True   
+# specificy your deep learning model backend, current support keras h5 model and onnx
+# model. There is a utility under DeepMapping to convert a h5 model into onnx format.
+os.environ['BACKEND'] = 'onnx'
+# Run the benchmark with the specified mode. full mode: assume memory is sufficient to cache
+# all the data; edge mode: try to cache all data within the available memory but reserve
+# a number of free memory for underlying process, current value: 100MB. Once the memory 
+# is insufficient, it will try to evict the least used partition to free the memory.
+os.environ['MODE'] = 'full'
 
-# pre_generated_files = defaultdict(bool)
 for run_config in tqdm(list_run_config):
     print('[STATUS] Current config: {}'.format(run_config))
     task_name, benchmark, sample_size = run_config
-    generate_file = True   
-    df = pd.read_csv('dataset/{}.csv'.format(task_name))
+    if generate_file: 
+        df = pd.read_csv('dataset/{}.csv'.format(task_name))
+    else:
+        df = pd.read_csv('dataset/{}.csv'.format(task_name), nrows=2)
     df, data_ori = df_preprocess(df, benchmark)
     function_call = benchmark_handler(benchmark)
 
     try:
         data_ori_size, data_comp_size, result, latency = function_call(df=df, data_ori=data_ori, 
                                                                 task_name=task_name,  sample_size=sample_size,
-                                                                generate_file=generate_file, memory_optimized=memory_optimized,
-                                                                latency_optimized=latency_optimized, num_loop=num_loop,
+                                                                generate_file=generate_file, num_loop=num_loop,
                                                                 num_query=num_query, search_algo=search_algo) 
         result_df = pd.DataFrame(latency)
         result_df['config'] = str(run_config)