diff --git a/.gitignore b/.gitignore index b014a4e..77e8eca 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ **/.ipynb_checkpoints/** **.tbl **/.vscode -**/DeepMapping.egg-info \ No newline at end of file +**/DeepMapping.egg-info +**.7z +**.onnx \ No newline at end of file diff --git a/DeepMapping/DeepMapping/byte_dictionary_compression.py b/DeepMapping/DeepMapping/byte_dictionary_compression.py index f5b7ce3..d93265e 100644 --- a/DeepMapping/DeepMapping/byte_dictionary_compression.py +++ b/DeepMapping/DeepMapping/byte_dictionary_compression.py @@ -1,8 +1,9 @@ -import pandas as pd -import numpy as np -import sys +import gc import math +import numpy as np import os +import pandas as pd +import sys from DeepMapping import ndb_utils from sklearn import preprocessing from DeepMapping.ndb_utils import Timer, recreate_temp_dir, save_byte_to_disk, read_bytes_from_disk @@ -55,7 +56,7 @@ def dict_compression(df): return dict_comp_data, list_encoder def measure_latency(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary'): """Measure the end-end latency of data query @@ -70,10 +71,6 @@ def measure_latency(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -81,12 +78,14 @@ def measure_latency(df, data_ori, task_name, sample_size, search_algo : str search algorithm that applied to search entry in each partition """ + mode = os.environ['MODE'] data_ori_size = 0 data_comp_size = 0 memory_optimized_latency = None latency_optimized_latency = None memory_optimized_result = None latency_optimized_result = None + exp_data_dict = dict() dict_comp_data, dict_encoder = dict_compression(df) list_type = [] @@ -132,200 +131,135 @@ def measure_latency(df, data_ori, task_name, sample_size, data_ori_size = data_ori.nbytes/1024/1024 data_comp_size = data_size/1024/1024 print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + exp_data_dict['list_type'] = list_type + exp_data_dict['dict_compressor'] = dict_comp_data, dict_encoder + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + else: + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_type = exp_data_dict['list_type'] + dict_comp_data, dict_encoder = exp_data_dict['dict_compressor'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) # Measure latency for run-time memory optimized strategy - if memory_optimized: - timer_total = Timer() - timer_decomp = Timer() - timer_sort = Timer() - timer_lookup = Timer() - timer_locate_part = Timer() - t_total = 0 - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_locate_part = 0 + + timer_total = Timer() + timer_decomp = Timer() + timer_lookup = Timer() + timer_total = Timer() + timer_sort = Timer() + timer_locate_part = Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + timer_total.tic() + + for _ in tqdm(range(num_loop)): + decomp_block = dict() + partition_hit = dict() peak_memory = 0 - - for _ in tqdm(range(num_loop)): - decomp_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = -1 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - part_idx = int((query_key-x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx != prev_part_idx: - # new block to decompress - current_memory = 0 - # decompress index first - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) - block_bytes = read_bytes_from_disk(file_name) - current_memory += sys.getsizeof(block_bytes) - block_data = np.frombuffer(block_bytes, dtype=list_type[0]) - curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype) - curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data - - for i in range(1, len(dict_comp_data)): - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - block_bytes = read_bytes_from_disk(file_name) - current_memory += sys.getsizeof(block_bytes) - block_data = np.frombuffer(block_bytes, dtype=list_type[i]) - col_name = data_ori.dtype.names[i] - - if dict_encoder[i] is not None: - fun_a = lambda x: dict_encoder[i].classes_[x] - curr_decomp_block[col_name] = fun_a(block_data.astype(np.int32)) - else: - curr_decomp_block[col_name] = block_data - - current_memory += curr_decomp_block.nbytes - decomp_block = curr_decomp_block - num_decomp += 1 - - if current_memory > peak_memory: - peak_memory = current_memory - - prev_part_idx = part_idx - decomp_block = curr_decomp_block - else: - curr_decomp_block = decomp_block - # ----- - - t_decomp += timer_decomp.toc() - # ----- - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + t_sort += timer_sort.toc() + result = np.ndarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + + for idx in range(sample_size): + timer_locate_part.tic() + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + t_locate_part += timer_locate_part.toc() + + part_idx = int((query_key-x_start) // num_record_per_part) + timer_decomp.tic() + # ----- + decomp_memory = 0 + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100) + + partition_hit[part_idx] =1 + + # decompress index first + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) + block_bytes = read_bytes_from_disk(file_name) + block_data = np.frombuffer(block_bytes, dtype=list_type[0]) + curr_decomp_block = np.ndarray((len(block_data),), dtype=data_ori.dtype) + decomp_memory += sys.getsizeof(block_bytes) + curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data - t_lookup += timer_lookup.toc() - - t_total += timer_total.toc() - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - if latency_optimized: - timer_total = Timer() - timer_decomp = Timer() - timer_lookup = Timer() - timer_total = Timer() - timer_sort = Timer() - timer_locate_part = Timer() - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_total = 0 - t_locate_part = 0 - timer_total.tic() - - for _ in tqdm(range(num_loop)): - decomp_block = dict() - peak_memory = 0 - num_decomp = 0 - count_nonexist = 0 - cache_block_memory = 0 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - result_idx = 0 - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - t_locate_part += timer_locate_part.toc() - - part_idx = int((query_key-x_start) // num_record_per_part) - timer_decomp.tic() - # ----- - decomp_memory = 0 - if part_idx not in decomp_block: - # decompress index first - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) + for i in range(1, len(dict_comp_data)): + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) block_bytes = read_bytes_from_disk(file_name) - block_data = np.frombuffer(block_bytes, dtype=list_type[0]) - curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype) - decomp_memory += sys.getsizeof(block_bytes) - curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data + decomp_memory += sys.getsizeof(block_bytes) + block_data = np.frombuffer(block_bytes, dtype=list_type[i]) + col_name = data_ori.dtype.names[i] + + if dict_encoder[i] is not None: + # encoded col + fun_a = lambda x: dict_encoder[i].classes_[x] + curr_decomp_block[col_name] = fun_a(block_data.astype(np.int32)) + else: + curr_decomp_block[col_name] = block_data - for i in range(1, len(dict_comp_data)): - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - block_bytes = read_bytes_from_disk(file_name) - decomp_memory += sys.getsizeof(block_bytes) - block_data = np.frombuffer(block_bytes, dtype=list_type[i]) - col_name = data_ori.dtype.names[i] - - if dict_encoder[i] is not None: - # encoded col - fun_a = lambda x: dict_encoder[i].classes_[x] - curr_decomp_block[col_name] = fun_a(block_data.astype(np.int32)) - else: - curr_decomp_block[col_name] = block_data - - cache_block_memory += curr_decomp_block.nbytes - decomp_block[part_idx] = curr_decomp_block - num_decomp += 1 - else: - curr_decomp_block = decomp_block[part_idx] - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - - t_lookup += timer_lookup.toc() - result_idx += 1 - - if cache_block_memory + decomp_memory > peak_memory: - peak_memory = cache_block_memory + decomp_memory - - t_total += timer_total.toc() - latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - return_latency = None + cache_block_memory += curr_decomp_block.nbytes + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + else: + partition_hit[part_idx] += 1 + curr_decomp_block = decomp_block[part_idx] + t_decomp += timer_decomp.toc() + timer_lookup.tic() + + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) + elif search_algo == 'naive': + data_idx = curr_decomp_block[key] == query_key + + if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): + result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx]) + else: + count_nonexist += 1 + + t_lookup += timer_lookup.toc() + result_idx += 1 + + if cache_block_memory + decomp_memory > peak_memory: + peak_memory = cache_block_memory + decomp_memory + + t_total += timer_total.toc() + latency_optimized_result = result.copy() + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, 0 / num_loop, # build_index time #TODO this is required for build hash table, current is no needed, use binary search instead + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + + return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: return_latency = latency_optimized_latency.reshape((1,-1)) diff --git a/DeepMapping/DeepMapping/convert_model_from_h5_to_onnx.py b/DeepMapping/DeepMapping/convert_model_from_h5_to_onnx.py new file mode 100644 index 0000000..c043266 --- /dev/null +++ b/DeepMapping/DeepMapping/convert_model_from_h5_to_onnx.py @@ -0,0 +1,25 @@ +import os +import shutil +import tensorflow as tf + +"""This script is used to convert the h5 model into onnx format, +if you want to use onnxrunutime as backend. + +You are required to install tf2onnx by using pip. +""" + +for root, dirs, files in os.walk("models/nas/tpch-s1/", topdown=False): + for name in files: + if '.h5' in name: + model_name = name.split('.')[0] + # h5 file + model_path = os.path.join(root, name) + + model = tf.keras.models.load_model(model_path, compile=False) + # save in pb + model.save(os.path.join(root, model_name)) + cmd = "python -m tf2onnx.convert --saved-model {} --output {}.onnx".format(os.path.join(root, model_name), + os.path.join(root, model_name)) + os.system(cmd) + shutil.rmtree(os.path.join(root, model_name)) + print(root, name, cmd) diff --git a/DeepMapping/DeepMapping/deepmapping.py b/DeepMapping/DeepMapping/deepmapping.py index e7fa1c4..9b68057 100644 --- a/DeepMapping/DeepMapping/deepmapping.py +++ b/DeepMapping/DeepMapping/deepmapping.py @@ -1,17 +1,24 @@ -import pandas as pd +import ctypes +import gc +import math import numpy as np +import onnx +import onnxruntime as ort +import os +import pandas as pd import sys +import tensorflow as tf import zstd -import math -import os + +from bitarray import bitarray +from collections import defaultdict from DeepMapping import ndb_utils -import tensorflow as tf +from onnx_opcounter import calculate_params + +from sklearn import preprocessing from tensorflow.keras import layers, regularizers from tensorflow import keras -import ctypes -from sklearn import preprocessing -from bitarray import bitarray -from more_itertools import run_length + from tqdm.auto import tqdm @@ -78,7 +85,7 @@ def __getitem__(self, index): X = create_features_c_multi_thread(shared_utils, data_x, num_record, max_len) return X, Y - + def on_epoch_end(self): """Updates indexes after each epoch @@ -87,6 +94,38 @@ def on_epoch_end(self): if self.shuffle == True: np.random.shuffle(self.indexes) +class InferenceDataGenerator(tf.keras.utils.Sequence): + def __init__(self, x, batch_size, max_len, shuffle=False): + self.x = x + self.batch_size = batch_size + self.max_len = max_len + self.shuffle = shuffle + + def __len__(self): + return int(np.ceil(len(self.x) / self.batch_size)) + + def __getitem__(self, index): + idx_start = index*self.batch_size + idx_end = (index+1)*self.batch_size + data_x = self.x[idx_start:idx_end] + + num_record = len(data_x) + max_len = self.max_len + + shared_utils.create_fetures.restype = ctypes.POINTER(ctypes.c_bool * (num_record * max_len * 10)) + shared_utils.create_fetures_mutlt_thread_mgr.restype = ctypes.POINTER(ctypes.c_bool * (num_record * max_len * 10)) + + X = create_features_c_multi_thread(shared_utils, data_x, num_record, max_len) + + return X + + + def on_epoch_end(self): + """Updates indexes after each epoch + """ + self.indexes = np.arange(len(self.x)) + + def build_model(num_in, model_sturcture, list_num_out): x = tf.keras.Input(shape=(num_in,1)) flatten = tf.keras.layers.Flatten(input_shape=(num_in,1), name='in')(x) @@ -174,12 +213,51 @@ def compress_data(df, model_sturcture, batch_size=1024, num_epochs=500, train_ve else: return model, train_generator +def finetune_model(df, model_path, batch_size=1024, num_epochs=500, train_verbose=1, train=True): + df_key = [df.columns[0]] + list_y_encoded = [] + list_y_encoder = [] + + for col in df.columns: + if col not in df_key: + encoded_val, encoder = encode_label(df[col]) + list_y_encoded.append(encoded_val) + list_y_encoder.append(encoder) + num_tasks = len(list_y_encoded) + num_tasks + + for encoder in list_y_encoder: + print(len(encoder.classes_)) + + x = df[df_key[0]].values.astype(np.int32) + max_len = len(str(np.max(x))) + print('MAX LEN', max_len) + list_num_out = [len(encoder.classes_) for encoder in list_y_encoder] + strategy = tf.distribute.MirroredStrategy() + print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) + + with strategy.scope(): + # Everything that creates variables should be under the strategy scope. + # In general this is only model construction & `compile()`. + # model = build_model(max_len*10, model_sturcture, list_num_out) + # x = tf.keras.Input(shape=(num_in,1)) + model = tf.keras.models.load_model(model_path) + model = tf.keras.models.clone_model(model, tf.keras.Input(shape=(max_len*10,1))) + + opt = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=1e-3/1000) + model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=["accuracy"]) + train_generator = DataGenerator(x, list_y_encoded, batch_size, max_len) + + if train == True: + train_history = model.fit(train_generator, epochs=num_epochs, verbose=train_verbose, callbacks=[SOMT(model, 1)]) + return model, train_history + else: + return model, train_generator + def measure_latency_any(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary', path_to_model=None, block_size=1024*1024): - # TODO add support of hash to run-time memory optimized strategy - # TODO add support of binary_c to run-time memory optimized strategy """Measure the end-end latency of data query Args: @@ -193,10 +271,6 @@ def measure_latency_any(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -206,6 +280,8 @@ def measure_latency_any(df, data_ori, task_name, sample_size, path_to_model : str load model from custom path """ + backend = os.environ['BACKEND'] + mode = os.environ['MODE'] data_ori_size = 0 data_comp_size = 0 memory_optimized_latency = None @@ -213,20 +289,25 @@ def measure_latency_any(df, data_ori, task_name, sample_size, memory_optimized_result = None latency_optimized_result = None + + root_path = 'temp' + folder_name = 'deepmapping' + comp_data_dir = os.path.join(root_path, task_name, folder_name) + if 'DATA_OPS' in os.environ: + comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS']) + print('[Generate File Path]: {}'.format(comp_data_dir)) + df_key = [df.columns[0]] list_y_encoded = [] list_y_encoder = [] size_encoder = 0 - for col in df.columns: - if col not in df_key: - encoded_val, encoder = encode_label(df[col]) - list_y_encoded.append(encoded_val) - list_y_encoder.append(encoder) - size_encoder += encoder.classes_.nbytes - num_tasks = len(list_y_encoded) - - for encoder in list_y_encoder: - print(len(encoder.classes_)) + exist_bit_arr = None + num_record_per_part = None + data_comp_size = None + max_len = None + x_start = None + x_end = None + num_tasks = None shared_utils = ctypes.CDLL(os.path.abspath("shared_utils.so")) # Or full path to file ND_POINTER_1 = np.ctypeslib.ndpointer(dtype=np.bool_, @@ -242,45 +323,80 @@ def measure_latency_any(df, data_ori, task_name, sample_size, num_threads = 8 - x = df[df_key[0]].values.astype(np.int32) - max_len = len(str(np.max(x))) - y = np.array(list_y_encoded).T.astype(np.int32) - data = np.concatenate((x.reshape(-1,1), y), axis=1, dtype=np.int32) - print(data.nbytes/1024/1024) - if path_to_model is None: - model = tf.keras.models.load_model('models/nas/{}.h5'.format(task_name), compile=False) + if backend == 'tf': + model = tf.keras.models.load_model('models/nas/{}.h5'.format(task_name), compile=False) + model_size = model.count_params()*4/1024/1024 + elif backend == 'onnx': + model = ort.InferenceSession('models/nas/{}.onnx'.format(task_name), providers=['CUDAExecutionProvider']) + input_name = model.get_inputs()[0].name + model_size = calculate_params(onnx.load_model('models/nas/{}.onnx'.format(task_name)))*4/1024/1024 else: - model = tf.keras.models.load_model(path_to_model, compile=False) - train_generator = DataGenerator(x, list_y_encoded, 1024*2**4, max_len) + if backend == 'tf': + model = tf.keras.models.load_model(path_to_model, compile=False) + model_size = model.count_params()*4/1024/1024 + elif backend == 'onnx': + model = ort.InferenceSession(path_to_model, providers=['CUDAExecutionProvider']) + input_name = model.get_inputs()[0].name + model_size = calculate_params(onnx.load_model(path_to_model))*4/1024/1024 - # exist_bitarray - x_start = np.min(x) - x_end = np.max(x) - exist_bit_arr = bitarray('0')*(x_end - x_start + 1) - - for val in x: - exist_bit_arr[val-x_start] = 1 - print(sys.getsizeof(exist_bit_arr)/1024/1024) - - root_path = 'temp' - folder_name = 'ours-any' - comp_data_dir = os.path.join(root_path, task_name, folder_name) - print('[Generate File Path]: {}'.format(comp_data_dir)) # generate file if generate_file: ndb_utils.recreate_temp_dir(comp_data_dir) + exp_data_dict = dict() + + + for col in df.columns: + if col not in df_key: + encoded_val, encoder = encode_label(df[col]) + list_y_encoded.append(encoded_val) + list_y_encoder.append(encoder) + size_encoder += encoder.classes_.nbytes / 1024 / 1024 + num_tasks = len(list_y_encoded) + + exp_data_dict['list_y_encoder'] = list_y_encoder + exp_data_dict['num_tasks'] = num_tasks + exp_data_dict['size_encoder'] = size_encoder + + for encoder in list_y_encoder: + print(len(encoder.classes_)) + + x = df[df_key[0]].values.astype(np.int32) + max_len = len(str(np.max(x))) + exp_data_dict['max_len'] = max_len + y = np.array(list_y_encoded).T.astype(np.int32) + data = np.concatenate((x.reshape(-1,1), y), axis=1, dtype=np.int32) + print(data.nbytes/1024/1024) + + train_generator = DataGenerator(x, list_y_encoded, 1024*2**4, max_len) + + # exist_bitarray + x_start = np.min(x) + x_end = np.max(x) + exist_bit_arr = bitarray('0')*(x_end - x_start + 1) + + for val in x: + exist_bit_arr[val-x_start] = 1 + print(sys.getsizeof(exist_bit_arr)/1024/1024) + misclassified_index = [] for idx, (x_sub,y_sub) in tqdm(enumerate(train_generator), total=len(train_generator)): y_sub = list(y_sub.values()) - y_sub_pred = model(x_sub) + if backend == 'tf': + y_sub_pred = model(x_sub) + elif backend == 'onnx': + y_sub_pred = model.run(None, {input_name: np.expand_dims(x_sub, -1).astype(np.float32)}) mis_pred = [] for i in range(num_tasks): if num_tasks == 1: - mis_pred.append(y_sub[i] != np.argmax(y_sub_pred, axis=1)) + if backend == 'tf': + mis_pred.append(y_sub[i] != np.argmax(y_sub_pred, axis=1)) + if backend == 'onnx': + mis_pred.append(y_sub[i] != np.argmax(y_sub_pred[0], axis=1)) + # mis_pred.append(y_sub[i] != np.argmax(y_sub_pred, axis=1)) else: mis_pred.append(y_sub[i] != np.argmax(y_sub_pred[i], axis=1)) @@ -302,8 +418,6 @@ def measure_latency_any(df, data_ori, task_name, sample_size, if len(misclassified_data) == 0: misclassified_data = np.zeros((1,2)) record_size = misclassified_data[0].nbytes - # block_size = 1024 * 1024 - # block_size = 1024 * 512 num_record_per_part = np.floor(block_size / record_size) x_start = np.min(misclassified_data[:,0]) @@ -314,297 +428,262 @@ def measure_latency_any(df, data_ori, task_name, sample_size, list_comp_aux_blocks = [] comp_zstd_size = 0 + data_partition_idx = (misclassified_data[:, 0] - x_start) // num_record_per_part for block_idx in tqdm(range(num_partition)): - val_start, val_end = x_start + block_idx*num_record_per_part, x_start + (block_idx+1)*num_record_per_part - data_idx = np.logical_and(misclassified_data[:, 0] >= val_start, misclassified_data[:, 0] < val_end) + # val_start, val_end = x_start + block_idx*num_record_per_part, x_start + (block_idx+1)*num_record_per_part + data_idx = data_partition_idx == block_idx + # data_idx = np.logical_and(misclassified_data[:, 0] >= val_start, misclassified_data[:, 0] < val_end) data_part = misclassified_data[data_idx] + if search_algo == 'binary_c': + dict_contigous_key[block_idx] = np.array(data_part[:, 0], order='F').astype(np.int32) + if len(data_part) == 0: continue data_bytes = data_part.tobytes() - data_zstd_comp = zstd.compress(data_bytes,1) + data_zstd_comp = zstd.compress(data_bytes) list_comp_aux_blocks.append(data_zstd_comp) comp_zstd_size += sys.getsizeof(data_zstd_comp)/1024/1024 file_name = os.path.join(comp_data_dir, str(block_idx) + '.data') ndb_utils.save_byte_to_disk(file_name, data_zstd_comp) data_ori_size = data_ori.nbytes/1024/1024 - data_comp_size = [size_encoder, comp_zstd_size, model.count_params()*4/1024/1024, sys.getsizeof(zstd.compress(exist_bit_arr.tobytes()))/1024/1024] + data_comp_size = [size_encoder, comp_zstd_size, model_size, sys.getsizeof(zstd.compress(exist_bit_arr.tobytes()))/1024/1024] print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_comp_size)) - np.save(os.path.join(comp_data_dir, 'num_record_per_part'), num_record_per_part) + x = df[df_key[0]].values.astype(np.int32) + max_len = len(str(np.max(x))) + x_start = np.min(x) + x_end = np.max(x) + + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = [size_encoder, comp_zstd_size, model_size, sys.getsizeof(zstd.compress(exist_bit_arr.tobytes()))/1024/1024] + exp_data_dict['max_len'] = max_len + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + ndb_utils.save_byte_to_disk(os.path.join(comp_data_dir, 'exist_bit_arr.data'), zstd.compress(exist_bit_arr.tobytes())) + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + else: - num_record_per_part = np.load(os.path.join(comp_data_dir, 'num_record_per_part.npy')) + exist_bit_arr = bitarray() + exist_bit_arr.frombytes(zstd.decompress(ndb_utils.read_bytes_from_disk(os.path.join(comp_data_dir, 'exist_bit_arr.data')))) + + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + max_len = exp_data_dict['max_len'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_y_encoder = exp_data_dict['list_y_encoder'] + num_tasks = exp_data_dict['num_tasks'] + size_encoder = exp_data_dict['size_encoder'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) - x = df[df_key[0]].values.astype(np.int32) - max_len = len(str(np.max(x))) - x_start = np.min(x) - x_end = np.max(x) + data_ori = data_ori[:2] + del df + gc.collect() shared_utils.create_fetures.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_long, ctypes.c_int] shared_utils.create_fetures_mutlt_thread_mgr.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_long, ctypes.c_int32, ctypes.c_int32] shared_utils.create_fetures_mutlt_thread_mgr.restype = ctypes.POINTER(ctypes.c_bool * (sample_size * max_len * 10)) - list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) - # Measure latency for run-time memory optimzed strategy - if memory_optimized: - timer_creatfeatures = ndb_utils.Timer() - timer_nn = ndb_utils.Timer() - timer_aux_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_exist_lookup = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_remap = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_remap = 0 - t_locate_part = 0 - t_decomp = 0 - t_createfeatures = 0 - t_aux_lookup = 0 - t_nn = 0 - t_exist_lookup = 0 - t_total = 0 - t_sort = 0 - peak_memory = -1 - block_bytes_size = 0 - - timer_total.tic() - for _ in tqdm(range(num_loop)): - decomp_aux_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = None - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - timer_creatfeatures.tic() - result = np.recarray((sample_size, ), dtype=data_ori.dtype) - result[df_key[0]] = sample_index + + timer_creatfeatures = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + timer_nn = ndb_utils.Timer() + timer_aux_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_exist_lookup = ndb_utils.Timer() + timer_remap = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_build_index = ndb_utils.Timer() + t_remap = 0 + t_decomp = 0 + t_createfeatures = 0 + t_aux_lookup = 0 + t_nn = 0 + t_exist_lookup = 0 + t_total = 0 + t_sort = 0 + t_locate_part = 0 + t_build_index = 0 + block_bytes_size = 0 + timer_total.tic() + for _ in tqdm(range(num_loop)): + partition_hit = dict() + decomp_aux_block = dict() + num_decomp = 0 + count_nonexist = 0 + peak_memory = 0 + cache_block_memory = 0 + gc.collect() + + # build hash table + if search_algo == 'hash': + data_hash = dict() + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part + sample_index_partition = sample_index_partition.astype(np.int32) + t_sort += timer_sort.toc() + result = np.ndarray((sample_size, ), dtype=data_ori.dtype) + result[df_key[0]] = sample_index + if mode == 'edge': + edge_batch_size = 5000 + timer_creatfeatures.tic() + inference_generator = InferenceDataGenerator(sample_index, edge_batch_size, max_len) + for idx, (x_sub) in enumerate(inference_generator): + t_createfeatures += timer_creatfeatures.toc() + timer_nn.tic() + if backend == 'tf': + y_nn_pred = model(x_sub) + elif backend == 'onnx': + y_nn_pred = model.run(None, {input_name: np.expand_dims(x_sub, -1).astype(np.float32)}) + + for i in range(num_tasks): + if num_tasks == 1 and backend == 'onnx': + col_name = data_ori.dtype.names[i+1] + result[col_name][idx*edge_batch_size:(idx+1)*edge_batch_size] = np.argmax(y_nn_pred[0], axis=1) + elif num_tasks == 1 and backend == 'tf': + col_name = data_ori.dtype.names[i+1] + result[col_name][idx*edge_batch_size:(idx+1)*edge_batch_size] = np.argmax(y_nn_pred, axis=1) + else: + col_name = data_ori.dtype.names[i+1] + result[col_name][idx*edge_batch_size:(idx+1)*edge_batch_size] = np.argmax(y_nn_pred[i], axis=1) + t_nn += timer_nn.toc() + timer_creatfeatures.tic() + + else: + timer_creatfeatures.tic() + x_features_arr = np.zeros(sample_size * max_len * 10, dtype=bool) x_features_arr_ptr = shared_utils.create_fetures_mutlt_thread_mgr( x_features_arr, sample_index, sample_size, max_len, num_threads) sampled_features = np.frombuffer( x_features_arr_ptr.contents, dtype=bool).reshape(sample_size, -1) + # sampled_features = ndb_utils.create_features(sample_index, max_len)[0] t_createfeatures += timer_creatfeatures.toc() - # --------- timer_nn.tic() - y_nn_pred = model(sampled_features) + if backend == 'tf': + y_nn_pred = model(sampled_features) + elif backend == 'onnx': + y_nn_pred = model.run(None, {input_name: np.expand_dims(sampled_features, -1).astype(np.float32)}) + for i in range(num_tasks): - if num_tasks == 1: + if num_tasks == 1 and backend == 'onnx': + col_name = data_ori.dtype.names[i+1] + result[col_name] = np.argmax(y_nn_pred[0], axis=1) + elif num_tasks == 1 and backend == 'tf': col_name = data_ori.dtype.names[i+1] result[col_name] = np.argmax(y_nn_pred, axis=1) else: col_name = data_ori.dtype.names[i+1] result[col_name] = np.argmax(y_nn_pred[i], axis=1) t_nn += timer_nn.toc() - for idx, val in enumerate(sample_index_sorted): - # ------ non exist look up - timer_exist_lookup.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - exist_flag = exist_bit_arr[query_key-x_start] == 1 + + for idx, val in enumerate(sample_index): + # ------ non exist look up + timer_exist_lookup.tic() + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + exist_flag = exist_bit_arr[query_key-x_start] == 1 + + if not exist_flag: + result[query_key_index_in_old] = -1 + count_nonexist += 1 + t_exist_lookup += timer_exist_lookup.toc() + else: + # misclassified lookup t_exist_lookup += timer_exist_lookup.toc() - if not exist_flag: - result[idx] = -1 - count_nonexist += 1 - t_exist_lookup += timer_exist_lookup.toc() + timer_locate_part.tic() + part_idx = sample_index_partition[idx] + t_locate_part += timer_locate_part.toc() + timer_decomp.tic() + + if part_idx not in decomp_aux_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_aux_block = ndb_utils.evict_unused_partition(decomp_aux_block, partition_hit, free_memory=1024*1024*100) + + partition_hit[part_idx] = 1 + + + file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') + + if not os.path.exists(file_name): + continue + block_zstd_comp = ndb_utils.read_bytes_from_disk(file_name) + data_uncomp = np.frombuffer( + # zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1).copy(order='F') + zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1) + # decomp_aux_block[part_idx] = data_uncomp + try: + decomp_aux_block[part_idx] = data_uncomp + except: + decomp_aux_block = dict() + decomp_aux_block[part_idx] = data_uncomp + num_decomp += 1 + block_bytes_size = sys.getsizeof(block_zstd_comp) + prev_part_idx = part_idx + + if search_algo == 'hash': + t_decomp += timer_decomp.toc() + timer_build_index.tic() + for block_data_idx in range(len(data_uncomp)): + data_entry_key = data_uncomp[block_data_idx, 0] + # print(data_entry_key) + data_entry_val = data_uncomp[block_data_idx] + data_hash[data_entry_key] = data_entry_val + cache_block_memory = sys.getsizeof(data_hash) + t_build_index += timer_build_index.toc() + timer_decomp.tic() + else: + cache_block_memory += data_uncomp.nbytes else: - # misclassified lookup - t_exist_lookup += timer_exist_lookup.toc() - timer_locate_part.tic() - part_idx = int((query_key - x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx != prev_part_idx: - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - if not os.path.exists(file_name): - continue - block_zstd_comp = ndb_utils.read_bytes_from_disk(file_name) - current_memory = sys.getsizeof(block_zstd_comp) - data_uncomp = np.frombuffer(zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1).copy(order='F') - - decomp_aux_block = data_uncomp - num_decomp += 1 - current_memory += data_uncomp.nbytes - prev_part_idx = part_idx - if current_memory > peak_memory: - peak_memory = current_memory - + data_uncomp = decomp_aux_block[part_idx] + partition_hit[part_idx] +=1 + t_decomp += timer_decomp.toc() + timer_aux_lookup.tic() + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(data_uncomp[:,0], query_key, len(data_uncomp)) + if data_idx != -1: + result[query_key_index_in_old] = tuple(data_uncomp[data_idx]) else: - data_uncomp = decomp_aux_block - - t_decomp += timer_decomp.toc() - timer_aux_lookup.tic() + count_nonexist += 1 + elif search_algo == 'binary_c': data_idx = shared_utils.aux_look_up_bin(data_uncomp[:,0], query_key, len(data_uncomp)) if data_idx != -1: result[query_key_index_in_old] = tuple(data_uncomp[data_idx]) - t_aux_lookup += timer_aux_lookup.toc() - - timer_remap.tic() - for i in range(num_tasks): - col_name = data_ori.dtype.names[i+1] - fun_a = lambda x: list_y_encoder[i].classes_[x] - result[col_name] = fun_a(result[col_name].astype(np.int32)) - t_remap += timer_remap.toc() - t_total += timer_total.toc() - - - peak_memory += exist_bit_arr.nbytes - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, np.sum(data_comp_size), sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, t_createfeatures / num_loop, t_nn / num_loop, t_locate_part / num_loop, t_decomp / num_loop, - t_aux_lookup / num_loop, t_exist_lookup / num_loop, t_remap / num_loop, t_total / num_loop, num_decomp, count_nonexist, exist_bit_arr.nbytes/1024/1024, model.count_params()*4/1024/1024)).T - - # Measure latency for end-end latency optimzed strategy - if latency_optimized: - timer_creatfeatures = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - timer_nn = ndb_utils.Timer() - timer_aux_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_exist_lookup = ndb_utils.Timer() - timer_remap = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_build_index = ndb_utils.Timer() - t_remap = 0 - t_decomp = 0 - t_createfeatures = 0 - t_aux_lookup = 0 - t_nn = 0 - t_exist_lookup = 0 - t_total = 0 - t_sort = 0 - t_locate_part = 0 - t_build_index = 0 - block_bytes_size = 0 - timer_total.tic() - for _ in tqdm(range(num_loop)): - decomp_aux_block = dict() - num_decomp = 0 - count_nonexist = 0 - peak_memory = 0 - cache_block_memory = 0 - - # build hash table - if search_algo == 'hash': - data_hash = dict() - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - timer_creatfeatures.tic() - result = np.recarray((sample_size, ), dtype=data_ori.dtype) - result[df_key[0]] = sample_index - x_features_arr = np.zeros(sample_size * max_len * 10, dtype=bool) - x_features_arr_ptr = shared_utils.create_fetures_mutlt_thread_mgr( - x_features_arr, sample_index, sample_size, max_len, num_threads) - sampled_features = np.frombuffer( - x_features_arr_ptr.contents, dtype=bool).reshape(sample_size, -1) - # sampled_features = ndb_utils.create_features(sample_index, max_len)[0] - - t_createfeatures += timer_creatfeatures.toc() - timer_nn.tic() - y_nn_pred = model(sampled_features) - - for i in range(num_tasks): - if num_tasks == 1: - col_name = data_ori.dtype.names[i+1] - result[col_name] = np.argmax(y_nn_pred, axis=1) - else: - col_name = data_ori.dtype.names[i+1] - result[col_name] = np.argmax(y_nn_pred[i], axis=1) - t_nn += timer_nn.toc() - - for idx, val in enumerate(sample_index): - # ------ non exist look up - timer_exist_lookup.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - exist_flag = exist_bit_arr[query_key-x_start] == 1 - - if not exist_flag: - result[query_key_index_in_old] = -1 - count_nonexist += 1 - t_exist_lookup += timer_exist_lookup.toc() - else: - # misclassified lookup - t_exist_lookup += timer_exist_lookup.toc() - timer_locate_part.tic() - part_idx = int((query_key - x_start) // num_record_per_part) - - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx not in decomp_aux_block: - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - if not os.path.exists(file_name): - continue - block_zstd_comp = ndb_utils.read_bytes_from_disk(file_name) - data_uncomp = np.frombuffer( - zstd.decompress(block_zstd_comp), dtype=np.int32).reshape(-1, num_tasks+1).copy(order='F') - decomp_aux_block[part_idx] = data_uncomp - num_decomp += 1 - block_bytes_size = sys.getsizeof(block_zstd_comp) - prev_part_idx = part_idx - - # TODO add size computation for hash approach - if search_algo == 'hash': - t_decomp += timer_decomp.toc() - timer_build_index.tic() - for block_data_idx in range(len(data_uncomp)): - data_entry_key = data_uncomp[block_data_idx, 0] - # print(data_entry_key) - data_entry_val = data_uncomp[block_data_idx] - data_hash[data_entry_key] = data_entry_val - cache_block_memory = sys.getsizeof(data_hash) - t_build_index += timer_build_index.toc() - timer_decomp.tic() - else: - cache_block_memory += data_uncomp.nbytes else: - data_uncomp = decomp_aux_block[part_idx] - t_decomp += timer_decomp.toc() - timer_aux_lookup.tic() - if search_algo == 'binary': - # TODO code can be optimized at revision stage - data_idx = ndb_utils.binary_search(data_uncomp[:,0], query_key, len(data_uncomp)) - if data_idx != -1: - result[query_key_index_in_old] = tuple(data_uncomp[data_idx]) - else: - count_nonexist += 1 - elif search_algo == 'binary_c': - data_idx = shared_utils.aux_look_up_bin(data_uncomp[:,0], query_key, len(data_uncomp)) - if data_idx != -1: - result[query_key_index_in_old] = tuple(data_uncomp[data_idx]) - else: - count_nonexist += 1 - elif search_algo == 'hash': - if query_key in data_hash.keys(): - result[query_key_index_in_old] = tuple(data_hash[query_key]) - - t_aux_lookup += timer_aux_lookup.toc() - - if cache_block_memory + block_bytes_size > peak_memory: - peak_memory = cache_block_memory + block_bytes_size - - timer_remap.tic() - for i in range(num_tasks): - col_name = data_ori.dtype.names[i+1] - fun_a = lambda x: list_y_encoder[i].classes_[x] - result[col_name] = fun_a(result[col_name].astype(np.int32)) - t_remap += timer_remap.toc() - t_total += timer_total.toc() - - peak_memory += exist_bit_arr.nbytes + count_nonexist += 1 + elif search_algo == 'hash': + if query_key in data_hash.keys(): + result[query_key_index_in_old] = tuple(data_hash[query_key]) + + t_aux_lookup += timer_aux_lookup.toc() + + if cache_block_memory + block_bytes_size > peak_memory: + peak_memory = cache_block_memory + block_bytes_size + + timer_remap.tic() + for i in range(num_tasks): + col_name = data_ori.dtype.names[i+1] + fun_a = lambda x: list_y_encoder[i].classes_[x] + result[col_name] = fun_a(result[col_name].astype(np.int32)) + t_remap += timer_remap.toc() + t_total += timer_total.toc() latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, np.sum(data_comp_size), sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, t_createfeatures / num_loop, t_nn / num_loop, t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, - t_aux_lookup / num_loop, t_exist_lookup / num_loop, t_remap / num_loop, t_total / num_loop, num_decomp, count_nonexist, exist_bit_arr.nbytes/1024/1024, model.count_params()*4/1024/1024)).T + del result + gc.collect() + peak_memory += exist_bit_arr.nbytes + latency_optimized_latency = np.array((data_ori_size, np.sum(data_comp_size), sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, t_createfeatures / num_loop, t_nn / num_loop, t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, + t_aux_lookup / num_loop, t_exist_lookup / num_loop, t_remap / num_loop, t_total / num_loop, num_decomp, count_nonexist, exist_bit_arr.nbytes/1024/1024, model_size)).T return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: diff --git a/DeepMapping/DeepMapping/delta_compression.py b/DeepMapping/DeepMapping/delta_compression.py index 18e8856..03c1fde 100644 --- a/DeepMapping/DeepMapping/delta_compression.py +++ b/DeepMapping/DeepMapping/delta_compression.py @@ -1,15 +1,17 @@ -import pandas as pd +import gc import numpy as np -import sys import math import os +import pandas as pd +import sys +import warnings from DeepMapping import ndb_utils from tqdm.auto import tqdm def measure_latency(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary'): """Measure the end-end latency of data query @@ -24,10 +26,6 @@ def measure_latency(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -37,17 +35,19 @@ def measure_latency(df, data_ori, task_name, sample_size, path_to_model : str load model from custom path """ + mode = os.environ['MODE'] data_ori_size = 0 data_comp_size = 0 memory_optimized_latency = None latency_optimized_latency = None memory_optimized_result = None latency_optimized_result = None + exp_data_dict = dict() list_type = [] for col in data_ori.dtype.names: - if data_ori[col].dtype == object: - list_type.append({'names': [col], 'formats': ['O'], 'offsets': [0], 'itemsize': 8}) + if data_ori[col].dtype == 'S8': + list_type.append((col, 'S8')) elif data_ori[col].dtype == np.float64: list_type.append(np.float64) else: @@ -129,235 +129,145 @@ def measure_latency(df, data_ori, task_name, sample_size, data_comp_size = data_size/1024/1024 print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) np.save(os.path.join(comp_data_dir, 'list_delta_enabled'), list_delta_enabled) + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + exp_data_dict['list_type'] = list_type + exp_data_dict['diff_dtype'] = diff_dtype + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) else: list_delta_enabled = np.load(os.path.join(comp_data_dir, 'list_delta_enabled.npy')) - - list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) - - # Measure latency for run-time memory optimized strategy - if memory_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_total = 0 - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_locate_part = 0 + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + diff_dtype = exp_data_dict['diff_dtype'] + list_type = exp_data_dict['list_type'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) + + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + timer_total.tic() + + for _ in tqdm(range(num_loop)): + decomp_block = dict() + partition_hit = dict() peak_memory = 0 - - for _ in tqdm(range(num_loop)): - decomp_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = -1 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - part_idx = int((query_key-x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx != prev_part_idx: - # new block to decompress - curr_block = [] - current_memory = 0 - # decompress index first - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - block_data = np.frombuffer(block_bytes, dtype=list_type[0]) - curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype) - current_memory += sys.getsizeof(block_bytes) - current_memory += block_data.nbytes - curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data - - for i in range(1, len(list_delta_enabled)): - col_name = data_ori.dtype.names[i] - - if list_delta_enabled[i] == False: - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - - if list_type[i] == np.int32 or list_type[i] == np.float64: - block_data = np.frombuffer(block_bytes, dtype=list_type[i]) - else: - block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name] - - curr_decomp_block[col_name] = block_data - current_memory += sys.getsizeof(block_bytes) - current_memory += block_data.nbytes - else: - # delta - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - delta_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=diff_dtype) - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}-init.data'.format(i)) - init_value = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=np.int32) - col_value = np.zeros(len(delta_data)+1, dtype=np.int32) - curr_value = init_value[0] - - for i in range(0, len(delta_data)): - col_value[i] = curr_value - curr_value += delta_data[i] - - curr_decomp_block[col_name] = col_value - current_memory += delta_data.nbytes - current_memory += init_value.nbytes - current_memory += col_value.nbytes - - decomp_block = curr_decomp_block - current_memory += decomp_block.nbytes - num_decomp += 1 - - if current_memory > peak_memory: - peak_memory = current_memory - prev_part_idx = part_idx - decomp_block = curr_decomp_block - else: - curr_decomp_block = decomp_block - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + t_sort += timer_sort.toc() + result = np.ndarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + + for idx in range(sample_size): + timer_locate_part.tic() + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + t_locate_part += timer_locate_part.toc() + part_idx = int((query_key-x_start) // num_record_per_part) + timer_decomp.tic() + decomp_memory = 0 + + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100) + + partition_hit[part_idx] =1 + # decompress index first + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + block_data = np.frombuffer(block_bytes, dtype=list_type[0]) + curr_decomp_block = np.ndarray((len(block_data),), dtype=data_ori.dtype) + decomp_memory += sys.getsizeof(block_bytes) + curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data - t_lookup += timer_lookup.toc() - - t_total += timer_total.toc() - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - # Measure latency for end-end latency optimzed strategy - if latency_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_total = 0 - t_locate_part = 0 - timer_total.tic() - - for _ in tqdm(range(num_loop)): - decomp_block = dict() - peak_memory = 0 - num_decomp = 0 - count_nonexist = 0 - cache_block_memory = 0 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - result_idx = 0 - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - t_locate_part += timer_locate_part.toc() - part_idx = int((query_key-x_start) // num_record_per_part) - timer_decomp.tic() - decomp_memory = 0 - - if part_idx not in decomp_block: - # decompress index first - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - block_data = np.frombuffer(block_bytes, dtype=list_type[0]) - curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype) - decomp_memory += sys.getsizeof(block_bytes) - curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data - - for i in range(1, len(list_delta_enabled)): - col_name = data_ori.dtype.names[i] - - if list_delta_enabled[i] == False: - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - decomp_memory += sys.getsizeof(block_bytes) + for i in range(1, len(list_delta_enabled)): + col_name = data_ori.dtype.names[i] - if list_type[i] == np.int32 or list_type[i] == np.float64: - block_data = np.frombuffer(block_bytes, dtype=list_type[i]) - else: - block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name] + if list_delta_enabled[i] == False: + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + decomp_memory += sys.getsizeof(block_bytes) - curr_decomp_block[col_name] = block_data + if list_type[i] == np.int32 or list_type[i] == np.float64: + block_data = np.frombuffer(block_bytes, dtype=list_type[i]) else: - # delta - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - delta_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=diff_dtype) - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}-init.data'.format(i)) - init_value = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=np.int32) - col_value = np.zeros(len(delta_data)+1, dtype=np.int32) - curr_value = init_value[0] - - for i in range(0, len(delta_data)): - col_value[i] = curr_value - curr_value += delta_data[i] - - curr_decomp_block[col_name] = col_value - decomp_memory += delta_data.nbytes - decomp_memory += init_value.nbytes - decomp_memory += col_value.nbytes - - cache_block_memory += curr_decomp_block.nbytes - decomp_block[part_idx] = curr_decomp_block - num_decomp += 1 - else: - curr_decomp_block = decomp_block[part_idx] - - t_decomp += timer_decomp.toc() - timer_lookup.tic() + block_data = np.frombuffer(block_bytes, dtype=list_type[i])[col_name] + + curr_decomp_block[col_name] = block_data + else: + # delta + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) + delta_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=diff_dtype) + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}-init.data'.format(i)) + init_value = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name), dtype=np.int32) + col_value = np.zeros(len(delta_data)+1, dtype=np.int32) + curr_value = init_value[0] + + for i in range(0, len(delta_data)): + col_value[i] = curr_value + curr_value += delta_data[i] + + curr_decomp_block[col_name] = col_value + decomp_memory += delta_data.nbytes + decomp_memory += init_value.nbytes + decomp_memory += col_value.nbytes + + cache_block_memory += curr_decomp_block.nbytes + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + else: + partition_hit[part_idx] += 1 + curr_decomp_block = decomp_block[part_idx] - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key + t_decomp += timer_decomp.toc() + timer_lookup.tic() - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - t_lookup += timer_lookup.toc() - result_idx += 1 + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) + elif search_algo == 'naive': + data_idx = curr_decomp_block[key] == query_key - if cache_block_memory + decomp_memory > peak_memory: - peak_memory = cache_block_memory + decomp_memory - t_total += timer_total.toc() - latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): + result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx]) + else: + count_nonexist += 1 + t_lookup += timer_lookup.toc() + result_idx += 1 + + if cache_block_memory + decomp_memory > peak_memory: + peak_memory = cache_block_memory + decomp_memory + t_total += timer_total.toc() + latency_optimized_result = result.copy() + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, 0 / num_loop, # build_index time + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: diff --git a/DeepMapping/DeepMapping/dgpe_compression.py b/DeepMapping/DeepMapping/dgpe_compression.py index 6f25507..22608b9 100644 --- a/DeepMapping/DeepMapping/dgpe_compression.py +++ b/DeepMapping/DeepMapping/dgpe_compression.py @@ -79,7 +79,7 @@ def dge_decompression(dge_comp_data, required_bits, dge_col_min_vals): def measure_latency(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary'): """Measure the end-end latency of data query @@ -94,10 +94,6 @@ def measure_latency(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -181,193 +177,99 @@ def measure_latency(df, data_ori, task_name, sample_size, list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) - # Measure latency for run-time memory optimized strategy - if memory_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_total = 0 - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_locate_part = 0 + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + timer_total.tic() + + for _ in tqdm(range(num_loop)): + decomp_block = dict() peak_memory = 0 - - for _ in tqdm(range(num_loop)): - decomp_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = -1 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - part_idx = int((query_key-x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx != prev_part_idx: - # new block to decompress - current_memory = 0 - file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-dge.data') - block_dge_bytes = ndb_utils.read_bytes_from_disk(file_name2) - current_memory += sys.getsizeof(block_dge_bytes) - data_dge_part = np.frombuffer(block_dge_bytes, dtype=dge_dtype) - data_int_part = dge_decompression(data_dge_part, required_bits, dge_col_min_vals) - current_memory += data_int_part.nbytes - curr_decomp_block = np.recarray((len(data_dge_part),), dtype=data_ori.dtype) - - if len(non_int_data) != 0: - file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-nonint.data') - block_nonint_bytes = ndb_utils.read_bytes_from_disk(file_name1) - data_non_int_part = np.rec.array(block_nonint_bytes, dtype=non_int_data.dtype) - - for i in range(len(non_int_cols)): - curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]] - - current_memory += sys.getsizeof(block_nonint_bytes) - - for i in range(len(int_cols)): - curr_decomp_block[int_cols[i]] = data_int_part[:, i] - - num_decomp += 1 - current_memory += curr_decomp_block.nbytes - - if current_memory > peak_memory: - peak_memory = current_memory - - prev_part_idx = part_idx - decomp_block = curr_decomp_block - else: - curr_decomp_block = decomp_block - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + t_sort += timer_sort.toc() + result = np.recarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + current_memory = 0 + + for idx in range(sample_size): + timer_locate_part.tic() + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + t_locate_part += timer_locate_part.toc() + part_idx = int((query_key-x_start) // num_record_per_part) + timer_decomp.tic() + # ----- + decomp_memory = 0 + if part_idx not in decomp_block: + file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-dge.data') + block_dge_bytes = ndb_utils.read_bytes_from_disk(file_name2) + decomp_memory += sys.getsizeof(block_dge_bytes) + data_dge_part = np.frombuffer(block_dge_bytes, dtype=dge_dtype) + data_int_part = dge_decompression(data_dge_part, required_bits, dge_col_min_vals) + decomp_memory += data_int_part.nbytes + curr_decomp_block = np.recarray((len(data_dge_part),), dtype=data_ori.dtype) - t_lookup += timer_lookup.toc() - - t_total += timer_total.toc() - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - # Measure latency for end-end latency optimzed strategy - if latency_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_total = 0 - t_locate_part = 0 - timer_total.tic() - - for _ in tqdm(range(num_loop)): - decomp_block = dict() - peak_memory = 0 - num_decomp = 0 - count_nonexist = 0 - cache_block_memory = 0 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - result_idx = 0 - current_memory = 0 - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - t_locate_part += timer_locate_part.toc() - part_idx = int((query_key-x_start) // num_record_per_part) - timer_decomp.tic() - # ----- - decomp_memory = 0 - if part_idx not in decomp_block: - file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-dge.data') - block_dge_bytes = ndb_utils.read_bytes_from_disk(file_name2) - decomp_memory += sys.getsizeof(block_dge_bytes) - data_dge_part = np.frombuffer(block_dge_bytes, dtype=dge_dtype) - data_int_part = dge_decompression(data_dge_part, required_bits, dge_col_min_vals) - decomp_memory += data_int_part.nbytes - curr_decomp_block = np.recarray((len(data_dge_part),), dtype=data_ori.dtype) - - if len(non_int_data) != 0: - file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-nonint.data') - block_nonint_bytes = ndb_utils.read_bytes_from_disk(file_name1) - decomp_memory += sys.getsizeof(block_nonint_bytes) - data_non_int_part = np.rec.array(block_nonint_bytes, dtype=non_int_data.dtype) - current_memory += sys.getsizeof(data_non_int_part) - for i in range(len(non_int_cols)): - curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]] - - peak_memory += data_non_int_part.nbytes - - for i in range(len(int_cols)): - curr_decomp_block[int_cols[i]] = data_int_part[:, i] - + if len(non_int_data) != 0: + file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-nonint.data') + block_nonint_bytes = ndb_utils.read_bytes_from_disk(file_name1) + decomp_memory += sys.getsizeof(block_nonint_bytes) + data_non_int_part = np.rec.array(block_nonint_bytes, dtype=non_int_data.dtype) + current_memory += sys.getsizeof(data_non_int_part) for i in range(len(non_int_cols)): curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]] - decomp_block[part_idx] = curr_decomp_block - num_decomp += 1 - cache_block_memory += curr_decomp_block.nbytes - else: - curr_decomp_block = decomp_block[part_idx] - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - t_lookup += timer_lookup.toc() - result_idx += 1 + peak_memory += data_non_int_part.nbytes + + for i in range(len(int_cols)): + curr_decomp_block[int_cols[i]] = data_int_part[:, i] + + for i in range(len(non_int_cols)): + curr_decomp_block[non_int_cols[i]] = data_non_int_part[non_int_cols[i]] - if cache_block_memory + decomp_memory > peak_memory: - peak_memory = cache_block_memory + decomp_memory - t_total += timer_total.toc() - latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + cache_block_memory += curr_decomp_block.nbytes + else: + curr_decomp_block = decomp_block[part_idx] + t_decomp += timer_decomp.toc() + timer_lookup.tic() + + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) + elif search_algo == 'naive': + data_idx = curr_decomp_block[key] == query_key + + if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): + result[query_key_index_in_old] = curr_decomp_block[data_idx] + else: + count_nonexist += 1 + t_lookup += timer_lookup.toc() + result_idx += 1 + + if cache_block_memory + decomp_memory > peak_memory: + peak_memory = cache_block_memory + decomp_memory + t_total += timer_total.toc() + latency_optimized_result = result.copy() + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: diff --git a/DeepMapping/DeepMapping/generate_sample_index.py b/DeepMapping/DeepMapping/generate_sample_index.py new file mode 100644 index 0000000..fa0abb4 --- /dev/null +++ b/DeepMapping/DeepMapping/generate_sample_index.py @@ -0,0 +1,43 @@ +import os +import itertools +from DeepMapping import ndb_utils + +# Generate the sample index for TPC-H, SF=1 experiments for re-use in non_generate_file mode + +list_dataset = ['tpch-s1/customer', 'tpch-s1/lineitem', 'tpch-s1/orders', 'tpch-s1/part', 'tpch-s1/supplier'] + +for dataset in list_dataset: + path_to_meta = os.path.join('temp', dataset, 'uncompress/extra_meta.data') + print('[INFO] Generating sample index for', dataset) + extra_meta = ndb_utils.load_obj_from_disk_with_pickle(path_to_meta) + x_start = extra_meta['x_start'] + x_end = extra_meta['x_end'] + num_query = 5 + for sample_size in [1000, 10000, 100000]: + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + + ndb_utils.save_obj_to_disk_with_pickle(os.path.join('temp', dataset, 'sample_index_{}.data'.format(sample_size)), + list_sample_index) + + + +# Generate the sample index for data manipulation experiments for re-use in non_generate_file mode + +list_dataset = ['data_manipulation/single_column_low_corr_100m', + 'data_manipulation/single_column_high_corr_100m', + 'data_manipulation/multi_column_low_corr_100m', + 'data_manipulation/multi_column_high_corr_100m'] + + +for dataset in list_dataset: + path_to_meta = os.path.join('temp', dataset, 'uncompress', 'Default', 'extra_meta.data') + print('[INFO] Generating sample index for', dataset) + extra_meta = ndb_utils.load_obj_from_disk_with_pickle(path_to_meta) + x_start = extra_meta['x_start'] + x_end = extra_meta['x_end'] + num_query = 5 + for sample_size in [1000, 10000, 100000]: + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + + ndb_utils.save_obj_to_disk_with_pickle(os.path.join('temp', dataset, 'sample_index_{}.data'.format(sample_size)), + list_sample_index) \ No newline at end of file diff --git a/DeepMapping/DeepMapping/hashtable.py b/DeepMapping/DeepMapping/hashtable.py new file mode 100644 index 0000000..280a62d --- /dev/null +++ b/DeepMapping/DeepMapping/hashtable.py @@ -0,0 +1,229 @@ +import ctypes +import gc +import math +import numpy as np +import os +import pandas as pd +import sys +from DeepMapping import ndb_utils +from tqdm.auto import tqdm + +ND_POINTER_1 = np.ctypeslib.ndpointer(dtype=np.bool_, + ndim=1, + flags="C") +ND_POINTER_2 = np.ctypeslib.ndpointer(dtype=np.int32, + ndim=1, + flags="C") + +shared_utils = ctypes.CDLL(os.path.abspath("shared_utils.so")) # Or full path to file +shared_utils.aux_look_up_bin.argtypes = [ND_POINTER_2, ctypes.c_int, ctypes.c_long] +shared_utils.aux_look_up_bin.restype = ctypes.c_long + +def measure_latency(df, data_ori, task_name, sample_size, + generate_file=True, + num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024): + """Measure the end-end latency of data query + + Args: + df : dataframe + dataset in pd.dataframe format + data_ori : np.record + dataset in np.record format + task_name : str + task name + sample_size : int + number of queried data per query + generate_file : bool + whether need to store the data to disk + num_loop : int + number of loops to run for measuring the latency + num_query : int + number of queries + search_algo : str + search algorithm that applied to search entry in each partition, available strategy: naive, binary, hash + path_to_model : str + load model from custom path + """ + mode = os.environ['MODE'] + data_ori_size = 0 + data_comp_size = 0 + memory_optimized_latency = None + latency_optimized_latency = None + memory_optimized_result = None + latency_optimized_result = None + exp_data_dict = dict() + + key = df.columns[0] + record_size = data_ori[0].nbytes + num_record_per_part = np.floor(block_size / record_size) + x = data_ori[key] + x_start = np.min(x) + x_end = np.max(x) + x_range = x_end - x_start + num_partition = int(math.ceil(x_range / num_record_per_part)) + print('[Partition] Size {} Per Partition, # Partition: {}'.format(record_size*num_record_per_part/1024, num_partition)) + root_path = 'temp' + task_name = task_name + folder_name = 'hashtable' + comp_data_dir = os.path.join(root_path, task_name, folder_name) + if 'DATA_OPS' in os.environ: + comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS']) + + print('[Generate File Path]: {}'.format(comp_data_dir)) + + dict_contigous_key = dict() + + # generate file + if generate_file: + ndb_utils.recreate_temp_dir(comp_data_dir) + data_size = 0 + + for block_idx in tqdm(range(num_partition)): + val_start, val_end = x_start + block_idx * \ + num_record_per_part, x_start + (block_idx+1)*num_record_per_part + data_idx = np.logical_and(x >= val_start, x < val_end) + data_part = data_ori[data_idx] + if search_algo == 'binary_c': + dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32) + + if len(data_part) == 0: + continue + data_part_hash_table = dict() + for data_idx in range(len(data_part)): + data_part_hash_table[data_part[key][data_idx]] = data_part[data_idx] + + file_name = os.path.join(comp_data_dir, str(block_idx) + '.data') + ndb_utils.save_hashtable_to_disk(file_name, data_part_hash_table) + data_size += os.path.getsize(file_name) + + data_ori_size = data_ori.nbytes/1024/1024 + data_comp_size = data_size/1024/1024 + print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + else: + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) + + + + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_build_index = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + t_build_index = 0 + timer_total.tic() + for _ in tqdm(range(num_loop)): + partition_hit = dict() + decomp_block = dict() + peak_memory = 0 + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() + + # build hash table + # if search_algo == 'hash': + # data_hash = dict() + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part + sample_index_partition = sample_index_partition.astype(np.int32) + t_sort += timer_sort.toc() + result = np.recarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + + for idx in range(sample_size): + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + timer_locate_part.tic() + part_idx = sample_index_partition[idx] + t_locate_part += timer_locate_part.toc() + timer_decomp.tic() + + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100) + partition_hit[part_idx] = 1 + file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') + curr_decomp_block = ndb_utils.load_hashtable_from_disk(file_name) + try: + decomp_block[part_idx] = curr_decomp_block + except: + decomp_block = dict() + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + block_bytes_size = ndb_utils.get_nested_dict_size(curr_decomp_block) + + cache_block_memory += block_bytes_size + + if search_algo == 'hash': + t_decomp += timer_decomp.toc() + timer_build_index.tic() + t_build_index += timer_build_index.toc() + timer_decomp.tic() + else: + pass + else: + curr_decomp_block = decomp_block[part_idx] + partition_hit[part_idx] +=1 + + t_decomp += timer_decomp.toc() + timer_lookup.tic() + + + data_idx = query_key in curr_decomp_block.keys() + if data_idx == True: + result[query_key_index_in_old] = tuple(curr_decomp_block[query_key]) + else: + count_nonexist += 1 + + + t_lookup += timer_lookup.toc() + result_idx += 1 + if cache_block_memory > peak_memory: + peak_memory = cache_block_memory + t_total += timer_total.toc() + latency_optimized_result = result.copy() + del result + gc.collect() + print('[DEBUG] number of decompressed partition', len(decomp_block)) + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + + return_latency = None + if memory_optimized_latency is None and latency_optimized_latency is not None: + return_latency = latency_optimized_latency.reshape((1,-1)) + elif memory_optimized_latency is not None and latency_optimized_latency is None: + return_latency = memory_optimized_latency.reshape((1,-1)) + elif memory_optimized_latency is not None and latency_optimized_latency is not None: + return_latency = np.vstack((memory_optimized_latency, latency_optimized_latency)) + + return data_ori_size, data_comp_size, [memory_optimized_result, latency_optimized_result], return_latency diff --git a/DeepMapping/DeepMapping/hashtable_with_compression.py b/DeepMapping/DeepMapping/hashtable_with_compression.py new file mode 100644 index 0000000..0969a48 --- /dev/null +++ b/DeepMapping/DeepMapping/hashtable_with_compression.py @@ -0,0 +1,238 @@ +import ctypes +import gc +import json +import math +import numpy as np +import os +import pandas as pd +import pickle +import sys +import zstd +from DeepMapping import ndb_utils +from tqdm.auto import tqdm + +ND_POINTER_1 = np.ctypeslib.ndpointer(dtype=np.bool_, + ndim=1, + flags="C") +ND_POINTER_2 = np.ctypeslib.ndpointer(dtype=np.int32, + ndim=1, + flags="C") + +shared_utils = ctypes.CDLL(os.path.abspath("shared_utils.so")) # Or full path to file +shared_utils.aux_look_up_bin.argtypes = [ND_POINTER_2, ctypes.c_int, ctypes.c_long] +shared_utils.aux_look_up_bin.restype = ctypes.c_long + +def measure_latency(df, data_ori, task_name, sample_size, + generate_file=True, + num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024): + """Measure the end-end latency of data query + + Args: + df : dataframe + dataset in pd.dataframe format + data_ori : np.record + dataset in np.record format + task_name : str + task name + sample_size : int + number of queried data per query + generate_file : bool + whether need to store the data to disk + num_loop : int + number of loops to run for measuring the latency + num_query : int + number of queries + search_algo : str + search algorithm that applied to search entry in each partition, available strategy: naive, binary, hash + path_to_model : str + load model from custom path + """ + mode = os.environ['MODE'] + data_ori_size = 0 + data_comp_size = 0 + memory_optimized_latency = None + latency_optimized_latency = None + memory_optimized_result = None + latency_optimized_result = None + exp_data_dict = dict() + + key = df.columns[0] + record_size = data_ori[0].nbytes + num_record_per_part = np.floor(block_size / record_size) + x = data_ori[key] + x_start = np.min(x) + x_end = np.max(x) + x_range = x_end - x_start + num_partition = int(math.ceil(x_range / num_record_per_part)) + print('[Partition] Size {} Per Partition, # Partition: {}'.format(record_size*num_record_per_part/1024, num_partition)) + root_path = 'temp' + task_name = task_name + folder_name = 'hashtable_with_compression' + comp_data_dir = os.path.join(root_path, task_name, folder_name) + if 'DATA_OPS' in os.environ: + comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS']) + + print('[Generate File Path]: {}'.format(comp_data_dir)) + + dict_contigous_key = dict() + + # generate file + if generate_file: + ndb_utils.recreate_temp_dir(comp_data_dir) + data_size = 0 + + for block_idx in tqdm(range(num_partition)): + val_start, val_end = x_start + block_idx * \ + num_record_per_part, x_start + (block_idx+1)*num_record_per_part + data_idx = np.logical_and(x >= val_start, x < val_end) + data_part = data_ori[data_idx] + if search_algo == 'binary_c': + dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32) + + if len(data_part) == 0: + continue + data_part_hash_table = dict() + for data_idx in range(len(data_part)): + data_part_hash_table[data_part[key][data_idx]] = data_part[data_idx] + + data_part_hash_table_bytes = pickle.dumps(data_part_hash_table) + # data_size += sys.getsizeof(data_bytes) + file_name = os.path.join(comp_data_dir, str(block_idx) + '.data') + + # data_part_hash_table_bytes = zstd.compress(json.dumps(data_part_hash_table).encode('utf-8')) + # ndb_utils.save_hashtable_to_disk(file_name, data_part_hash_table) + ndb_utils.save_byte_to_disk(file_name, zstd.compress(data_part_hash_table_bytes)) + data_size += os.path.getsize(file_name) + + data_ori_size = data_ori.nbytes/1024/1024 + data_comp_size = data_size/1024/1024 + print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + else: + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) + + + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_build_index = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + t_build_index = 0 + timer_total.tic() + for _ in tqdm(range(num_loop)): + partition_hit = dict() + decomp_block = dict() + peak_memory = 0 + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() + + # build hash table + # if search_algo == 'hash': + # data_hash = dict() + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part + sample_index_partition = sample_index_partition.astype(np.int32) + t_sort += timer_sort.toc() + result = np.recarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + + for idx in range(sample_size): + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + timer_locate_part.tic() + # part_idx = int((query_key-x_start) // num_record_per_part) + part_idx = sample_index_partition[idx] + t_locate_part += timer_locate_part.toc() + timer_decomp.tic() + + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100) + partition_hit[part_idx] = 1 + file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + curr_decomp_block = pickle.loads(zstd.uncompress(block_bytes)) + # curr_decomp_block = ndb_utils.load_hashtable_from_disk(file_name) + try: + decomp_block[part_idx] = curr_decomp_block + except: + decomp_block = dict() + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + load_block_bytes = sys.getsizeof(block_bytes) + block_bytes_size = ndb_utils.get_nested_dict_size(curr_decomp_block) + + cache_block_memory += block_bytes_size + + if search_algo == 'hash': + t_decomp += timer_decomp.toc() + timer_build_index.tic() + t_build_index += timer_build_index.toc() + timer_decomp.tic() + else: + pass + else: + curr_decomp_block = decomp_block[part_idx] + partition_hit[part_idx] += 1 + + t_decomp += timer_decomp.toc() + timer_lookup.tic() + + data_idx = query_key in curr_decomp_block.keys() + if data_idx == True: + result[query_key_index_in_old] = tuple(curr_decomp_block[query_key]) + else: + count_nonexist += 1 + + t_lookup += timer_lookup.toc() + result_idx += 1 + if cache_block_memory + load_block_bytes > peak_memory: + peak_memory = cache_block_memory + load_block_bytes + t_total += timer_total.toc() + latency_optimized_result = result.copy() + del result + gc.collect() + print('[DEBUG] number of decompressed partition', len(decomp_block)) + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + + return_latency = None + if memory_optimized_latency is None and latency_optimized_latency is not None: + return_latency = latency_optimized_latency.reshape((1,-1)) + elif memory_optimized_latency is not None and latency_optimized_latency is None: + return_latency = memory_optimized_latency.reshape((1,-1)) + elif memory_optimized_latency is not None and latency_optimized_latency is not None: + return_latency = np.vstack((memory_optimized_latency, latency_optimized_latency)) + + return data_ori_size, data_comp_size, [memory_optimized_result, latency_optimized_result], return_latency diff --git a/DeepMapping/DeepMapping/lzo_compression.py b/DeepMapping/DeepMapping/lzo_compression.py index 90b48fc..ce0ea2a 100644 --- a/DeepMapping/DeepMapping/lzo_compression.py +++ b/DeepMapping/DeepMapping/lzo_compression.py @@ -1,16 +1,17 @@ -import pandas as pd -import numpy as np -import sys +import gc +import lzo import math +import numpy as np import os -import lzo +import pandas as pd +import sys from DeepMapping import ndb_utils from tqdm.auto import tqdm def measure_latency(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary'): """Measure the end-end latency of data query @@ -25,10 +26,6 @@ def measure_latency(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -38,12 +35,18 @@ def measure_latency(df, data_ori, task_name, sample_size, path_to_model : str load model from custom path """ + mode = os.environ['MODE'] data_ori_size = 0 data_comp_size = 0 memory_optimized_latency = None latency_optimized_latency = None memory_optimized_result = None latency_optimized_result = None + exp_data_dict = dict() + memory_optimized_latency = None + latency_optimized_latency = None + memory_optimized_result = None + latency_optimized_result = None key = df.columns[0] block_size = 1024 * 1024 record_size = data_ori[0].nbytes @@ -83,155 +86,118 @@ def measure_latency(df, data_ori, task_name, sample_size, data_ori_size = data_ori.nbytes/1024/1024 data_comp_size = data_size/1024/1024 print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + else: + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) - list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) - - # Measure latency for run-time memory optimized strategy - if memory_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_total = 0 - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_locate_part = 0 - peak_memory = 0 - for _ in tqdm(range(num_loop)): - decomp_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = -1 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - part_idx = int((query_key-x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx != prev_part_idx: - # new block to decompress - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - curr_decomp_block = np.rec.array(lzo.decompress(block_bytes), dtype=data_ori.dtype) - # curr_decomp_block = np.frombuffer(block_bytes, dtype=np.int32).reshape(-1, num_cols) - decomp_block = curr_decomp_block - num_decomp += 1 - current_memory = sys.getsizeof(block_bytes) - current_memory += curr_decomp_block.nbytes - if current_memory > peak_memory: - peak_memory = current_memory - prev_part_idx = part_idx - else: - curr_decomp_block = decomp_block - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - - t_lookup += timer_lookup.toc() - - t_total += timer_total.toc() - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - # Measure latency for end-end latency optimzed strategy - if latency_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_total = 0 - t_locate_part = 0 - timer_total.tic() - - for _ in tqdm(range(num_loop)): - decomp_block = dict() - peak_memory = 0 - num_decomp = 0 - count_nonexist = 0 - cache_block_memory = 0 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - result_idx = 0 - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - t_locate_part += timer_locate_part.toc() - part_idx = int((query_key-x_start) // num_record_per_part) - timer_decomp.tic() - - if part_idx not in decomp_block: - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - curr_decomp_block = np.rec.array(lzo.decompress(block_bytes), dtype=data_ori.dtype) - decomp_block[part_idx] = curr_decomp_block - num_decomp += 1 - cache_block_memory += curr_decomp_block.nbytes - block_bytes_size = sys.getsizeof(block_bytes) - else: - curr_decomp_block = decomp_block[part_idx] - - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - - t_lookup += timer_lookup.toc() - result_idx += 1 - if cache_block_memory + block_bytes_size > peak_memory: - peak_memory = cache_block_memory + block_bytes_size - t_total += timer_total.toc() + + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + timer_build_index = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + t_build_index = 0 + timer_total.tic() + + for _ in tqdm(range(num_loop)): + decomp_block = dict() + partition_hit = dict() + peak_memory = 0 + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part + sample_index_partition = sample_index_partition.astype(np.int32) + t_sort += timer_sort.toc() + result = np.ndarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + + for idx in range(sample_size): + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + timer_locate_part.tic() + part_idx = sample_index_partition[idx] + # part_idx = int((query_key-x_start) // num_record_per_part) + t_locate_part += timer_locate_part.toc() + timer_decomp.tic() + + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + curr_decomp_block = ndb_utils.evict_unused_partition(curr_decomp_block, partition_hit, free_memory=1024*1024*100) + + partition_hit[part_idx] =1 + + file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + curr_decomp_block = np.frombuffer(lzo.decompress(block_bytes), dtype=data_ori.dtype) + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + cache_block_memory += curr_decomp_block.nbytes + block_bytes_size = sys.getsizeof(block_bytes) + if search_algo == 'hash': + t_decomp += timer_decomp.toc() + timer_build_index.tic() + t_build_index += timer_build_index.toc() + timer_decomp.tic() + else: + curr_decomp_block = decomp_block[part_idx] + partition_hit[part_idx] +=1 + + t_decomp += timer_decomp.toc() + timer_lookup.tic() + + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) + elif search_algo == 'naive': + data_idx = curr_decomp_block[key] == query_key + + if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): + result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx]) + else: + count_nonexist += 1 + + t_lookup += timer_lookup.toc() + result_idx += 1 + if cache_block_memory + block_bytes_size > peak_memory: + peak_memory = cache_block_memory + block_bytes_size + t_total += timer_total.toc() latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + del result + gc.collect() + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: diff --git a/DeepMapping/DeepMapping/rle_compression.py b/DeepMapping/DeepMapping/rle_compression.py index 2aeb0bc..0415ce0 100644 --- a/DeepMapping/DeepMapping/rle_compression.py +++ b/DeepMapping/DeepMapping/rle_compression.py @@ -1,15 +1,16 @@ -import pandas as pd -import numpy as np -import sys +import gc import math +import numpy as np import os +import pandas as pd +import sys from DeepMapping import ndb_utils from DeepMapping.ndb_utils import Timer, recreate_temp_dir, save_byte_to_disk, read_bytes_from_disk from more_itertools import run_length from tqdm.auto import tqdm def measure_latency(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary'): """Measure the end-end latency of data query @@ -24,10 +25,6 @@ def measure_latency(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -37,12 +34,14 @@ def measure_latency(df, data_ori, task_name, sample_size, path_to_model : str load model from custom path """ + mode = os.environ['MODE'] data_ori_size = 0 data_comp_size = 0 memory_optimized_latency = None latency_optimized_latency = None memory_optimized_result = None latency_optimized_result = None + exp_data_dict = dict() key = df.columns[0] block_size = 1024 * 1024 record_size = data_ori[0].nbytes @@ -62,8 +61,8 @@ def measure_latency(df, data_ori, task_name, sample_size, list_type = [] for col in data_ori.dtype.names: - if data_ori[col].dtype == object: - list_type.append({'names': [col], 'formats': ['O'], 'offsets': [0], 'itemsize': 8}) + if data_ori[col].dtype == 'S8': + list_type.append('S8') elif data_ori[col].dtype == np.int32: list_type.append(np.int32) elif data_ori[col].dtype == np.float64: @@ -109,7 +108,7 @@ def measure_latency(df, data_ori, task_name, sample_size, temp_dtype = {'names': [data_ori.dtype.names[col_idx]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4} else: temp_dtype = list_type[col_idx] - a = np.recarray((len(col_val_rle_encode),), dtype=temp_dtype) + a = np.ndarray((len(col_val_rle_encode),), dtype=temp_dtype) b = np.zeros(len(col_val_rle_encode), np.int32) for idx, val in enumerate(col_val_rle_encode): a[idx] = val[0] @@ -124,228 +123,143 @@ def measure_latency(df, data_ori, task_name, sample_size, data_comp_size = data_size/1024/1024 print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) np.save(os.path.join(comp_data_dir, 'list_rle_enabled'), list_rle_enabled) + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + exp_data_dict['list_type'] = list_type + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) else: list_rle_enabled = np.load(os.path.join(comp_data_dir, 'list_rle_enabled.npy')) + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_type = exp_data_dict['list_type'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) - # Measure latency for run-time memory optimized strategy - if memory_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_total = 0 - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_locate_part = 0 - peak_memory = 0 - - for _ in tqdm(range(num_loop)): - decomp_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = -1 + + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + timer_total.tic() - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) + for _ in tqdm(range(num_loop)): + partition_hit = dict() + decomp_block = dict() + peak_memory = 0 + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - part_idx = int((query_key-x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - if part_idx != prev_part_idx: - current_memory = 0 - # decompress index first - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - block_data = np.frombuffer(block_bytes, dtype=list_type[0]) - curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype) - current_memory += sys.getsizeof(block_bytes) - curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data - - for i in range(1, df.shape[1]): - col_name = data_ori.dtype.names[i] + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + t_sort += timer_sort.toc() + result = np.ndarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + + for idx in range(sample_size): + timer_locate_part.tic() + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + t_locate_part += timer_locate_part.toc() - if list_rle_enabled[i + part_idx*df.shape[1]] == False: - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - current_memory += sys.getsizeof(block_bytes) + part_idx = int((query_key-x_start) // num_record_per_part) + timer_decomp.tic() + decomp_memory = 0 + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100) - if list_type[i] == np.int32 or list_type[i] == np.float64: - block_data = np.frombuffer(block_bytes, dtype=list_type[i]) - else: - block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name] - curr_decomp_block[col_name] = block_data + partition_hit[part_idx] =1 + # decompress index first + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + block_data = np.frombuffer(block_bytes, dtype=list_type[0]) + curr_decomp_block = np.ndarray((len(block_data),), dtype=data_ori.dtype) + decomp_memory += sys.getsizeof(block_bytes) + curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data + + for i in range(1, df.shape[1]): + col_name = data_ori.dtype.names[i] + if list_rle_enabled[i + part_idx*df.shape[1]] == False: + file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + if list_type[i] == np.int32 or list_type[i] == np.float64: + block_data = np.frombuffer(block_bytes, dtype=list_type[i]) else: - # rle decode - if list_type[i] == np.int32: - temp_dtype = {'names': [data_ori.dtype.names[i]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4} - else: - temp_dtype = list_type[i] - - file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-{}-val.data'.format(i)) - file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-{}-num.data'.format(i)) - val_data = np.rec.array(ndb_utils.read_bytes_from_disk(file_name1), dtype=temp_dtype) - num_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name2), dtype=np.int32) - temp_col_decode_data = [] - - for val, num in zip(val_data, num_data): - temp_col_decode_data.extend([val[0]]*num) - curr_decomp_block[col_name] = temp_col_decode_data + block_data = np.frombuffer(block_bytes, dtype=list_type[i]) + curr_decomp_block[col_name] = block_data + decomp_memory += sys.getsizeof(block_bytes) + else: + # rle decode + if list_type[i] == np.int32: + temp_dtype = {'names': [data_ori.dtype.names[i]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4} + else: + temp_dtype = list_type[i] - current_memory += val_data.nbytes - current_memory += num_data.nbytes - - current_memory += curr_decomp_block.nbytes - - decomp_block = curr_decomp_block - num_decomp += 1 - if current_memory > peak_memory: - peak_memory = current_memory - prev_part_idx = part_idx - else: - curr_decomp_block = decomp_block + file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-{}-val.data'.format(i)) + file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-{}-num.data'.format(i)) + val_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name1), dtype=temp_dtype) + num_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name2), dtype=np.int32) + temp_col_decode_data = [] - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - - t_lookup += timer_lookup.toc() - - t_total += timer_total.toc() - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - # Measure latency for end-end latency optimzed strategy - if latency_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_total = 0 - t_locate_part = 0 - timer_total.tic() - - for _ in tqdm(range(num_loop)): - decomp_block = dict() - peak_memory = 0 - num_decomp = 0 - count_nonexist = 0 - cache_block_memory = 0 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - result_idx = 0 - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - t_locate_part += timer_locate_part.toc() - - part_idx = int((query_key-x_start) // num_record_per_part) - timer_decomp.tic() - decomp_memory = 0 - if part_idx not in decomp_block: - # decompress index first - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(0)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - block_data = np.frombuffer(block_bytes, dtype=list_type[0]) - curr_decomp_block = np.recarray((len(block_data),), dtype=data_ori.dtype) - decomp_memory += sys.getsizeof(block_bytes) - curr_decomp_block[curr_decomp_block.dtype.names[0]] = block_data + for val, num in zip(val_data, num_data): + temp_col_decode_data.extend([val[0]]*num) + curr_decomp_block[col_name] = temp_col_decode_data + decomp_memory += val_data.nbytes + decomp_memory += num_data.nbytes - for i in range(1, df.shape[1]): - col_name = data_ori.dtype.names[i] - if list_rle_enabled[i + part_idx*df.shape[1]] == False: - file_name = os.path.join(comp_data_dir, str(part_idx) + '-{}.data'.format(i)) - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - if list_type[i] == np.int32 or list_type[i] == np.float64: - block_data = np.frombuffer(block_bytes, dtype=list_type[i]) - else: - block_data = np.rec.array(block_bytes, dtype=list_type[i])[col_name] - curr_decomp_block[col_name] = block_data - decomp_memory += sys.getsizeof(block_bytes) - else: - # rle decode - if list_type[i] == np.int32: - temp_dtype = {'names': [data_ori.dtype.names[i]], 'formats': [np.int32], 'offsets': [0], 'itemsize': 4} - else: - temp_dtype = list_type[i] - - file_name1 = os.path.join(comp_data_dir, str(part_idx) + '-{}-val.data'.format(i)) - file_name2 = os.path.join(comp_data_dir, str(part_idx) + '-{}-num.data'.format(i)) - val_data = np.rec.array(ndb_utils.read_bytes_from_disk(file_name1), dtype=temp_dtype) - num_data = np.frombuffer(ndb_utils.read_bytes_from_disk(file_name2), dtype=np.int32) - temp_col_decode_data = [] - - for val, num in zip(val_data, num_data): - temp_col_decode_data.extend([val[0]]*num) - curr_decomp_block[col_name] = temp_col_decode_data - decomp_memory += val_data.nbytes - decomp_memory += num_data.nbytes - - cache_block_memory += curr_decomp_block.nbytes - decomp_block[part_idx] = curr_decomp_block - num_decomp += 1 - else: - curr_decomp_block = decomp_block[part_idx] - t_decomp += timer_decomp.toc() - # ----- - timer_lookup.tic() + cache_block_memory += curr_decomp_block.nbytes + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + else: + partition_hit[part_idx] += 1 + curr_decomp_block = decomp_block[part_idx] + t_decomp += timer_decomp.toc() + # ----- + timer_lookup.tic() - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) + elif search_algo == 'naive': + data_idx = curr_decomp_block[key] == query_key - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - t_lookup += timer_lookup.toc() - result_idx += 1 - if cache_block_memory + decomp_memory > peak_memory: - peak_memory = cache_block_memory + decomp_memory - t_total += timer_total.toc() - latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): + result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx]) + else: + count_nonexist += 1 + t_lookup += timer_lookup.toc() + result_idx += 1 + if cache_block_memory + decomp_memory > peak_memory: + peak_memory = cache_block_memory + decomp_memory + t_total += timer_total.toc() + latency_optimized_result = result.copy() + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, 0 / num_loop, # build_index time + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: diff --git a/DeepMapping/DeepMapping/uncompress.py b/DeepMapping/DeepMapping/uncompress.py index e792fd0..b668ae2 100644 --- a/DeepMapping/DeepMapping/uncompress.py +++ b/DeepMapping/DeepMapping/uncompress.py @@ -1,10 +1,12 @@ -import pandas as pd -import numpy as np -import sys +import ctypes +import gc import math +import numpy as np +import pandas as pd import os -import ctypes +import sys from DeepMapping import ndb_utils +from collections import defaultdict # from DeepMapping.ndb_utils import Timer, recreate_temp_dir, save_byte_to_disk, read_bytes_from_disk # from more_itertools import run_length from tqdm.auto import tqdm @@ -21,10 +23,8 @@ shared_utils.aux_look_up_bin.restype = ctypes.c_long def measure_latency(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024): - # TODO add support of hash to run-time memory optimized strategy - # TODO add support of binary_c to run-time memory optimized strategy """Measure the end-end latency of data query Args: @@ -38,10 +38,6 @@ def measure_latency(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -51,12 +47,14 @@ def measure_latency(df, data_ori, task_name, sample_size, path_to_model : str load model from custom path """ + mode = os.environ['MODE'] data_ori_size = 0 data_comp_size = 0 memory_optimized_latency = None latency_optimized_latency = None memory_optimized_result = None latency_optimized_result = None + exp_data_dict = dict() key = df.columns[0] # block_size = 1024 * 1024 @@ -73,6 +71,9 @@ def measure_latency(df, data_ori, task_name, sample_size, task_name = task_name folder_name = 'uncompress' comp_data_dir = os.path.join(root_path, task_name, folder_name) + if 'DATA_OPS' in os.environ: + comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS']) + print('[Generate File Path]: {}'.format(comp_data_dir)) dict_contigous_key = dict() @@ -88,7 +89,6 @@ def measure_latency(df, data_ori, task_name, sample_size, data_idx = np.logical_and(x >= val_start, x < val_end) data_part = data_ori[data_idx] if search_algo == 'binary_c': - # FIXME temporary workaround to avoid the overhead of converting to contiguous array dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32) if len(data_part) == 0: @@ -101,178 +101,137 @@ def measure_latency(df, data_ori, task_name, sample_size, data_ori_size = data_ori.nbytes/1024/1024 data_comp_size = data_size/1024/1024 print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) - list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) - - # Measure latency for run-time memory optimized strategy - if memory_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_total = 0 - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_locate_part = 0 + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + else: + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) + + + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_build_index = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + t_build_index = 0 + timer_total.tic() + for _ in tqdm(range(num_loop)): + decomp_block = dict() + partition_hit = dict() peak_memory = 0 - - for _ in tqdm(range(num_loop)): - decomp_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = -1 - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - part_idx = int((query_key-x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx != prev_part_idx: - # new block to decompress - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - curr_decomp_block = np.rec.array(block_bytes, dtype=data_ori.dtype) - decomp_block = curr_decomp_block - num_decomp += 1 - current_memory = sys.getsizeof(block_bytes) - current_memory += curr_decomp_block.nbytes - if current_memory > peak_memory: - peak_memory = current_memory - prev_part_idx = part_idx - else: - curr_decomp_block = decomp_block - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - - t_lookup += timer_lookup.toc() - - t_total += timer_total.toc() - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - # Measure latency for end-end latency optimzed strategy - if latency_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_build_index = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_total = 0 - t_locate_part = 0 - t_build_index = 0 - timer_total.tic() - for _ in tqdm(range(num_loop)): - decomp_block = dict() - peak_memory = 0 - num_decomp = 0 - count_nonexist = 0 - cache_block_memory = 0 - - # build hash table - if search_algo == 'hash': - data_hash = dict() - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - result_idx = 0 - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - t_locate_part += timer_locate_part.toc() - part_idx = int((query_key-x_start) // num_record_per_part) - timer_decomp.tic() - - if part_idx not in decomp_block: - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - curr_decomp_block = np.rec.array(block_bytes, dtype=data_ori.dtype) + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() + + # build hash table + if search_algo == 'hash': + data_hash = dict() + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part + sample_index_partition = sample_index_partition.astype(np.int32) + t_sort += timer_sort.toc() + result = np.ndarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + + for idx in range(sample_size): + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + timer_locate_part.tic() + part_idx = sample_index_partition[idx] + t_locate_part += timer_locate_part.toc() + timer_decomp.tic() + + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100) + partition_hit[part_idx] =1 + file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + curr_decomp_block = np.frombuffer(block_bytes, dtype=data_ori.dtype) + try: decomp_block[part_idx] = curr_decomp_block - num_decomp += 1 - block_bytes_size = sys.getsizeof(block_bytes) - - if search_algo == 'hash': - t_decomp += timer_decomp.toc() - timer_build_index.tic() - for block_data_idx in range(len(curr_decomp_block)): - data_entry_key = curr_decomp_block[key][block_data_idx] - # print(data_entry_key) - data_entry_val = curr_decomp_block[block_data_idx] - data_hash[data_entry_key] = data_entry_val - cache_block_memory = sys.getsizeof(data_hash) - t_build_index += timer_build_index.toc() - timer_decomp.tic() - else: - cache_block_memory += curr_decomp_block.nbytes - else: - curr_decomp_block = decomp_block[part_idx] - - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'binary_c': - arr_contigous_arr = dict_contigous_key[part_idx] - data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - elif search_algo == 'hash': - data_idx = query_key in data_hash.keys() - - if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - elif search_algo == 'hash' and data_idx == True: - result[query_key_index_in_old] = data_hash[query_key] + except: + decomp_block = dict() + decomp_block[part_idx] = curr_decomp_block + + num_decomp += 1 + block_bytes_size = sys.getsizeof(block_bytes) + + if search_algo == 'hash': + t_decomp += timer_decomp.toc() + timer_build_index.tic() + for block_data_idx in range(len(curr_decomp_block)): + data_entry_key = curr_decomp_block[key][block_data_idx] + # print(data_entry_key) + data_entry_val = curr_decomp_block[block_data_idx] + data_hash[data_entry_key] = data_entry_val + cache_block_memory = sys.getsizeof(data_hash) + t_build_index += timer_build_index.toc() + timer_decomp.tic() else: - count_nonexist += 1 - - t_lookup += timer_lookup.toc() - result_idx += 1 - if cache_block_memory + block_bytes_size > peak_memory: - peak_memory = cache_block_memory + block_bytes_size - t_total += timer_total.toc() + cache_block_memory += curr_decomp_block.nbytes + else: + curr_decomp_block = decomp_block[part_idx] + partition_hit[part_idx] +=1 + + t_decomp += timer_decomp.toc() + timer_lookup.tic() + + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) + elif search_algo == 'binary_c': + arr_contigous_arr = dict_contigous_key[part_idx] + data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block)) + elif search_algo == 'naive': + data_idx = curr_decomp_block[key] == query_key + elif search_algo == 'hash': + data_idx = query_key in data_hash.keys() + + if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): + result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx]) + elif search_algo == 'hash' and data_idx == True: + result[query_key_index_in_old] = tuple(data_hash[query_key]) + else: + count_nonexist += 1 + + t_lookup += timer_lookup.toc() + result_idx += 1 + if cache_block_memory + block_bytes_size > peak_memory: + peak_memory = cache_block_memory + block_bytes_size + t_total += timer_total.toc() latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + del result + gc.collect() + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: diff --git a/DeepMapping/DeepMapping/zstd_compression.py b/DeepMapping/DeepMapping/zstd_compression.py index 117ea92..368b72d 100644 --- a/DeepMapping/DeepMapping/zstd_compression.py +++ b/DeepMapping/DeepMapping/zstd_compression.py @@ -1,9 +1,11 @@ +import ctypes +import gc +import math import numpy as np +import os import sys import zstd -import math -import os -import ctypes +from collections import defaultdict from DeepMapping import ndb_utils from tqdm.auto import tqdm @@ -19,10 +21,8 @@ shared_utils.aux_look_up_bin.restype = ctypes.c_long def measure_latency(df, data_ori, task_name, sample_size, - generate_file=True, memory_optimized=True, latency_optimized=True, + generate_file=True, num_loop=10, num_query=5, search_algo='binary', block_size=1024*1024): - # TODO add support for run-time memory optimized strategy - # TODO add support of binary_c to run-time memory optimized strategy """Measure the end-end latency of data query Args: @@ -36,10 +36,6 @@ def measure_latency(df, data_ori, task_name, sample_size, number of queried data per query generate_file : bool whether need to store the data to disk - memory_optimized : bool - whether measure the end-end latency with the run-time memory optimized strategy - latency_optimized : bool - whether measure the end-end latency with the latency optimized strategy num_loop : int number of loops to run for measuring the latency num_query : int @@ -49,15 +45,15 @@ def measure_latency(df, data_ori, task_name, sample_size, path_to_model : str load model from custom path """ + mode = os.environ['MODE'] data_ori_size = 0 data_comp_size = 0 memory_optimized_latency = None latency_optimized_latency = None memory_optimized_result = None latency_optimized_result = None + exp_data_dict = dict() key = df.columns[0] - # block_size = 1024 * 1024 - # block_size = 1024 * 512 record_size = data_ori[0].nbytes num_record_per_part = np.floor(block_size / record_size) x = data_ori[key] @@ -70,6 +66,9 @@ def measure_latency(df, data_ori, task_name, sample_size, task_name = task_name folder_name = 'zstd' comp_data_dir = os.path.join(root_path, task_name, folder_name) + + if 'DATA_OPS' in os.environ: + comp_data_dir = os.path.join(comp_data_dir, os.environ['DATA_OPS']) print('[Generate File Path]: {}'.format(comp_data_dir)) dict_contigous_key = dict() @@ -86,7 +85,6 @@ def measure_latency(df, data_ori, task_name, sample_size, data_part = data_ori[data_idx] if search_algo == 'binary_c': - # FIXME temporary workaround to avoid the overhead of converting to contiguous array dict_contigous_key[block_idx] = np.array(data_part[key], order='F').astype(np.int32) if len(data_part) == 0: @@ -99,177 +97,137 @@ def measure_latency(df, data_ori, task_name, sample_size, data_ori_size = data_ori.nbytes/1024/1024 data_comp_size = data_size/1024/1024 print('Ori Size: {}, Curr Size: {}'.format(data_ori.nbytes/1024/1024, data_size/1024/1024)) - - list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) - - # Measure latency for run-time memory optimized strategy - if memory_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - t_total = 0 - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_locate_part = 0 + exp_data_dict['num_record_per_part'] = num_record_per_part + exp_data_dict['data_ori_size'] = data_ori_size + exp_data_dict['data_comp_size'] = data_comp_size + exp_data_dict['x_start'] = x_start + exp_data_dict['x_end'] = x_end + ndb_utils.save_obj_to_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data'), exp_data_dict) + list_sample_index = ndb_utils.generate_query(x_start, x_end, num_query=num_query, sample_size=sample_size) + else: + exp_data_dict = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(comp_data_dir, 'extra_meta.data')) + num_record_per_part = exp_data_dict['num_record_per_part'] + data_ori_size = exp_data_dict['data_ori_size'] + data_comp_size = exp_data_dict['data_comp_size'] + x_start = exp_data_dict['x_start'] + x_end = exp_data_dict['x_end'] + list_sample_index = ndb_utils.load_obj_from_disk_with_pickle(os.path.join(root_path, task_name, 'sample_index_{}.data'.format(sample_size))) + + timer_total = ndb_utils.Timer() + timer_decomp = ndb_utils.Timer() + timer_lookup = ndb_utils.Timer() + timer_total = ndb_utils.Timer() + timer_sort = ndb_utils.Timer() + timer_locate_part = ndb_utils.Timer() + timer_build_index = ndb_utils.Timer() + t_decomp = 0 + t_lookup = 0 + t_sort = 0 + t_total = 0 + t_locate_part = 0 + t_build_index = 0 + timer_total.tic() + + for _ in tqdm(range(num_loop)): + decomp_block = dict() + partition_hit = dict() peak_memory = 0 - - for _ in tqdm(range(num_loop)): - decomp_block = None - num_decomp = 0 - count_nonexist = 0 - prev_part_idx = -1 - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - part_idx = int((query_key-x_start) // num_record_per_part) - t_locate_part += timer_locate_part.toc() - timer_decomp.tic() - - if part_idx != prev_part_idx: - # new block to decompress - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - curr_decomp_block = np.rec.array(zstd.decompress(block_bytes), dtype=data_ori.dtype) - decomp_block = curr_decomp_block - num_decomp += 1 - current_memory = sys.getsizeof(block_bytes) - current_memory += curr_decomp_block.nbytes - if current_memory > peak_memory: - peak_memory = current_memory - prev_part_idx = part_idx - else: - curr_decomp_block = decomp_block - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - - if (search_algo == 'binary' and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - else: - count_nonexist += 1 - - t_lookup += timer_lookup.toc() - - t_total += timer_total.toc() - memory_optimized_result = result.copy() - memory_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 0, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T - - # Measure latency for end-end latency optimzed strategy - if latency_optimized: - timer_total = ndb_utils.Timer() - timer_decomp = ndb_utils.Timer() - timer_lookup = ndb_utils.Timer() - timer_total = ndb_utils.Timer() - timer_sort = ndb_utils.Timer() - timer_locate_part = ndb_utils.Timer() - timer_build_index = ndb_utils.Timer() - t_decomp = 0 - t_lookup = 0 - t_sort = 0 - t_total = 0 - t_locate_part = 0 - t_build_index = 0 - timer_total.tic() - - for _ in tqdm(range(num_loop)): - decomp_block = dict() - peak_memory = 0 - num_decomp = 0 - count_nonexist = 0 - cache_block_memory = 0 - - # build hash table - if search_algo == 'hash': - data_hash = dict() - - for query_idx in range(num_query): - sample_index = list_sample_index[query_idx] - timer_total.tic() - timer_sort.tic() - sample_index_sorted = np.sort(sample_index) - sample_index_argsort = np.argsort(sample_index) - t_sort += timer_sort.toc() - result = np.recarray((sample_size,), dtype=data_ori.dtype) - result_idx = 0 - for idx in range(sample_size): - timer_locate_part.tic() - query_key = sample_index_sorted[idx] - query_key_index_in_old = sample_index_argsort[idx] - t_locate_part += timer_locate_part.toc() - part_idx = int((query_key-x_start) // num_record_per_part) - timer_decomp.tic() - - if part_idx not in decomp_block: - file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') - block_bytes = ndb_utils.read_bytes_from_disk(file_name) - curr_decomp_block = np.rec.array(zstd.decompress(block_bytes), dtype=data_ori.dtype) + num_decomp = 0 + count_nonexist = 0 + cache_block_memory = 0 + gc.collect() + + # build hash table + if search_algo == 'hash': + data_hash = dict() + + for query_idx in range(num_query): + sample_index = list_sample_index[query_idx] + timer_total.tic() + timer_sort.tic() + sample_index_sorted = np.sort(sample_index) + sample_index_argsort = np.argsort(sample_index) + sample_index_partition = (sample_index_sorted - x_start) // num_record_per_part + sample_index_partition = sample_index_partition.astype(np.int32) + t_sort += timer_sort.toc() + result = np.ndarray((sample_size,), dtype=data_ori.dtype) + result_idx = 0 + for idx in range(sample_size): + query_key = sample_index_sorted[idx] + query_key_index_in_old = sample_index_argsort[idx] + timer_locate_part.tic() + part_idx = sample_index_partition[idx] + t_locate_part += timer_locate_part.toc() + timer_decomp.tic() + + if part_idx not in decomp_block: + if mode == 'edge': + available_memory = ndb_utils.get_available_memory() + if available_memory < 1024*1024*100: + # memory not eneough, free some memory + decomp_block = ndb_utils.evict_unused_partition(decomp_block, partition_hit, free_memory=1024*1024*100) + + partition_hit[part_idx] =1 + file_name = os.path.join(comp_data_dir, str(part_idx) + '.data') + block_bytes = ndb_utils.read_bytes_from_disk(file_name) + curr_decomp_block = np.frombuffer(zstd.decompress(block_bytes), dtype=data_ori.dtype) + try: decomp_block[part_idx] = curr_decomp_block - num_decomp += 1 - block_bytes_size = sys.getsizeof(block_bytes) - - if search_algo == 'hash': - t_decomp += timer_decomp.toc() - timer_build_index.tic() - for block_data_idx in range(len(curr_decomp_block)): - data_entry_key = curr_decomp_block[key][block_data_idx] - # print(data_entry_key) - data_entry_val = curr_decomp_block[block_data_idx] - data_hash[data_entry_key] = data_entry_val - cache_block_memory = sys.getsizeof(data_hash) - t_build_index += timer_build_index.toc() - timer_decomp.tic() - else: - cache_block_memory += curr_decomp_block.nbytes - else: - curr_decomp_block = decomp_block[part_idx] - t_decomp += timer_decomp.toc() - timer_lookup.tic() - - if search_algo == 'binary': - data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) - elif search_algo == 'binary_c': - arr_contigous_arr = dict_contigous_key[part_idx] - data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block)) - elif search_algo == 'naive': - data_idx = curr_decomp_block[key] == query_key - elif search_algo == 'hash': - data_idx = query_key in data_hash.keys() - - if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): - result[query_key_index_in_old] = curr_decomp_block[data_idx] - elif search_algo == 'hash' and data_idx == True: - result[query_key_index_in_old] = data_hash[query_key] + except: + decomp_block = dict() + decomp_block[part_idx] = curr_decomp_block + num_decomp += 1 + block_bytes_size = sys.getsizeof(block_bytes) + + if search_algo == 'hash': + t_decomp += timer_decomp.toc() + timer_build_index.tic() + for block_data_idx in range(len(curr_decomp_block)): + data_entry_key = curr_decomp_block[key][block_data_idx] + # print(data_entry_key) + data_entry_val = curr_decomp_block[block_data_idx] + data_hash[data_entry_key] = data_entry_val + cache_block_memory = sys.getsizeof(data_hash) + t_build_index += timer_build_index.toc() + timer_decomp.tic() else: - count_nonexist += 1 - t_lookup += timer_lookup.toc() - result_idx += 1 - - if cache_block_memory + block_bytes_size > peak_memory: - peak_memory = cache_block_memory + block_bytes_size - t_total += timer_total.toc() + cache_block_memory += curr_decomp_block.nbytes + else: + curr_decomp_block = decomp_block[part_idx] + partition_hit[part_idx] +=1 + t_decomp += timer_decomp.toc() + timer_lookup.tic() + + if search_algo == 'binary': + data_idx = ndb_utils.binary_search(curr_decomp_block[key], query_key, len(curr_decomp_block)) + elif search_algo == 'binary_c': + arr_contigous_arr = dict_contigous_key[part_idx] + data_idx = shared_utils.aux_look_up_bin(arr_contigous_arr, query_key, len(curr_decomp_block)) + elif search_algo == 'naive': + data_idx = curr_decomp_block[key] == query_key + elif search_algo == 'hash': + data_idx = query_key in data_hash.keys() + + if ((search_algo == 'binary' or search_algo =='binary_c') and data_idx >= 0) or (search_algo == 'naive' and np.sum(data_idx) > 0): + result[query_key_index_in_old] = tuple(curr_decomp_block[data_idx]) + elif search_algo == 'hash' and data_idx == True: + result[query_key_index_in_old] = tuple(data_hash[query_key]) + else: + count_nonexist += 1 + t_lookup += timer_lookup.toc() + result_idx += 1 + + if cache_block_memory + block_bytes_size > peak_memory: + peak_memory = cache_block_memory + block_bytes_size + t_total += timer_total.toc() latency_optimized_result = result.copy() - latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, - t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, - t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T + del result + del decomp_block + del partition_hit + gc.collect() + latency_optimized_latency = np.array((data_ori_size, data_comp_size, sample_size, 1, peak_memory/1024/1024, t_sort / num_loop, + t_locate_part / num_loop, t_decomp / num_loop, t_build_index / num_loop, + t_lookup / num_loop, t_total / num_loop, num_decomp, count_nonexist)).T return_latency = None if memory_optimized_latency is None and latency_optimized_latency is not None: diff --git a/run_benchmark_data_manipulation.py b/run_benchmark_data_manipulation.py index b32bb3b..a673817 100644 --- a/run_benchmark_data_manipulation.py +++ b/run_benchmark_data_manipulation.py @@ -12,24 +12,38 @@ 'data_manipulation/multi_column_low_corr_100m', 'data_manipulation/multi_column_high_corr_100m'] list_benchmark = ['uncompress', 'zstd', 'deepmapping'] -list_sample_size = [10000] +list_sample_size = [1000, 100000] list_ops = ['Default', 'Insert', 'Update', 'Delete'] list_run_config = list(itertools.product(list_dataset, list_benchmark, list_sample_size, list_ops)) print('[Config]: \n\t Dataset: {} \n\t Benchmark: {} \n\t Sample Size: {}'.format(list_dataset, list_benchmark, list_sample_size)) -memory_optimized = False -latency_optimized = True num_loop = 100 num_query = 5 search_algo = 'binary' file_name = 'benchmark_data_manipulation.csv' +# The following flag is used to indicate whether you can re-use the existing disk +# files (stored in temp dir) saved from a fresh run. Usually, you can start a +# fresh run and then change this flag to False. Also, if you set this flag a False +# please make sure, you also run the generate_sample_index.py under DeepMapping +# folder to pre-generate the query index before your next run. +generate_file = True +# specificy your deep learning model backend, current support keras h5 model and onnx +# model. There is a utility under DeepMapping to convert a h5 model into onnx format. +os.environ['BACKEND'] = 'onnx' +# Run the benchmark with the specified mode. full mode: assume memory is sufficient to cache +# all the data; edge mode: try to cache all data within the available memory but reserve +# a number of free memory for underlying process, current value: 100MB. Once the memory +# is insufficient, it will try to evict the least used partition to free the memory. +os.environ['MODE'] = 'full' for run_config in tqdm(list_run_config): print('[STATUS] Current config: {}'.format(run_config)) task_name, benchmark, sample_size, data_ops = run_config generate_file = True - - df = pd.read_csv('dataset/{}.csv'.format(task_name)) + if generate_file: + df = pd.read_csv('dataset/{}.csv'.format(task_name)) + else: + df = pd.read_csv('dataset/{}.csv'.format(task_name), nrows=2) df, data_ori = df_preprocess(df, benchmark) # perform data manipulation to the data df, data_ori = data_manipulation(df, data_ops) @@ -38,8 +52,7 @@ try: data_ori_size, data_comp_size, result, latency = function_call(df=df, data_ori=data_ori, task_name=task_name, sample_size=sample_size, - generate_file=generate_file, memory_optimized=memory_optimized, - latency_optimized=latency_optimized, num_loop=num_loop, + generate_file=generate_file, num_loop=num_loop, num_query=num_query, search_algo=search_algo) result_df = pd.DataFrame(latency) result_df['config'] = str(run_config) diff --git a/run_benchmark_data_query.py b/run_benchmark_data_query.py index 624f846..c8213ae 100644 --- a/run_benchmark_data_query.py +++ b/run_benchmark_data_query.py @@ -8,32 +8,44 @@ from DeepMapping.benchmark_utils import benchmark_handler list_dataset = ['tpch-s1/customer', 'tpch-s1/lineitem', 'tpch-s1/orders', 'tpch-s1/part', 'tpch-s1/supplier'] -list_benchmark = ['uncompress', 'dgpe', 'delta', 'byte_dictionary', 'lzo', 'zstd', 'rle', 'deepmapping'] -list_sample_size = [1000, 10000] +list_benchmark = ['uncompress', 'zstd', 'deepmapping', 'hashtable', 'hashtable_with_compression'] +list_sample_size = [1000, 100000] list_run_config = list(itertools.product(list_dataset, list_benchmark, list_sample_size)) print('[Config]: \n\t Dataset: {} \n\t Benchmark: {} \n\t Sample Size: {}'.format(list_dataset, list_benchmark, list_sample_size)) -memory_optimized = True # whether measure the latency for memory optimized strategy -latency_optimized = True # whether measure teh latency for latency optimized strategy num_loop = 100 num_query = 5 search_algo = 'binary' file_name = 'benchmark_data_query.csv' +# The following flag is used to indicate whether you can re-use the existing disk +# files (stored in temp dir) saved from a fresh run. Usually, you can start a +# fresh run and then change this flag to False. Also, if you set this flag a False +# please make sure, you also run the generate_sample_index.py under DeepMapping +# folder to pre-generate the query index before your next run. +generate_file = True +# specificy your deep learning model backend, current support keras h5 model and onnx +# model. There is a utility under DeepMapping to convert a h5 model into onnx format. +os.environ['BACKEND'] = 'onnx' +# Run the benchmark with the specified mode. full mode: assume memory is sufficient to cache +# all the data; edge mode: try to cache all data within the available memory but reserve +# a number of free memory for underlying process, current value: 100MB. Once the memory +# is insufficient, it will try to evict the least used partition to free the memory. +os.environ['MODE'] = 'full' -# pre_generated_files = defaultdict(bool) for run_config in tqdm(list_run_config): print('[STATUS] Current config: {}'.format(run_config)) task_name, benchmark, sample_size = run_config - generate_file = True - df = pd.read_csv('dataset/{}.csv'.format(task_name)) + if generate_file: + df = pd.read_csv('dataset/{}.csv'.format(task_name)) + else: + df = pd.read_csv('dataset/{}.csv'.format(task_name), nrows=2) df, data_ori = df_preprocess(df, benchmark) function_call = benchmark_handler(benchmark) try: data_ori_size, data_comp_size, result, latency = function_call(df=df, data_ori=data_ori, task_name=task_name, sample_size=sample_size, - generate_file=generate_file, memory_optimized=memory_optimized, - latency_optimized=latency_optimized, num_loop=num_loop, + generate_file=generate_file, num_loop=num_loop, num_query=num_query, search_algo=search_algo) result_df = pd.DataFrame(latency) result_df['config'] = str(run_config)