Merge pull request #5 from asu-cactus/revision

update util code, readme, prereqs
asu-cactus · Sep 1, 2023 · 51fd6b9 · 51fd6b9
2 parents 226fb4f + 3061e07
commit 51fd6b9
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 42 deletions.
diff --git a/DeepMapping/DeepMapping/ndb_utils.py b/DeepMapping/DeepMapping/ndb_utils.py
@@ -1,12 +1,30 @@
+import gc
 import numpy as np
-import pandas as pd
-import time
 import os
-import shutil
 import pandas as pd
+import pickle
+import psutil
+import shutil
+import time
+import sys
+from heapq import nsmallest
 from tqdm.auto import tqdm 
 
-def df_preprocess(df, benchmark = None):
+def recarr_to_ndarray(rec_arr):
+    type_list = []
+    for i in range(len(rec_arr.dtype)):
+        col_name = rec_arr.dtype.names[i]
+        col_type = rec_arr.dtype[i]
+
+        if col_type == 'O':
+            col_type = 'S8'
+        type_list.append((col_name, col_type))
+
+    ndarray = np.array(rec_arr, dtype=type_list)
+    del rec_arr
+    return ndarray
+
+def df_preprocess(df, benchmark = None, to_ndarray=True):
     if 'PARTKEY' in df.columns and 'SUPPKEY' in df.columns:
         df.reset_index(inplace=True)
         df.drop(['PARTKEY', 'SUPPKEY', 'QUANTITY'], axis=1, inplace=True)
@@ -43,9 +61,11 @@ def df_preprocess(df, benchmark = None):
     key = df.columns[0]
     df.sort_values(by=key, inplace=True)
     data_ori = df.to_records(index=False)
+    if to_ndarray:
+        data_ori = recarr_to_ndarray(data_ori)
     return df, data_ori
 
-def data_manipulation(df, ops='None'):
+def data_manipulation(df, ops='None', to_ndarray=True):
     # default use 90% data
     if ops == 'None':
         pass
@@ -84,6 +104,8 @@ def data_manipulation(df, ops='None'):
     key = df.columns[0]
     df.sort_values(by=key, inplace=True)
     data_ori = df.to_records(index=False)
+    if to_ndarray:
+        data_ori = recarr_to_ndarray(data_ori)
     return df, data_ori
 
 def process_df_for_synthetic_data(df):
@@ -204,14 +226,42 @@ def recreate_temp_dir(comp_data_dir):
     else:
         shutil.rmtree(comp_data_dir)
         os.makedirs(comp_data_dir)
+
 def save_byte_to_disk(file_name, f_bytes):
     with open(file_name, "wb") as binary_file:
         binary_file.write(f_bytes)
+
 def read_bytes_from_disk(file_name):
     with open(file_name, "rb") as f:
         bytes_read = f.read()
     return bytes_read
 
+def save_recarray_to_disk(file_name, rec_arr):
+    np.save(file_name, rec_arr, allow_pickle=True)
+
+def load_recarray_from_disk(file_name):
+    return np.load(file_name, allow_pickle=True)
+
+def save_obj_to_disk_with_pickle(file_name, data):
+    with open(file_name, "wb") as f:
+        pickle.dump(data, f)
+
+def load_obj_from_disk_with_pickle(file_name):
+    with open(file_name, "rb") as f:
+        obj = pickle.load(f)
+    return obj
+
+def save_hashtable_to_disk(file_name, hashtable):
+    with open(file_name, "wb") as f:
+        pickle.dump(hashtable, f)
+
+def load_hashtable_from_disk(file_name):
+    with open(file_name, 'rb') as f:
+        hash_table = pickle.load(f)
+    return hash_table
+
+def get_size_of_file(file_name):
+    return os.path.getsize(file_name)
 
 class Timer(object):
     """A convenient class to measure the running time of a program
@@ -251,4 +301,35 @@ def binary_search(x, val, num_record, search_larger_value=False):
     if search_larger_value:
         return low
     else:
-        return -1
+        return -1
+
+def get_available_memory():
+    return psutil.virtual_memory()[1]
+
+def evict_unused_partition(decomp_block_dict, partition_hit, free_memory):
+    max_try = 10
+    curr_try = 0
+    while get_available_memory() < free_memory:
+        if curr_try > max_try:
+            return 
+        curr_try += 1
+        list_least_used_partition_id = nsmallest(100, partition_hit, key = partition_hit.get)
+        for least_used_partition_id in list_least_used_partition_id:
+            # least_used_partition_id = min(partition_hit, key=partition_hit.get)
+            del decomp_block_dict[least_used_partition_id]
+            del partition_hit[least_used_partition_id]
+            # print('[DEBUG] eviction work')
+        decomp_block_dict = dict(decomp_block_dict)
+        # gc.collect()
+        if decomp_block_dict is None:
+            return dict()
+    return decomp_block_dict
+
+def get_nested_dict_size(d):
+    total_size = sys.getsizeof(d)
+    for value in d.values():
+        if isinstance(value, dict):
+            total_size += get_nested_dict_size(value)
+        else:
+            total_size += sys.getsizeof(value)
+    return total_size
diff --git a/README.md b/README.md
@@ -5,23 +5,13 @@ Resources for SIGMOD 2024 Submission
 <!-- TOC -->
 
 - [DeepMapping: The Case for Learned Data Mapping for Compression and Efficient Query Processing](#deepmapping-the-case-for-learned-data-mapping-for-compression-and-efficient-query-processing)
-    - [Set-Up](#set-up)
-    - [Dataset](#dataset)
-    - [Model Searching](#model-searching)
-    - [Benchmark](#benchmark)
-        - [Task: Data Query](#task-data-query)
-        - [Task: Data Manipulation](#task-data-manipulation)
-    - [Supplemental Material](#supplemental-material)
-        - [Comparison of end-end latency using hashing and binary search](#comparison-of-end-end-latency-using-hashing-and-binary-search)
-        - [Comparison of end-end latency for running model in CPU/GPU](#comparison-of-end-end-latency-for-running-model-in-cpugpu)
-        - [Comparison of tunning the partition size](#comparison-of-tunning-the-partition-size)
+  - [Dataset](#dataset)
+  - [Model Searching](#model-searching)
+  - [Benchmark](#benchmark)
+    - [Task: Data Query](#task-data-query)
+    - [Task: Data Manipulation](#task-data-manipulation)
 
 <!-- /TOC -->
-
-## Set-Up
-
-1. Please install the required dependencies via `pip install -r requirements.txt`
-
 2. DeepMapping is wrapped up as a Python library, please run the following command to install it.
 
     ```
@@ -37,9 +27,9 @@ Resources for SIGMOD 2024 Submission
 
 ## Dataset
 
-Our experiments covered synthetic datasets, low/high correlation datasets with different scales(100MB, 1GB, 10GB), and TPC-H, TPC-DS benchmark datasets with scale factors as 1 and 10. We removed all string/continuous columns and uploaded our pre-generated datasets to [**HERE**](https://mega.nz/file/aUREBDQI#vW-rUQOTOr0B7uN9XhcOFXd2dqfe5yA18-Mk3xn-Dvc):
+Our experiments covered synthetic datasets, low/high correlation datasets with different scales(100MB, 1GB, 10GB), and TPC-H, TPC-DS benchmark datasets with scale factors as 1 and 10. We removed all string/continuous columns and uploaded our pre-generated datasets to [**HERE**](#FIXME):
 
-[**DATA LINK: https://mega.nz/file/aUREBDQI#vW-rUQOTOr0B7uN9XhcOFXd2dqfe5yA18-Mk3xn-Dvc**](https://mega.nz/file/aUREBDQI#vW-rUQOTOr0B7uN9XhcOFXd2dqfe5yA18-Mk3xn-Dvc)
+[**DATA LINK: Uploading...**](#FIXME)
 
 After download it, please unzip it to the **root** folder of this GitHub repository. Then, you will see a **dataset**  folder here.
 
@@ -58,9 +48,9 @@ List of datasets:
 
 ## Benchmark 
 
-We provided some example models for the following 2 tasks. Please go [**HERE**](https://mega.nz/file/SdYWHAzZ#AAuYAz_-UmHXWUixHGOzzBJN0NTmwY6N66da3UyRS9s) to download:
+We provided some example models for the following 2 tasks. Please go [**HERE**](#FIXME) to download:
 
-[**MODEL LINK: https://mega.nz/file/SdYWHAzZ#AAuYAz_-UmHXWUixHGOzzBJN0NTmwY6N66da3UyRS9s**](https://mega.nz/file/SdYWHAzZ#AAuYAz_-UmHXWUixHGOzzBJN0NTmwY6N66da3UyRS9s)
+[**MODEL LINK: Uploading...**](#FIXME)
 
 After download it, please unzip it to the **root** folder of this GitHub repository. Then, you will see a **models**  folder here.
 
@@ -74,19 +64,4 @@ Run `python run_benchmark_data_query.py` to benchmark. To benchmark with differe
 
 ### Task: Data Manipulation
 
-These experiments measured overall storage overhead and end-end query latency for synthetic dataset with data manipulation, i.e. INSERT/UPDATE/DELETE. Run `python run_benchmark_data_manipulation.py` to benchmark it. To benchmark with different dataset, you should modify the file correspondingly by following the instructions provided in the python file.
-
-## Supplemental Material
-
-All latency values are measured in ms.
-### Comparison of end-end latency using hashing and binary search
-
-![tab-binary-vs-hash](./imgs/tab-binary-vs-hash.png)
-### Comparison of end-end latency for running model in CPU/GPU
-
-![tab-cpu-vs-gpu](./imgs/tab-cpu-vs-gpu.png)
-### Comparison of tunning the partition size
-
-Experiments results are measured on TPC-H, SF=10, `orders` table, B=100,000
-
-![tab-cpu-vs-gpu](./imgs/tab-partition-size.png)
+These experiments measured overall storage overhead and end-end query latency for synthetic dataset with data manipulation, i.e. INSERT/UPDATE/DELETE. Run `python run_benchmark_data_manipulation.py` to benchmark it. To benchmark with different dataset, you should modify the file correspondingly by following the instructions provided in the python file.
diff --git a/imgs/tab-binary-vs-hash.png b/imgs/tab-binary-vs-hash.png
diff --git a/imgs/tab-cpu-vs-gpu.png b/imgs/tab-cpu-vs-gpu.png
diff --git a/imgs/tab-partition-size.png b/imgs/tab-partition-size.png
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,12 @@
 bitarray==2.6.2
 matplotlib==3.7.0
-more_itertools==9.1.0
+more_itertools==10.1.0
 numpy==1.24.1
+onnx==1.14.0
+onnx_opcounter==0.0.3
+onnxruntime==1.15.1
 pandas==1.5.3
+psutil==5.9.4
 python_lzo==1.14
 scikit_learn==1.2.1
 setuptools==65.5.0
@@ -11,3 +15,4 @@ torch==2.0.0.dev20230202+cu116
 torchmetrics==0.9.3
 tqdm==4.64.1
 zstd==1.5.2.6
+protobuf==3.20.0