Skip to content

Commit

Permalink
Merge pull request #5 from asu-cactus/revision
Browse files Browse the repository at this point in the history
update util code, readme, prereqs
  • Loading branch information
lixi-zhou authored Sep 1, 2023
2 parents 226fb4f + 3061e07 commit 51fd6b9
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 42 deletions.
93 changes: 87 additions & 6 deletions DeepMapping/DeepMapping/ndb_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
import gc
import numpy as np
import pandas as pd
import time
import os
import shutil
import pandas as pd
import pickle
import psutil
import shutil
import time
import sys
from heapq import nsmallest
from tqdm.auto import tqdm

def df_preprocess(df, benchmark = None):
def recarr_to_ndarray(rec_arr):
type_list = []
for i in range(len(rec_arr.dtype)):
col_name = rec_arr.dtype.names[i]
col_type = rec_arr.dtype[i]

if col_type == 'O':
col_type = 'S8'
type_list.append((col_name, col_type))

ndarray = np.array(rec_arr, dtype=type_list)
del rec_arr
return ndarray

def df_preprocess(df, benchmark = None, to_ndarray=True):
if 'PARTKEY' in df.columns and 'SUPPKEY' in df.columns:
df.reset_index(inplace=True)
df.drop(['PARTKEY', 'SUPPKEY', 'QUANTITY'], axis=1, inplace=True)
Expand Down Expand Up @@ -43,9 +61,11 @@ def df_preprocess(df, benchmark = None):
key = df.columns[0]
df.sort_values(by=key, inplace=True)
data_ori = df.to_records(index=False)
if to_ndarray:
data_ori = recarr_to_ndarray(data_ori)
return df, data_ori

def data_manipulation(df, ops='None'):
def data_manipulation(df, ops='None', to_ndarray=True):
# default use 90% data
if ops == 'None':
pass
Expand Down Expand Up @@ -84,6 +104,8 @@ def data_manipulation(df, ops='None'):
key = df.columns[0]
df.sort_values(by=key, inplace=True)
data_ori = df.to_records(index=False)
if to_ndarray:
data_ori = recarr_to_ndarray(data_ori)
return df, data_ori

def process_df_for_synthetic_data(df):
Expand Down Expand Up @@ -204,14 +226,42 @@ def recreate_temp_dir(comp_data_dir):
else:
shutil.rmtree(comp_data_dir)
os.makedirs(comp_data_dir)

def save_byte_to_disk(file_name, f_bytes):
with open(file_name, "wb") as binary_file:
binary_file.write(f_bytes)

def read_bytes_from_disk(file_name):
with open(file_name, "rb") as f:
bytes_read = f.read()
return bytes_read

def save_recarray_to_disk(file_name, rec_arr):
np.save(file_name, rec_arr, allow_pickle=True)

def load_recarray_from_disk(file_name):
return np.load(file_name, allow_pickle=True)

def save_obj_to_disk_with_pickle(file_name, data):
with open(file_name, "wb") as f:
pickle.dump(data, f)

def load_obj_from_disk_with_pickle(file_name):
with open(file_name, "rb") as f:
obj = pickle.load(f)
return obj

def save_hashtable_to_disk(file_name, hashtable):
with open(file_name, "wb") as f:
pickle.dump(hashtable, f)

def load_hashtable_from_disk(file_name):
with open(file_name, 'rb') as f:
hash_table = pickle.load(f)
return hash_table

def get_size_of_file(file_name):
return os.path.getsize(file_name)

class Timer(object):
"""A convenient class to measure the running time of a program
Expand Down Expand Up @@ -251,4 +301,35 @@ def binary_search(x, val, num_record, search_larger_value=False):
if search_larger_value:
return low
else:
return -1
return -1

def get_available_memory():
return psutil.virtual_memory()[1]

def evict_unused_partition(decomp_block_dict, partition_hit, free_memory):
max_try = 10
curr_try = 0
while get_available_memory() < free_memory:
if curr_try > max_try:
return
curr_try += 1
list_least_used_partition_id = nsmallest(100, partition_hit, key = partition_hit.get)
for least_used_partition_id in list_least_used_partition_id:
# least_used_partition_id = min(partition_hit, key=partition_hit.get)
del decomp_block_dict[least_used_partition_id]
del partition_hit[least_used_partition_id]
# print('[DEBUG] eviction work')
decomp_block_dict = dict(decomp_block_dict)
# gc.collect()
if decomp_block_dict is None:
return dict()
return decomp_block_dict

def get_nested_dict_size(d):
total_size = sys.getsizeof(d)
for value in d.values():
if isinstance(value, dict):
total_size += get_nested_dict_size(value)
else:
total_size += sys.getsizeof(value)
return total_size
45 changes: 10 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,13 @@ Resources for SIGMOD 2024 Submission
<!-- TOC -->

- [DeepMapping: The Case for Learned Data Mapping for Compression and Efficient Query Processing](#deepmapping-the-case-for-learned-data-mapping-for-compression-and-efficient-query-processing)
- [Set-Up](#set-up)
- [Dataset](#dataset)
- [Model Searching](#model-searching)
- [Benchmark](#benchmark)
- [Task: Data Query](#task-data-query)
- [Task: Data Manipulation](#task-data-manipulation)
- [Supplemental Material](#supplemental-material)
- [Comparison of end-end latency using hashing and binary search](#comparison-of-end-end-latency-using-hashing-and-binary-search)
- [Comparison of end-end latency for running model in CPU/GPU](#comparison-of-end-end-latency-for-running-model-in-cpugpu)
- [Comparison of tunning the partition size](#comparison-of-tunning-the-partition-size)
- [Dataset](#dataset)
- [Model Searching](#model-searching)
- [Benchmark](#benchmark)
- [Task: Data Query](#task-data-query)
- [Task: Data Manipulation](#task-data-manipulation)

<!-- /TOC -->

## Set-Up

1. Please install the required dependencies via `pip install -r requirements.txt`

2. DeepMapping is wrapped up as a Python library, please run the following command to install it.

```
Expand All @@ -37,9 +27,9 @@ Resources for SIGMOD 2024 Submission
## Dataset
Our experiments covered synthetic datasets, low/high correlation datasets with different scales(100MB, 1GB, 10GB), and TPC-H, TPC-DS benchmark datasets with scale factors as 1 and 10. We removed all string/continuous columns and uploaded our pre-generated datasets to [**HERE**](https://mega.nz/file/aUREBDQI#vW-rUQOTOr0B7uN9XhcOFXd2dqfe5yA18-Mk3xn-Dvc):
Our experiments covered synthetic datasets, low/high correlation datasets with different scales(100MB, 1GB, 10GB), and TPC-H, TPC-DS benchmark datasets with scale factors as 1 and 10. We removed all string/continuous columns and uploaded our pre-generated datasets to [**HERE**](#FIXME):
[**DATA LINK: https://mega.nz/file/aUREBDQI#vW-rUQOTOr0B7uN9XhcOFXd2dqfe5yA18-Mk3xn-Dvc**](https://mega.nz/file/aUREBDQI#vW-rUQOTOr0B7uN9XhcOFXd2dqfe5yA18-Mk3xn-Dvc)
[**DATA LINK: Uploading...**](#FIXME)
After download it, please unzip it to the **root** folder of this GitHub repository. Then, you will see a **dataset** folder here.
Expand All @@ -58,9 +48,9 @@ List of datasets:
## Benchmark
We provided some example models for the following 2 tasks. Please go [**HERE**](https://mega.nz/file/SdYWHAzZ#AAuYAz_-UmHXWUixHGOzzBJN0NTmwY6N66da3UyRS9s) to download:
We provided some example models for the following 2 tasks. Please go [**HERE**](#FIXME) to download:
[**MODEL LINK: https://mega.nz/file/SdYWHAzZ#AAuYAz_-UmHXWUixHGOzzBJN0NTmwY6N66da3UyRS9s**](https://mega.nz/file/SdYWHAzZ#AAuYAz_-UmHXWUixHGOzzBJN0NTmwY6N66da3UyRS9s)
[**MODEL LINK: Uploading...**](#FIXME)
After download it, please unzip it to the **root** folder of this GitHub repository. Then, you will see a **models** folder here.
Expand All @@ -74,19 +64,4 @@ Run `python run_benchmark_data_query.py` to benchmark. To benchmark with differe
### Task: Data Manipulation
These experiments measured overall storage overhead and end-end query latency for synthetic dataset with data manipulation, i.e. INSERT/UPDATE/DELETE. Run `python run_benchmark_data_manipulation.py` to benchmark it. To benchmark with different dataset, you should modify the file correspondingly by following the instructions provided in the python file.
## Supplemental Material
All latency values are measured in ms.
### Comparison of end-end latency using hashing and binary search
![tab-binary-vs-hash](./imgs/tab-binary-vs-hash.png)
### Comparison of end-end latency for running model in CPU/GPU
![tab-cpu-vs-gpu](./imgs/tab-cpu-vs-gpu.png)
### Comparison of tunning the partition size
Experiments results are measured on TPC-H, SF=10, `orders` table, B=100,000
![tab-cpu-vs-gpu](./imgs/tab-partition-size.png)
These experiments measured overall storage overhead and end-end query latency for synthetic dataset with data manipulation, i.e. INSERT/UPDATE/DELETE. Run `python run_benchmark_data_manipulation.py` to benchmark it. To benchmark with different dataset, you should modify the file correspondingly by following the instructions provided in the python file.
Binary file removed imgs/tab-binary-vs-hash.png
Binary file not shown.
Binary file removed imgs/tab-cpu-vs-gpu.png
Binary file not shown.
Binary file removed imgs/tab-partition-size.png
Binary file not shown.
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
bitarray==2.6.2
matplotlib==3.7.0
more_itertools==9.1.0
more_itertools==10.1.0
numpy==1.24.1
onnx==1.14.0
onnx_opcounter==0.0.3
onnxruntime==1.15.1
pandas==1.5.3
psutil==5.9.4
python_lzo==1.14
scikit_learn==1.2.1
setuptools==65.5.0
Expand All @@ -11,3 +15,4 @@ torch==2.0.0.dev20230202+cu116
torchmetrics==0.9.3
tqdm==4.64.1
zstd==1.5.2.6
protobuf==3.20.0

0 comments on commit 51fd6b9

Please sign in to comment.