From f90fdb4435e07ba4b70864087c2f0783fabf8500 Mon Sep 17 00:00:00 2001 From: RoyYang0714 Date: Fri, 23 Aug 2024 21:19:02 +0200 Subject: [PATCH] fix: Separate to hdf5 script. --- tests/data/io/to_hdf5_test.py | 2 +- vis4d/data/io/hdf5.py | 62 ---------------------------- vis4d/data/io/to_hdf5.py | 76 +++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 63 deletions(-) create mode 100644 vis4d/data/io/to_hdf5.py diff --git a/tests/data/io/to_hdf5_test.py b/tests/data/io/to_hdf5_test.py index 5e469746..caf61923 100644 --- a/tests/data/io/to_hdf5_test.py +++ b/tests/data/io/to_hdf5_test.py @@ -4,7 +4,7 @@ import unittest from tests.util import get_test_data -from vis4d.data.io.hdf5 import convert_dataset +from vis4d.data.io.to_hdf5 import convert_dataset class TestHDF5(unittest.TestCase): diff --git a/vis4d/data/io/hdf5.py b/vis4d/data/io/hdf5.py index 15107770..3d6b594e 100644 --- a/vis4d/data/io/hdf5.py +++ b/vis4d/data/io/hdf5.py @@ -6,12 +6,10 @@ from __future__ import annotations -import argparse import os from typing import Literal import numpy as np -from tqdm import tqdm from vis4d.common.imports import H5PY_AVAILABLE @@ -242,63 +240,3 @@ def close(self) -> None: for client, _ in self.db_cache.values(): client.close() self.db_cache.clear() - - -def convert_dataset(source_dir: str) -> None: - """Convert a dataset to HDF5 format. - - This function converts an arbitary dictionary to an HDF5 file. The keys - inside the HDF5 file preserve the directory structure of the original. - - As an example, if you convert "/path/to/dataset" to HDF5, the resulting - file will be: "/path/to/dataset.hdf5". The file "relative/path/to/file" - will be stored at "relative/path/to/file" inside /path/to/dataset.hdf5. - - Args: - source_dir (str): The path to the dataset to convert. - """ - if not os.path.exists(source_dir): - raise FileNotFoundError(f"No such file or directory: {source_dir}") - - source_dir = os.path.join(source_dir, "") # must end with trailing slash - hdf5_path = source_dir.rstrip("/") + ".hdf5" - if os.path.exists(hdf5_path): - print(f"File {hdf5_path} already exists! Skipping {source_dir}") - return - - print(f"Converting dataset at: {source_dir}") - hdf5_file = h5py.File(hdf5_path, mode="w") - sub_dirs = list(os.walk(source_dir)) - file_count = sum(len(files) for (_, _, files) in sub_dirs) - - with tqdm(total=file_count) as pbar: - for root, _, files in sub_dirs: - g_name = root.replace(source_dir, "") - g = hdf5_file.create_group(g_name) if g_name else hdf5_file - for f in files: - filepath = os.path.join(root, f) - if os.path.isfile(filepath): - with open(filepath, "rb") as fp: - file_content = fp.read() - g.create_dataset( - f, data=np.frombuffer(file_content, dtype="uint8") - ) - pbar.update() - - hdf5_file.close() - print("done.") - - -if __name__ == "__main__": # pragma: no cover - parser = argparse.ArgumentParser( - description="Converts a dataset at the specified path to hdf5. The " - "local directory structure is preserved in the hdf5 file." - ) - parser.add_argument( - "-p", - "--path", - required=True, - help="path to the root folder of a specific dataset to convert", - ) - args = parser.parse_args() - convert_dataset(args.path) diff --git a/vis4d/data/io/to_hdf5.py b/vis4d/data/io/to_hdf5.py new file mode 100644 index 00000000..4a2161a5 --- /dev/null +++ b/vis4d/data/io/to_hdf5.py @@ -0,0 +1,76 @@ +"""Script to convert a dataset to hdf5 format.""" + +from __future__ import annotations + +import argparse +import os + +import numpy as np +from tqdm import tqdm + +from vis4d.common.imports import H5PY_AVAILABLE + +if H5PY_AVAILABLE: + import h5py +else: + raise ImportError("Please install h5py to enable HDF5Backend.") + + +def convert_dataset(source_dir: str) -> None: + """Convert a dataset to HDF5 format. + + This function converts an arbitary dictionary to an HDF5 file. The keys + inside the HDF5 file preserve the directory structure of the original. + + As an example, if you convert "/path/to/dataset" to HDF5, the resulting + file will be: "/path/to/dataset.hdf5". The file "relative/path/to/file" + will be stored at "relative/path/to/file" inside /path/to/dataset.hdf5. + + Args: + source_dir (str): The path to the dataset to convert. + """ + if not os.path.exists(source_dir): + raise FileNotFoundError(f"No such file or directory: {source_dir}") + + source_dir = os.path.join(source_dir, "") # must end with trailing slash + hdf5_path = source_dir.rstrip("/") + ".hdf5" + if os.path.exists(hdf5_path): + print(f"File {hdf5_path} already exists! Skipping {source_dir}") + return + + print(f"Converting dataset at: {source_dir}") + hdf5_file = h5py.File(hdf5_path, mode="w") + sub_dirs = list(os.walk(source_dir)) + file_count = sum(len(files) for (_, _, files) in sub_dirs) + + with tqdm(total=file_count) as pbar: + for root, _, files in sub_dirs: + g_name = root.replace(source_dir, "") + g = hdf5_file.create_group(g_name) if g_name else hdf5_file + for f in files: + filepath = os.path.join(root, f) + if os.path.isfile(filepath): + with open(filepath, "rb") as fp: + file_content = fp.read() + g.create_dataset( + f, data=np.frombuffer(file_content, dtype="uint8") + ) + pbar.update() + + hdf5_file.close() + print("done.") + + +if __name__ == "__main__": # pragma: no cover + parser = argparse.ArgumentParser( + description="Converts a dataset at the specified path to hdf5. The " + "local directory structure is preserved in the hdf5 file." + ) + parser.add_argument( + "-p", + "--path", + required=True, + help="path to the root folder of a specific dataset to convert", + ) + args = parser.parse_args() + convert_dataset(args.path)