Skip to content

Commit

Permalink
fix: Separate to hdf5 script.
Browse files Browse the repository at this point in the history
  • Loading branch information
RoyYang0714 committed Aug 23, 2024
1 parent 17c4e02 commit f90fdb4
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 63 deletions.
2 changes: 1 addition & 1 deletion tests/data/io/to_hdf5_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import unittest

from tests.util import get_test_data
from vis4d.data.io.hdf5 import convert_dataset
from vis4d.data.io.to_hdf5 import convert_dataset


class TestHDF5(unittest.TestCase):
Expand Down
62 changes: 0 additions & 62 deletions vis4d/data/io/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@

from __future__ import annotations

import argparse
import os
from typing import Literal

import numpy as np
from tqdm import tqdm

from vis4d.common.imports import H5PY_AVAILABLE

Expand Down Expand Up @@ -242,63 +240,3 @@ def close(self) -> None:
for client, _ in self.db_cache.values():
client.close()
self.db_cache.clear()


def convert_dataset(source_dir: str) -> None:
"""Convert a dataset to HDF5 format.
This function converts an arbitary dictionary to an HDF5 file. The keys
inside the HDF5 file preserve the directory structure of the original.
As an example, if you convert "/path/to/dataset" to HDF5, the resulting
file will be: "/path/to/dataset.hdf5". The file "relative/path/to/file"
will be stored at "relative/path/to/file" inside /path/to/dataset.hdf5.
Args:
source_dir (str): The path to the dataset to convert.
"""
if not os.path.exists(source_dir):
raise FileNotFoundError(f"No such file or directory: {source_dir}")

source_dir = os.path.join(source_dir, "") # must end with trailing slash
hdf5_path = source_dir.rstrip("/") + ".hdf5"
if os.path.exists(hdf5_path):
print(f"File {hdf5_path} already exists! Skipping {source_dir}")
return

print(f"Converting dataset at: {source_dir}")
hdf5_file = h5py.File(hdf5_path, mode="w")
sub_dirs = list(os.walk(source_dir))
file_count = sum(len(files) for (_, _, files) in sub_dirs)

with tqdm(total=file_count) as pbar:
for root, _, files in sub_dirs:
g_name = root.replace(source_dir, "")
g = hdf5_file.create_group(g_name) if g_name else hdf5_file
for f in files:
filepath = os.path.join(root, f)
if os.path.isfile(filepath):
with open(filepath, "rb") as fp:
file_content = fp.read()
g.create_dataset(
f, data=np.frombuffer(file_content, dtype="uint8")
)
pbar.update()

hdf5_file.close()
print("done.")


if __name__ == "__main__": # pragma: no cover
parser = argparse.ArgumentParser(
description="Converts a dataset at the specified path to hdf5. The "
"local directory structure is preserved in the hdf5 file."
)
parser.add_argument(
"-p",
"--path",
required=True,
help="path to the root folder of a specific dataset to convert",
)
args = parser.parse_args()
convert_dataset(args.path)
76 changes: 76 additions & 0 deletions vis4d/data/io/to_hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Script to convert a dataset to hdf5 format."""

from __future__ import annotations

import argparse
import os

import numpy as np
from tqdm import tqdm

from vis4d.common.imports import H5PY_AVAILABLE

if H5PY_AVAILABLE:
import h5py
else:
raise ImportError("Please install h5py to enable HDF5Backend.")


def convert_dataset(source_dir: str) -> None:
"""Convert a dataset to HDF5 format.
This function converts an arbitary dictionary to an HDF5 file. The keys
inside the HDF5 file preserve the directory structure of the original.
As an example, if you convert "/path/to/dataset" to HDF5, the resulting
file will be: "/path/to/dataset.hdf5". The file "relative/path/to/file"
will be stored at "relative/path/to/file" inside /path/to/dataset.hdf5.
Args:
source_dir (str): The path to the dataset to convert.
"""
if not os.path.exists(source_dir):
raise FileNotFoundError(f"No such file or directory: {source_dir}")

source_dir = os.path.join(source_dir, "") # must end with trailing slash
hdf5_path = source_dir.rstrip("/") + ".hdf5"
if os.path.exists(hdf5_path):
print(f"File {hdf5_path} already exists! Skipping {source_dir}")
return

print(f"Converting dataset at: {source_dir}")
hdf5_file = h5py.File(hdf5_path, mode="w")
sub_dirs = list(os.walk(source_dir))
file_count = sum(len(files) for (_, _, files) in sub_dirs)

with tqdm(total=file_count) as pbar:
for root, _, files in sub_dirs:
g_name = root.replace(source_dir, "")
g = hdf5_file.create_group(g_name) if g_name else hdf5_file
for f in files:
filepath = os.path.join(root, f)
if os.path.isfile(filepath):
with open(filepath, "rb") as fp:
file_content = fp.read()
g.create_dataset(
f, data=np.frombuffer(file_content, dtype="uint8")
)
pbar.update()

hdf5_file.close()
print("done.")


if __name__ == "__main__": # pragma: no cover
parser = argparse.ArgumentParser(
description="Converts a dataset at the specified path to hdf5. The "
"local directory structure is preserved in the hdf5 file."
)
parser.add_argument(
"-p",
"--path",
required=True,
help="path to the root folder of a specific dataset to convert",
)
args = parser.parse_args()
convert_dataset(args.path)

0 comments on commit f90fdb4

Please sign in to comment.