Skip to content

Commit

Permalink
[Enhance] Support download dataset from openxlab (#223)
Browse files Browse the repository at this point in the history
  • Loading branch information
HAOCHENYE authored Aug 28, 2023
1 parent 53536fc commit 30dc4fa
Showing 1 changed file with 23 additions and 10 deletions.
33 changes: 23 additions & 10 deletions mim/commands/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,38 +210,51 @@ def _download_dataset(package: str, dataset: str, dest_root: str) -> None:
f'already updated it and still get this error, please report an '
f'issue to {package}')
with open(dataset_index_path) as f:
datasets_meta = yaml.load(f, Loader=yaml.SafeLoader)
dataset_metas = yaml.load(f, Loader=yaml.SafeLoader)

if dataset not in datasets_meta:
if dataset not in dataset_metas:
raise KeyError(f'Cannot find {dataset} in {dataset_index_path}. '
'here are the available datasets: '
'{}'.format('\n'.join(datasets_meta.keys())))
dataset_meta = datasets_meta[dataset]

'{}'.format('\n'.join(dataset_metas.keys())))
dataset_meta = dataset_metas[dataset]
# OpenMMLab repo will define the `dataset-index.yml` like this:
# openxlab: true
# voc2007:
# dataset: PASCAL_VOC2007
# download_root: data
# data_root: data
# script: tools/dataset_converters/scripts/preprocess_voc2007.sh

# In this case, the top level key "voc2007" means the "Dataset Name" passed
# In this case:
# `openxlab` means download the dataset with `openxlab` cli, otherwise
# use the `odl` cli. Although `odl` cli will not be maintained in the
# future, we still keep it here for compatibility.

# The top level key "voc2007" means the "Dataset Name" passed
# to `mim download --dataset {Dataset Name}`

# The nested field "dataset" means the argument passed to `odl get`
# If the value of "dataset" is the same as the "Dataset Name", downstream
# repos can skip defining "dataset" and "Dataset Name" will be passed
# to `odl get`
src_name = dataset_meta.get('dataset', dataset)

use_openxlab = dataset_metas.get('openxlab', False)
src_name = dataset_meta.get('dataset', dataset)
# `odl get` will download raw dataset to `download_root`, and the script
# will process the raws data and put the prepared data to the `data_root`
download_root = dataset_meta['download_root']
os.makedirs(download_root, exist_ok=True)

color_echo(f'Start downloading {dataset} to {download_root}...', 'blue')
subprocess.check_call(['odl', 'get', src_name, '-d', download_root],
stdin=sys.stdin,
stdout=sys.stdout)
if use_openxlab:
subprocess.check_call(
['openxlab', 'dataset', 'get', src_name, '-d', download_root],
stdin=sys.stdin,
stdout=sys.stdout)
else:
subprocess.check_call(['odl', 'get', src_name, '-d', download_root],
stdin=sys.stdin,
stdout=sys.stdout)

if not osp.exists(download_root):
return
Expand Down

0 comments on commit 30dc4fa

Please sign in to comment.