From 2f38f8e5193f9871f46490dc2a1dab2d6b8764f6 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Nov 2023 15:42:51 -0500 Subject: [PATCH 1/4] Replace magic string arguments with enums --- dandi/cli/cmd_digest.py | 4 +- dandi/cli/cmd_download.py | 33 ++-- dandi/cli/cmd_move.py | 9 +- dandi/cli/cmd_organize.py | 29 ++-- dandi/cli/cmd_upload.py | 32 ++-- dandi/cli/tests/test_download.py | 43 ++--- dandi/consts.py | 11 -- dandi/download.py | 77 ++++++--- dandi/files/bases.py | 11 +- dandi/move.py | 63 +++++-- dandi/organize.py | 172 +++++++++++-------- dandi/tests/fixtures.py | 26 +-- dandi/tests/test_download.py | 54 ++++-- dandi/tests/test_move.py | 272 +++++++++++++++++++++---------- dandi/tests/test_organize.py | 63 +++---- dandi/tests/test_upload.py | 51 +++--- dandi/upload.py | 50 ++++-- dandi/utils.py | 4 +- setup.cfg | 2 +- 19 files changed, 619 insertions(+), 387 deletions(-) diff --git a/dandi/cli/cmd_digest.py b/dandi/cli/cmd_digest.py index bd6af1ff3..7bcceff22 100644 --- a/dandi/cli/cmd_digest.py +++ b/dandi/cli/cmd_digest.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import click from .base import map_to_click_exceptions @@ -18,7 +20,7 @@ ) @click.argument("paths", nargs=-1, type=click.Path(exists=True)) @map_to_click_exceptions -def digest(paths, digest_alg): +def digest(paths: tuple[str, ...], digest_alg: str) -> None: """Calculate file digests""" from ..support.digests import get_digest diff --git a/dandi/cli/cmd_download.py b/dandi/cli/cmd_download.py index 3ac975e4d..9c8c6f6d6 100644 --- a/dandi/cli/cmd_download.py +++ b/dandi/cli/cmd_download.py @@ -1,9 +1,13 @@ +from __future__ import annotations + +from collections.abc import Sequence import os import click from .base import ChoiceList, IntColonInt, instance_option, map_to_click_exceptions from ..dandiarchive import _dandi_url_parser, parse_dandi_url +from ..download import DownloadExisting, DownloadFormat, PathType from ..utils import get_instance @@ -28,9 +32,8 @@ @click.option( "-e", "--existing", - type=click.Choice( - ["error", "skip", "overwrite", "overwrite-different", "refresh"] - ), # TODO: verify-reupload (to become default) + type=click.Choice(list(DownloadExisting)), + # TODO: verify-reupload (to become default) help="What to do if a file found existing locally. 'refresh': verify " "that according to the size and mtime, it is the same file, if not - " "download and overwrite.", @@ -41,12 +44,12 @@ "-f", "--format", help="Choose the format/frontend for output. TODO: support all of the ls", - type=click.Choice(["pyout", "debug"]), + type=click.Choice(list(DownloadFormat)), default="pyout", ) @click.option( "--path-type", - type=click.Choice(["exact", "glob"]), + type=click.Choice(list(PathType)), default="exact", help="Whether to interpret asset paths in URLs as exact matches or glob patterns", show_default=True, @@ -96,16 +99,16 @@ @click.argument("url", nargs=-1) @map_to_click_exceptions def download( - url, - output_dir, - existing, - jobs, - format, - download_types, - sync, - dandi_instance, - path_type, -): + url: Sequence[str], + output_dir: str, + existing: DownloadExisting, + jobs: tuple[int, int], + format: DownloadFormat, + download_types: set[str], + sync: bool, + dandi_instance: str, + path_type: PathType, +) -> None: # We need to import the download module rather than the download function # so that the tests can properly patch the function with a mock. from .. import download diff --git a/dandi/cli/cmd_move.py b/dandi/cli/cmd_move.py index 491905606..6a8898f9a 100644 --- a/dandi/cli/cmd_move.py +++ b/dandi/cli/cmd_move.py @@ -3,6 +3,7 @@ import click from .base import devel_debug_option, instance_option, map_to_click_exceptions +from ..move import MoveExisting, MoveWorkOn @click.command() @@ -15,7 +16,7 @@ @click.option( "-e", "--existing", - type=click.Choice(["error", "skip", "overwrite"]), + type=click.Choice(list(MoveExisting)), default="error", help="How to handle assets that would be moved to a destination that already exists", show_default=True, @@ -29,7 +30,7 @@ @click.option( "-w", "--work-on", - type=click.Choice(["auto", "both", "local", "remote"]), + type=click.Choice(list(MoveWorkOn)), default="auto", help=( "Whether to operate on the local Dandiset, remote Dandiset, or both;" @@ -47,10 +48,10 @@ def move( paths: tuple[str, ...], dandiset: str | None, dry_run: bool, - existing: str, + existing: MoveExisting, jobs: int | None, regex: bool, - work_on: str, + work_on: MoveWorkOn, dandi_instance: str, devel_debug: bool = False, ) -> None: diff --git a/dandi/cli/cmd_organize.py b/dandi/cli/cmd_organize.py index a45765ec6..c51f2fa11 100644 --- a/dandi/cli/cmd_organize.py +++ b/dandi/cli/cmd_organize.py @@ -3,7 +3,8 @@ import click from .base import dandiset_path_option, devel_debug_option, map_to_click_exceptions -from ..consts import dandi_layout_fields, file_operation_modes +from ..consts import dandi_layout_fields +from ..organize import CopyMode, FileOperationMode, OrganizeInvalid @click.command() @@ -16,7 +17,7 @@ @click.option( "--invalid", help="What to do if files without sufficient metadata are encountered.", - type=click.Choice(["fail", "warn"]), + type=click.Choice(list(OrganizeInvalid)), default="fail", show_default=True, ) @@ -29,7 +30,7 @@ "If 'auto' - whichever of symlink, hardlink, copy is allowed by system. " "The other modes (copy, move, symlink, hardlink) define how data files " "should be made available.", - type=click.Choice(file_operation_modes), + type=click.Choice(list(FileOperationMode)), default="auto", show_default=True, ) @@ -44,7 +45,7 @@ ) @click.option( "--media-files-mode", - type=click.Choice(["copy", "move", "symlink", "hardlink"]), + type=click.Choice(list(CopyMode)), default=None, help="How to relocate video files referenced by NWB files", ) @@ -63,16 +64,16 @@ @devel_debug_option() @map_to_click_exceptions def organize( - paths, - required_fields, - dandiset_path=None, - invalid="fail", - files_mode="auto", - devel_debug=False, - update_external_file_paths=False, - media_files_mode=None, - jobs=None, -): + paths: tuple[str, ...], + required_fields: tuple[str, ...], + dandiset_path: str | None, + invalid: OrganizeInvalid, + files_mode: FileOperationMode, + media_files_mode: CopyMode | None, + update_external_file_paths: bool, + jobs: int | None, + devel_debug: bool = False, +) -> None: """(Re)organize files according to the metadata. The purpose of this command is to take advantage of metadata contained in diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 343df995e..a17a69c3c 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import click from .base import ( @@ -7,13 +9,14 @@ instance_option, map_to_click_exceptions, ) +from ..upload import UploadExisting, UploadValidation @click.command() @click.option( "-e", "--existing", - type=click.Choice(["error", "skip", "force", "overwrite", "refresh"]), + type=click.Choice(list(UploadExisting)), help="What to do if a file found existing on the server. 'skip' would skip" "the file, 'force' - force reupload, 'overwrite' - force upload if " "either size or modification time differs; 'refresh' - upload only if " @@ -24,6 +27,7 @@ @click.option( "-J", "--jobs", + "jobs_pair", type=IntColonInt(), help=( "Number of assets to upload in parallel and, optionally, number of" @@ -36,7 +40,7 @@ @click.option( "--validation", help="Data must pass validation before the upload. Use of this option is highly discouraged.", - type=click.Choice(["require", "skip", "ignore"]), + type=click.Choice(list(UploadValidation)), default="require", show_default=True, ) @@ -61,17 +65,17 @@ @devel_debug_option() @map_to_click_exceptions def upload( - paths, - jobs, - sync, - dandi_instance, - existing="refresh", - validation="require", + paths: tuple[str, ...], + jobs_pair: tuple[int, int] | None, + sync: bool, + dandi_instance: str, + existing: UploadExisting, + validation: UploadValidation, # Development options should come as kwargs - allow_any_path=False, - upload_dandiset_metadata=False, - devel_debug=False, -): + allow_any_path: bool = False, + upload_dandiset_metadata: bool = False, + devel_debug: bool = False, +) -> None: """ Upload Dandiset files to DANDI Archive. @@ -89,11 +93,11 @@ def upload( """ from ..upload import upload - if jobs is None: + if jobs_pair is None: jobs = None jobs_per_file = None else: - jobs, jobs_per_file = jobs + jobs, jobs_per_file = jobs_pair upload( paths, diff --git a/dandi/cli/tests/test_download.py b/dandi/cli/tests/test_download.py index 164860c46..0a8131cff 100644 --- a/dandi/cli/tests/test_download.py +++ b/dandi/cli/tests/test_download.py @@ -7,6 +7,7 @@ from ..cmd_download import download from ...consts import dandiset_metadata_file, known_instances +from ...download import DownloadExisting, DownloadFormat, PathType def test_download_defaults(mocker): @@ -16,14 +17,14 @@ def test_download_defaults(mocker): mock_download.assert_called_once_with( (), os.curdir, - existing="error", - format="pyout", + existing=DownloadExisting.ERROR, + format=DownloadFormat.PYOUT, jobs=6, jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, - path_type="exact", + path_type=PathType.EXACT, ) @@ -34,14 +35,14 @@ def test_download_all_types(mocker): mock_download.assert_called_once_with( (), os.curdir, - existing="error", - format="pyout", + existing=DownloadExisting.ERROR, + format=DownloadFormat.PYOUT, jobs=6, jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, - path_type="exact", + path_type=PathType.EXACT, ) @@ -52,14 +53,14 @@ def test_download_metadata_only(mocker): mock_download.assert_called_once_with( (), os.curdir, - existing="error", - format="pyout", + existing=DownloadExisting.ERROR, + format=DownloadFormat.PYOUT, jobs=6, jobs_per_zarr=None, get_metadata=True, get_assets=False, sync=False, - path_type="exact", + path_type=PathType.EXACT, ) @@ -70,14 +71,14 @@ def test_download_assets_only(mocker): mock_download.assert_called_once_with( (), os.curdir, - existing="error", - format="pyout", + existing=DownloadExisting.ERROR, + format=DownloadFormat.PYOUT, jobs=6, jobs_per_zarr=None, get_metadata=False, get_assets=True, sync=False, - path_type="exact", + path_type=PathType.EXACT, ) @@ -103,14 +104,14 @@ def test_download_gui_instance_in_dandiset(mocker): mock_download.assert_called_once_with( ["https://dandiarchive.org/#/dandiset/123456/draft"], os.curdir, - existing="error", - format="pyout", + existing=DownloadExisting.ERROR, + format=DownloadFormat.PYOUT, jobs=6, jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, - path_type="exact", + path_type=PathType.EXACT, ) @@ -128,14 +129,14 @@ def test_download_api_instance_in_dandiset(mocker): mock_download.assert_called_once_with( ["http://localhost:8000/api/dandisets/123456/"], os.curdir, - existing="error", - format="pyout", + existing=DownloadExisting.ERROR, + format=DownloadFormat.PYOUT, jobs=6, jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, - path_type="exact", + path_type=PathType.EXACT, ) @@ -153,14 +154,14 @@ def test_download_url_instance_match(mocker): mock_download.assert_called_once_with( ("http://localhost:8000/api/dandisets/123456/",), os.curdir, - existing="error", - format="pyout", + existing=DownloadExisting.ERROR, + format=DownloadFormat.PYOUT, jobs=6, jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, - path_type="exact", + path_type=PathType.EXACT, ) diff --git a/dandi/consts.py b/dandi/consts.py index d8e47350d..7a0a8522e 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -138,17 +138,6 @@ def urls(self) -> Iterator[str]: vv: k for k, v in known_instances.items() for vv in v.urls() if vv } -file_operation_modes = [ - "dry", - "simulate", - "copy", - "move", - "hardlink", - "symlink", - "auto", -] - - # Download (upload?) specific constants #: Chunk size when iterating a download (and upload) body. Taken from girder-cli diff --git a/dandi/download.py b/dandi/download.py index 339dce3be..2becc6041 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -13,7 +13,6 @@ from pathlib import Path import random from shutil import rmtree -import sys from threading import Lock import time from types import TracebackType @@ -50,18 +49,45 @@ lgr = get_logger() +class DownloadExisting(str, Enum): + ERROR = "error" + SKIP = "skip" + OVERWRITE = "overwrite" + OVERWRITE_DIFFERENT = "overwrite-different" + REFRESH = "refresh" + + def __str__(self) -> str: + return self.value + + +class DownloadFormat(str, Enum): + PYOUT = "pyout" + DEBUG = "debug" + + def __str__(self) -> str: + return self.value + + +class PathType(str, Enum): + EXACT = "exact" + GLOB = "glob" + + def __str__(self) -> str: + return self.value + + def download( urls: str | Sequence[str], output_dir: str | Path, *, - format: str = "pyout", - existing: str = "error", + format: DownloadFormat = DownloadFormat.PYOUT, + existing: DownloadExisting = DownloadExisting.ERROR, jobs: int = 1, jobs_per_zarr: int | None = None, get_metadata: bool = True, get_assets: bool = True, sync: bool = False, - path_type: str = "exact", + path_type: PathType = PathType.EXACT, ) -> None: # TODO: unduplicate with upload. For now stole from that one # We will again use pyout to provide a neat table summarizing our progress @@ -75,7 +101,7 @@ def download( # on which instance it exists! Thus ATM we would do nothing but crash raise NotImplementedError("No URLs were provided. Cannot download anything") - parsed_urls = [parse_dandi_url(u, glob=path_type == "glob") for u in urls] + parsed_urls = [parse_dandi_url(u, glob=path_type is PathType.GLOB) for u in urls] # dandi.cli.formatters are used in cmd_ls to provide switchable pyout_style = pyouts.get_style(hide_if_missing=False) @@ -93,14 +119,14 @@ def download( # TODO: redo kw = dict(assets_it=out_helper.it) if jobs > 1: - if format == "pyout": + if format is DownloadFormat.PYOUT: # It could handle delegated to generator downloads kw["yield_generator_for_fields"] = rec_fields[1:] # all but path else: lgr.warning( "Parallel downloads are not yet implemented for non-pyout format=%r. " "Download will proceed serially.", - format, + str(format), ) downloaders = [ @@ -111,7 +137,7 @@ def download( get_metadata=get_metadata, get_assets=get_assets, jobs_per_zarr=jobs_per_zarr, - on_error="yield" if format == "pyout" else "raise", + on_error="yield" if format is DownloadFormat.PYOUT else "raise", **kw, ) for purl in parsed_urls @@ -125,16 +151,15 @@ def download( # has failed to download. If any was: exception should probably be # raised. API discussion for Python side of API: # - if format == "debug": + if format is DownloadFormat.DEBUG: for rec in gen_: - print(rec) - sys.stdout.flush() - elif format == "pyout": + print(rec, flush=True) + elif format is DownloadFormat.PYOUT: with out: for rec in gen_: out(rec) else: - raise ValueError(format) + raise AssertionError(f"Unhandled DownloadFormat member: {format!r}") if sync: to_delete = [p for dl in downloaders for p in dl.delete_for_sync()] @@ -168,7 +193,7 @@ class Downloader: output_dir: InitVar[str | Path] output_prefix: Path = field(init=False) output_path: Path = field(init=False) - existing: str + existing: DownloadExisting get_metadata: bool get_assets: bool jobs_per_zarr: int | None @@ -448,7 +473,7 @@ def _skip_file(msg: Any) -> dict: def _populate_dandiset_yaml( - dandiset_path: str | Path, dandiset: RemoteDandiset, existing: str + dandiset_path: str | Path, dandiset: RemoteDandiset, existing: DownloadExisting ) -> Iterator[dict]: metadata = dandiset.get_raw_metadata() if not metadata: @@ -465,15 +490,15 @@ def _populate_dandiset_yaml( if yaml_load(fp, typ="safe") == metadata: yield _skip_file("no change") return - if existing == "error": + if existing is DownloadExisting.ERROR: yield {"status": "error", "message": "already exists"} return - elif existing == "refresh" and op.lexists( + elif existing is DownloadExisting.REFRESH and op.lexists( op.join(dandiset_path, ".git", "annex") ): raise RuntimeError("Not refreshing path in git annex repository") - elif existing == "skip" or ( - existing == "refresh" + elif existing is DownloadExisting.SKIP or ( + existing is DownloadExisting.REFRESH and os.lstat(dandiset_yaml).st_mtime >= mtime.timestamp() ): yield _skip_file("already exists") @@ -496,7 +521,7 @@ def _download_file( lock: Lock, size: int | None = None, mtime: datetime | None = None, - existing: str = "error", + existing: DownloadExisting = DownloadExisting.ERROR, digests: dict[str, str] | None = None, digest_callback: Callable[[str, str], Any] | None = None, ) -> Iterator[dict]: @@ -529,14 +554,14 @@ def _download_file( """ if op.lexists(path): annex_path = op.join(toplevel_path, ".git", "annex") - if existing == "error": + if existing is DownloadExisting.ERROR: raise FileExistsError(f"File {path!r} already exists") - elif existing == "skip": + elif existing is DownloadExisting.SKIP: yield _skip_file("already exists") return - elif existing == "overwrite": + elif existing is DownloadExisting.OVERWRITE: pass - elif existing == "overwrite-different": + elif existing is DownloadExisting.OVERWRITE_DIFFERENT: realpath = op.realpath(path) key_parts = op.basename(realpath).split("-") if size is not None and os.stat(realpath).st_size != size: @@ -578,7 +603,7 @@ def _download_file( lgr.debug( "Etag of %s does not match etag on server; redownloading", path ) - elif existing == "refresh": + elif existing is DownloadExisting.REFRESH: if op.lexists(annex_path): raise RuntimeError("Not refreshing path in git annex repository") if mtime is None: @@ -825,7 +850,7 @@ def _download_zarr( asset: BaseRemoteZarrAsset, download_path: Path, toplevel_path: str | Path, - existing: str, + existing: DownloadExisting, lock: Lock, jobs: int | None = None, ) -> Iterator[dict]: diff --git a/dandi/files/bases.py b/dandi/files/bases.py index 93dfa3233..efd32606d 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -19,8 +19,6 @@ from dandischema.models import Dandiset as DandisetMeta from dandischema.models import get_schema_version from etelemetry import get_project -from nwbinspector import Importance, inspect_nwbfile, load_config -from nwbinspector.utils import get_package_version from packaging.version import Version from pydantic import ValidationError import requests @@ -29,7 +27,6 @@ from dandi.dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient from dandi.metadata import get_default_metadata, nwb2asset from dandi.misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, P -from dandi.pynwb_utils import validate as pynwb_validate from dandi.support.digests import get_dandietag, get_digest from dandi.utils import yaml_load from dandi.validate_types import Scope, Severity, ValidationOrigin, ValidationResult @@ -40,7 +37,7 @@ _required_dandiset_metadata_fields = ["identifier", "name", "description"] -NWBI_IMPORTANCE_TO_DANDI_SEVERITY: dict[Importance.name, Severity] = { +NWBI_IMPORTANCE_TO_DANDI_SEVERITY: dict[str, Severity] = { "ERROR": Severity.ERROR, "PYNWB_VALIDATION": Severity.ERROR, "CRITICAL": Severity.ERROR, # when using --config dandi @@ -497,6 +494,10 @@ def get_validation_errors( If ``schema_version`` was provided, we only validate basic metadata, and completely skip validation using nwbinspector.inspect_nwbfile """ + from nwbinspector import Importance, inspect_nwbfile, load_config + + from dandi.pynwb_utils import validate as pynwb_validate + errors: list[ValidationResult] = pynwb_validate( self.filepath, devel_debug=devel_debug ) @@ -710,6 +711,8 @@ def _check_required_fields( def _get_nwb_inspector_version(): + from nwbinspector.utils import get_package_version + global _current_nwbinspector_version if _current_nwbinspector_version is not None: return _current_nwbinspector_version diff --git a/dandi/move.py b/dandi/move.py index 7463beb5f..b98c5fb19 100644 --- a/dandi/move.py +++ b/dandi/move.py @@ -4,6 +4,7 @@ from collections.abc import Iterator from contextlib import ExitStack from dataclasses import dataclass, field +from enum import Enum from itertools import zip_longest import os.path from pathlib import Path, PurePosixPath @@ -21,6 +22,26 @@ lgr = get_logger() + +class MoveExisting(str, Enum): + ERROR = "error" + SKIP = "skip" + OVERWRITE = "overwrite" + + def __str__(self) -> str: + return self.value + + +class MoveWorkOn(str, Enum): + AUTO = "auto" + BOTH = "both" + LOCAL = "local" + REMOTE = "remote" + + def __str__(self) -> str: + return self.value + + #: A /-separated path to an asset, relative to the root of the Dandiset AssetPath = NewType("AssetPath", str) @@ -84,7 +105,9 @@ def status_field(self) -> str: ... @abstractmethod - def calculate_moves(self, *srcs: str, dest: str, existing: str) -> list[Movement]: + def calculate_moves( + self, *srcs: str, dest: str, existing: MoveExisting + ) -> list[Movement]: """ Given a sequence of input source paths and a destination path, return a sorted list of all assets that will be moved/renamed @@ -93,7 +116,7 @@ def calculate_moves(self, *srcs: str, dest: str, existing: str) -> list[Movement @abstractmethod def calculate_moves_by_regex( - self, find: str, replace: str, existing: str + self, find: str, replace: str, existing: MoveExisting ) -> list[Movement]: """ Given a regular expression and a replacement string, return a sorted @@ -212,7 +235,9 @@ def resolve(self, path: str) -> tuple[AssetPath, bool]: raise ValueError(f"{path!r} is outside of Dandiset") return (AssetPath(str(p)), path.endswith("/")) - def calculate_moves(self, *srcs: str, dest: str, existing: str) -> list[Movement]: + def calculate_moves( + self, *srcs: str, dest: str, existing: MoveExisting + ) -> list[Movement]: """ Given a sequence of input source paths and a destination path, return a sorted list of all assets that will be moved/renamed @@ -285,7 +310,7 @@ def calculate_moves(self, *srcs: str, dest: str, existing: str) -> list[Movement return self.compile_moves(moves, existing) def calculate_moves_by_regex( - self, find: str, replace: str, existing: str + self, find: str, replace: str, existing: MoveExisting ) -> list[Movement]: """ Given a regular expression and a replacement string, return a sorted @@ -327,7 +352,7 @@ def calculate_moves_by_regex( return self.compile_moves(moves, existing) def compile_moves( - self, moves: dict[AssetPath, AssetPath], existing: str + self, moves: dict[AssetPath, AssetPath], existing: MoveExisting ) -> list[Movement]: """ Given a `dict` mapping source paths to destination paths, produce a @@ -341,9 +366,9 @@ def compile_moves( " destination is a directory" ) elif self.is_file(dest): - if existing == "overwrite": + if existing is MoveExisting.OVERWRITE: motions.append(Movement(src, dest, delete=True)) - elif existing == "skip": + elif existing is MoveExisting.SKIP: motions.append(Movement(src, dest, skip=True)) else: raise ValueError( @@ -685,7 +710,9 @@ def status_field(self) -> str: """ return "both" - def calculate_moves(self, *srcs: str, dest: str, existing: str) -> list[Movement]: + def calculate_moves( + self, *srcs: str, dest: str, existing: MoveExisting + ) -> list[Movement]: """ Given a sequence of input source paths and a destination path, return a sorted list of all assets that will be moved/renamed @@ -696,7 +723,7 @@ def calculate_moves(self, *srcs: str, dest: str, existing: str) -> list[Movement return local_moves def calculate_moves_by_regex( - self, find: str, replace: str, existing: str + self, find: str, replace: str, existing: MoveExisting ) -> list[Movement]: """ Given a regular expression and a replacement string, return a sorted @@ -760,10 +787,10 @@ def move( *srcs: str, dest: str, regex: bool = False, - existing: str = "error", + existing: MoveExisting = MoveExisting.ERROR, dandi_instance: str | DandiInstance = "dandi", dandiset: Path | str | None = None, - work_on: str = "auto", + work_on: MoveWorkOn = MoveWorkOn.AUTO, devel_debug: bool = False, jobs: int | None = None, dry_run: bool = False, @@ -775,9 +802,11 @@ def move( with ExitStack() as stack: mover: Mover client: DandiAPIClient | None = None - if work_on == "auto": - work_on = "remote" if isinstance(dandiset, str) else "both" - if work_on == "both": + if work_on is MoveWorkOn.AUTO: + work_on = ( + MoveWorkOn.REMOTE if isinstance(dandiset, str) else MoveWorkOn.BOTH + ) + if work_on is MoveWorkOn.BOTH: if isinstance(dandiset, str): raise TypeError("`dandiset` must be a Path when work_on='both'") local_ds, subpath = find_dandiset_and_subpath(dandiset) @@ -797,7 +826,7 @@ def move( local_dandiset_path=Path(local_ds.path), ), ) - elif work_on == "remote": + elif work_on is MoveWorkOn.REMOTE: if isinstance(dandiset, str): url = parse_dandi_url(dandiset) if not isinstance(url, DandisetURL): @@ -816,13 +845,13 @@ def move( local_ds.identifier, version_id="draft", lazy=False ) mover = RemoteMover(dandiset=remote_ds, subpath=subpath) - elif work_on == "local": + elif work_on is MoveWorkOn.LOCAL: if isinstance(dandiset, str): raise TypeError("`dandiset` must be a Path when work_on='both'") local_ds, subpath = find_dandiset_and_subpath(dandiset) mover = LocalMover(dandiset_path=Path(local_ds.path), subpath=subpath) else: - raise ValueError(f"Invalid work_on value: {work_on!r}") + raise AssertionError(f"Unexpected value for 'work_on': {work_on!r}") if regex: try: (find,) = srcs diff --git a/dandi/organize.py b/dandi/organize.py index bac665e13..02879a3e2 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -8,26 +8,20 @@ from collections import Counter from collections.abc import Sequence from copy import deepcopy +from enum import Enum import os import os.path as op from pathlib import Path, PurePosixPath import re import uuid -import numpy as np - from . import __version__, get_logger from .consts import dandi_layout_fields from .dandiset import Dandiset from .exceptions import OrganizeImpossibleError from .metadata import get_metadata -from .pynwb_utils import ( - get_neurodata_types_to_modalities_map, - get_object_id, - ignore_benign_pynwb_warnings, - rename_nwb_external_files, -) from .utils import ( + AnyPath, Parallel, copy_file, delayed, @@ -43,6 +37,54 @@ lgr = get_logger() + +class FileOperationMode(str, Enum): + DRY = "dry" + SIMULATE = "simulate" + COPY = "copy" + MOVE = "move" + HARDLINK = "hardlink" + SYMLINK = "symlink" + AUTO = "auto" + + def __str__(self) -> str: + return self.value + + def as_copy_mode(self) -> CopyMode: + # Raises ValueError if the mode can't be mapped to a CopyMode + return CopyMode(self.value) + + +class CopyMode(str, Enum): + SYMLINK = "symlink" + COPY = "copy" + MOVE = "move" + HARDLINK = "hardlink" + + def __str__(self) -> str: + return self.value + + def copy(self, old_path: AnyPath, new_path: AnyPath) -> None: + if self is CopyMode.SYMLINK: + os.symlink(old_path, new_path) + elif self is CopyMode.HARDLINK: + os.link(old_path, new_path) + elif self is CopyMode.COPY: + copy_file(old_path, new_path) + elif self is CopyMode.MOVE: + move_file(old_path, new_path) + else: + raise AssertionError(f"Unhandled CopyMode member: {self!r}") + + +class OrganizeInvalid(str, Enum): + FAIL = "fail" + WARN = "warn" + + def __str__(self) -> str: + return self.value + + dandi_path = op.join("sub-{subject_id}", "{dandi_filename}") @@ -227,7 +269,7 @@ def _create_external_file_names(metadata: list[dict]) -> list[dict]: def organize_external_files( - metadata: list[dict], dandiset_path: str, files_mode: str + metadata: list[dict], dandiset_path: str, copy_mode: CopyMode ) -> None: """Organizes the external_files into the new Dandiset folder structure. @@ -237,9 +279,7 @@ def organize_external_files( list of metadata dictionaries created during the call to pynwb_utils._get_pynwb_metadata dandiset_path: str full path of the main dandiset folder. - files_mode: str - one of "symlink", "copy", "move", "hardlink" - + copy_mode: CopyMode """ for e in metadata: for ext_file_dict in e["external_file_objects"]: @@ -259,19 +299,12 @@ def organize_external_files( lgr.error("%s does not exist", name_old_str) raise FileNotFoundError(f"{name_old_str} does not exist") os.makedirs(op.dirname(new_path), exist_ok=True) - if files_mode == "symlink": - os.symlink(name_old_str, new_path) - elif files_mode == "hardlink": - os.link(name_old_str, new_path) - elif files_mode == "copy": - copy_file(name_old_str, new_path) - elif files_mode == "move": - move_file(name_old_str, new_path) - else: - raise NotImplementedError(files_mode) + copy_mode.copy(name_old_str, new_path) def _assign_obj_id(metadata, non_unique): + from .pynwb_utils import get_object_id + msg = "%d out of %d paths are not unique" % (len(non_unique), len(metadata)) lgr.info(msg + ". We will try adding _obj- based on crc32 of object_id") @@ -339,6 +372,8 @@ def _get_unique_values_among_non_unique(metadata, non_unique_paths, field): def get_obj_id(object_id): """Given full object_id, get its shortened version""" + import numpy as np + return np.base_repr(binascii.crc32(object_id.encode("ascii")), 36).lower() @@ -401,6 +436,8 @@ def _sanitize_value(value, field): def _populate_modalities(metadata): + from .pynwb_utils import get_neurodata_types_to_modalities_map + ndtypes_to_modalities = get_neurodata_types_to_modalities_map() ndtypes_unassigned = set() for r in metadata: @@ -677,7 +714,7 @@ def _get_non_unique_paths(metadata): return non_unique -def detect_link_type(srcfile, destdir): +def detect_link_type(srcfile: AnyPath, destdir: AnyPath) -> FileOperationMode: """ Determine what type of links the filesystem will let us make from the file ``srcfile`` to the directory ``destdir``. If symlinks are allowed, returns @@ -695,33 +732,30 @@ def detect_link_type(srcfile, destdir): lgr.info( "Symlink and hardlink tests both failed; setting files_mode='copy'" ) - return "copy" + return FileOperationMode.COPY else: lgr.info( "Hard link support autodetected; setting files_mode='hardlink'" ) - return "hardlink" + return FileOperationMode.HARDLINK else: lgr.info("Symlink support autodetected; setting files_mode='symlink'") - return "symlink" + return FileOperationMode.SYMLINK finally: - try: - destfile.unlink() - except FileNotFoundError: - pass + destfile.unlink(missing_ok=True) def organize( - paths, - dandiset_path=None, - invalid="fail", - files_mode="auto", - devel_debug=False, - update_external_file_paths=False, - media_files_mode=None, - required_fields=None, - jobs=None, -): + paths: Sequence[str], + dandiset_path: str | None = None, + invalid: OrganizeInvalid = OrganizeInvalid.FAIL, + files_mode: FileOperationMode = FileOperationMode.AUTO, + devel_debug: bool = False, + update_external_file_paths: bool = False, + media_files_mode: CopyMode | None = None, + required_fields: Sequence[str] | None = None, + jobs: int | None = None, +) -> None: in_place = False # If we deduce that we are organizing in-place jobs = jobs or -1 @@ -729,7 +763,7 @@ def organize( def dry_print(msg): print(f"DRY: {msg}") - if files_mode == "dry": + if files_mode is FileOperationMode.DRY: def act(func, *args, **kwargs): dry_print(f"{func.__name__} {args}, {kwargs}") @@ -740,7 +774,10 @@ def act(func, *args, **kwargs): lgr.debug("%s %s %s", func.__name__, args, kwargs) return func(*args, **kwargs) - if update_external_file_paths and files_mode not in ["copy", "move"]: + if update_external_file_paths and files_mode not in [ + FileOperationMode.COPY, + FileOperationMode.MOVE, + ]: raise ValueError( "--files-mode needs to be one of 'copy/move' for the rewrite option to work" ) @@ -756,7 +793,7 @@ def act(func, *args, **kwargs): del dandiset # Early checks to not wait to fail - if files_mode == "simulate": + if files_mode is FileOperationMode.SIMULATE: # in this mode we will demand the entire output folder to be absent if op.lexists(dandiset_path): # TODO: RF away @@ -765,6 +802,8 @@ def act(func, *args, **kwargs): % dandiset_path ) + from .pynwb_utils import ignore_benign_pynwb_warnings + ignore_benign_pynwb_warnings() if not paths: @@ -776,7 +815,7 @@ def act(func, *args, **kwargs): f"No dandiset was found at {dandiset_path}, and no " f"paths were provided" ) - if files_mode not in ("dry", "move"): + if files_mode not in (FileOperationMode.DRY, FileOperationMode.MOVE): raise ValueError( "Only 'dry' or 'move' mode could be used to operate in-place " "within a dandiset (no paths were provided)" @@ -844,17 +883,17 @@ def _get_metadata(path): ", ".join(m["path"] for m in skip_invalid), ) ) - if invalid == "fail": + if invalid is OrganizeInvalid.FAIL: raise ValueError(msg) - elif invalid == "warn": + elif invalid is OrganizeInvalid.WARN: lgr.warning(msg + " They will be skipped") else: - raise ValueError(f"invalid has an invalid value {invalid}") + raise AssertionError(f"Unhandled OrganizeInvalid member: {invalid!r}") if not op.lexists(dandiset_path): act(os.makedirs, dandiset_path) - if files_mode == "auto": + if files_mode is FileOperationMode.AUTO: files_mode = detect_link_type(link_test_file, dandiset_path) metadata = create_unique_filenames_from_metadata( @@ -885,13 +924,13 @@ def _get_metadata(path): ) if update_external_file_paths and media_files_mode is None: - media_files_mode = "symlink" + media_files_mode = CopyMode.SYMLINK lgr.warning( - "--media-files-mode not specified, setting to recommended mode: 'symlink' " + "--media-files-mode not specified, setting to recommended mode: 'symlink'" ) # look for multiple nwbfiles linking to one video: - if media_files_mode == "move": + if media_files_mode is CopyMode.MOVE: videos_list = [] for meta in metadata: for ext_ob in meta["external_file_objects"]: @@ -955,16 +994,18 @@ def _get_metadata(path): e_abs_path = op.abspath(e_path) if use_abs_paths: e_path = e_abs_path - elif files_mode == "symlink": # path should be relative to the target + elif ( + files_mode is FileOperationMode.SYMLINK + ): # path should be relative to the target e_path = op.relpath(e_abs_path, dandi_dirpath) if dandi_abs_fullpath == e_abs_path: lgr.debug("Skipping %s since the same in source/destination", e_path) skip_same.append(e) continue - elif files_mode == "symlink" and op.realpath(dandi_abs_fullpath) == op.realpath( - e_abs_path - ): + elif files_mode is FileOperationMode.SYMLINK and op.realpath( + dandi_abs_fullpath + ) == op.realpath(e_abs_path): lgr.debug( "Skipping %s since mode is symlink and both resolve to the same path", e_path, @@ -973,27 +1014,17 @@ def _get_metadata(path): continue if ( - files_mode == "dry" + files_mode is FileOperationMode.DRY ): # TODO: this is actually a files_mode on top of modes!!!? dry_print(f"{e_path} -> {dandi_path}") else: if not op.lexists(dandi_dirpath): os.makedirs(dandi_dirpath) - if files_mode == "simulate": + if files_mode is FileOperationMode.SIMULATE: os.symlink(e_path, dandi_fullpath) - continue - # - if files_mode == "symlink": - os.symlink(e_path, dandi_fullpath) - elif files_mode == "hardlink": - os.link(e_path, dandi_fullpath) - elif files_mode == "copy": - copy_file(e_path, dandi_fullpath) - elif files_mode == "move": - move_file(e_path, dandi_fullpath) else: - raise NotImplementedError(files_mode) - acted_upon.append(e) + files_mode.as_copy_mode().copy(e_path, dandi_fullpath) + acted_upon.append(e) if acted_upon and in_place: # We might need to cleanup a bit - e.g. prune empty directories left @@ -1009,7 +1040,10 @@ def _get_metadata(path): # create video file name and re write nwb file external files: if update_external_file_paths: + from .pynwb_utils import rename_nwb_external_files + rename_nwb_external_files(metadata, dandiset_path) + assert media_files_mode is not None organize_external_files(metadata, dandiset_path, media_files_mode) def msg_(msg, n, cond=None): diff --git a/dandi/tests/fixtures.py b/dandi/tests/fixtures.py index 6b3d85983..01612b826 100644 --- a/dandi/tests/fixtures.py +++ b/dandi/tests/fixtures.py @@ -660,9 +660,9 @@ def video_files(tmp_path: Path) -> list[tuple[Path, Path]]: video_paths = [] video_path = tmp_path / "video_files" video_path.mkdir() - for no in range(2): - movie_file1 = video_path / f"test1_{no}.avi" - movie_file2 = video_path / f"test2_{no}.avi" + for i in range(2): + movie_file1 = video_path / f"test1_{i}.avi" + movie_file2 = video_path / f"test2_{i}.avi" (nf, nx, ny) = (2, 2, 2) writer1 = cv2.VideoWriter( filename=str(movie_file1), @@ -687,25 +687,25 @@ def video_files(tmp_path: Path) -> list[tuple[Path, Path]]: return video_paths -def _create_nwb_files(video_list): +def _create_nwb_files(video_list: list[tuple[Path, Path]]) -> Path: base_path = video_list[0][0].parent.parent base_nwb_path = base_path / "nwbfiles" base_nwb_path.mkdir(parents=True, exist_ok=True) - for no, vid_loc in enumerate(video_list): + for i, vid_loc in enumerate(video_list): vid_1 = vid_loc[0] vid_2 = vid_loc[1] - subject_id = f"mouse{no}" - session_id = f"sessionid{no}" + subject_id = f"mouse{i}" + session_id = f"sessionid{i}" subject = Subject( subject_id=subject_id, species="Mus musculus", sex="M", description="lab mouse ", ) - device = Device(f"imaging_device_{no}") - name = f"{vid_1.stem}_{no}" + device = Device(f"imaging_device_{i}") + name = f"{vid_1.stem}_{i}" nwbfile = NWBFile( - f"{name}{no}", + f"{name}{i}", "desc: contains movie for dandi .mp4 storage as external", datetime.now(tzlocal()), experimenter="Experimenter name", @@ -715,7 +715,7 @@ def _create_nwb_files(video_list): ) image_series = ImageSeries( - name=f"MouseWhiskers{no}", + name=f"MouseWhiskers{i}", format="external", external_file=[str(vid_1), str(vid_2)], starting_frame=[0, 2], @@ -731,13 +731,13 @@ def _create_nwb_files(video_list): @pytest.fixture() -def nwbfiles_video_unique(video_files): +def nwbfiles_video_unique(video_files: list[tuple[Path, Path]]) -> Path: """Create nwbfiles linked with unique set of videos.""" return _create_nwb_files(video_files) @pytest.fixture() -def nwbfiles_video_common(video_files): +def nwbfiles_video_common(video_files: list[tuple[Path, Path]]) -> Path: """Create nwbfiles sharing video files.""" video_list = [video_files[0], video_files[0]] return _create_nwb_files(video_list) diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 9196ef9a9..492dae3d8 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -20,7 +20,15 @@ from .test_helpers import assert_dirtrees_eq from ..consts import DRAFT, dandiset_metadata_file from ..dandiarchive import DandisetURL -from ..download import Downloader, ProgressCombiner, PYOUTHelper, download +from ..download import ( + Downloader, + DownloadExisting, + DownloadFormat, + PathType, + ProgressCombiner, + PYOUTHelper, + download, +) from ..exceptions import NotFoundError from ..utils import list_paths @@ -57,16 +65,20 @@ def test_download_000027( # redownload - since already exist there should be an exception if we are # not using pyout with pytest.raises(FileExistsError): - download(url, tmp_path, format="debug") + download(url, tmp_path, format=DownloadFormat.DEBUG) assert "FileExistsError" not in capsys.readouterr().out # but no exception is raised, and rather it gets output to pyout otherwise download(url, tmp_path) assert "FileExistsError" in capsys.readouterr().out # TODO: somehow get that status report about what was downloaded and what not - download(url, tmp_path, existing="skip") # TODO: check that skipped - download(url, tmp_path, existing="overwrite") # TODO: check that redownloaded - download(url, tmp_path, existing="refresh") # TODO: check that skipped (the same) + download(url, tmp_path, existing=DownloadExisting.SKIP) # TODO: check that skipped + download( + url, tmp_path, existing=DownloadExisting.OVERWRITE + ) # TODO: check that redownloaded + download( + url, tmp_path, existing=DownloadExisting.REFRESH + ) # TODO: check that skipped (the same) @mark.skipif_no_network @@ -88,7 +100,7 @@ def test_download_000027_metadata_only(url: str, tmp_path: Path) -> None: @mark.skipif_no_network @pytest.mark.parametrize( "url", - [ # Should go through API + [ # Should go through AP "https://dandiarchive.org/dandiset/000027/0.210831.2033", # Drafts do not go through API ATM, but that should not be visible to user "https://dandiarchive.org/dandiset/000027/draft", @@ -211,7 +223,7 @@ def test_download_sync( download( f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}", tmp_path, - existing="overwrite", + existing=DownloadExisting.OVERWRITE, sync=True, ) confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") @@ -230,7 +242,7 @@ def test_download_sync_folder( download( f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}/subdir2/", text_dandiset.dspath, - existing="overwrite", + existing=DownloadExisting.OVERWRITE, sync=True, ) confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") @@ -251,7 +263,7 @@ def test_download_sync_list( download( f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}", tmp_path, - existing="overwrite", + existing=DownloadExisting.OVERWRITE, sync=True, ) assert not (dspath / "file.txt").exists() @@ -272,7 +284,7 @@ def test_download_sync_zarr( download( zarr_dandiset.dandiset.version_api_url, tmp_path, - existing="overwrite", + existing=DownloadExisting.OVERWRITE, sync=True, ) confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") @@ -306,7 +318,7 @@ def test_download_metadata404(text_dandiset: SampleDandiset, tmp_path: Path) -> version_id=text_dandiset.dandiset.version_id, ), output_dir=tmp_path, - existing="error", + existing=DownloadExisting.ERROR, get_metadata=True, get_assets=True, jobs_per_zarr=None, @@ -344,7 +356,9 @@ def test_download_different_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) dd.mkdir() zarr.save(dd / "sample.zarr", np.eye(5)) download( - zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + zarr_dandiset.dandiset.version_api_url, + tmp_path, + existing=DownloadExisting.OVERWRITE_DIFFERENT, ) assert_dirtrees_eq( zarr_dandiset.dspath / "sample.zarr", @@ -367,7 +381,9 @@ def test_download_different_zarr_onto_excluded_dotfiles( (zarr_path / "arr_0").mkdir() (zarr_path / "arr_0" / ".gitmodules").touch() download( - zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + zarr_dandiset.dandiset.version_api_url, + tmp_path, + existing=DownloadExisting.OVERWRITE_DIFFERENT, ) assert list_paths(zarr_path, dirs=True, exclude_vcs=False) == [ zarr_path / ".dandi", @@ -398,7 +414,7 @@ def test_download_different_zarr_delete_dir( dd.mkdir(parents=True, exist_ok=True) zarr.save(dd / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) assert any(p.is_dir() for p in (dd / "sample.zarr").iterdir()) - download(d.version_api_url, tmp_path, existing="overwrite-different") + download(d.version_api_url, tmp_path, existing=DownloadExisting.OVERWRITE_DIFFERENT) assert_dirtrees_eq(dspath / "sample.zarr", dd / "sample.zarr") @@ -409,7 +425,9 @@ def test_download_zarr_to_nonzarr_path( dd.mkdir() (dd / "sample.zarr").write_text("This is not a Zarr.\n") download( - zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + zarr_dandiset.dandiset.version_api_url, + tmp_path, + existing=DownloadExisting.OVERWRITE_DIFFERENT, ) assert_dirtrees_eq( zarr_dandiset.dspath / "sample.zarr", @@ -426,7 +444,7 @@ def test_download_nonzarr_to_zarr_path( dd = tmp_path / d.identifier dd.mkdir(parents=True, exist_ok=True) zarr.save(dd / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) - download(d.version_api_url, tmp_path, existing="overwrite-different") + download(d.version_api_url, tmp_path, existing=DownloadExisting.OVERWRITE_DIFFERENT) assert (dd / "sample.zarr").is_file() assert (dd / "sample.zarr").read_text() == "This is not a Zarr.\n" @@ -805,7 +823,7 @@ def test_download_glob_option(text_dandiset: SampleDandiset, tmp_path: Path) -> download( f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/s*.Txt", tmp_path, - path_type="glob", + path_type=PathType.GLOB, ) assert list_paths(tmp_path, dirs=True) == [ tmp_path / "subdir1", @@ -842,7 +860,7 @@ def test_download_sync_glob( download( f"{text_dandiset.dandiset.version_api_url}assets/?glob=s*.Txt", text_dandiset.dspath, - existing="overwrite", + existing=DownloadExisting.OVERWRITE, sync=True, ) confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") diff --git a/dandi/tests/test_move.py b/dandi/tests/test_move.py index 1dae78341..3f1eb5e0d 100644 --- a/dandi/tests/test_move.py +++ b/dandi/tests/test_move.py @@ -2,13 +2,14 @@ import logging from pathlib import Path +from typing import Any import pytest from .fixtures import SampleDandiset from ..dandiapi import RemoteAsset from ..exceptions import NotFoundError -from ..move import AssetMismatchError, move +from ..move import AssetMismatchError, MoveExisting, MoveWorkOn, move @pytest.fixture() @@ -35,14 +36,14 @@ def moving_dandiset(new_dandiset: SampleDandiset) -> SampleDandiset: def check_assets( sample_dandiset: SampleDandiset, starting_assets: list[RemoteAsset], - work_on: str, + work_on: MoveWorkOn, remapping: dict[str, str | None], ) -> None: for asset in starting_assets: if asset.path in remapping and remapping[asset.path] is None: # Asset was overwritten continue - if work_on in ("local", "both") and asset.path in remapping: + if work_on in (MoveWorkOn.LOCAL, MoveWorkOn.BOTH) and asset.path in remapping: assert not (sample_dandiset.dspath / asset.path).exists() remapped = remapping[asset.path] assert isinstance(remapped, str) @@ -51,7 +52,7 @@ def check_assets( assert ( sample_dandiset.dspath / asset.path ).read_text() == f"{asset.path}\n" - if work_on in ("remote", "both") and asset.path in remapping: + if work_on in (MoveWorkOn.REMOTE, MoveWorkOn.BOTH) and asset.path in remapping: remapped = remapping[asset.path] assert isinstance(remapped, str) with pytest.raises(NotFoundError): @@ -157,7 +158,9 @@ def check_assets( ), ], ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move( monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, @@ -165,7 +168,7 @@ def test_move( dest: str, regex: bool, remapping: dict[str, str | None], - work_on: str, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -181,9 +184,13 @@ def test_move( check_assets(moving_dandiset, starting_assets, work_on, remapping) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_skip( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -193,7 +200,7 @@ def test_move_skip( "subdir4/foo.json", dest="subdir5", work_on=work_on, - existing="skip", + existing=MoveExisting.SKIP, dandi_instance=moving_dandiset.api.instance_id, devel_debug=True, ) @@ -202,13 +209,15 @@ def test_move_skip( ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) -@pytest.mark.parametrize("kwargs", [{"existing": "error"}, {}]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) +@pytest.mark.parametrize("kwargs", [{"existing": MoveExisting.ERROR}, {}]) def test_move_error( monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, - work_on: str, - kwargs: dict[str, str], + work_on: MoveWorkOn, + kwargs: dict[str, Any], ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -224,14 +233,18 @@ def test_move_error( ) assert ( str(excinfo.value) == "Cannot move 'subdir4/foo.json' to 'subdir5/foo.json', as" - f" {'remote' if work_on == 'remote' else 'local'} destination already exists" + f" {'remote' if work_on is MoveWorkOn.REMOTE else 'local'} destination already exists" ) check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_overwrite( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -241,7 +254,7 @@ def test_move_overwrite( "subdir4/foo.json", dest="subdir5", work_on=work_on, - existing="overwrite", + existing=MoveExisting.OVERWRITE, devel_debug=True, dandi_instance=moving_dandiset.api.instance_id, ) @@ -266,11 +279,11 @@ def test_move_no_srcs( with pytest.raises(ValueError) as excinfo: move( dest="nowhere", - work_on="both", + work_on=MoveWorkOn.BOTH, dandi_instance=moving_dandiset.api.instance_id, ) assert str(excinfo.value) == "No source paths given" - check_assets(moving_dandiset, starting_assets, "both", {}) + check_assets(moving_dandiset, starting_assets, MoveWorkOn.BOTH, {}) def test_move_regex_multisrcs( @@ -285,18 +298,22 @@ def test_move_regex_multisrcs( r"\.dat", dest=".blob", regex=True, - work_on="both", + work_on=MoveWorkOn.BOTH, dandi_instance=moving_dandiset.api.instance_id, ) assert ( str(excinfo.value) == "Cannot take multiple source paths when `regex` is True" ) - check_assets(moving_dandiset, starting_assets, "both", {}) + check_assets(moving_dandiset, starting_assets, MoveWorkOn.BOTH, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_multisrcs_file_dest( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -316,9 +333,13 @@ def test_move_multisrcs_file_dest( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_folder_src_file_dest( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -334,9 +355,13 @@ def test_move_folder_src_file_dest( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_nonexistent_src( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -356,9 +381,13 @@ def test_move_nonexistent_src( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_file_slash_src( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -378,9 +407,13 @@ def test_move_file_slash_src( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_file_slash_dest( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -410,14 +443,14 @@ def test_move_regex_no_match( "no-match", dest="nowhere", regex=True, - work_on="both", + work_on=MoveWorkOn.BOTH, dandi_instance=moving_dandiset.api.instance_id, ) assert ( str(excinfo.value) == "Regular expression 'no-match' did not match any local paths" ) - check_assets(moving_dandiset, starting_assets, "both", {}) + check_assets(moving_dandiset, starting_assets, MoveWorkOn.BOTH, {}) def test_move_regex_collision( @@ -431,7 +464,7 @@ def test_move_regex_collision( r"^\w+/foo\.json$", dest="data/data.json", regex=True, - work_on="both", + work_on=MoveWorkOn.BOTH, dandi_instance=moving_dandiset.api.instance_id, ) assert ( @@ -439,15 +472,17 @@ def test_move_regex_collision( == "Local assets 'subdir4/foo.json' and 'subdir5/foo.json' would both" " be moved to 'data/data.json'" ) - check_assets(moving_dandiset, starting_assets, "both", {}) + check_assets(moving_dandiset, starting_assets, MoveWorkOn.BOTH, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_regex_some_to_self( caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, - work_on: str, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -461,7 +496,11 @@ def test_move_regex_some_to_self( devel_debug=True, ) for path in ["subdir3/red.dat", "subdir3/green.dat", "subdir3/blue.dat"]: - for where in ["local", "remote"] if work_on == "both" else [work_on]: + for where in ( + [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE] + if work_on is MoveWorkOn.BOTH + else [work_on] + ): assert ( "dandi", logging.DEBUG, @@ -479,9 +518,13 @@ def test_move_regex_some_to_self( ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_from_subdir( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath / "subdir1") @@ -505,9 +548,13 @@ def test_move_from_subdir( ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_in_subdir( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath / "subdir1") @@ -527,9 +574,13 @@ def test_move_in_subdir( ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_from_subdir_abspaths( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath / "subdir1") @@ -549,9 +600,13 @@ def test_move_from_subdir_abspaths( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_from_subdir_as_dot( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath / "subdir1") @@ -568,9 +623,13 @@ def test_move_from_subdir_as_dot( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_from_subdir_regex( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath / "subdir1") @@ -591,12 +650,14 @@ def test_move_from_subdir_regex( ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_from_subdir_regex_no_changes( caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, - work_on: str, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath / "subdir1") @@ -613,12 +674,14 @@ def test_move_from_subdir_regex_no_changes( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_dandiset_path( monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, tmp_path: Path, - work_on: str, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(tmp_path) @@ -643,12 +706,12 @@ def test_move_dandiset_path( ) -@pytest.mark.parametrize("work_on", ["auto", "remote"]) +@pytest.mark.parametrize("work_on", [MoveWorkOn.AUTO, MoveWorkOn.REMOTE]) def test_move_dandiset_url( monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, tmp_path: Path, - work_on: str, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(tmp_path) @@ -664,7 +727,7 @@ def test_move_dandiset_url( check_assets( moving_dandiset, starting_assets, - "remote", + MoveWorkOn.REMOTE, { "file.txt": "subdir1/file.txt", "subdir2/banana.txt": "subdir1/banana.txt", @@ -682,14 +745,14 @@ def test_move_work_on_auto( "file.txt", "subdir2/banana.txt", dest="subdir1", - work_on="auto", + work_on=MoveWorkOn.AUTO, dandi_instance=moving_dandiset.api.instance_id, devel_debug=True, ) check_assets( moving_dandiset, starting_assets, - "both", + MoveWorkOn.BOTH, { "file.txt": "subdir1/file.txt", "subdir2/banana.txt": "subdir1/banana.txt", @@ -697,9 +760,11 @@ def test_move_work_on_auto( ) -@pytest.mark.parametrize("work_on", ["auto", "both", "local", "remote"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.AUTO, MoveWorkOn.BOTH, MoveWorkOn.LOCAL, MoveWorkOn.REMOTE] +) def test_move_not_dandiset( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path, work_on: str + monkeypatch: pytest.MonkeyPatch, tmp_path: Path, work_on: MoveWorkOn ) -> None: monkeypatch.chdir(tmp_path) with pytest.raises(ValueError) as excinfo: @@ -718,13 +783,13 @@ def test_move_local_delete_empty_dirs( "../subdir2/banana.txt", "foo.json", dest="../subdir3", - work_on="local", + work_on=MoveWorkOn.LOCAL, devel_debug=True, ) check_assets( moving_dandiset, starting_assets, - "local", + MoveWorkOn.LOCAL, { "subdir1/apple.txt": "subdir3/apple.txt", "subdir2/banana.txt": "subdir3/banana.txt", @@ -747,7 +812,7 @@ def test_move_both_src_path_not_in_local( move( "subdir2", dest="subdir3", - work_on="both", + work_on=MoveWorkOn.BOTH, dandi_instance=moving_dandiset.api.instance_id, devel_debug=True, ) @@ -756,7 +821,9 @@ def test_move_both_src_path_not_in_local( "- Asset 'subdir2/banana.txt' only exists remotely\n" "- Asset 'subdir2/coconut.txt' only exists remotely" ) - check_assets(moving_dandiset, starting_assets, "both", {"subdir2/banana.txt": None}) + check_assets( + moving_dandiset, starting_assets, MoveWorkOn.BOTH, {"subdir2/banana.txt": None} + ) def test_move_both_src_path_not_in_remote( @@ -770,7 +837,7 @@ def test_move_both_src_path_not_in_remote( move( "subdir2", dest="subdir3", - work_on="both", + work_on=MoveWorkOn.BOTH, dandi_instance=moving_dandiset.api.instance_id, devel_debug=True, ) @@ -778,12 +845,14 @@ def test_move_both_src_path_not_in_remote( str(excinfo.value) == "Mismatch between local and remote Dandisets:\n" "- Asset 'subdir2/mango.txt' only exists locally" ) - check_assets(moving_dandiset, starting_assets, "both", {}) + check_assets(moving_dandiset, starting_assets, MoveWorkOn.BOTH, {}) -@pytest.mark.parametrize("existing", ["skip", "overwrite"]) +@pytest.mark.parametrize("existing", [MoveExisting.SKIP, MoveExisting.OVERWRITE]) def test_move_both_dest_path_not_in_remote( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, existing: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + existing: MoveExisting, ) -> None: (moving_dandiset.dspath / "subdir2" / "file.txt").write_text("This is a file.\n") starting_assets = list(moving_dandiset.dandiset.get_assets()) @@ -793,7 +862,7 @@ def test_move_both_dest_path_not_in_remote( move( "file.txt", dest="subdir2", - work_on="both", + work_on=MoveWorkOn.BOTH, existing=existing, dandi_instance=moving_dandiset.api.instance_id, devel_debug=True, @@ -803,12 +872,14 @@ def test_move_both_dest_path_not_in_remote( "- Asset 'file.txt' would be moved to 'subdir2/file.txt', which exists" " locally but not remotely" ) - check_assets(moving_dandiset, starting_assets, "both", {}) + check_assets(moving_dandiset, starting_assets, MoveWorkOn.BOTH, {}) -@pytest.mark.parametrize("existing", ["skip", "overwrite"]) +@pytest.mark.parametrize("existing", [MoveExisting.SKIP, MoveExisting.OVERWRITE]) def test_move_both_dest_path_not_in_local( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, existing: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + existing: MoveExisting, ) -> None: (moving_dandiset.dspath / "subdir2" / "banana.txt").unlink() starting_assets = list(moving_dandiset.dandiset.get_assets()) @@ -818,7 +889,7 @@ def test_move_both_dest_path_not_in_local( move( "file.txt", dest="subdir2/banana.txt", - work_on="both", + work_on=MoveWorkOn.BOTH, existing=existing, dandi_instance=moving_dandiset.api.instance_id, devel_debug=True, @@ -829,7 +900,9 @@ def test_move_both_dest_path_not_in_local( " would be moved to 'subdir2/banana.txt', which exists remotely but" " not locally" ) - check_assets(moving_dandiset, starting_assets, "both", {"subdir2/banana.txt": None}) + check_assets( + moving_dandiset, starting_assets, MoveWorkOn.BOTH, {"subdir2/banana.txt": None} + ) def test_move_both_dest_mismatch( @@ -845,8 +918,8 @@ def test_move_both_dest_mismatch( move( "file.txt", dest="subdir1/apple.txt", - work_on="both", - existing="overwrite", + work_on=MoveWorkOn.BOTH, + existing=MoveExisting.OVERWRITE, dandi_instance=moving_dandiset.api.instance_id, devel_debug=True, ) @@ -855,12 +928,18 @@ def test_move_both_dest_mismatch( "- Asset 'file.txt' would be moved to 'subdir1/apple.txt/file.txt'" " locally but to 'subdir1/apple.txt' remotely" ) - check_assets(moving_dandiset, starting_assets, "both", {"subdir1/apple.txt": None}) + check_assets( + moving_dandiset, starting_assets, MoveWorkOn.BOTH, {"subdir1/apple.txt": None} + ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_pyout( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -870,7 +949,7 @@ def test_move_pyout( "subdir4/foo.json", dest="subdir5", work_on=work_on, - existing="overwrite", + existing=MoveExisting.OVERWRITE, devel_debug=False, dandi_instance=moving_dandiset.api.instance_id, ) @@ -886,9 +965,13 @@ def test_move_pyout( ) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_pyout_dry_run( - monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, work_on: str + monkeypatch: pytest.MonkeyPatch, + moving_dandiset: SampleDandiset, + work_on: MoveWorkOn, ) -> None: starting_assets = list(moving_dandiset.dandiset.get_assets()) monkeypatch.chdir(moving_dandiset.dspath) @@ -898,7 +981,7 @@ def test_move_pyout_dry_run( "subdir4/foo.json", dest="subdir5", work_on=work_on, - existing="overwrite", + existing=MoveExisting.OVERWRITE, devel_debug=False, dry_run=True, dandi_instance=moving_dandiset.api.instance_id, @@ -906,12 +989,14 @@ def test_move_pyout_dry_run( check_assets(moving_dandiset, starting_assets, work_on, {}) -@pytest.mark.parametrize("work_on", ["local", "remote", "both"]) +@pytest.mark.parametrize( + "work_on", [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE, MoveWorkOn.BOTH] +) def test_move_path_to_self( caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch, moving_dandiset: SampleDandiset, - work_on: str, + work_on: MoveWorkOn, ) -> None: (moving_dandiset.dspath / "newdir").mkdir() starting_assets = list(moving_dandiset.dandiset.get_assets()) @@ -924,7 +1009,11 @@ def test_move_path_to_self( devel_debug=True, dandi_instance=moving_dandiset.api.instance_id, ) - for where in ["local", "remote"] if work_on == "both" else [work_on]: + for where in ( + [MoveWorkOn.LOCAL, MoveWorkOn.REMOTE] + if work_on is MoveWorkOn.BOTH + else [work_on] + ): assert ( "dandi", logging.DEBUG, @@ -944,11 +1033,13 @@ def test_move_remote_dest_is_local_dir_sans_slash( move( "file.txt", dest="newdir", - work_on="remote", + work_on=MoveWorkOn.REMOTE, devel_debug=True, dandi_instance=moving_dandiset.api.instance_id, ) - check_assets(moving_dandiset, starting_assets, "remote", {"file.txt": "newdir"}) + check_assets( + moving_dandiset, starting_assets, MoveWorkOn.REMOTE, {"file.txt": "newdir"} + ) def test_move_both_dest_is_local_dir_sans_slash( @@ -961,10 +1052,13 @@ def test_move_both_dest_is_local_dir_sans_slash( move( "file.txt", dest="newdir", - work_on="both", + work_on=MoveWorkOn.BOTH, devel_debug=True, dandi_instance=moving_dandiset.api.instance_id, ) check_assets( - moving_dandiset, starting_assets, "both", {"file.txt": "newdir/file.txt"} + moving_dandiset, + starting_assets, + MoveWorkOn.BOTH, + {"file.txt": "newdir/file.txt"}, ) diff --git a/dandi/tests/test_organize.py b/dandi/tests/test_organize.py index 8556d953d..495db55a0 100644 --- a/dandi/tests/test_organize.py +++ b/dandi/tests/test_organize.py @@ -11,8 +11,10 @@ import ruamel.yaml from ..cli.cmd_organize import organize -from ..consts import dandiset_metadata_file, file_operation_modes +from ..consts import dandiset_metadata_file from ..organize import ( + CopyMode, + FileOperationMode, _sanitize_value, create_dataset_yml_template, create_unique_filenames_from_metadata, @@ -104,8 +106,8 @@ def c() -> Any: # shortcut # do not test 'move' - would need a dedicated handling since it would # really move data away and break testing of other modes -no_move_modes = file_operation_modes[:] -no_move_modes.remove("move") +no_move_modes: list[FileOperationMode | str] = list(FileOperationMode) +no_move_modes.remove(FileOperationMode.MOVE) if not on_windows: # github workflows start whining about cross-drives links no_move_modes.append("symlink-relative") @@ -115,7 +117,7 @@ def c() -> Any: # shortcut @pytest.mark.parametrize("mode", no_move_modes) @pytest.mark.parametrize("jobs", (1, -1)) def test_organize_nwb_test_data( - nwb_test_data: Path, tmp_path: Path, mode: str, jobs: int + nwb_test_data: Path, tmp_path: Path, mode: FileOperationMode | str, jobs: int ) -> None: outdir = tmp_path / "organized" @@ -123,7 +125,7 @@ def test_organize_nwb_test_data( if mode == "symlink-relative": # Force relative paths, as if e.g. user did provide relative = True - mode = "symlink" + mode = FileOperationMode.SYMLINK # all paths will be relative to the curdir, which should cause # organize also organize using relative paths in case of 'symlink' # mode @@ -131,6 +133,8 @@ def test_organize_nwb_test_data( nwb_test_data = Path(op.relpath(nwb_test_data, cwd)) outdir = Path(op.relpath(outdir, cwd)) + assert isinstance(mode, FileOperationMode) + src = tmp_path / "src" src.touch() dest = tmp_path / "dest" @@ -140,10 +144,7 @@ def test_organize_nwb_test_data( symlinks_work = False else: symlinks_work = True - try: - dest.unlink() - except FileNotFoundError: - pass + dest.unlink(missing_ok=True) try: os.link(src, dest) except OSError: @@ -151,9 +152,12 @@ def test_organize_nwb_test_data( else: hardlinks_work = True - if mode in ("simulate", "symlink") and not symlinks_work: + if ( + mode in (FileOperationMode.SIMULATE, FileOperationMode.SYMLINK) + and not symlinks_work + ): pytest.skip("Symlinks not supported") - elif mode == "hardlink" and not hardlinks_work: + elif mode is FileOperationMode.HARDLINK and not hardlinks_work: pytest.skip("Hard links not supported") input_files = nwb_test_data / "v2.0.1" @@ -162,7 +166,7 @@ def test_organize_nwb_test_data( "-d", str(outdir), "--files-mode", - mode, + str(mode), str(input_files), "--jobs", str(jobs), @@ -182,7 +186,7 @@ def test_organize_nwb_test_data( produced_paths = sorted(find_files(".*", paths=outdir)) produced_nwb_paths = sorted(find_files(r"\.nwb\Z", paths=outdir)) produced_relpaths = [op.relpath(p, outdir) for p in produced_paths] - if mode == "dry": + if mode is FileOperationMode.DRY: assert produced_relpaths == [] else: assert produced_relpaths == [ @@ -192,9 +196,11 @@ def test_organize_nwb_test_data( # symlinks) assert all(map(op.exists, produced_paths)) - if mode == "simulate": + if mode is FileOperationMode.SIMULATE: assert all((op.isabs(p) != relative) for p in produced_paths) - elif mode == "symlink" or (mode == "auto" and symlinks_work): + elif mode is FileOperationMode.SYMLINK or ( + mode is FileOperationMode.AUTO and symlinks_work + ): assert all(op.islink(p) for p in produced_nwb_paths) else: assert not any(op.islink(p) for p in produced_paths) @@ -295,16 +301,18 @@ def error_link(src: Any, dest: Any) -> NoReturn: assert detect_link_type(p, tmp_path) == result -@pytest.mark.parametrize("mode", ["copy", "move"]) -@pytest.mark.parametrize("video_mode", ["copy", "move", "symlink", "hardlink"]) -def test_video_organize(video_mode, mode, nwbfiles_video_unique): +@pytest.mark.parametrize("mode", [FileOperationMode.COPY, FileOperationMode.MOVE]) +@pytest.mark.parametrize("video_mode", list(CopyMode)) +def test_video_organize( + video_mode: CopyMode, mode: FileOperationMode, nwbfiles_video_unique: Path +) -> None: dandi_organize_path = nwbfiles_video_unique.parent / "dandi_organized" cmd = [ "--files-mode", - mode, + str(mode), "--update-external-file-paths", "--media-files-mode", - video_mode, + str(video_mode), "-d", str(dandi_organize_path), str(nwbfiles_video_unique), @@ -334,26 +342,23 @@ def test_video_organize(video_mode, mode, nwbfiles_video_unique): assert len(video_files_list) == len(video_files_organized) -@pytest.mark.parametrize("video_mode", ["copy", "move"]) -def test_video_organize_common(video_mode, nwbfiles_video_common): +@pytest.mark.parametrize("video_mode,rc", [(CopyMode.COPY, 0), (CopyMode.MOVE, 1)]) +def test_video_organize_common( + video_mode: CopyMode, rc: int, nwbfiles_video_common: Path +) -> None: dandi_organize_path = nwbfiles_video_common.parent / "dandi_organized" cmd = [ "--files-mode", "move", "--update-external-file-paths", "--media-files-mode", - video_mode, + str(video_mode), "-d", str(dandi_organize_path), str(nwbfiles_video_common), ] r = CliRunner().invoke(organize, cmd) - if video_mode == "move": - assert r.exit_code == 1 - print(r.exception) - else: - assert r.exit_code == 0 - print(r.stdout) + assert r.exit_code == rc @pytest.mark.parametrize( diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 9326b3ad4..2ba5bf35f 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -20,6 +20,7 @@ from ..exceptions import NotFoundError, UploadError from ..files import LocalFileAsset from ..pynwb_utils import make_nwb_file +from ..upload import UploadExisting, UploadValidation from ..utils import list_paths, yaml_dump @@ -66,7 +67,7 @@ def test_upload_extant_existing( ) -> None: iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") with pytest.raises(FileExistsError): - text_dandiset.upload(existing="error") + text_dandiset.upload(existing=UploadExisting.ERROR) iter_upload_spy.assert_not_called() @@ -74,22 +75,25 @@ def test_upload_extant_skip( mocker: MockerFixture, text_dandiset: SampleDandiset ) -> None: iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset.upload(existing="skip") + text_dandiset.upload(existing=UploadExisting.SKIP) iter_upload_spy.assert_not_called() -@pytest.mark.parametrize("existing", ["overwrite", "refresh"]) +@pytest.mark.parametrize("existing", [UploadExisting.OVERWRITE, UploadExisting.REFRESH]) def test_upload_extant_eq_overwrite( - existing: str, mocker: MockerFixture, text_dandiset: SampleDandiset + existing: UploadExisting, mocker: MockerFixture, text_dandiset: SampleDandiset ) -> None: iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") text_dandiset.upload(existing=existing) iter_upload_spy.assert_not_called() -@pytest.mark.parametrize("existing", ["overwrite", "refresh"]) +@pytest.mark.parametrize("existing", [UploadExisting.OVERWRITE, UploadExisting.REFRESH]) def test_upload_extant_neq_overwrite( - existing: str, mocker: MockerFixture, text_dandiset: SampleDandiset, tmp_path: Path + existing: UploadExisting, + mocker: MockerFixture, + text_dandiset: SampleDandiset, + tmp_path: Path, ) -> None: (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") @@ -107,7 +111,7 @@ def test_upload_extant_old_refresh( (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") os.utime(text_dandiset.dspath / "file.txt", times=(0, 0)) iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset.upload(existing="refresh") + text_dandiset.upload(existing=UploadExisting.REFRESH) iter_upload_spy.assert_not_called() @@ -115,19 +119,10 @@ def test_upload_extant_force( mocker: MockerFixture, text_dandiset: SampleDandiset ) -> None: iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset.upload(existing="force") + text_dandiset.upload(existing=UploadExisting.FORCE) iter_upload_spy.assert_called() -def test_upload_extant_bad_existing( - mocker: MockerFixture, text_dandiset: SampleDandiset -) -> None: - iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - with pytest.raises(ValueError): - text_dandiset.upload(existing="foobar") - iter_upload_spy.assert_not_called() - - @pytest.mark.parametrize( "contents", [ @@ -189,10 +184,12 @@ def test_upload_bids_invalid( ) -> None: iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") with pytest.raises(UploadError): - bids_dandiset_invalid.upload(existing="force") + bids_dandiset_invalid.upload(existing=UploadExisting.FORCE) iter_upload_spy.assert_not_called() # Does validation ignoring work? - bids_dandiset_invalid.upload(existing="force", validation="ignore") + bids_dandiset_invalid.upload( + existing=UploadExisting.FORCE, validation=UploadValidation.IGNORE + ) iter_upload_spy.assert_called() # Check existence of assets: dandiset = bids_dandiset_invalid.dandiset @@ -203,7 +200,9 @@ def test_upload_bids_validation_ignore( mocker: MockerFixture, bids_dandiset: SampleDandiset ) -> None: iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - bids_dandiset.upload(existing="force", validation="ignore") + bids_dandiset.upload( + existing=UploadExisting.FORCE, validation=UploadValidation.IGNORE + ) # Check whether upload was run iter_upload_spy.assert_called() # Check existence of assets: @@ -219,7 +218,7 @@ def test_upload_bids_validation_ignore( def test_upload_bids_metadata( mocker: MockerFixture, bids_dandiset: SampleDandiset ) -> None: - bids_dandiset.upload(existing="force") + bids_dandiset.upload(existing=UploadExisting.FORCE) dandiset = bids_dandiset.dandiset # Automatically check all files, heuristic should remain very BIDS-stable for asset in dandiset.get_assets(order="path"): @@ -233,7 +232,7 @@ def test_upload_bids_metadata( def test_upload_bids(mocker: MockerFixture, bids_dandiset: SampleDandiset) -> None: iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - bids_dandiset.upload(existing="force") + bids_dandiset.upload(existing=UploadExisting.FORCE) # Check whether upload was run iter_upload_spy.assert_called() # Check existence of assets: @@ -352,14 +351,14 @@ def test_upload_different_zarr_entry_conflicts( (zf / "changed-size.txt").write_text("This is a test.\n") (zf / "changed-type").mkdir() (zf / "changed-type" / "file.txt").write_text("This is test text.\n") - new_dandiset.upload(validation="skip") + new_dandiset.upload(validation=UploadValidation.SKIP) rmtree(zf) zf.mkdir() (zf / "unchanged.txt").write_text("This is will not change.\n") (zf / "changed-contents.txt").write_text("This is text version #2.\n") (zf / "changed-size.txt").write_text("This is a test of the upload code.\n") (zf / "changed-type").write_text("This is now a file.\n") - new_dandiset.upload(validation="skip") + new_dandiset.upload(validation=UploadValidation.SKIP) download(new_dandiset.dandiset.version_api_url, tmp_path) assert_dirtrees_eq(zf, tmp_path / new_dandiset.dandiset_id / "sample.zarr") @@ -370,12 +369,12 @@ def test_upload_different_zarr_file_to_parent_dir( zf = new_dandiset.dspath / "sample.zarr" zf.mkdir() (zf / "foo").write_text("This is a file.\n") - new_dandiset.upload(validation="skip") + new_dandiset.upload(validation=UploadValidation.SKIP) rmtree(zf) zf.mkdir() (zf / "foo").mkdir() (zf / "foo" / "bar").write_text("This is under what used to be a file.\n") - new_dandiset.upload(validation="skip") + new_dandiset.upload(validation=UploadValidation.SKIP) download(new_dandiset.dandiset.version_api_url, tmp_path) assert_dirtrees_eq(zf, tmp_path / new_dandiset.dandiset_id / "sample.zarr") diff --git a/dandi/upload.py b/dandi/upload.py index bf2e69cd0..bf9eea14b 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -1,8 +1,9 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Iterator +from collections.abc import Iterator, Sequence from contextlib import ExitStack +from enum import Enum from functools import reduce import os.path from pathlib import Path @@ -40,10 +41,30 @@ class Uploaded(TypedDict): errors: list[str] +class UploadExisting(str, Enum): + ERROR = "error" + SKIP = "skip" + FORCE = "force" + OVERWRITE = "overwrite" + REFRESH = "refresh" + + def __str__(self) -> str: + return self.value + + +class UploadValidation(str, Enum): + REQUIRE = "require" + SKIP = "skip" + IGNORE = "ignore" + + def __str__(self) -> str: + return self.value + + def upload( - paths: list[str | Path] | None = None, - existing: str = "refresh", - validation: str = "require", + paths: Sequence[str | Path] | None = None, + existing: UploadExisting = UploadExisting.REFRESH, + validation: UploadValidation = UploadValidation.REQUIRE, dandi_instance: str | DandiInstance = "dandi", allow_any_path: bool = False, upload_dandiset_metadata: bool = False, @@ -165,7 +186,10 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml - if isinstance(dfile, LocalAsset) and validation != "skip": + if ( + isinstance(dfile, LocalAsset) + and validation != UploadValidation.SKIP + ): yield {"status": "pre-validating"} validation_statuses = dfile.get_validation_errors() validation_errors = [ @@ -174,7 +198,7 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: - if validation == "require": + if validation is UploadValidation.REQUIRE: lgr.warning( "%r had %d validation errors preventing its upload:", strpath, @@ -387,7 +411,7 @@ def upload_agg(*ignored: Any) -> str: def check_replace_asset( local_asset: LocalAsset, remote_asset: RemoteAsset, - existing: str, + existing: UploadExisting, local_etag: Digest | None, ) -> tuple[bool, dict[str, str]]: # Returns a (replace asset, message to yield) tuple @@ -414,24 +438,24 @@ def check_replace_asset( remote_mtime = None remote_file_status = "no mtime" exists_msg = f"exists ({remote_file_status})" - if existing == "error": + if existing is UploadExisting.ERROR: # as promised -- not gentle at all! raise FileExistsError(exists_msg) - if existing == "skip": + if existing is UploadExisting.SKIP: return (False, skip_file(exists_msg)) # Logic below only for overwrite and reupload - if existing == "overwrite": + if existing is UploadExisting.OVERWRITE: if remote_etag == local_etag.value: return (False, skip_file(exists_msg)) - elif existing == "refresh": + elif existing is UploadExisting.REFRESH: if remote_etag == local_etag.value: return (False, skip_file("file exists")) elif remote_mtime is not None and remote_mtime >= local_mtime: return (False, skip_file(exists_msg)) - elif existing == "force": + elif existing is UploadExisting.FORCE: pass else: - raise ValueError(f"invalid value for 'existing': {existing!r}") + raise AssertionError(f"Unhandled UploadExisting member: {existing!r}") return (True, {"message": f"{exists_msg} - reuploading"}) diff --git a/dandi/utils.py b/dandi/utils.py index 6a6c2317c..042554c13 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -255,7 +255,7 @@ def load_jsonl(filename: AnyPath) -> list: def find_files( regex: str, - paths: list[AnyPath] | tuple[AnyPath, ...] | set[AnyPath] | AnyPath = os.curdir, + paths: AnyPath | Iterable[AnyPath] = os.curdir, exclude: str | None = None, exclude_dotfiles: bool = True, exclude_dotdirs: bool = True, @@ -302,7 +302,7 @@ def exclude_path(path: str) -> bool: def good_file(path: str) -> bool: return bool(re.search(regex, path)) and not exclude_path(path) - if isinstance(paths, (list, tuple, set)): + if not isinstance(paths, (str, Path)): for path in paths: if op.isdir(path): yield from find_files( diff --git a/setup.cfg b/setup.cfg index c987a034e..673c846c8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ project_urls = python_requires = >=3.8 install_requires = bidsschematools ~= 0.7.0 - click + click >= 7.1 click-didyoumean dandischema ~= 0.8.0 etelemetry >= 0.2.2 From b92c471398363aa441bc2e0b5d190e91881b4718 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Nov 2023 16:40:51 -0500 Subject: [PATCH 2/4] Split up metadata.py So that common functionality can be imported without failing the "no heavy imports" test --- dandi/cli/cmd_service_scripts.py | 2 +- dandi/files/bases.py | 4 +- dandi/files/bids.py | 2 +- dandi/files/zarr.py | 2 +- dandi/metadata/__init__.py | 0 dandi/metadata/core.py | 76 ++++++++ dandi/metadata/nwb.py | 161 +++++++++++++++++ dandi/{metadata.py => metadata/util.py} | 221 +----------------------- dandi/organize.py | 3 +- dandi/tests/test_metadata.py | 7 +- 10 files changed, 250 insertions(+), 228 deletions(-) create mode 100644 dandi/metadata/__init__.py create mode 100644 dandi/metadata/core.py create mode 100644 dandi/metadata/nwb.py rename dandi/{metadata.py => metadata/util.py} (77%) diff --git a/dandi/cli/cmd_service_scripts.py b/dandi/cli/cmd_service_scripts.py index e0b2da107..e8a2c4b11 100644 --- a/dandi/cli/cmd_service_scripts.py +++ b/dandi/cli/cmd_service_scripts.py @@ -47,7 +47,7 @@ def reextract_metadata(url: str, diff: bool, when: str) -> None: Running this command requires the fsspec library to be installed with the `http` extra (e.g., `pip install "fsspec[http]"`). """ - from ..metadata import nwb2asset # Avoid heavy import at top level + from ..metadata.nwb import nwb2asset # Avoid heavy import at top level parsed_url = parse_dandi_url(url) if parsed_url.dandiset_id is None: diff --git a/dandi/files/bases.py b/dandi/files/bases.py index efd32606d..48eca8dae 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -25,7 +25,7 @@ import dandi from dandi.dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient -from dandi.metadata import get_default_metadata, nwb2asset +from dandi.metadata.core import get_default_metadata from dandi.misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, P from dandi.support.digests import get_dandietag, get_digest from dandi.utils import yaml_load @@ -466,6 +466,8 @@ def get_metadata( digest: Digest | None = None, ignore_errors: bool = True, ) -> BareAsset: + from dandi.metadata.nwb import nwb2asset + try: metadata = nwb2asset(self.filepath, digest=digest) except Exception as e: diff --git a/dandi/files/bids.py b/dandi/files/bids.py index 6f19345df..0ab0784f0 100644 --- a/dandi/files/bids.py +++ b/dandi/files/bids.py @@ -13,7 +13,7 @@ from .bases import GenericAsset, LocalFileAsset, NWBAsset from .zarr import ZarrAsset from ..consts import ZARR_MIME_TYPE -from ..metadata import add_common_metadata, prepare_metadata +from ..metadata.core import add_common_metadata, prepare_metadata from ..misctypes import Digest from ..validate_types import ValidationResult diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index b675a489c..7acecec42 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -32,7 +32,7 @@ RemoteZarrEntry, RESTFullAPIClient, ) -from dandi.metadata import get_default_metadata +from dandi.metadata.core import get_default_metadata from dandi.misctypes import DUMMY_DANDI_ZARR_CHECKSUM, BasePath, Digest from dandi.support.digests import get_digest, get_zarr_checksum, md5file_nocache from dandi.utils import chunked, exclude_from_zarr, pluralize diff --git a/dandi/metadata/__init__.py b/dandi/metadata/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dandi/metadata/core.py b/dandi/metadata/core.py new file mode 100644 index 000000000..3e97dbc24 --- /dev/null +++ b/dandi/metadata/core.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +import re + +from dandischema import models +from pydantic import ByteSize, parse_obj_as + +from .util import extract_model, get_generator +from .. import get_logger +from ..misctypes import Digest, LocalReadableFile, Readable +from ..utils import get_mime_type, get_utcnow_datetime + +lgr = get_logger() + + +def get_default_metadata( + path: str | Path | Readable, digest: Digest | None = None +) -> models.BareAsset: + metadata = models.BareAsset.unvalidated() + start_time = end_time = datetime.now().astimezone() + add_common_metadata(metadata, path, start_time, end_time, digest) + return metadata + + +def add_common_metadata( + metadata: models.BareAsset, + path: str | Path | Readable, + start_time: datetime, + end_time: datetime, + digest: Digest | None = None, +) -> None: + """ + Update a `dict` of raw "schemadata" with the fields that are common to both + NWB assets and non-NWB assets + """ + if digest is not None: + metadata.digest = digest.asdict() + else: + metadata.digest = {} + metadata.dateModified = get_utcnow_datetime() + if isinstance(path, Readable): + r = path + else: + r = LocalReadableFile(path) + mtime = r.get_mtime() + if mtime is not None: + metadata.blobDateModified = mtime + if mtime > metadata.dateModified: + lgr.warning("mtime %s of %s is in the future", mtime, r) + size = r.get_size() + if digest is not None and digest.algorithm is models.DigestType.dandi_zarr_checksum: + m = re.fullmatch( + r"(?P[0-9a-f]{32})-(?P[0-9]+)--(?P[0-9]+)", digest.value + ) + if m: + size = int(m["size"]) + metadata.contentSize = parse_obj_as(ByteSize, size) + if metadata.wasGeneratedBy is None: + metadata.wasGeneratedBy = [] + metadata.wasGeneratedBy.append(get_generator(start_time, end_time)) + metadata.encodingFormat = get_mime_type(r.get_filename()) + + +def prepare_metadata(metadata: dict) -> models.BareAsset: + """ + Convert "flatdata" [1]_ for an asset into "schemadata" [2]_ as a + `BareAsset` + + .. [1] a flat `dict` mapping strings to strings & other primitive types; + returned by `get_metadata()` + + .. [2] metadata in the form used by the ``dandischema`` library + """ + return extract_model(models.BareAsset, metadata) diff --git a/dandi/metadata/nwb.py b/dandi/metadata/nwb.py new file mode 100644 index 000000000..fcdd2f04d --- /dev/null +++ b/dandi/metadata/nwb.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +from datetime import datetime +import os.path +from pathlib import Path +import re +from typing import Any + +from dandischema import models + +from .core import add_common_metadata, prepare_metadata +from .util import process_ndtypes +from .. import get_logger +from ..consts import metadata_all_fields +from ..misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, Readable +from ..pynwb_utils import ( + _get_pynwb_metadata, + get_neurodata_types, + get_nwb_version, + ignore_benign_pynwb_warnings, + metadata_cache, + nwb_has_external_links, +) +from ..utils import find_parent_directory_containing + +lgr = get_logger() + + +# Disable this for clean hacking +@metadata_cache.memoize_path +def get_metadata( + path: str | Path | Readable, digest: Digest | None = None +) -> dict | None: + """ + Get "flatdata" from a .nwb file + + Parameters + ---------- + path: str, Path, or Readable + + Returns + ------- + dict + """ + + from .files import bids, dandi_file, find_bids_dataset_description + + # when we run in parallel, these annoying warnings appear + ignore_benign_pynwb_warnings() + + if isinstance(path, Readable): + r = path + else: + r = LocalReadableFile(os.path.abspath(path)) + + meta: dict[str, Any] = {} + + if isinstance(r, LocalReadableFile): + # Is the data BIDS (as defined by the presence of a BIDS dataset descriptor) + bids_dataset_description = find_bids_dataset_description(r.filepath) + if bids_dataset_description: + dandiset_path = find_parent_directory_containing( + "dandiset.yaml", r.filepath + ) + df = dandi_file( + r.filepath, + dandiset_path, + bids_dataset_description=bids_dataset_description, + ) + assert isinstance(df, bids.BIDSAsset) + if not digest: + digest = DUMMY_DANDI_ETAG + path_metadata = df.get_metadata(digest=digest) + meta["bids_version"] = df.get_validation_bids_version() + # there might be a more elegant way to do this: + if path_metadata.wasAttributedTo is not None: + attributed = path_metadata.wasAttributedTo[0] + for key in metadata_all_fields: + try: + value = getattr(attributed, key) + except AttributeError: + pass + else: + meta[key] = value + + if r.get_filename().endswith((".NWB", ".nwb")): + if nwb_has_external_links(r): + raise NotImplementedError( + f"NWB files with external links are not supported: {r}" + ) + + # First read out possibly available versions of specifications for NWB(:N) + meta["nwb_version"] = get_nwb_version(r) + + # PyNWB might fail to load because of missing extensions. + # There is a new initiative of establishing registry of such extensions. + # Not yet sure if PyNWB is going to provide "native" support for needed + # functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143 + # So meanwhile, hard-coded workaround for data types we care about + ndtypes_registry = { + "AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb", + "ndx-labmetadata-abf": "ndx_dandi_icephys", + } + tried_imports = set() + while True: + try: + meta.update(_get_pynwb_metadata(r)) + break + except KeyError as exc: # ATM there is + lgr.debug("Failed to read %s: %s", r, exc) + res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc)) + if not res: + raise + ndtype = res.groups()[0] + if ndtype not in ndtypes_registry: + raise ValueError( + "We do not know which extension provides %s. " + "Original exception was: %s. " % (ndtype, exc) + ) + import_mod = ndtypes_registry[ndtype] + lgr.debug("Importing %r which should provide %r", import_mod, ndtype) + if import_mod in tried_imports: + raise RuntimeError( + "We already tried importing %s to provide %s, but it seems it didn't help" + % (import_mod, ndtype) + ) + tried_imports.add(import_mod) + __import__(import_mod) + + meta["nd_types"] = get_neurodata_types(r) + if not meta: + raise RuntimeError( + f"Unable to get metadata from non-BIDS, non-NWB asset: `{path}`." + ) + return meta + + +def nwb2asset( + nwb_path: str | Path | Readable, + digest: Digest | None = None, + schema_version: str | None = None, +) -> models.BareAsset: + if schema_version is not None: + current_version = models.get_schema_version() + if schema_version != current_version: + raise ValueError( + f"Unsupported schema version: {schema_version}; expected {current_version}" + ) + start_time = datetime.now().astimezone() + metadata = get_metadata(nwb_path) + asset_md = prepare_metadata(metadata) + process_ndtypes(asset_md, metadata["nd_types"]) + end_time = datetime.now().astimezone() + add_common_metadata(asset_md, nwb_path, start_time, end_time, digest) + asset_md.encodingFormat = "application/x-nwb" + # This gets overwritten with a better value by the caller: + if isinstance(nwb_path, Readable): + asset_md.path = nwb_path.get_filename() + else: + asset_md.path = str(nwb_path) + return asset_md diff --git a/dandi/metadata.py b/dandi/metadata/util.py similarity index 77% rename from dandi/metadata.py rename to dandi/metadata/util.py index 5bbd05f59..cfc57bb49 100644 --- a/dandi/metadata.py +++ b/dandi/metadata/util.py @@ -3,147 +3,17 @@ from collections.abc import Callable, Iterable from datetime import datetime, timedelta from functools import lru_cache -import os -import os.path -from pathlib import Path import re from typing import Any, TypedDict, TypeVar from uuid import uuid4 from xml.dom.minidom import parseString from dandischema import models -from pydantic import ByteSize, parse_obj_as import requests import tenacity -from . import __version__, get_logger -from .consts import metadata_all_fields -from .misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, Readable -from .pynwb_utils import ( - _get_pynwb_metadata, - get_neurodata_types, - get_nwb_version, - ignore_benign_pynwb_warnings, - metadata_cache, - nwb_has_external_links, -) -from .utils import ( - ensure_datetime, - find_parent_directory_containing, - get_mime_type, - get_utcnow_datetime, -) - -lgr = get_logger() - - -# Disable this for clean hacking -@metadata_cache.memoize_path -def get_metadata( - path: str | Path | Readable, digest: Digest | None = None -) -> dict | None: - """ - Get "flatdata" from a .nwb file - - Parameters - ---------- - path: str, Path, or Readable - - Returns - ------- - dict - """ - - from .files import bids, dandi_file, find_bids_dataset_description - - # when we run in parallel, these annoying warnings appear - ignore_benign_pynwb_warnings() - - if isinstance(path, Readable): - r = path - else: - r = LocalReadableFile(os.path.abspath(path)) - - meta: dict[str, Any] = {} - - if isinstance(r, LocalReadableFile): - # Is the data BIDS (as defined by the presence of a BIDS dataset descriptor) - bids_dataset_description = find_bids_dataset_description(r.filepath) - if bids_dataset_description: - dandiset_path = find_parent_directory_containing( - "dandiset.yaml", r.filepath - ) - df = dandi_file( - r.filepath, - dandiset_path, - bids_dataset_description=bids_dataset_description, - ) - assert isinstance(df, bids.BIDSAsset) - if not digest: - digest = DUMMY_DANDI_ETAG - path_metadata = df.get_metadata(digest=digest) - meta["bids_version"] = df.get_validation_bids_version() - # there might be a more elegant way to do this: - if path_metadata.wasAttributedTo is not None: - attributed = path_metadata.wasAttributedTo[0] - for key in metadata_all_fields: - try: - value = getattr(attributed, key) - except AttributeError: - pass - else: - meta[key] = value - - if r.get_filename().endswith((".NWB", ".nwb")): - if nwb_has_external_links(r): - raise NotImplementedError( - f"NWB files with external links are not supported: {r}" - ) - - # First read out possibly available versions of specifications for NWB(:N) - meta["nwb_version"] = get_nwb_version(r) - - # PyNWB might fail to load because of missing extensions. - # There is a new initiative of establishing registry of such extensions. - # Not yet sure if PyNWB is going to provide "native" support for needed - # functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143 - # So meanwhile, hard-coded workaround for data types we care about - ndtypes_registry = { - "AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb", - "ndx-labmetadata-abf": "ndx_dandi_icephys", - } - tried_imports = set() - while True: - try: - meta.update(_get_pynwb_metadata(r)) - break - except KeyError as exc: # ATM there is - lgr.debug("Failed to read %s: %s", r, exc) - res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc)) - if not res: - raise - ndtype = res.groups()[0] - if ndtype not in ndtypes_registry: - raise ValueError( - "We do not know which extension provides %s. " - "Original exception was: %s. " % (ndtype, exc) - ) - import_mod = ndtypes_registry[ndtype] - lgr.debug("Importing %r which should provide %r", import_mod, ndtype) - if import_mod in tried_imports: - raise RuntimeError( - "We already tried importing %s to provide %s, but it seems it didn't help" - % (import_mod, ndtype) - ) - tried_imports.add(import_mod) - __import__(import_mod) - - meta["nd_types"] = get_neurodata_types(r) - if not meta: - raise RuntimeError( - f"Unable to get metadata from non-BIDS, non-NWB asset: `{path}`." - ) - return meta +from .. import __version__ +from ..utils import ensure_datetime def _parse_iso8601(age: str) -> list[str]: @@ -945,80 +815,6 @@ def process_ndtypes(metadata: models.BareAsset, nd_types: Iterable[str]) -> None metadata.variableMeasured = [models.PropertyValue(value=val) for val in variables] -def nwb2asset( - nwb_path: str | Path | Readable, - digest: Digest | None = None, - schema_version: str | None = None, -) -> models.BareAsset: - if schema_version is not None: - current_version = models.get_schema_version() - if schema_version != current_version: - raise ValueError( - f"Unsupported schema version: {schema_version}; expected {current_version}" - ) - start_time = datetime.now().astimezone() - metadata = get_metadata(nwb_path) - asset_md = prepare_metadata(metadata) - process_ndtypes(asset_md, metadata["nd_types"]) - end_time = datetime.now().astimezone() - add_common_metadata(asset_md, nwb_path, start_time, end_time, digest) - asset_md.encodingFormat = "application/x-nwb" - # This gets overwritten with a better value by the caller: - if isinstance(nwb_path, Readable): - asset_md.path = nwb_path.get_filename() - else: - asset_md.path = str(nwb_path) - return asset_md - - -def get_default_metadata( - path: str | Path | Readable, digest: Digest | None = None -) -> models.BareAsset: - metadata = models.BareAsset.unvalidated() - start_time = end_time = datetime.now().astimezone() - add_common_metadata(metadata, path, start_time, end_time, digest) - return metadata - - -def add_common_metadata( - metadata: models.BareAsset, - path: str | Path | Readable, - start_time: datetime, - end_time: datetime, - digest: Digest | None = None, -) -> None: - """ - Update a `dict` of raw "schemadata" with the fields that are common to both - NWB assets and non-NWB assets - """ - if digest is not None: - metadata.digest = digest.asdict() - else: - metadata.digest = {} - metadata.dateModified = get_utcnow_datetime() - if isinstance(path, Readable): - r = path - else: - r = LocalReadableFile(path) - mtime = r.get_mtime() - if mtime is not None: - metadata.blobDateModified = mtime - if mtime > metadata.dateModified: - lgr.warning("mtime %s of %s is in the future", mtime, r) - size = r.get_size() - if digest is not None and digest.algorithm is models.DigestType.dandi_zarr_checksum: - m = re.fullmatch( - r"(?P[0-9a-f]{32})-(?P[0-9]+)--(?P[0-9]+)", digest.value - ) - if m: - size = int(m["size"]) - metadata.contentSize = parse_obj_as(ByteSize, size) - if metadata.wasGeneratedBy is None: - metadata.wasGeneratedBy = [] - metadata.wasGeneratedBy.append(get_generator(start_time, end_time)) - metadata.encodingFormat = get_mime_type(r.get_filename()) - - def get_generator(start_time: datetime, end_time: datetime) -> models.Activity: return models.Activity( id=uuid4().urn, @@ -1036,16 +832,3 @@ def get_generator(start_time: datetime, end_time: datetime) -> models.Activity: startDate=start_time, endDate=end_time, ) - - -def prepare_metadata(metadata: dict) -> models.BareAsset: - """ - Convert "flatdata" [1]_ for an asset into "schemadata" [2]_ as a - `BareAsset` - - .. [1] a flat `dict` mapping strings to strings & other primitive types; - returned by `get_metadata()` - - .. [2] metadata in the form used by the ``dandischema`` library - """ - return extract_model(models.BareAsset, metadata) diff --git a/dandi/organize.py b/dandi/organize.py index 02879a3e2..73d5fcc30 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -19,7 +19,6 @@ from .consts import dandi_layout_fields from .dandiset import Dandiset from .exceptions import OrganizeImpossibleError -from .metadata import get_metadata from .utils import ( AnyPath, Parallel, @@ -841,6 +840,8 @@ def act(func, *args, **kwargs): failed = [] def _get_metadata(path): + from .metadata.nwb import get_metadata + try: meta = get_metadata(path) except Exception as exc: diff --git a/dandi/tests/test_metadata.py b/dandi/tests/test_metadata.py index ca52467c1..4c0e3bf0b 100644 --- a/dandi/tests/test_metadata.py +++ b/dandi/tests/test_metadata.py @@ -35,15 +35,14 @@ from .. import __version__ from ..consts import metadata_nwb_subject_fields from ..dandiapi import RemoteBlobAsset -from ..metadata import ( +from ..metadata.core import prepare_metadata +from ..metadata.nwb import get_metadata, nwb2asset +from ..metadata.util import ( extract_age, extract_cellLine, extract_species, - get_metadata, - nwb2asset, parse_age, parse_purlobourl, - prepare_metadata, process_ndtypes, species_map, timedelta2duration, From 8ab9d0569db4974942cd29d2cb3e97643de1c4f9 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Nov 2023 17:05:06 -0500 Subject: [PATCH 3/4] Avoid importing numpy --- dandi/download.py | 5 ++++- dandi/files/bases.py | 5 ++++- dandi/files/zarr.py | 20 +++++++++++++++++--- dandi/support/digests.py | 9 ++++++++- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index 2becc6041..f9cd05167 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -31,7 +31,6 @@ from .dandiset import Dandiset from .exceptions import NotFoundError from .files import LocalAsset, find_dandi_files -from .support.digests import get_digest, get_zarr_checksum from .support.iterators import IteratorWithAggregation from .support.pyout import naturalsize from .utils import ( @@ -552,6 +551,8 @@ def _download_file( possible checksums or other digests provided for the file. Only one will be used to verify download """ + from .support.digests import get_digest + if op.lexists(path): annex_path = op.join(toplevel_path, ".git", "annex") if existing is DownloadExisting.ERROR: @@ -854,6 +855,8 @@ def _download_zarr( lock: Lock, jobs: int | None = None, ) -> Iterator[dict]: + from .support.digests import get_zarr_checksum + download_gens = {} entries = list(asset.iterfiles()) digests = {} diff --git a/dandi/files/bases.py b/dandi/files/bases.py index 48eca8dae..10f887563 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -27,7 +27,6 @@ from dandi.dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient from dandi.metadata.core import get_default_metadata from dandi.misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, P -from dandi.support.digests import get_dandietag, get_digest from dandi.utils import yaml_load from dandi.validate_types import Scope, Severity, ValidationOrigin, ValidationResult @@ -300,6 +299,8 @@ def get_metadata( def get_digest(self) -> Digest: """Calculate a dandi-etag digest for the asset""" + from dandi.support.digests import get_digest + value = get_digest(self.filepath, digest="dandi-etag") return Digest.dandi_etag(value) @@ -330,6 +331,8 @@ def iter_upload( ``"done"`` and an ``"asset"`` key containing the resulting `RemoteAsset`. """ + from dandi.support.digests import get_dandietag + asset_path = metadata.setdefault("path", self.path) client = dandiset.client yield {"status": "calculating etag"} diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index 7acecec42..48d8d3b97 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -15,8 +15,6 @@ from dandischema.digests.zarr import get_checksum from dandischema.models import BareAsset, DigestType import requests -import zarr -from zarr_checksum import ZarrChecksumTree from dandi import get_logger from dandi.consts import ( @@ -34,7 +32,6 @@ ) from dandi.metadata.core import get_default_metadata from dandi.misctypes import DUMMY_DANDI_ZARR_CHECKSUM, BasePath, Digest -from dandi.support.digests import get_digest, get_zarr_checksum, md5file_nocache from dandi.utils import chunked, exclude_from_zarr, pluralize from .bases import LocalDirectoryAsset @@ -95,6 +92,9 @@ def get_digest(self) -> Digest: directory, the algorithm will be the Dandi Zarr checksum algorithm; if it is a file, it will be MD5. """ + + from dandi.support.digests import get_digest, get_zarr_checksum + if self.is_dir(): return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) else: @@ -151,6 +151,8 @@ def stat(self) -> ZarrStat: """Return various details about the Zarr asset""" def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: + from dandi.support.digests import md5file_nocache + size = 0 dir_md5s = {} file_md5s = {} @@ -175,6 +177,8 @@ def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: def get_digest(self) -> Digest: """Calculate a dandi-zarr-checksum digest for the asset""" + from dandi.support.digests import get_zarr_checksum + return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) def get_metadata( @@ -192,6 +196,8 @@ def get_validation_errors( schema_version: str | None = None, devel_debug: bool = False, ) -> list[ValidationResult]: + import zarr + errors: list[ValidationResult] = [] try: data = zarr.open(str(self.filepath)) @@ -281,6 +287,10 @@ def iter_upload( ``"done"`` and an ``"asset"`` key containing the resulting `RemoteAsset`. """ + # Importing zarr_checksum leads to importing numpy, which we want to + # avoid unless necessary + from zarr_checksum import ZarrChecksumTree + # So that older clients don't get away with doing the wrong thing once # Zarr upload to embargoed Dandisets is implemented in the API: if dandiset.embargo_status is EmbargoStatus.EMBARGOED: @@ -584,6 +594,8 @@ def register(self, e: LocalZarrEntry, digest: str | None = None) -> None: @staticmethod def _mkitem(e: LocalZarrEntry) -> UploadItem: + from dandi.support.digests import md5file_nocache + digest = md5file_nocache(e.filepath) return UploadItem.from_entry(e, digest) @@ -634,6 +646,8 @@ def upload_request(self) -> dict[str, str]: def _cmp_digests( asset_path: str, local_entry: LocalZarrEntry, remote_digest: str ) -> tuple[LocalZarrEntry, str, bool]: + from dandi.support.digests import md5file_nocache + local_digest = md5file_nocache(local_entry.filepath) if local_digest != remote_digest: lgr.debug( diff --git a/dandi/support/digests.py b/dandi/support/digests.py index 99e01f1d3..1748ad5cd 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -9,6 +9,10 @@ """Provides helper to compute digests (md5 etc) on files """ +# Importing this module imports fscacher, which imports joblib, which imports +# numpy, which is a "heavy" import, so avoid importing this module at the top +# level of a module. + from __future__ import annotations from collections.abc import Callable @@ -20,7 +24,6 @@ from dandischema.digests.dandietag import DandiETag from fscacher import PersistentCache -from zarr_checksum import ZarrChecksumTree from .threaded_walk import threaded_walk from ..utils import Hasher, exclude_from_zarr @@ -101,6 +104,10 @@ def get_zarr_checksum(path: Path, known: dict[str, str] | None = None) -> str: passed in the ``known`` argument, which must be a `dict` mapping slash-separated paths relative to the root of the Zarr to hex digests. """ + # Importing zarr_checksum leads to importing numpy, which we want to avoid + # unless necessary + from zarr_checksum import ZarrChecksumTree + if path.is_file(): s = get_digest(path, "md5") assert isinstance(s, str) From 68ba57c1170c0bdc900f40c91977eed156110836 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Nov 2023 17:10:26 -0500 Subject: [PATCH 4/4] Fixes --- dandi/cli/cmd_ls.py | 2 +- dandi/metadata/nwb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dandi/cli/cmd_ls.py b/dandi/cli/cmd_ls.py index 7241dd48b..0cfacaeb3 100644 --- a/dandi/cli/cmd_ls.py +++ b/dandi/cli/cmd_ls.py @@ -331,7 +331,7 @@ def get_metadata_ls( path, keys, errors, flatten=False, schema=None, use_fake_digest=False ): from ..dandiset import Dandiset - from ..metadata import get_metadata, nwb2asset + from ..metadata.nwb import get_metadata, nwb2asset from ..pynwb_utils import get_nwb_version, ignore_benign_pynwb_warnings from ..support.digests import get_digest diff --git a/dandi/metadata/nwb.py b/dandi/metadata/nwb.py index fcdd2f04d..0c7583922 100644 --- a/dandi/metadata/nwb.py +++ b/dandi/metadata/nwb.py @@ -43,7 +43,7 @@ def get_metadata( dict """ - from .files import bids, dandi_file, find_bids_dataset_description + from ..files import bids, dandi_file, find_bids_dataset_description # when we run in parallel, these annoying warnings appear ignore_benign_pynwb_warnings()