diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py new file mode 100644 index 000000000..470896739 --- /dev/null +++ b/src/zarr/core/_info.py @@ -0,0 +1,135 @@ +import dataclasses +import textwrap +from typing import Any, Literal + +import numcodecs.abc +import numpy as np + +from zarr.abc.codec import Codec +from zarr.core.metadata.v3 import DataType + + +@dataclasses.dataclass(kw_only=True) +class GroupInfo: + """ + Visual summary for a Group. + + Note that this method and its properties is not part of + Zarr's public API. + """ + + _name: str + _type: Literal["Group"] = "Group" + _zarr_format: Literal[2, 3] + _read_only: bool + _store_type: str + _count_members: int | None = None + _count_arrays: int | None = None + _count_groups: int | None = None + + def __repr__(self) -> str: + template = textwrap.dedent("""\ + Name : {_name} + Type : {_type} + Zarr format : {_zarr_format} + Read-only : {_read_only} + Store type : {_store_type}""") + + if self._count_members is not None: + template += "\nNo. members : {_count_members}" + if self._count_arrays is not None: + template += "\nNo. arrays : {_count_arrays}" + if self._count_groups is not None: + template += "\nNo. groups : {_count_groups}" + return template.format(**dataclasses.asdict(self)) + + +def human_readable_size(size: int) -> str: + if size < 2**10: + return f"{size}" + elif size < 2**20: + return f"{size / float(2**10):.1f}K" + elif size < 2**30: + return f"{size / float(2**20):.1f}M" + elif size < 2**40: + return f"{size / float(2**30):.1f}G" + elif size < 2**50: + return f"{size / float(2**40):.1f}T" + else: + return f"{size / float(2**50):.1f}P" + + +def byte_info(size: int) -> str: + if size < 2**10: + return str(size) + else: + return f"{size} ({human_readable_size(size)})" + + +@dataclasses.dataclass(kw_only=True) +class ArrayInfo: + """ + Visual summary for an Array. + + Note that this method and its properties is not part of + Zarr's public API. + """ + + _type: Literal["Array"] = "Array" + _zarr_format: Literal[2, 3] + _data_type: np.dtype[Any] | DataType + _shape: tuple[int, ...] + _chunk_shape: tuple[int, ...] | None = None + _order: Literal["C", "F"] + _read_only: bool + _store_type: str + _compressor: numcodecs.abc.Codec | None = None + _filters: tuple[numcodecs.abc.Codec, ...] | None = None + _codecs: list[Codec] | None = None + _count_bytes: int | None = None + _count_bytes_stored: int | None = None + _count_chunks_initialized: int | None = None + + def __repr__(self) -> str: + template = textwrap.dedent("""\ + Type : {_type} + Zarr format : {_zarr_format} + Data type : {_data_type} + Shape : {_shape} + Chunk shape : {_chunk_shape} + Order : {_order} + Read-only : {_read_only} + Store type : {_store_type}""") + + kwargs = dataclasses.asdict(self) + if self._chunk_shape is None: + # for non-regular chunk grids + kwargs["chunk_shape"] = "" + if self._compressor is not None: + template += "\nCompressor : {_compressor}" + + if self._filters is not None: + template += "\nFilters : {_filters}" + + if self._codecs is not None: + template += "\nCodecs : {_codecs}" + + if self._count_bytes is not None: + template += "\nNo. bytes : {_count_bytes}" + kwargs["_count_bytes"] = byte_info(self._count_bytes) + + if self._count_bytes_stored is not None: + template += "\nNo. bytes stored : {_count_bytes_stored}" + kwargs["_count_stored"] = byte_info(self._count_bytes_stored) + + if ( + self._count_bytes is not None + and self._count_bytes_stored is not None + and self._count_bytes_stored > 0 + ): + template += "\nStorage ratio : {_storage_ratio}" + kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}" + + if self._count_chunks_initialized is not None: + template += "\nChunks Initialized : {_count_chunks_initialized}" + return template.format(**kwargs) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 78e496818..1808a8630 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -14,6 +14,7 @@ from zarr.abc.store import Store, set_or_delete from zarr.codecs import _get_default_array_bytes_codec from zarr.codecs._v2 import V2Codec +from zarr.core._info import ArrayInfo from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -1332,9 +1333,65 @@ async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: def __repr__(self) -> str: return f"" - async def info(self) -> None: + @property + def info(self) -> Any: + """ + Return the statically known information for an array. + + Returns + ------- + ArrayInfo + + See Also + -------- + AsyncArray.info_complete + All information about a group, including dynamic information + like the number of bytes and chunks written. + """ + return self._info() + + async def info_complete(self) -> Any: + # TODO: get the size of the object from the store. + extra = { + "count_chunks_initialized": await self.nchunks_initialized(), + # count_bytes_stored isn't yet implemented. + } + return self._info(extra=extra) + raise NotImplementedError + def _info(self, extra: dict[str, int] | None = None) -> Any: + kwargs: dict[str, Any] = {} + if self.metadata.zarr_format == 2: + assert isinstance(self.metadata, ArrayV2Metadata) + if self.metadata.compressor is not None: + kwargs["_compressor"] = self.metadata.compressor + if self.metadata.filters is not None: + kwargs["_filters"] = self.metadata.filters + kwargs["_data_type"] = self.metadata.dtype + kwargs["_chunk_shape"] = self.metadata.chunks + else: + kwargs["_codecs"] = self.metadata.codecs + kwargs["_data_type"] = self.metadata.data_type + # just regular? + chunk_grid = self.metadata.chunk_grid + if isinstance(chunk_grid, RegularChunkGrid): + kwargs["_chunk_shape"] = chunk_grid.chunk_shape + else: + raise NotImplementedError( + "'info' is not yet implemented for chunk grids of type {type(self.metadata.chunk_grid)}" + ) + + return ArrayInfo( + _zarr_format=self.metadata.zarr_format, + _shape=self.shape, + _order=self.order, + _read_only=self.store_path.store.mode.readonly, + _store_type=type(self.store_path.store).__name__, + _count_bytes=self.dtype.itemsize * self.size, + **kwargs, + ) + # TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed @dataclass(frozen=False) @@ -3099,10 +3156,58 @@ def update_attributes(self, new_attributes: dict[str, JSON]) -> Array: def __repr__(self) -> str: return f"" - def info(self) -> None: - return sync( - self._async_array.info(), - ) + @property + def info(self) -> Any: + """ + Return the statically known information for an array. + + Returns + ------- + ArrayInfo + + See Also + -------- + Array.info_complete + All information about a group, including dynamic information + like the number of bytes and chunks written. + + Examples + -------- + >>> arr = zarr.create(shape=(10,), chunks=(2,), dtype="float32") + >>> arr.info + Type : Array + Zarr format : 3 + Data type : DataType.float32 + Shape : (10,) + Chunk shape : (2,) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [BytesCodec(endian=)] + No. bytes : 40 + """ + return self._async_array.info + + def info_complete(self) -> Any: + """ + Returns all the information about an array, including information from the Store. + + In addition to the statically known information like ``name`` and ``zarr_format``, + this includes additional information like the size of the array in bytes and + the number of chunks written. + + Note that this method will need to read metadata from the store. + + Returns + ------- + ArrayInfo + + See Also + -------- + Array.info + The statically known subset of metadata about an array. + """ + return sync(self._async_array.info_complete()) async def chunks_initialized( diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 9a54b346b..160306049 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -17,6 +17,7 @@ from zarr._compat import _deprecate_positional_args from zarr.abc.metadata import Metadata from zarr.abc.store import Store, set_or_delete +from zarr.core._info import GroupInfo from zarr.core.array import Array, AsyncArray, _build_parents from zarr.core.attributes import Attributes from zarr.core.buffer import default_buffer_prototype @@ -805,8 +806,72 @@ def attrs(self) -> dict[str, Any]: return self.metadata.attributes @property - def info(self) -> None: - raise NotImplementedError + def info(self) -> Any: + """ + Return a visual representation of the statically known information about a group. + + Note that this doesn't include dynamic information, like the number of child + Groups or Arrays. + + Returns + ------- + GroupInfo + + See Also + -------- + AsyncGroup.info_complete + All information about a group, including dynamic information + """ + + if self.metadata.consolidated_metadata: + members = list(self.metadata.consolidated_metadata.flattened_metadata.values()) + else: + members = None + return self._info(members=members) + + async def info_complete(self) -> Any: + """ + Return all the information for a group. + + This includes dynamic information like the number + of child Groups or Arrays. If this group doesn't contain consolidated + metadata then this will need to read from the backing Store. + + Returns + ------- + GroupInfo + + See Also + -------- + AsyncGroup.info + """ + members = [x[1].metadata async for x in self.members(max_depth=None)] + return self._info(members=members) + + def _info( + self, members: list[ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] | None = None + ) -> Any: + kwargs = {} + if members is not None: + kwargs["_count_members"] = len(members) + count_arrays = 0 + count_groups = 0 + for member in members: + if isinstance(member, GroupMetadata): + count_groups += 1 + else: + count_arrays += 1 + kwargs["_count_arrays"] = count_arrays + kwargs["_count_groups"] = count_groups + + return GroupInfo( + _name=self.store_path.path, + _read_only=self.store_path.store.mode.readonly, + _store_type=type(self.store_path.store).__name__, + _zarr_format=self.metadata.zarr_format, + # maybe do a typeddict + **kwargs, # type: ignore[arg-type] + ) @property def store(self) -> Store: @@ -1454,8 +1519,38 @@ def attrs(self) -> Attributes: return Attributes(self) @property - def info(self) -> None: - raise NotImplementedError + def info(self) -> Any: + """ + Return the statically known information for a group. + + Returns + ------- + GroupInfo + + See Also + -------- + Group.info_complete + All information about a group, including dynamic information + like the children members. + """ + return self._async_group.info + + def info_complete(self) -> Any: + """ + Return information for a group. + + If this group doesn't contain consolidated metadata then + this will need to read from the backing Store. + + Returns + ------- + GroupInfo + + See Also + -------- + Group.info + """ + return self._sync(self._async_group.info_complete()) @property def store(self) -> Store: diff --git a/tests/test_array.py b/tests/test_array.py index b8af26133..5f73f2f44 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -10,12 +10,14 @@ import zarr.api.asynchronous from zarr import Array, AsyncArray, Group from zarr.codecs import BytesCodec, VLenBytesCodec +from zarr.core._info import ArrayInfo from zarr.core.array import chunks_initialized from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import ceildiv +from zarr.core.metadata.v3 import DataType from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore @@ -418,6 +420,39 @@ def test_update_attrs(zarr_format: int) -> None: assert arr2.attrs["foo"] == "bar" +class TestInfo: + def test_info_v2(self) -> None: + arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=2) + result = arr.info + expected = ArrayInfo( + _zarr_format=2, + _data_type=np.dtype("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _count_bytes=128, + ) + assert result == expected + + def test_info_v3(self) -> None: + arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + result = arr.info + expected = ArrayInfo( + _zarr_format=3, + _data_type=DataType.parse("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=128, + ) + assert result == expected + + @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("zarr_format", [2, 3]) def test_resize_1d(store: MemoryStore, zarr_format: int) -> None: diff --git a/tests/test_group.py b/tests/test_group.py index 6bacca488..6e185097f 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -12,8 +12,10 @@ import zarr import zarr.api.asynchronous import zarr.api.synchronous +import zarr.storage from zarr import Array, AsyncArray, AsyncGroup, Group from zarr.abc.store import Store +from zarr.core._info import GroupInfo from zarr.core.buffer import default_buffer_prototype from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.sync import sync @@ -792,15 +794,6 @@ async def test_asyncgroup_attrs(store: Store, zarr_format: ZarrFormat) -> None: assert agroup.attrs == agroup.metadata.attributes == attributes -async def test_asyncgroup_info(store: Store, zarr_format: ZarrFormat) -> None: - agroup = await AsyncGroup.from_store( # noqa: F841 - store, - zarr_format=zarr_format, - ) - pytest.xfail("Info is not implemented for metadata yet") - # assert agroup.info == agroup.metadata.info - - async def test_asyncgroup_open( store: Store, zarr_format: ZarrFormat, @@ -1352,6 +1345,37 @@ def test_from_dict_extra_fields(self): assert result == expected +class TestInfo: + def test_info(self): + store = zarr.storage.MemoryStore(mode="w") + A = zarr.group(store=store, path="A") + B = A.create_group(name="B") + + B.create_array(name="x", shape=(1,)) + B.create_array(name="y", shape=(2,)) + + result = A.info + expected = GroupInfo( + _name="A", + _read_only=False, + _store_type="MemoryStore", + _zarr_format=3, + ) + assert result == expected + + result = A.info_complete() + expected = GroupInfo( + _name="A", + _read_only=False, + _store_type="MemoryStore", + _zarr_format=3, + _count_members=3, + _count_arrays=2, + _count_groups=1, + ) + assert result == expected + + def test_update_attrs() -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 root = Group.from_store( diff --git a/tests/test_info.py b/tests/test_info.py new file mode 100644 index 000000000..5d9264aa1 --- /dev/null +++ b/tests/test_info.py @@ -0,0 +1,133 @@ +import textwrap + +import numpy as np +import pytest + +from zarr.codecs.bytes import BytesCodec +from zarr.core._info import ArrayInfo, GroupInfo, human_readable_size +from zarr.core.common import ZarrFormat + +ZARR_FORMATS = [2, 3] + + +@pytest.mark.parametrize("zarr_format", ZARR_FORMATS) +def test_group_info_repr(zarr_format: ZarrFormat) -> None: + info = GroupInfo( + _name="a", _store_type="MemoryStore", _read_only=False, _zarr_format=zarr_format + ) + result = repr(info) + expected = textwrap.dedent(f"""\ + Name : a + Type : Group + Zarr format : {zarr_format} + Read-only : False + Store type : MemoryStore""") + assert result == expected + + +@pytest.mark.parametrize("zarr_format", ZARR_FORMATS) +def test_group_info_complete(zarr_format: ZarrFormat) -> None: + info = GroupInfo( + _name="a", + _store_type="MemoryStore", + _zarr_format=zarr_format, + _read_only=False, + _count_arrays=10, + _count_groups=4, + _count_members=14, + ) + result = repr(info) + expected = textwrap.dedent(f"""\ + Name : a + Type : Group + Zarr format : {zarr_format} + Read-only : False + Store type : MemoryStore + No. members : 14 + No. arrays : 10 + No. groups : 4""") + assert result == expected + + +@pytest.mark.parametrize("zarr_format", ZARR_FORMATS) +def test_array_info(zarr_format: ZarrFormat) -> None: + info = ArrayInfo( + _zarr_format=zarr_format, + _data_type=np.dtype("int32"), + _shape=(100, 100), + _chunk_shape=(10, 100), + _order="C", + _read_only=True, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + ) + result = repr(info) + assert result == textwrap.dedent(f"""\ + Type : Array + Zarr format : {zarr_format} + Data type : int32 + Shape : (100, 100) + Chunk shape : (10, 100) + Order : C + Read-only : True + Store type : MemoryStore + Codecs : [{{'endian': }}]""") + + +@pytest.mark.parametrize("zarr_format", ZARR_FORMATS) +@pytest.mark.parametrize("bytes_things", [(1_000_000, "976.6K", 500_000, "500000", "2.0", 5)]) +def test_array_info_complete( + zarr_format: ZarrFormat, bytes_things: tuple[int, str, int, str, str, int] +) -> None: + ( + count_bytes, + count_bytes_formatted, + count_bytes_stored, + count_bytes_stored_formatted, + storage_ratio_formatted, + count_chunks_initialized, + ) = bytes_things + info = ArrayInfo( + _zarr_format=zarr_format, + _data_type=np.dtype("int32"), + _shape=(100, 100), + _chunk_shape=(10, 100), + _order="C", + _read_only=True, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=count_bytes, + _count_bytes_stored=count_bytes_stored, + _count_chunks_initialized=count_chunks_initialized, + ) + result = repr(info) + assert result == textwrap.dedent(f"""\ + Type : Array + Zarr format : {zarr_format} + Data type : int32 + Shape : (100, 100) + Chunk shape : (10, 100) + Order : C + Read-only : True + Store type : MemoryStore + Codecs : [{{'endian': }}] + No. bytes : {count_bytes} ({count_bytes_formatted}) + No. bytes stored : {count_bytes_stored_formatted} + Storage ratio : {storage_ratio_formatted} + Chunks Initialized : 5""") + + +@pytest.mark.parametrize( + ("size", "expected"), + [ + (1, "1"), + (2**10, "1.0K"), + (2**20, "1.0M"), + (2**30, "1.0G"), + (2**40, "1.0T"), + (2**50, "1.0P"), + ], +) +def test_human_readable_size(size: int, expected: str) -> None: + result = human_readable_size(size) + assert result == expected