Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added info for Group and Array #2400

Merged
merged 24 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions src/zarr/core/_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import dataclasses
import textwrap
from typing import Any, Literal

import numcodecs.abc
import numpy as np

from zarr.abc.codec import Codec
from zarr.core.metadata.v3 import DataType


@dataclasses.dataclass(kw_only=True)
class GroupInfo:
"""
Visual summary for a Group.

Note that this method and its properties is not part of
Zarr's public API.
"""

_name: str
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've made all these fields private.

IMO, we should encourage things like group.info.zarr_format. The one place for that information should be group.metadata.zarr_format.

This class's sole user-facing focus should be the repr.

Copy link
Member

@jhamman jhamman Nov 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO, we should encourage things like group.info.zarr_format.

Did you mean:

"... we should NOT encourage ..."

_type: Literal["Group"] = "Group"
_zarr_format: Literal[2, 3]
_read_only: bool
_store_type: str
_count_members: int | None = None
_count_arrays: int | None = None
_count_groups: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Name : {_name}
Type : {_type}
Zarr format : {_zarr_format}
Read-only : {_read_only}
Store type : {_store_type}""")

if self._count_members is not None:
template += "\nNo. members : {_count_members}"
if self._count_arrays is not None:
template += "\nNo. arrays : {_count_arrays}"
if self._count_groups is not None:
template += "\nNo. groups : {_count_groups}"
return template.format(**dataclasses.asdict(self))


def human_readable_size(size: int) -> str:
if size < 2**10:
return f"{size}"
elif size < 2**20:
return f"{size / float(2**10):.1f}K"
elif size < 2**30:
return f"{size / float(2**20):.1f}M"
elif size < 2**40:
return f"{size / float(2**30):.1f}G"
elif size < 2**50:
return f"{size / float(2**40):.1f}T"
else:
return f"{size / float(2**50):.1f}P"


def byte_info(size: int) -> str:
if size < 2**10:
return str(size)
else:
return f"{size} ({human_readable_size(size)})"


@dataclasses.dataclass(kw_only=True)
class ArrayInfo:
"""
Visual summary for an Array.

Note that this method and its properties is not part of
Zarr's public API.
"""

_type: Literal["Array"] = "Array"
_zarr_format: Literal[2, 3]
_data_type: np.dtype[Any] | DataType
_shape: tuple[int, ...]
_chunk_shape: tuple[int, ...] | None = None
_order: Literal["C", "F"]
_read_only: bool
_store_type: str
_compressor: numcodecs.abc.Codec | None = None
_filters: tuple[numcodecs.abc.Codec, ...] | None = None
_codecs: list[Codec] | None = None
_count_bytes: int | None = None
_count_bytes_stored: int | None = None
_count_chunks_initialized: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Type : {_type}
Zarr format : {_zarr_format}
Data type : {_data_type}
Shape : {_shape}
Chunk shape : {_chunk_shape}
Order : {_order}
Read-only : {_read_only}
Store type : {_store_type}""")

kwargs = dataclasses.asdict(self)
if self._chunk_shape is None:
# for non-regular chunk grids
kwargs["chunk_shape"] = "<variable>"
if self._compressor is not None:
template += "\nCompressor : {_compressor}"

if self._filters is not None:
template += "\nFilters : {_filters}"

if self._codecs is not None:
template += "\nCodecs : {_codecs}"

if self._count_bytes is not None:
template += "\nNo. bytes : {_count_bytes}"
kwargs["_count_bytes"] = byte_info(self._count_bytes)

if self._count_bytes_stored is not None:
template += "\nNo. bytes stored : {_count_bytes_stored}"
kwargs["_count_stored"] = byte_info(self._count_bytes_stored)

if (
self._count_bytes is not None
and self._count_bytes_stored is not None
and self._count_bytes_stored > 0
):
template += "\nStorage ratio : {_storage_ratio}"
kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}"

if self._count_chunks_initialized is not None:
template += "\nChunks Initialized : {_count_chunks_initialized}"
return template.format(**kwargs)
115 changes: 110 additions & 5 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import _get_default_array_bytes_codec
from zarr.codecs._v2 import V2Codec
from zarr.core._info import ArrayInfo
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
BufferPrototype,
Expand Down Expand Up @@ -1332,9 +1333,65 @@ async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
def __repr__(self) -> str:
return f"<AsyncArray {self.store_path} shape={self.shape} dtype={self.dtype}>"

async def info(self) -> None:
@property
def info(self) -> Any:
"""
Return the statically known information for an array.

Returns
-------
ArrayInfo

See Also
--------
AsyncArray.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.
"""
return self._info()

async def info_complete(self) -> Any:
# TODO: get the size of the object from the store.
extra = {
"count_chunks_initialized": await self.nchunks_initialized(),
# count_bytes_stored isn't yet implemented.
}
return self._info(extra=extra)

raise NotImplementedError

def _info(self, extra: dict[str, int] | None = None) -> Any:
kwargs: dict[str, Any] = {}
if self.metadata.zarr_format == 2:
assert isinstance(self.metadata, ArrayV2Metadata)
if self.metadata.compressor is not None:
kwargs["_compressor"] = self.metadata.compressor
if self.metadata.filters is not None:
kwargs["_filters"] = self.metadata.filters
kwargs["_data_type"] = self.metadata.dtype
kwargs["_chunk_shape"] = self.metadata.chunks
else:
kwargs["_codecs"] = self.metadata.codecs
kwargs["_data_type"] = self.metadata.data_type
# just regular?
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
chunk_grid = self.metadata.chunk_grid
if isinstance(chunk_grid, RegularChunkGrid):
kwargs["_chunk_shape"] = chunk_grid.chunk_shape
else:
raise NotImplementedError(
"'info' is not yet implemented for chunk grids of type {type(self.metadata.chunk_grid)}"
)

return ArrayInfo(
_zarr_format=self.metadata.zarr_format,
_shape=self.shape,
_order=self.order,
_read_only=self.store_path.store.mode.readonly,
_store_type=type(self.store_path.store).__name__,
_count_bytes=self.dtype.itemsize * self.size,
**kwargs,
)


# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
@dataclass(frozen=False)
Expand Down Expand Up @@ -3099,10 +3156,58 @@ def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
def __repr__(self) -> str:
return f"<Array {self.store_path} shape={self.shape} dtype={self.dtype}>"

def info(self) -> None:
return sync(
self._async_array.info(),
)
@property
def info(self) -> Any:
"""
Return the statically known information for an array.

Returns
-------
ArrayInfo

See Also
--------
Array.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.

Examples
--------
>>> arr = zarr.create(shape=(10,), chunks=(2,), dtype="float32")
>>> arr.info
Type : Array
Zarr format : 3
Data type : DataType.float32
Shape : (10,)
Chunk shape : (2,)
Order : C
Read-only : False
Store type : MemoryStore
Codecs : [BytesCodec(endian=<Endian.little: 'little'>)]
No. bytes : 40
"""
return self._async_array.info

def info_complete(self) -> Any:
"""
Returns all the information about an array, including information from the Store.

In addition to the statically known information like ``name`` and ``zarr_format``,
this includes additional information like the size of the array in bytes and
the number of chunks written.

Note that this method will need to read metadata from the store.

Returns
-------
ArrayInfo

See Also
--------
Array.info
The statically known subset of metadata about an array.
"""
return sync(self._async_array.info_complete())


async def chunks_initialized(
Expand Down
103 changes: 99 additions & 4 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from zarr._compat import _deprecate_positional_args
from zarr.abc.metadata import Metadata
from zarr.abc.store import Store, set_or_delete
from zarr.core._info import GroupInfo
from zarr.core.array import Array, AsyncArray, _build_parents
from zarr.core.attributes import Attributes
from zarr.core.buffer import default_buffer_prototype
Expand Down Expand Up @@ -805,8 +806,72 @@ def attrs(self) -> dict[str, Any]:
return self.metadata.attributes

@property
def info(self) -> None:
raise NotImplementedError
def info(self) -> Any:
"""
Return a visual representation of the statically known information about a group.

Note that this doesn't include dynamic information, like the number of child
Groups or Arrays.

Returns
-------
GroupInfo

See Also
--------
AsyncGroup.info_complete
All information about a group, including dynamic information
"""

if self.metadata.consolidated_metadata:
members = list(self.metadata.consolidated_metadata.flattened_metadata.values())
else:
members = None
return self._info(members=members)

async def info_complete(self) -> Any:
"""
Return all the information for a group.

This includes dynamic information like the number
of child Groups or Arrays. If this group doesn't contain consolidated
metadata then this will need to read from the backing Store.

Returns
-------
GroupInfo

See Also
--------
AsyncGroup.info
"""
members = [x[1].metadata async for x in self.members(max_depth=None)]
return self._info(members=members)

def _info(
self, members: list[ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] | None = None
) -> Any:
kwargs = {}
if members is not None:
kwargs["_count_members"] = len(members)
count_arrays = 0
count_groups = 0
for member in members:
if isinstance(member, GroupMetadata):
count_groups += 1
else:
count_arrays += 1
kwargs["_count_arrays"] = count_arrays
kwargs["_count_groups"] = count_groups

return GroupInfo(
_name=self.store_path.path,
_read_only=self.store_path.store.mode.readonly,
_store_type=type(self.store_path.store).__name__,
_zarr_format=self.metadata.zarr_format,
# maybe do a typeddict
**kwargs, # type: ignore[arg-type]
)

@property
def store(self) -> Store:
Expand Down Expand Up @@ -1454,8 +1519,38 @@ def attrs(self) -> Attributes:
return Attributes(self)

@property
def info(self) -> None:
raise NotImplementedError
def info(self) -> Any:
"""
Return the statically known information for a group.

Returns
-------
GroupInfo

See Also
--------
Group.info_complete
All information about a group, including dynamic information
like the children members.
"""
return self._async_group.info

def info_complete(self) -> Any:
"""
Return information for a group.

If this group doesn't contain consolidated metadata then
this will need to read from the backing Store.

Returns
-------
GroupInfo

See Also
--------
Group.info
"""
return self._sync(self._async_group.info_complete())

@property
def store(self) -> Store:
Expand Down
Loading