From a36726846bf569163d17ace65e85e5db2dd5d622 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 15:44:18 +0200 Subject: [PATCH 01/54] add numcodec protocol --- src/zarr/abc/codec.py | 28 ++++++++++++- src/zarr/api/asynchronous.py | 5 +-- src/zarr/api/synchronous.py | 4 +- src/zarr/codecs/_numcodecs.py | 37 ++++++++++++++++ src/zarr/codecs/_v2.py | 77 ++++++++++++++++++++++++++++------ src/zarr/core/array.py | 16 +++---- tests/test_api.py | 2 +- tests/test_array.py | 2 +- tests/test_codecs/test_vlen.py | 2 +- 9 files changed, 142 insertions(+), 31 deletions(-) create mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index f8a5447a70..d5c995d2ca 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,11 +1,14 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar +from collections.abc import Mapping +from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar + +from typing_extensions import ReadOnly, TypedDict from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -34,6 +37,27 @@ CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) +TName = TypeVar("TName", bound=str, covariant=True) + + +class CodecJSON_V2(TypedDict, Generic[TName]): + """The JSON representation of a codec for Zarr V2""" + + id: ReadOnly[TName] + + +def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]: + return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str) + + +CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]] +"""The JSON representation of a codec for Zarr V3.""" + +# The widest type we will *accept* for a codec JSON +# This covers v2 and v3 +CodecJSON = str | Mapping[str, object] +"""The widest type of JSON-like input that could specify a codec.""" + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 9a380082b0..044d881c22 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -46,9 +46,8 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc - from zarr.abc.codec import Codec + from zarr.codecs._v2 import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike @@ -871,7 +870,7 @@ async def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 4ce02e7b6d..df667c0b23 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -14,12 +14,12 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc import numpy as np import numpy.typing as npt from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, @@ -609,7 +609,7 @@ def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py new file mode 100644 index 0000000000..4a8f43b5c6 --- /dev/null +++ b/src/zarr/codecs/_numcodecs.py @@ -0,0 +1,37 @@ +import numcodecs.registry as numcodecs_registry + +from zarr.abc.codec import CodecJSON_V2 +from zarr.codecs._v2 import Numcodec + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + codec_id = data["id"] + cls = numcodecs_registry.codec_registry.get(codec_id) + if cls is None and data in numcodecs_registry.entries: + cls = numcodecs_registry.entries[data].load() + numcodecs_registry.register_codec(cls, codec_id=data) + if cls is not None: + return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] + raise KeyError(data) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 08853f27f1..8deae99a6d 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,26 +2,77 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard -import numcodecs import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like +from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: - import numcodecs.abc - from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: ClassVar[str] + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): - filters: tuple[numcodecs.abc.Codec, ...] | None - compressor: numcodecs.abc.Codec | None + filters: tuple[Numcodec, ...] | None + compressor: Numcodec | None is_fixed_size = False @@ -33,9 +84,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) + chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] else: - chunk = cdata + chunk = cdata # type: ignore[assignment] # apply filters if self.filters: @@ -56,7 +107,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -85,7 +136,7 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) + chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] # check object encoding if ensure_ndarray_like(chunk).dtype == object: @@ -93,9 +144,9 @@ async def _encode_single( # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) + cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] else: - cdata = chunk + cdata = chunk # type: ignore[assignment] cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 260e94bc88..68a4694a55 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -27,7 +27,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec +from zarr.codecs._v2 import Numcodec, V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -607,7 +607,7 @@ async def _create( chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", # runtime overwrite: bool = False, @@ -818,7 +818,7 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLikev2 = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: @@ -856,7 +856,7 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, @@ -3898,7 +3898,7 @@ def _build_parents( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec @@ -3911,10 +3911,10 @@ def _build_parents( ) CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] | dict[str, JSON] | BytesBytesCodec - | numcodecs.abc.Codec + | Numcodec | Literal["auto"] | None ) @@ -4944,7 +4944,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) + compressors = (compressor,) # type: ignore[assignment] elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/tests/test_api.py b/tests/test_api.py index 01fb40f050..b245685e30 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1282,7 +1282,7 @@ def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None: dtype=src.dtype, overwrite=True, zarr_format=zarr_format, - compressors=compressors, + compressors=compressors, # type: ignore[arg-type] ) z[:10, :10] = src[:10, :10] diff --git a/tests/test_array.py b/tests/test_array.py index f672006f9a..6342ce6430 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1684,7 +1684,7 @@ def test_roundtrip_numcodecs() -> None: shape=(720, 1440), chunks=(720, 1440), dtype="float64", - compressors=compressors, + compressors=compressors, # type: ignore[arg-type] filters=filters, fill_value=-9.99, dimension_names=["lat", "lon"], diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 6fe1863464..cf0905daca 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -40,7 +40,7 @@ def test_vlen_string( chunks=data.shape, dtype=data.dtype, fill_value="", - compressors=compressor, + compressors=compressor, # type: ignore[arg-type] ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy From 1d424c0ba32bb535d9f787435980e6e87bc802c4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:25:36 +0200 Subject: [PATCH 02/54] add tests for numcodecs compatibility --- tests/test_codecs/test_numcodecs.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/test_codecs/test_numcodecs.py diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py new file mode 100644 index 0000000000..a3824cc386 --- /dev/null +++ b/tests/test_codecs/test_numcodecs.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from numcodecs import GZip + +from zarr.codecs._numcodecs import get_numcodec +from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls + + +def test_get_numcodec() -> None: + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + + +def test_is_numcodec() -> None: + """ + Test the _is_numcodec function + """ + assert _is_numcodec(GZip()) + + +def test_is_numcodec_cls() -> None: + """ + Test the _is_numcodec_cls function + """ + assert _is_numcodec_cls(GZip) From 41dd6ff53664f7b33db24e2a56562a9ddc53484b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:32:31 +0200 Subject: [PATCH 03/54] changelog --- changes/3318.misc.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/3318.misc.rst diff --git a/changes/3318.misc.rst b/changes/3318.misc.rst new file mode 100644 index 0000000000..f8308e6b97 --- /dev/null +++ b/changes/3318.misc.rst @@ -0,0 +1,2 @@ +Define a ``Protocol`` to model the ``numcodecs.abc.Codec`` interface. This is groundwork toward +making ``numcodecs`` an optional dependency for ``zarr-python``. \ No newline at end of file From c435a59728b6671250f14e4b512527bcbf329ea6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:43:41 +0200 Subject: [PATCH 04/54] ignore unknown key --- tests/test_codecs/test_numcodecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index a3824cc386..bb381c615a 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -7,7 +7,7 @@ def test_get_numcodec() -> None: - assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) # type: ignore[typeddict-unknown-key] def test_is_numcodec() -> None: From 8e50ef8606bc0ada69127841befe179717d2a0c4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 1 Aug 2025 11:05:43 +0200 Subject: [PATCH 05/54] remove re-implementation of get_codec --- src/zarr/codecs/_numcodecs.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py index 4a8f43b5c6..b00f258db5 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/_numcodecs.py @@ -1,5 +1,3 @@ -import numcodecs.registry as numcodecs_registry - from zarr.abc.codec import CodecJSON_V2 from zarr.codecs._v2 import Numcodec @@ -22,16 +20,11 @@ def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: Examples -------- - >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec = get_codec({'id': 'zlib', 'level': 1}) >>> codec Zlib(level=1) """ - codec_id = data["id"] - cls = numcodecs_registry.codec_registry.get(codec_id) - if cls is None and data in numcodecs_registry.entries: - cls = numcodecs_registry.entries[data].load() - numcodecs_registry.register_codec(cls, codec_id=data) - if cls is not None: - return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] - raise KeyError(data) + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] From 156134f54a6b43a248db474d3e9e38864b5925c3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 16:13:04 +0200 Subject: [PATCH 06/54] add to_json methods to codecs --- src/zarr/codecs/blosc.py | 203 ++++++++++++++++++++++++----------- src/zarr/codecs/bytes.py | 114 ++++++++++++++++---- src/zarr/codecs/crc32c_.py | 68 +++++++++++- src/zarr/codecs/gzip.py | 80 ++++++++++++-- src/zarr/codecs/sharding.py | 147 ++++++++++++++++++++++--- src/zarr/codecs/transpose.py | 90 ++++++++++++++-- src/zarr/codecs/vlen_utf8.py | 101 +++++++++++++++-- src/zarr/codecs/zstd.py | 95 ++++++++++++++-- 8 files changed, 760 insertions(+), 138 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index f89f127852..f40993b201 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,18 +1,22 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass, replace -from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.core.dtype.common import HasItemSize from zarr.registry import register_codec @@ -22,39 +26,64 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") -class BloscShuffle(Enum): - """ - Enum for shuffle filter used by blosc. - """ +BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") - noshuffle = "noshuffle" - shuffle = "shuffle" - bitshuffle = "bitshuffle" - @classmethod - def from_int(cls, num: int) -> BloscShuffle: - blosc_shuffle_int_to_str = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", - } - if num not in blosc_shuffle_int_to_str: - raise ValueError(f"Value must be between 0 and 2. Got {num}.") - return BloscShuffle[blosc_shuffle_int_to_str[num]] +class BloscConfigV2(TypedDict): + cname: BloscCname + clevel: int + shuffle: int + blocksize: int + typesize: NotRequired[int] -class BloscCname(Enum): +class BloscConfigV3(TypedDict): + cname: BloscCname + clevel: int + shuffle: BloscShuffle + blocksize: int + typesize: int + + +class BloscJSON_V2(CodecJSON_V2[Literal["blosc"]], BloscConfigV2): """ - Enum for compression library used by blosc. + The JSON form of the Blosc codec in Zarr V2. """ - lz4 = "lz4" - lz4hc = "lz4hc" - blosclz = "blosclz" - zstd = "zstd" - snappy = "snappy" - zlib = "zlib" + +class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): + """ + The JSON form of the Blosc codec in Zarr V3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"} + and data["id"] == "blosc" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "blosc" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"cname", "clevel", "shuffle", "blocksize", "typesize"} + ) + + +def parse_cname(value: object) -> BloscCname: + if value not in BLOSC_CNAME: + raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.") + return value # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc @@ -85,31 +114,35 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") +def parse_shuffle(data: object) -> BloscShuffle: + if data in BLOSC_SHUFFLE: + return data # type: ignore[return-value] + raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.") + + @dataclass(frozen=True) class BloscCodec(BytesBytesCodec): - """blosc codec""" - is_fixed_size = False typesize: int | None - cname: BloscCname = BloscCname.zstd - clevel: int = 5 - shuffle: BloscShuffle | None = BloscShuffle.noshuffle - blocksize: int = 0 + cname: BloscCname + clevel: int + shuffle: BloscShuffle | None + blocksize: int def __init__( self, *, typesize: int | None = None, - cname: BloscCname | str = BloscCname.zstd, + cname: BloscCname = "zstd", clevel: int = 5, - shuffle: BloscShuffle | str | None = None, + shuffle: BloscShuffle | None = None, blocksize: int = 0, ) -> None: typesize_parsed = parse_typesize(typesize) if typesize is not None else None - cname_parsed = parse_enum(cname, BloscCname) + cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None + shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -120,24 +153,74 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "blosc") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.typesize is None: - raise ValueError("`typesize` needs to be set for serialization.") - if self.shuffle is None: - raise ValueError("`shuffle` needs to be set for serialization.") - return { - "name": "blosc", - "configuration": { - "typesize": self.typesize, - "cname": self.cname.value, + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + cname=data["cname"], + clevel=data["clevel"], + shuffle=BLOSC_SHUFFLE[data["shuffle"]], + blocksize=data["blocksize"], + typesize=data.get("typesize", None), + ) + msg = ( + "Invalid Zarr V2 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + typesize=data["configuration"]["typesize"], + cname=data["configuration"]["cname"], + clevel=data["configuration"]["clevel"], + shuffle=data["configuration"]["shuffle"], + blocksize=data["configuration"]["blocksize"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: + if self.typesize is None or self.shuffle is None: + raise ValueError("typesize and blocksize need to be set for encoding.") + if zarr_format == 2: + return { + "id": "blosc", "clevel": self.clevel, - "shuffle": self.shuffle.value, + "cname": self.cname, + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, - }, - } + } + elif zarr_format == 3: + return { + "name": "blosc", + "configuration": { + "clevel": self.clevel, + "cname": self.cname, + "shuffle": self.shuffle, + "typesize": self.typesize, + "blocksize": self.blocksize, + }, + } + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = 1 @@ -147,10 +230,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if new_codec.typesize is None: new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: - new_codec = replace( - new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), - ) + new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle") return new_codec @@ -158,15 +238,10 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: def _blosc_codec(self) -> Blosc: if self.shuffle is None: raise ValueError("`shuffle` needs to be set for decoding and encoding.") - map_shuffle_str_to_int = { - BloscShuffle.noshuffle: 0, - BloscShuffle.shuffle: 1, - BloscShuffle.bitshuffle: 2, - } config_dict = { - "cname": self.cname.name, + "cname": self.cname, "clevel": self.clevel, - "shuffle": map_shuffle_str_to_int[self.shuffle], + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, } # See https://github.com/zarr-developers/numcodecs/pull/713 diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 7576119c82..93fe2e034d 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -1,15 +1,16 @@ from __future__ import annotations import sys +from collections.abc import Mapping from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numpy as np -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import HasEndianness from zarr.registry import register_codec @@ -28,35 +29,106 @@ class Endian(Enum): little = "little" -default_system_endian = Endian(sys.byteorder) +# TODO: unify with the endianness defined in core.dtype.common +EndiannessStr = Literal["little", "big"] +ENDIANNESS_STR: Final = "little", "big" + +default_system_endian = sys.byteorder + + +class BytesConfig(TypedDict): + endian: NotRequired[EndiannessStr] + + +class BytesJSON_V2(CodecJSON_V2[Literal["bytes"]], BytesConfig): ... + + +BytesJSON_V3 = NamedConfig[Literal["bytes"], BytesConfig] | Literal["bytes"] + + +def parse_endianness(data: object) -> EndiannessStr: + if data in ENDIANNESS_STR: + return data # type: ignore [return-value] + raise ValueError(f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}") + + +def check_json_v2(data: CodecJSON) -> TypeGuard[BytesJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"id", "endian"}, {"id"}) + and data["id"] == "bytes" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BytesJSON_V3]: + return data == "bytes" or ( + ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name"}, {"name", "configuration"}) + and data["name"] == "bytes" + ) + and isinstance(data.get("configuration", {}), Mapping) + and set(data.get("configuration", {}).keys()) in ({"endian"}, set()) + ) @dataclass(frozen=True) class BytesCodec(ArrayBytesCodec): - """bytes codec""" - is_fixed_size = True - endian: Endian | None + endian: EndiannessStr | None - def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None: - endian_parsed = None if endian is None else parse_enum(endian, Endian) + def __init__(self, *, endian: EndiannessStr | str | None = default_system_endian) -> None: + endian_parsed = None if endian is None else parse_endianness(endian) object.__setattr__(self, "endian", endian_parsed) @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.endian is None: + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls(endian=data.get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + # Three different representations of the exact same codec... + if data in ("bytes", {"name": "bytes"}, {"name": "bytes", "configuration": {}}): + return cls() + else: + return cls(endian=data["configuration"].get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @overload + def to_json(self, zarr_format: Literal[2]) -> BytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BytesJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BytesJSON_V2 | BytesJSON_V3: + if zarr_format == 2: + if self.endian is not None: + return { + "id": "bytes", + "endian": self.endian, + } + return {"id": "bytes"} + elif zarr_format == 3: + if self.endian is not None: + return { + "name": "bytes", + "configuration": {"endian": self.endian}, + } return {"name": "bytes"} - else: - return {"name": "bytes", "configuration": {"endian": self.endian.value}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if not isinstance(array_spec.dtype, HasEndianness): @@ -75,9 +147,9 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None - if isinstance(chunk_spec.dtype, HasEndianness): - dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg] + endian = self.endian if self.endian is not None else None + if isinstance(chunk_spec.dtype, HasEndianness) and endian is not None: + dtype = replace(chunk_spec.dtype, endianness=endian).to_native_dtype() # type: ignore[call-arg] else: dtype = chunk_spec.dtype.to_native_dtype() as_array_like = chunk_bytes.as_array_like() @@ -109,7 +181,7 @@ async def _encode_single( ): # type-ignore is a numpy bug # see https://github.com/numpy/numpy/issues/26473 - new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) # type: ignore[arg-type] + new_dtype = chunk_array.dtype.newbyteorder(self.endian) # type: ignore[arg-type] chunk_array = chunk_array.astype(new_dtype) nd_array = chunk_array.as_ndarray_like() diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index c2e30f689a..81072366e7 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -1,14 +1,15 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, cast, overload import numpy as np import typing_extensions from crc32c import crc32c -from zarr.abc.codec import BytesBytesCodec -from zarr.core.common import JSON, parse_named_configuration +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -18,20 +19,77 @@ from zarr.core.buffer import Buffer +class Crc32Config(TypedDict): ... + + +class Crc32cJSON_V2(CodecJSON_V2[Literal["crc32c"]]): ... + + +class Crc32cJSON_V3(NamedConfig[Literal["crc32c"], Crc32Config]): ... + + +def check_json_v2(data: CodecJSON) -> TypeGuard[Crc32cJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()) == {"id"} and data["id"] == "crc32c" + + +def check_json_v3(data: CodecJSON) -> TypeGuard[Crc32cJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name", "configuration"}, {"name"}) + and data["name"] == "crc32c" + and data.get("configuration") in ({}, None) + ) + + @dataclass(frozen=True) class Crc32cCodec(BytesBytesCodec): - """crc32c codec""" - is_fixed_size = True @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) parse_named_configuration(data, "crc32c", require_configuration=False) return cls() + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls() + msg = ( + "Invalid Zarr V2 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('id')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls() + msg = ( + "Invalid Zarr V3 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('name')" + ) + raise CodecValidationError(msg) + def to_dict(self) -> dict[str, JSON]: + return self.to_json(zarr_format=3) return {"name": "crc32c"} + @overload + def to_json(self, zarr_format: Literal[2]) -> Crc32cJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Crc32cJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "crc32c"} + elif zarr_format == 3: + return {"name": "crc32c"} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + async def _decode_single( self, chunk_bytes: Buffer, diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 9e6515a4d1..c2d3bbaa55 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -1,14 +1,19 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload from numcodecs.gzip import GZip -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -28,10 +33,24 @@ def parse_gzip_level(data: JSON) -> int: return data +class GZipConfig(TypedDict): + level: int + + +class GZipJSON_V2(CodecJSON_V2[Literal["gzip"]], GZipConfig): + """ + The JSON form of the GZip codec in Zarr V2. + """ + + +class GZipJSON_V3(NamedRequiredConfig[Literal["gzip"], GZipConfig]): + """ + The JSON form of the GZip codec in Zarr V3. + """ + + @dataclass(frozen=True) class GzipCodec(BytesBytesCodec): - """gzip codec""" - is_fixed_size = False level: int = 5 @@ -43,11 +62,56 @@ def __init__(self, *, level: int = 5) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "gzip") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - return {"name": "gzip", "configuration": {"level": self.level}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> GZipJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> GZipJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> GZipJSON_V2 | GZipJSON_V3: + if zarr_format == 2: + return {"id": "gzip", "level": self.level} + elif zarr_format == 3: + return {"name": "gzip", "configuration": {"level": self.level}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "level"} + and data["id"] == "gzip" + and isinstance(data["level"], int) + ) + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "gzip" + and isinstance(data["configuration"], dict) + and "level" in data["configuration"] + and isinstance(data["configuration"]["level"], int) + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls(level=data["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls(level=data["configuration"]["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 3: {data!r}") async def _decode_single( self, diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 888d258649..54a3d0bc31 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -1,11 +1,22 @@ from __future__ import annotations -from collections.abc import Iterable, Mapping, MutableMapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache from operator import itemgetter -from typing import TYPE_CHECKING, Any, NamedTuple, cast +from typing import ( + TYPE_CHECKING, + Any, + Literal, + NamedTuple, + NotRequired, + Self, + TypedDict, + TypeGuard, + cast, + overload, +) import numpy as np import numpy.typing as npt @@ -15,7 +26,11 @@ ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin, Codec, + CodecJSON, + CodecJSON_V2, + CodecJSON_V3, CodecPipeline, + CodecValidationError, ) from zarr.abc.store import ( ByteGetter, @@ -38,8 +53,8 @@ from zarr.core.common import ( ChunkCoords, ChunkCoordsLike, + NamedRequiredConfig, parse_enum, - parse_named_configuration, parse_shapelike, product, ) @@ -65,6 +80,28 @@ ShardMapping = Mapping[ChunkCoords, Buffer] ShardMutableMapping = MutableMapping[ChunkCoords, Buffer] +IndexLocation = Literal["start", "end"] + + +class ShardingConfigV2(TypedDict): + codecs: tuple[CodecJSON_V2[str], ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V2[str], ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingConfigV3(TypedDict): + codecs: tuple[CodecJSON_V3, ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V3, ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingJSON_V2(CodecJSON_V2[Literal["sharding_indexed"]], ShardingConfigV2): ... + + +class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): ... + class ShardingCodecIndexLocation(Enum): """ @@ -79,6 +116,37 @@ def parse_index_location(data: object) -> ShardingCodecIndexLocation: return parse_enum(data, ShardingCodecIndexLocation) +def check_json_v2(data: CodecJSON) -> TypeGuard[ShardingJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "codecs", "chunk_shape"} + and data["id"] == "sharding_indexed" + and isinstance(data["chunk_shape"], Sequence) + and not isinstance(data["chunk_shape"], str) + and isinstance(data["codecs"], Sequence) + and not isinstance(data["codecs"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ShardingJSON_V3]: + # TODO: Automate this with a function that does runtime type checking on typeddicts. + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "sharding_indexed" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"codecs", "chunk_shape", "index_codecs", "index_location"} + and isinstance(data["configuration"]["chunk_shape"], Sequence) + and not isinstance(data["configuration"]["chunk_shape"], str) + and isinstance(data["configuration"]["codecs"], Sequence) + and not isinstance(data["configuration"]["codecs"], str) + and isinstance(data["configuration"]["index_codecs"], Sequence) + and not isinstance(data["configuration"]["index_codecs"], str) + and data["configuration"]["index_location"] in ("start", "end") + ) + + @dataclass(frozen=True) class _ShardingByteGetter(ByteGetter): shard_dict: ShardMapping @@ -333,8 +401,6 @@ async def finalize( class ShardingCodec( ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin ): - """Sharding codec""" - chunk_shape: ChunkCoords codecs: tuple[Codec, ...] index_codecs: tuple[Codec, ...] @@ -385,23 +451,76 @@ def __setstate__(self, state: dict[str, Any]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + codecs=data["codecs"], + index_codecs=data["index_codecs"], + index_location=data["index_location"], + chunk_shape=data["chunk_shape"], + ) + msg = ( + "Invalid Zarr V2 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'codecs', 'index_codecs', 'chunk_shape', 'index_location')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + codecs=data["configuration"]["codecs"], + index_codecs=data["configuration"]["index_codecs"], + index_location=data["configuration"]["index_location"], + chunk_shape=data["configuration"]["chunk_shape"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('codecs', 'index_codecs', 'index_location', 'chunk_shape')" + ) + raise CodecValidationError(msg) @property def codec_pipeline(self) -> CodecPipeline: return get_pipeline_class().from_codecs(self.codecs) def to_dict(self) -> dict[str, JSON]: - return { - "name": "sharding_indexed", - "configuration": { + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ShardingJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ShardingJSON_V3: ... + + def to_json(self, zarr_format: int) -> ShardingJSON_V2 | ShardingJSON_V3: + if zarr_format == 2: + return { + "id": "sharding_indexed", + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), "chunk_shape": self.chunk_shape, - "codecs": tuple(s.to_dict() for s in self.codecs), - "index_codecs": tuple(s.to_dict() for s in self.index_codecs), "index_location": self.index_location.value, - }, - } + } + elif zarr_format == 3: + return { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": self.chunk_shape, + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), + "index_location": self.index_location.value, + }, + } + raise ValueError(f"Unsupported Zarr format {zarr_format}. Expected 2 or 3.") def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: shard_spec = self._get_chunk_spec(array_spec) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index c87804685c..4d976e529b 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -1,14 +1,19 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np -from zarr.abc.codec import ArrayArrayCodec +from zarr.abc.codec import ArrayArrayCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.array_spec import ArraySpec -from zarr.core.common import JSON, ChunkCoordsLike, parse_named_configuration +from zarr.core.common import ( + JSON, + ChunkCoordsLike, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -27,10 +32,44 @@ def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: return tuple(cast("Iterable[int]", data)) +class TransposeConfig(TypedDict): + order: tuple[int, ...] + + +class TransposeJSON_V2(CodecJSON_V2[Literal["transpose"]], TransposeConfig): + """ + The JSON form of the Transpose codec in Zarr V2. + """ + + +class TransposeJSON_V3(NamedRequiredConfig[Literal["transpose"], TransposeConfig]): + """ + The JSON form of the Transpose codec in Zarr V3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[TransposeJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "configuration"} + and data["id"] == "transpose" + and isinstance(data["order"], Sequence) + and not isinstance(data["order"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[TransposeJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "transpose" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"order"} + ) + + @dataclass(frozen=True) class TransposeCodec(ArrayArrayCodec): - """Transpose codec""" - is_fixed_size = True order: tuple[int, ...] @@ -42,12 +81,47 @@ def __init__(self, *, order: ChunkCoordsLike) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "transpose") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v2(data): + return cls(order=data["order"]) # type: ignore[arg-type] + msg = ( + "Invalid Zarr V2 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'order')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v3(data): + return cls(order=data["configuration"]["order"]) + msg = ( + "Invalid Zarr V3 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('order')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: return {"name": "transpose", "configuration": {"order": tuple(self.order)}} + @overload + def to_json(self, zarr_format: Literal[2]) -> TransposeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> TransposeJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "transpose", "order": self.order} + elif zarr_format == 3: + return {"name": "transpose", "configuration": {"order": self.order}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + def validate( self, shape: tuple[int, ...], diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 28c64be1c0..4124883ab0 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -1,14 +1,14 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload import numpy as np from numcodecs.vlen import VLenBytes, VLenUTF8 -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -22,12 +22,29 @@ _vlen_bytes_codec = VLenBytes() +class VlenUF8Config(TypedDict): ... + + +class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): ... + + +class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): ... + + +class VLenBytesConfig(TypedDict): ... + + +class VLenBytesJSON_V2(CodecJSON_V2[Literal["vlen-bytes"]]): ... + + +VLenBytesJSON_V3 = NamedConfig[Literal["vlen-bytes"], VLenBytesConfig] | Literal["vlen-bytes"] + + @dataclass(frozen=True) class VLenUTF8Codec(ArrayBytesCodec): - """Variable-length UTF8 codec""" - @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration( data, "vlen-utf8", require_configuration=False ) @@ -37,6 +54,40 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-utf8", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenUTF8JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenUTF8JSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenUTF8JSON_V2 | VLenUTF8JSON_V3: + if zarr_format == 2: + return {"id": "vlen-utf8"} + else: + return {"name": "vlen-utf8"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V2]: + return data == {"id": "vlen-utf8"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V3]: + return data in ( + {"name": "vlen-utf8"}, + {"name": "vlen-utf8", "configuration": {}}, + "vlen-utf8", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self @@ -74,15 +125,45 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - class VLenBytesCodec(ArrayBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "vlen-bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-bytes", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenBytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenBytesJSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenBytesJSON_V2 | VLenBytesJSON_V3: + if zarr_format == 2: + return {"id": "vlen-bytes"} + else: + return {"name": "vlen-bytes"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V2]: + return data == {"id": "vlen-bytes"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V3]: + return data in ( + {"name": "vlen-bytes"}, + {"name": "vlen-bytes", "configuration": {}}, + "vlen-bytes", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index ead41e7b5f..f942bd5700 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -1,17 +1,22 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, overload import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -21,6 +26,41 @@ from zarr.core.buffer import Buffer +class ZstdConfig_V2(TypedDict): + level: int + + +class ZstdConfig_V3(TypedDict): + level: int + checksum: bool + + +class ZstdJSON_V2(CodecJSON_V2[Literal["zstd"]], ZstdConfig_V2): + """ + The JSON form of the ZStandard codec in Zarr v2. + """ + + +class ZstdJSON_V3(NamedRequiredConfig[Literal["zstd"], ZstdConfig_V3]): + """ + The JSON form of the ZStandard codec in Zarr v3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[ZstdJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()).issuperset({"id", "level"}) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ZstdJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "zstd" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"level", "checksum"} + ) + + def parse_zstd_level(data: JSON) -> int: if isinstance(data, int): if data >= 23: @@ -37,8 +77,6 @@ def parse_checksum(data: JSON) -> bool: @dataclass(frozen=True) class ZstdCodec(BytesBytesCodec): - """zstd codec""" - is_fixed_size = True level: int = 0 @@ -61,11 +99,52 @@ def __init__(self, *, level: int = 0, checksum: bool = False) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "zstd") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + if "checksum" in data: + return cls(level=data["level"], checksum=data["checksum"]) + else: + return cls(level=data["level"]) + + msg = ( + "Invalid Zarr V2 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'level')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + level=data["configuration"]["level"], checksum=data["configuration"]["checksum"] + ) + msg = ( + "Invalid Zarr V3 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration') " + "Where the 'configuration' key is a Mapping with keys ('level', 'checksum')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: - return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ZstdJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ZstdJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> ZstdJSON_V2 | ZstdJSON_V3: + if zarr_format == 2: + return {"id": "zstd", "level": self.level} + else: + return { + "name": "zstd", + "configuration": {"level": self.level, "checksum": self.checksum}, + } @cached_property def _zstd_codec(self) -> Zstd: From 486f8377be38b293b759ca33bda5b9d59ae2e9b4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 16:51:40 +0200 Subject: [PATCH 07/54] add codecvalidationerror --- src/zarr/errors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/zarr/errors.py b/src/zarr/errors.py index 4f972a6703..d859b12e02 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -66,3 +66,8 @@ class NodeTypeValidationError(MetadataValidationError): This can be raised when the value is invalid or unexpected given the context, for example an 'array' node when we expected a 'group'. """ + +class CodecValidationError(BaseZarrError): + """Raised when the Zarr codec metadata is invalid in some way""" + + _msg = "Invalid value for '{}'. Expected '{}'. Got '{}'." From dd53981184ba9eb4bb3181787ca5b5c61b3dea89 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 16:52:18 +0200 Subject: [PATCH 08/54] fix v2 codec json models to avoid inheritance --- src/zarr/codecs/blosc.py | 19 +++++++++++++++---- src/zarr/codecs/bytes.py | 9 +++++++-- src/zarr/codecs/crc32c_.py | 4 ++-- src/zarr/codecs/gzip.py | 5 +++-- src/zarr/codecs/sharding.py | 11 +++++++---- src/zarr/codecs/transpose.py | 9 +++++---- src/zarr/codecs/zstd.py | 8 +++++--- 7 files changed, 44 insertions(+), 21 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index f40993b201..b046ac04e1 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -4,13 +4,23 @@ from collections.abc import Mapping from dataclasses import dataclass, replace from functools import cached_property -from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload +from typing import ( + TYPE_CHECKING, + Final, + Literal, + NotRequired, + TypedDict, + TypeGuard, + overload, +) + +from typing_extensions import ReadOnly import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import ( JSON, @@ -18,6 +28,7 @@ ZarrFormat, ) from zarr.core.dtype.common import HasItemSize +from zarr.errors import CodecValidationError from zarr.registry import register_codec if TYPE_CHECKING: @@ -49,11 +60,11 @@ class BloscConfigV3(TypedDict): typesize: int -class BloscJSON_V2(CodecJSON_V2[Literal["blosc"]], BloscConfigV2): +class BloscJSON_V2(BloscConfigV2): """ The JSON form of the Blosc codec in Zarr V2. """ - + id: ReadOnly[Literal["blosc"]] class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 93fe2e034d..6f5a103cf8 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -7,8 +7,9 @@ from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numpy as np +from typing_extensions import ReadOnly -from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 +from zarr.abc.codec import ArrayBytesCodec, CodecJSON from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import HasEndianness @@ -40,7 +41,11 @@ class BytesConfig(TypedDict): endian: NotRequired[EndiannessStr] -class BytesJSON_V2(CodecJSON_V2[Literal["bytes"]], BytesConfig): ... +class BytesJSON_V2(BytesConfig): + """ + JSON representation of the bytes codec for zarr v2. + """ + id: ReadOnly[Literal["bytes"]] BytesJSON_V3 = NamedConfig[Literal["bytes"], BytesConfig] | Literal["bytes"] diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index 81072366e7..a73275ac14 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -8,10 +8,10 @@ import typing_extensions from crc32c import crc32c -from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec - +from zarr.errors import CodecValidationError if TYPE_CHECKING: from typing import Self diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index c2d3bbaa55..fb4dc70209 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload from numcodecs.gzip import GZip - +from typing_extensions import ReadOnly from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import ( @@ -37,10 +37,11 @@ class GZipConfig(TypedDict): level: int -class GZipJSON_V2(CodecJSON_V2[Literal["gzip"]], GZipConfig): +class GZipJSON_V2(GZipConfig): """ The JSON form of the GZip codec in Zarr V2. """ + id: ReadOnly[Literal["gzip"]] class GZipJSON_V3(NamedRequiredConfig[Literal["gzip"], GZipConfig]): diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 54a3d0bc31..3bda7112dd 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -17,7 +17,7 @@ cast, overload, ) - +from typing_extensions import ReadOnly import numpy as np import numpy.typing as npt @@ -30,8 +30,8 @@ CodecJSON_V2, CodecJSON_V3, CodecPipeline, - CodecValidationError, ) +from zarr.errors import CodecValidationError from zarr.abc.store import ( ByteGetter, ByteRequest, @@ -97,8 +97,11 @@ class ShardingConfigV3(TypedDict): index_location: NotRequired[Literal["start", "end"]] -class ShardingJSON_V2(CodecJSON_V2[Literal["sharding_indexed"]], ShardingConfigV2): ... - +class ShardingJSON_V2(ShardingConfigV2): + """ + The JSON form of the sharding codec in Zarr V2. + """ + id: ReadOnly[Literal["sharding_indexed"]] class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): ... diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 4d976e529b..7176521c74 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np - -from zarr.abc.codec import ArrayArrayCodec, CodecJSON, CodecJSON_V2, CodecValidationError +from typing_extensions import ReadOnly +from zarr.abc.codec import ArrayArrayCodec, CodecJSON, CodecJSON_V2 from zarr.core.array_spec import ArraySpec from zarr.core.common import ( JSON, @@ -14,6 +14,7 @@ NamedRequiredConfig, ZarrFormat, ) +from zarr.errors import CodecValidationError from zarr.registry import register_codec if TYPE_CHECKING: @@ -36,11 +37,11 @@ class TransposeConfig(TypedDict): order: tuple[int, ...] -class TransposeJSON_V2(CodecJSON_V2[Literal["transpose"]], TransposeConfig): +class TransposeJSON_V2(TransposeConfig): """ The JSON form of the Transpose codec in Zarr V2. """ - + id: ReadOnly[Literal["transpose"]] class TransposeJSON_V3(NamedRequiredConfig[Literal["transpose"], TransposeConfig]): """ diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index f942bd5700..802b63447e 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -9,8 +9,9 @@ import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version - -from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError +from zarr.errors import CodecValidationError +from typing_extensions import ReadOnly +from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import ( JSON, @@ -35,10 +36,11 @@ class ZstdConfig_V3(TypedDict): checksum: bool -class ZstdJSON_V2(CodecJSON_V2[Literal["zstd"]], ZstdConfig_V2): +class ZstdJSON_V2(ZstdConfig_V2): """ The JSON form of the ZStandard codec in Zarr v2. """ + id: ReadOnly[Literal["zstd"]] class ZstdJSON_V3(NamedRequiredConfig[Literal["zstd"], ZstdConfig_V3]): From 678889aa73b31e0174d3025abb7a0c01117bbd02 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 16:52:33 +0200 Subject: [PATCH 09/54] add blosc json test --- tests/test_codecs/test_blosc.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 6e6e9df383..1abb613e2c 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -8,10 +8,23 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BloscCodec +from zarr.codecs.blosc import BLOSC_CNAME, BLOSC_SHUFFLE, BloscCname, BloscJSON_V2, BloscShuffle from zarr.core.buffer import default_buffer_prototype from zarr.storage import StorePath +@pytest.mark.parametrize('shuffle', BLOSC_SHUFFLE) +@pytest.mark.parametrize('cname', BLOSC_CNAME) +@pytest.mark.parametrize('clevel', [1,2]) +@pytest.mark.parametrize('blocksize', [1,2]) +@pytest.mark.parametrize('typesize', [1,2]) +def test_to_json_v2(cname: BloscCname, shuffle: BloscShuffle, clevel: int, blocksize: int, typesize: int) -> None: + codec= BloscCodec(shuffle=shuffle, cname=cname, clevel=clevel, blocksize=blocksize, typesize=typesize) + expected_v2: BloscJSON_V2 = {"id": "blosc", "cname": cname, "clevel": clevel, "shuffle": BLOSC_SHUFFLE.index(shuffle), "blocksize": blocksize} + expected_v3: BloscJSON_V3 = {"name": "blosc", "configuration": {"cname": cname, "clevel": clevel, "shuffle": shuffle, "blocksize": blocksize, "typesize": typesize}} + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("dtype", ["uint8", "uint16"]) async def test_blosc_evolve(store: Store, dtype: str) -> None: From dfca3ec0b819c42e5d434eb577c23ce06cf7caea Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 16:53:03 +0200 Subject: [PATCH 10/54] distinguish namedconfig from namedrequiredconfig --- src/zarr/core/common.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 33590c83a5..a199dd4399 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -14,6 +14,7 @@ Final, Generic, Literal, + NotRequired, TypedDict, TypeVar, cast, @@ -52,8 +53,26 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): """ - A typed dictionary representing an object with a name and configuration, where the configuration - is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + A typed dictionary representing an object with a `"name"` and `"configuration"` keys. + + The configuration key is not required. + + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + """ + + name: ReadOnly[TName] + """The name of the object.""" + + configuration: NotRequired[ReadOnly[TConfig]] + """The configuration of the object.""" + + +class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): + """ + A typed dictionary representing an object with a `"name"` and `"configuration"` keys. + + The configuration is required. This class is generic with two type parameters: the type of the name (``TName``) and the type of the configuration (``TConfig``). From 262e369ecfe4f498a9290529620fe8167d184915 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 16:53:44 +0200 Subject: [PATCH 11/54] lint --- src/zarr/codecs/blosc.py | 7 +++--- src/zarr/codecs/bytes.py | 1 + src/zarr/codecs/crc32c_.py | 3 ++- src/zarr/codecs/gzip.py | 4 +++- src/zarr/codecs/sharding.py | 7 ++++-- src/zarr/codecs/transpose.py | 5 ++++- src/zarr/codecs/zstd.py | 4 +++- src/zarr/errors.py | 1 + tests/test_codecs/test_blosc.py | 38 +++++++++++++++++++++++++-------- 9 files changed, 52 insertions(+), 18 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index b046ac04e1..2bad85257b 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -14,13 +14,12 @@ overload, ) -from typing_extensions import ReadOnly - import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version +from typing_extensions import ReadOnly -from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 +from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import ( JSON, @@ -64,8 +63,10 @@ class BloscJSON_V2(BloscConfigV2): """ The JSON form of the Blosc codec in Zarr V2. """ + id: ReadOnly[Literal["blosc"]] + class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ The JSON form of the Blosc codec in Zarr V3. diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 6f5a103cf8..9682ff9860 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -45,6 +45,7 @@ class BytesJSON_V2(BytesConfig): """ JSON representation of the bytes codec for zarr v2. """ + id: ReadOnly[Literal["bytes"]] diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index a73275ac14..6fa656b3b8 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -10,8 +10,9 @@ from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration -from zarr.registry import register_codec from zarr.errors import CodecValidationError +from zarr.registry import register_codec + if TYPE_CHECKING: from typing import Self diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index fb4dc70209..bddaa2bca6 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -7,7 +7,8 @@ from numcodecs.gzip import GZip from typing_extensions import ReadOnly -from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 + +from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import ( JSON, @@ -41,6 +42,7 @@ class GZipJSON_V2(GZipConfig): """ The JSON form of the GZip codec in Zarr V2. """ + id: ReadOnly[Literal["gzip"]] diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 3bda7112dd..e2fcb7020a 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -17,9 +17,10 @@ cast, overload, ) -from typing_extensions import ReadOnly + import numpy as np import numpy.typing as npt +from typing_extensions import ReadOnly from zarr.abc.codec import ( ArrayBytesCodec, @@ -31,7 +32,6 @@ CodecJSON_V3, CodecPipeline, ) -from zarr.errors import CodecValidationError from zarr.abc.store import ( ByteGetter, ByteRequest, @@ -67,6 +67,7 @@ morton_order_iter, ) from zarr.core.metadata.v3 import parse_codecs +from zarr.errors import CodecValidationError from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec if TYPE_CHECKING: @@ -101,8 +102,10 @@ class ShardingJSON_V2(ShardingConfigV2): """ The JSON form of the sharding codec in Zarr V2. """ + id: ReadOnly[Literal["sharding_indexed"]] + class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): ... diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 7176521c74..2585479839 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -6,7 +6,8 @@ import numpy as np from typing_extensions import ReadOnly -from zarr.abc.codec import ArrayArrayCodec, CodecJSON, CodecJSON_V2 + +from zarr.abc.codec import ArrayArrayCodec, CodecJSON from zarr.core.array_spec import ArraySpec from zarr.core.common import ( JSON, @@ -41,8 +42,10 @@ class TransposeJSON_V2(TransposeConfig): """ The JSON form of the Transpose codec in Zarr V2. """ + id: ReadOnly[Literal["transpose"]] + class TransposeJSON_V3(NamedRequiredConfig[Literal["transpose"], TransposeConfig]): """ The JSON form of the Transpose codec in Zarr V3. diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 802b63447e..ba0bc2461a 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -9,8 +9,8 @@ import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version -from zarr.errors import CodecValidationError from typing_extensions import ReadOnly + from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import ( @@ -18,6 +18,7 @@ NamedRequiredConfig, ZarrFormat, ) +from zarr.errors import CodecValidationError from zarr.registry import register_codec if TYPE_CHECKING: @@ -40,6 +41,7 @@ class ZstdJSON_V2(ZstdConfig_V2): """ The JSON form of the ZStandard codec in Zarr v2. """ + id: ReadOnly[Literal["zstd"]] diff --git a/src/zarr/errors.py b/src/zarr/errors.py index d859b12e02..b1a8d206a5 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -67,6 +67,7 @@ class NodeTypeValidationError(MetadataValidationError): for example an 'array' node when we expected a 'group'. """ + class CodecValidationError(BaseZarrError): """Raised when the Zarr codec metadata is invalid in some way""" diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 1abb613e2c..8d3b1c45bd 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -13,18 +13,38 @@ from zarr.storage import StorePath -@pytest.mark.parametrize('shuffle', BLOSC_SHUFFLE) -@pytest.mark.parametrize('cname', BLOSC_CNAME) -@pytest.mark.parametrize('clevel', [1,2]) -@pytest.mark.parametrize('blocksize', [1,2]) -@pytest.mark.parametrize('typesize', [1,2]) -def test_to_json_v2(cname: BloscCname, shuffle: BloscShuffle, clevel: int, blocksize: int, typesize: int) -> None: - codec= BloscCodec(shuffle=shuffle, cname=cname, clevel=clevel, blocksize=blocksize, typesize=typesize) - expected_v2: BloscJSON_V2 = {"id": "blosc", "cname": cname, "clevel": clevel, "shuffle": BLOSC_SHUFFLE.index(shuffle), "blocksize": blocksize} - expected_v3: BloscJSON_V3 = {"name": "blosc", "configuration": {"cname": cname, "clevel": clevel, "shuffle": shuffle, "blocksize": blocksize, "typesize": typesize}} +@pytest.mark.parametrize("shuffle", BLOSC_SHUFFLE) +@pytest.mark.parametrize("cname", BLOSC_CNAME) +@pytest.mark.parametrize("clevel", [1, 2]) +@pytest.mark.parametrize("blocksize", [1, 2]) +@pytest.mark.parametrize("typesize", [1, 2]) +def test_to_json_v2( + cname: BloscCname, shuffle: BloscShuffle, clevel: int, blocksize: int, typesize: int +) -> None: + codec = BloscCodec( + shuffle=shuffle, cname=cname, clevel=clevel, blocksize=blocksize, typesize=typesize + ) + expected_v2: BloscJSON_V2 = { + "id": "blosc", + "cname": cname, + "clevel": clevel, + "shuffle": BLOSC_SHUFFLE.index(shuffle), + "blocksize": blocksize, + } + expected_v3: BloscJSON_V3 = { + "name": "blosc", + "configuration": { + "cname": cname, + "clevel": clevel, + "shuffle": shuffle, + "blocksize": blocksize, + "typesize": typesize, + }, + } assert codec.to_json(zarr_format=2) == expected_v2 assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("dtype", ["uint8", "uint16"]) async def test_blosc_evolve(store: Store, dtype: str) -> None: From 4c7fe8ab18e42ce67a964966c1ef5ebf9478a3d4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 17:07:02 +0200 Subject: [PATCH 12/54] make codecvalidationerror effectively single-argument --- src/zarr/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/errors.py b/src/zarr/errors.py index b1a8d206a5..24c3996d42 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -71,4 +71,4 @@ class NodeTypeValidationError(MetadataValidationError): class CodecValidationError(BaseZarrError): """Raised when the Zarr codec metadata is invalid in some way""" - _msg = "Invalid value for '{}'. Expected '{}'. Got '{}'." + _msg = "{}" From 1e23a91ce3601dde8b366bfe673baf970bab1910 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 17:11:47 +0200 Subject: [PATCH 13/54] rename test_endian to test_bytes --- tests/test_codecs/{test_endian.py => test_bytes.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/test_codecs/{test_endian.py => test_bytes.py} (100%) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_bytes.py similarity index 100% rename from tests/test_codecs/test_endian.py rename to tests/test_codecs/test_bytes.py From d7d4e02e2809a18585fdbbf25302015320ee12f6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 17:54:22 +0200 Subject: [PATCH 14/54] bring in update codec abc --- src/zarr/abc/codec.py | 57 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index d5c995d2ca..c1140ee4be 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -2,13 +2,20 @@ from abc import abstractmethod from collections.abc import Mapping -from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar +from typing import ( + TYPE_CHECKING, + Generic, + Literal, + TypedDict, + TypeVar, + overload, +) -from typing_extensions import ReadOnly, TypedDict +from typing_extensions import ReadOnly from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, ZarrFormat, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -41,22 +48,16 @@ class CodecJSON_V2(TypedDict, Generic[TName]): - """The JSON representation of a codec for Zarr V2""" - id: ReadOnly[TName] -def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]: - return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str) - +CodecConfig_V3 = NamedConfig[str, Mapping[str, object]] -CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]] -"""The JSON representation of a codec for Zarr V3.""" +CodecJSON_V3 = str | CodecConfig_V3 -# The widest type we will *accept* for a codec JSON +# The widest type we will accept for a codec JSON # This covers v2 and v3 CodecJSON = str | Mapping[str, object] -"""The widest type of JSON-like input that could specify a codec.""" class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): @@ -181,6 +182,34 @@ async def encode( """ return await _batching_helper(self._encode_single, chunks_and_specs) + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]: + raise NotImplementedError + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + raise NotImplementedError + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + raise NotImplementedError + + @classmethod + def from_json(cls, data: CodecJSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls._from_json_v2(data) + elif zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]): """Base class for array-to-array codecs.""" @@ -471,3 +500,7 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | return await func(chunk, chunk_spec) return wrap + + +# Raised when a codec JSON data is invalid +class CodecValidationError(ValueError): ... From 99808230e526b8c6bfa5dc9725d614f95cdce3bd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 18:08:27 +0200 Subject: [PATCH 15/54] add to_json_tests --- tests/test_codecs/test_blosc.py | 2 +- tests/test_codecs/test_bytes.py | 17 +++++++++++++++ tests/test_codecs/test_gzip.py | 17 +++++++++++++++ tests/test_codecs/test_sharding.py | 32 ++++++++++++++++++++++++++++- tests/test_codecs/test_transpose.py | 12 +++++++++++ tests/test_codecs/test_vlen.py | 11 ++++++++++ tests/test_codecs/test_zstd.py | 18 ++++++++++++++++ 7 files changed, 107 insertions(+), 2 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 8d3b1c45bd..ad09414e32 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -8,7 +8,7 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BloscCodec -from zarr.codecs.blosc import BLOSC_CNAME, BLOSC_SHUFFLE, BloscCname, BloscJSON_V2, BloscShuffle +from zarr.codecs.blosc import BLOSC_CNAME, BLOSC_SHUFFLE, BloscCname, BloscJSON_V2, BloscJSON_V3, BloscShuffle from zarr.core.buffer import default_buffer_prototype from zarr.storage import StorePath diff --git a/tests/test_codecs/test_bytes.py b/tests/test_codecs/test_bytes.py index ab64afb1b8..02afd523b9 100644 --- a/tests/test_codecs/test_bytes.py +++ b/tests/test_codecs/test_bytes.py @@ -6,10 +6,27 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BytesCodec +from zarr.codecs.bytes import BytesJSON_V2, BytesJSON_V3 from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy +@pytest.mark.parametrize("endian", ["big", "little"]) +def test_bytescodec_to_json(endian: Literal["big", "little"]) -> None: + codec = BytesCodec(endian=endian) + expected_v2: BytesJSON_V2 = { + "id": "bytes", + "endian": endian, + } + expected_v3: BytesJSON_V3 = { + "name": "bytes", + "configuration": { + "endian": endian, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 4753036c87..3f7cb204fb 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -4,8 +4,25 @@ import zarr from zarr.abc.store import Store from zarr.codecs import GzipCodec +from zarr.codecs.gzip import GZipJSON_V2, GZipJSON_V3 from zarr.storage import StorePath +@pytest.mark.parametrize("level", [1, 5, 9]) +def test_json(level: int) -> None: + codec = GzipCodec(level=level) + expected_v2: GZipJSON_V2 = { + "id": "gzip", + "level": level, + } + expected_v3: GZipJSON_V3 = { + "name": "gzip", + "configuration": { + "level": level, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_gzip(store: Store) -> None: diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 403fd80e81..68dfede522 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -1,5 +1,6 @@ +from codecs import Codec import pickle -from typing import Any +from typing import Any, Literal import numpy as np import numpy.typing as npt @@ -16,6 +17,9 @@ ShardingCodecIndexLocation, TransposeCodec, ) +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.crc32c_ import Crc32cCodec +from zarr.codecs.sharding import ShardingJSON_V2, ShardingJSON_V3 from zarr.core.buffer import NDArrayLike, default_buffer_prototype from zarr.storage import StorePath @@ -23,6 +27,32 @@ from .test_codecs import _AsyncArrayProxy, order_from_dim +@pytest.mark.parametrize("index_location", ["start", "end"]) +@pytest.mark.parametrize("chunk_shape", [(32, 32), (64, 64)]) +@pytest.mark.parametrize('codecs', [(BytesCodec(),)]) +@pytest.mark.parametrize('index_codecs', [(Crc32cCodec(),)]) +def test_sharding_codec_to_json(index_location: Literal["start", "end"], chunk_shape: tuple[int, ...], codecs: tuple[Codec, ...], index_codecs: tuple[Codec, ...]) -> None: + codec = ShardingCodec(chunk_shape=chunk_shape, codecs=codecs, index_location=index_location, index_codecs=index_codecs) + expected_v2: ShardingJSON_V2 = { + "id": "sharding_indexed", + "chunk_shape": chunk_shape, + "codecs": tuple(c.to_json(zarr_format=2) for c in codecs), + "index_codecs": tuple(c.to_json(zarr_format=2) for c in index_codecs), + "index_location": index_location, + } + expected_v3: ShardingJSON_V3 = { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": chunk_shape, + "codecs": tuple(c.to_json(zarr_format=3) for c in codecs), + "index_codecs": tuple(c.to_json(zarr_format=3) for c in index_codecs), + "index_location": index_location, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("index_location", ["start", "end"]) @pytest.mark.parametrize( diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 06ec668ad3..ce342d45a0 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -5,12 +5,24 @@ from zarr import AsyncArray, config from zarr.abc.store import Store from zarr.codecs import TransposeCodec +from zarr.codecs.transpose import TransposeJSON_V2, TransposeJSON_V3 from zarr.core.common import MemoryOrder from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy +@pytest.mark.parametrize("order", [(1,2,3), (2,1,0)]) +def test_transpose_to_json(order: tuple[int, ...]) -> None: + codec = TransposeCodec(order=order) + expected_v2: TransposeJSON_V2 = {"id": "transpose", "order": order} + expected_v3: TransposeJSON_V3 = { + "name": "transpose", + "configuration": {"order": order}, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("input_order", ["F", "C"]) @pytest.mark.parametrize("runtime_write_order", ["F", "C"]) @pytest.mark.parametrize("runtime_read_order", ["F", "C"]) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index cf0905daca..8710ffe51b 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,6 +8,7 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.codecs.vlen_utf8 import VLenUTF8Codec, VLenUTF8JSON_V2, VLenUTF8JSON_V3 from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata @@ -21,6 +22,16 @@ else: expected_array_string_dtype = np.dtype("O") +def test_vlen_utf8_to_json() -> None: + codec = VLenUTF8Codec() + expected_v2: VLenUTF8JSON_V2 = { + "id": "vlen-utf8"} + expected_v3: VLenUTF8JSON_V3 = { + "name": "vlen-utf8", + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 6068f53443..77ecc79a81 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -4,8 +4,26 @@ import zarr from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.codecs.zstd import ZstdJSON_V2, ZstdJSON_V3 from zarr.storage import StorePath +@pytest.mark.parametrize("level", [1, 5, 9]) +@pytest.mark.parametrize('checksum', [True, False]) +def test_json(level: int, checksum: bool) -> None: + codec = ZstdCodec(level=level, checksum=checksum) + expected_v2: ZstdJSON_V2 = { + "id": "zstd", + "level": level, + } + expected_v3: ZstdJSON_V3 = { + "name": "zstd", + "configuration": { + "level": level, + "checksum": checksum + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("checksum", [True, False]) From cbb32d7f121e89f757d238797732e034da89a764 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 19:36:51 +0200 Subject: [PATCH 16/54] lint --- tests/test_codecs/test_blosc.py | 9 +++++++- tests/test_codecs/test_bytes.py | 7 ++++-- tests/test_codecs/test_gzip.py | 7 +++++- tests/test_codecs/test_sharding.py | 34 +++++++++++++++++++---------- tests/test_codecs/test_transpose.py | 9 ++++++-- tests/test_codecs/test_vlen.py | 4 ++-- tests/test_codecs/test_zstd.py | 15 ++++++++----- 7 files changed, 60 insertions(+), 25 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index ad09414e32..296889a55a 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -8,7 +8,14 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BloscCodec -from zarr.codecs.blosc import BLOSC_CNAME, BLOSC_SHUFFLE, BloscCname, BloscJSON_V2, BloscJSON_V3, BloscShuffle +from zarr.codecs.blosc import ( + BLOSC_CNAME, + BLOSC_SHUFFLE, + BloscCname, + BloscJSON_V2, + BloscJSON_V3, + BloscShuffle, +) from zarr.core.buffer import default_buffer_prototype from zarr.storage import StorePath diff --git a/tests/test_codecs/test_bytes.py b/tests/test_codecs/test_bytes.py index 02afd523b9..e991885fa6 100644 --- a/tests/test_codecs/test_bytes.py +++ b/tests/test_codecs/test_bytes.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np import pytest @@ -6,11 +6,14 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BytesCodec -from zarr.codecs.bytes import BytesJSON_V2, BytesJSON_V3 from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy +if TYPE_CHECKING: + from zarr.codecs.bytes import BytesJSON_V2, BytesJSON_V3 + + @pytest.mark.parametrize("endian", ["big", "little"]) def test_bytescodec_to_json(endian: Literal["big", "little"]) -> None: codec = BytesCodec(endian=endian) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 3f7cb204fb..db146c04de 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -1,12 +1,17 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest import zarr from zarr.abc.store import Store from zarr.codecs import GzipCodec -from zarr.codecs.gzip import GZipJSON_V2, GZipJSON_V3 from zarr.storage import StorePath +if TYPE_CHECKING: + from zarr.codecs.gzip import GZipJSON_V2, GZipJSON_V3 + + @pytest.mark.parametrize("level", [1, 5, 9]) def test_json(level: int) -> None: codec = GzipCodec(level=level) diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 68dfede522..e7ef5bc61c 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -1,6 +1,6 @@ -from codecs import Codec import pickle -from typing import Any, Literal +from codecs import Codec +from typing import TYPE_CHECKING, Any, Literal import numpy as np import numpy.typing as npt @@ -19,20 +19,32 @@ ) from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.codecs.sharding import ShardingJSON_V2, ShardingJSON_V3 from zarr.core.buffer import NDArrayLike, default_buffer_prototype from zarr.storage import StorePath from ..conftest import ArrayRequest from .test_codecs import _AsyncArrayProxy, order_from_dim +if TYPE_CHECKING: + from zarr.codecs.sharding import ShardingJSON_V2, ShardingJSON_V3 + @pytest.mark.parametrize("index_location", ["start", "end"]) @pytest.mark.parametrize("chunk_shape", [(32, 32), (64, 64)]) -@pytest.mark.parametrize('codecs', [(BytesCodec(),)]) -@pytest.mark.parametrize('index_codecs', [(Crc32cCodec(),)]) -def test_sharding_codec_to_json(index_location: Literal["start", "end"], chunk_shape: tuple[int, ...], codecs: tuple[Codec, ...], index_codecs: tuple[Codec, ...]) -> None: - codec = ShardingCodec(chunk_shape=chunk_shape, codecs=codecs, index_location=index_location, index_codecs=index_codecs) +@pytest.mark.parametrize("codecs", [(BytesCodec(),)]) +@pytest.mark.parametrize("index_codecs", [(Crc32cCodec(),)]) +def test_sharding_codec_to_json( + index_location: Literal["start", "end"], + chunk_shape: tuple[int, ...], + codecs: tuple[Codec, ...], + index_codecs: tuple[Codec, ...], +) -> None: + codec = ShardingCodec( + chunk_shape=chunk_shape, + codecs=codecs, + index_location=index_location, + index_codecs=index_codecs, + ) expected_v2: ShardingJSON_V2 = { "id": "sharding_indexed", "chunk_shape": chunk_shape, @@ -43,10 +55,10 @@ def test_sharding_codec_to_json(index_location: Literal["start", "end"], chunk_s expected_v3: ShardingJSON_V3 = { "name": "sharding_indexed", "configuration": { - "chunk_shape": chunk_shape, - "codecs": tuple(c.to_json(zarr_format=3) for c in codecs), - "index_codecs": tuple(c.to_json(zarr_format=3) for c in index_codecs), - "index_location": index_location, + "chunk_shape": chunk_shape, + "codecs": tuple(c.to_json(zarr_format=3) for c in codecs), + "index_codecs": tuple(c.to_json(zarr_format=3) for c in index_codecs), + "index_location": index_location, }, } assert codec.to_json(zarr_format=2) == expected_v2 diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index ce342d45a0..0d48d3fddf 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -5,14 +7,16 @@ from zarr import AsyncArray, config from zarr.abc.store import Store from zarr.codecs import TransposeCodec -from zarr.codecs.transpose import TransposeJSON_V2, TransposeJSON_V3 from zarr.core.common import MemoryOrder from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy +if TYPE_CHECKING: + from zarr.codecs.transpose import TransposeJSON_V2, TransposeJSON_V3 + -@pytest.mark.parametrize("order", [(1,2,3), (2,1,0)]) +@pytest.mark.parametrize("order", [(1, 2, 3), (2, 1, 0)]) def test_transpose_to_json(order: tuple[int, ...]) -> None: codec = TransposeCodec(order=order) expected_v2: TransposeJSON_V2 = {"id": "transpose", "order": order} @@ -23,6 +27,7 @@ def test_transpose_to_json(order: tuple[int, ...]) -> None: assert codec.to_json(zarr_format=2) == expected_v2 assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("input_order", ["F", "C"]) @pytest.mark.parametrize("runtime_write_order", ["F", "C"]) @pytest.mark.parametrize("runtime_read_order", ["F", "C"]) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 8710ffe51b..50c69b6ae2 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -22,10 +22,10 @@ else: expected_array_string_dtype = np.dtype("O") + def test_vlen_utf8_to_json() -> None: codec = VLenUTF8Codec() - expected_v2: VLenUTF8JSON_V2 = { - "id": "vlen-utf8"} + expected_v2: VLenUTF8JSON_V2 = {"id": "vlen-utf8"} expected_v3: VLenUTF8JSON_V3 = { "name": "vlen-utf8", } diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 77ecc79a81..7c18a25103 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -1,14 +1,19 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest import zarr from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.codecs.zstd import ZstdJSON_V2, ZstdJSON_V3 from zarr.storage import StorePath +if TYPE_CHECKING: + from zarr.codecs.zstd import ZstdJSON_V2, ZstdJSON_V3 + + @pytest.mark.parametrize("level", [1, 5, 9]) -@pytest.mark.parametrize('checksum', [True, False]) +@pytest.mark.parametrize("checksum", [True, False]) def test_json(level: int, checksum: bool) -> None: codec = ZstdCodec(level=level, checksum=checksum) expected_v2: ZstdJSON_V2 = { @@ -17,14 +22,12 @@ def test_json(level: int, checksum: bool) -> None: } expected_v3: ZstdJSON_V3 = { "name": "zstd", - "configuration": { - "level": level, - "checksum": checksum - }, + "configuration": {"level": level, "checksum": checksum}, } assert codec.to_json(zarr_format=2) == expected_v2 assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("checksum", [True, False]) def test_zstd(store: Store, checksum: bool) -> None: From e2d4df86465e8d201fd271a02f001a8b7cbebef0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 19:48:12 +0200 Subject: [PATCH 17/54] fix broken tests that used invalid codec JSON --- tests/test_array.py | 4 ++-- tests/test_config.py | 2 +- tests/test_group.py | 10 +++++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 6342ce6430..8453bb05d5 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1105,8 +1105,8 @@ def test_dtype_roundtrip( (ZstdCodec(level=3),), (ZstdCodec(level=3), GzipCodec(level=0)), ZstdCodec(level=3), - {"name": "zstd", "configuration": {"level": 3}}, - ({"name": "zstd", "configuration": {"level": 3}},), + {"name": "zstd", "configuration": {"level": 3, "checksum": True}}, + ({"name": "zstd", "configuration": {"level": 3, "checksum": True}},), ], ) @pytest.mark.parametrize( diff --git a/tests/test_config.py b/tests/test_config.py index da5b2cc488..b1de6e392c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -180,7 +180,7 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu chunks=(10,), zarr_format=3, dtype="i4", - compressors=[{"name": "blosc", "configuration": {}}], + compressors=[{"name": "blosc", "configuration": {"cname": "lz4", "clevel": 1, "shuffle": "noshuffle", "blocksize": 1, "typesize": 1}},], ) arr[:] = range(100) _mock.call.assert_called() diff --git a/tests/test_group.py b/tests/test_group.py index 7705fa205a..e9cd2779a4 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -16,6 +16,7 @@ import zarr import zarr.api.asynchronous import zarr.api.synchronous +from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 import zarr.storage from zarr import Array, AsyncArray, AsyncGroup, Group from zarr.abc.store import Store @@ -523,7 +524,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "chunks": (1,), "order": "C", "filters": None, - "compressor": Blosc(), + "compressor": default_compressor_v2(dtype).to_json(zarr_format=zarr_format), "zarr_format": zarr_format, }, "subgroup": { @@ -550,8 +551,11 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "name": "default", }, "codecs": ( - {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + default_serializer_v3(dtype).to_json(zarr_format=zarr_format), + *[ + c.to_json(zarr_format=zarr_format) + for c in default_compressors_v3(dtype) + ], ), "data_type": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, From d91b0e9e80213c13b9600f2f73a152d0ba197ec3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 19:50:53 +0200 Subject: [PATCH 18/54] update test_info --- tests/test_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_info.py b/tests/test_info.py index 28c8803c83..08f2318dc2 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -74,7 +74,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : ()""") @@ -117,7 +117,7 @@ def test_array_info_complete( Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored} ({count_bytes_stored_formatted}) From 1eb5b3c148e97576084e8e0553aa91132aa91f85 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 20:42:59 +0200 Subject: [PATCH 19/54] avoid circular imports by moving numcodec protocol to codec abc --- src/zarr/abc/codec.py | 23 ++++++++++++++++++++++- src/zarr/codecs/_numcodecs.py | 2 +- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index c1140ee4be..03a267fdc8 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -4,14 +4,16 @@ from collections.abc import Mapping from typing import ( TYPE_CHECKING, + ClassVar, Generic, Literal, + Self, TypedDict, TypeVar, overload, ) -from typing_extensions import ReadOnly +from typing_extensions import Protocol, ReadOnly from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer @@ -504,3 +506,22 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | # Raised when a codec JSON data is invalid class CodecValidationError(ValueError): ... + + +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: ClassVar[str] + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py index b00f258db5..33bc216d09 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/_numcodecs.py @@ -1,5 +1,5 @@ from zarr.abc.codec import CodecJSON_V2 -from zarr.codecs._v2 import Numcodec +from zarr.abc.codec import Numcodec def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: From 94ba77ad2dcfb103f61d167f8b4165c4483824cd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 20:46:28 +0200 Subject: [PATCH 20/54] use Numcodec instead of numcodecs.abc.Codec --- src/zarr/api/asynchronous.py | 3 +- src/zarr/api/synchronous.py | 3 +- src/zarr/codecs/_numcodecs.py | 3 +- src/zarr/codecs/_v2.py | 24 ++------------ src/zarr/core/_info.py | 8 ++--- src/zarr/core/array.py | 32 +++++++++---------- src/zarr/core/metadata/v2.py | 15 +++++---- tests/test_config.py | 13 +++++++- tests/test_group.py | 5 ++- .../test_v2_dtype_regression.py | 6 ++-- 10 files changed, 49 insertions(+), 63 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 044d881c22..34e80a9a08 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -46,8 +46,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from zarr.abc.codec import Codec - from zarr.codecs._v2 import Numcodec + from zarr.abc.codec import Codec, Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index df667c0b23..4048b4090b 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -17,9 +17,8 @@ import numpy as np import numpy.typing as npt - from zarr.abc.codec import Codec + from zarr.abc.codec import Codec, Numcodec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py index 33bc216d09..2d915c5368 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/_numcodecs.py @@ -1,5 +1,4 @@ -from zarr.abc.codec import CodecJSON_V2 -from zarr.abc.codec import Numcodec +from zarr.abc.codec import CodecJSON_V2, Numcodec def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 8deae99a6d..4303bf6de6 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,13 +2,12 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard +from typing import TYPE_CHECKING, TypeGuard import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 +from zarr.abc.codec import ArrayBytesCodec, Numcodec from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: @@ -16,25 +15,6 @@ from zarr.core.buffer import Buffer, NDBuffer -class Numcodec(Protocol): - """ - A protocol that models the ``numcodecs.abc.Codec`` interface. - """ - - codec_id: ClassVar[str] - - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... - - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... - - def get_config(self) -> CodecJSON_V2[str]: ... - - @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... - - def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: """ Check if the given object implements the Numcodec protocol. diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index a5b14d573a..6e51442d50 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -4,9 +4,9 @@ import textwrap from typing import TYPE_CHECKING, Literal -if TYPE_CHECKING: - import numcodecs.abc +from zarr.abc.codec import Numcodec +if TYPE_CHECKING: from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -88,9 +88,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _filters: tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] = () _serializer: ArrayBytesCodec | None = None - _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () + _compressors: tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 68a4694a55..edfb5be87a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -25,9 +25,9 @@ from typing_extensions import deprecated import zarr -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, Numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import Numcodec, V2Codec +from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -1033,7 +1033,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1062,7 +1062,7 @@ def serializer(self) -> ArrayBytesCodec | None: @property @deprecated("Use AsyncArray.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -1075,7 +1075,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: raise TypeError("`compressor` is not available for Zarr format 3 arrays.") @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -2227,7 +2227,7 @@ def fill_value(self) -> Any: return self.metadata.fill_value @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -2243,7 +2243,7 @@ def serializer(self) -> None | ArrayBytesCodec: @property @deprecated("Use Array.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -2254,7 +2254,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: return self._async_array.compressor @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3900,15 +3900,13 @@ def _build_parents( FiltersLike: TypeAlias = ( Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec - | Iterable[numcodecs.abc.Codec] - | numcodecs.abc.Codec + | Iterable[Numcodec] + | Numcodec | Literal["auto"] | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = ( - dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None -) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None CompressorsLike: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] @@ -4775,7 +4773,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ Given a data type, return the default filters for that data type. @@ -4813,12 +4811,12 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Numcodec, ...] | None, Numcodec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - _filters: tuple[numcodecs.abc.Codec, ...] | None - _compressor: numcodecs.abc.Codec | None + _filters: tuple[Numcodec, ...] | None + _compressor: Numcodec | None if compressor is None or compressor == (): _compressor = None diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 17af3538a9..532b1695ee 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -7,6 +7,7 @@ import numcodecs.abc +from zarr.abc.codec import Numcodec from zarr.abc.metadata import Metadata from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json @@ -56,7 +57,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None @dataclass(frozen=True, kw_only=True) @@ -66,9 +67,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[numcodecs.abc.Codec, ...] | None = None + filters: tuple[Numcodec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None + compressor: Numcodec | None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -82,7 +83,7 @@ def __init__( order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", compressor: CompressorLikev2 = None, - filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, + filters: Iterable[Numcodec | dict[str, JSON]] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -262,11 +263,11 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: +def parse_filters(data: object) -> tuple[Numcodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[numcodecs.abc.Codec] = [] + out: list[Numcodec] = [] if data is None: return data @@ -291,7 +292,7 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> Numcodec | None: """ Parse a potential compressor. """ diff --git a/tests/test_config.py b/tests/test_config.py index b1de6e392c..c30743fb30 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -180,7 +180,18 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu chunks=(10,), zarr_format=3, dtype="i4", - compressors=[{"name": "blosc", "configuration": {"cname": "lz4", "clevel": 1, "shuffle": "noshuffle", "blocksize": 1, "typesize": 1}},], + compressors=[ + { + "name": "blosc", + "configuration": { + "cname": "lz4", + "clevel": 1, + "shuffle": "noshuffle", + "blocksize": 1, + "typesize": 1, + }, + }, + ], ) arr[:] = range(100) _mock.call.assert_called() diff --git a/tests/test_group.py b/tests/test_group.py index e9cd2779a4..8ed661f8a5 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -11,17 +11,16 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr import zarr.api.asynchronous import zarr.api.synchronous -from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 import zarr.storage from zarr import Array, AsyncArray, AsyncGroup, Group from zarr.abc.store import Store from zarr.core import sync_group from zarr.core._info import GroupInfo +from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config from zarr.core.dtype.common import unpack_dtype_json @@ -524,7 +523,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "chunks": (1,), "order": "C", "filters": None, - "compressor": default_compressor_v2(dtype).to_json(zarr_format=zarr_format), + "compressor": default_compressor_v2(dtype).to_json(zarr_format=zarr_format), "zarr_format": zarr_format, }, "subgroup": { diff --git a/tests/test_regression/test_v2_dtype_regression.py b/tests/test_regression/test_v2_dtype_regression.py index 9702ca7d23..37c2928e67 100644 --- a/tests/test_regression/test_v2_dtype_regression.py +++ b/tests/test_regression/test_v2_dtype_regression.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal -import numcodecs import numpy as np import pytest from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd @@ -13,6 +12,7 @@ import zarr.abc import zarr.abc.codec import zarr.codecs as zarrcodecs +from zarr.abc.codec import Numcodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.bytes import VariableLengthBytes @@ -40,9 +40,9 @@ def runner_installed() -> bool: class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] fill_value: np.generic | str | int | bytes - filters: tuple[numcodecs.abc.Codec, ...] = () + filters: tuple[Numcodec, ...] = () serializer: str | None = None - compressor: numcodecs.abc.Codec + compressor: Numcodec basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() From f1ca290f9e0f49bfa34b985ce6b16e160c79de58 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Aug 2025 22:11:31 +0200 Subject: [PATCH 21/54] Wip implementation of v2 / v3 codec behavior --- src/zarr/abc/codec.py | 3 + src/zarr/abc/store.py | 7 +- src/zarr/codecs/_v2.py | 104 ++++++++++++++- src/zarr/codecs/blosc.py | 2 +- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 223 ++++++++++---------------------- src/zarr/core/codec_pipeline.py | 162 ++++++++++++++++++++++- src/zarr/core/common.py | 3 +- src/zarr/core/metadata/v2.py | 106 +++++++++------ src/zarr/core/metadata/v3.py | 18 ++- src/zarr/registry.py | 93 +++++++++++-- 11 files changed, 494 insertions(+), 231 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 03a267fdc8..47f03ac4e9 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -2,6 +2,7 @@ from abc import abstractmethod from collections.abc import Mapping +from dataclasses import dataclass from typing import ( TYPE_CHECKING, ClassVar, @@ -525,3 +526,5 @@ def get_config(self) -> CodecJSON_V2[str]: ... @classmethod def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 1fbdb3146c..0ea1d0bae4 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -6,10 +6,6 @@ from itertools import starmap from typing import TYPE_CHECKING, Protocol, runtime_checkable -from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.common import concurrent_map -from zarr.core.config import config - if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable from types import TracebackType @@ -435,6 +431,7 @@ async def getsize(self, key: str) -> int: FileNotFoundError When the given key does not exist in the store. """ + from zarr.core.buffer.core import default_buffer_prototype # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. @@ -476,6 +473,8 @@ async def getsize_prefix(self, prefix: str) -> int: # on to getting sizes. Ideally we would overlap those two, which should # improve tail latency and might reduce memory pressure (since not all keys # would be in memory at once). + from zarr.core.common import concurrent_map + from zarr.core.config import config keys = [(x,) async for x in self.list_prefix(prefix)] limit = config.get("async.concurrency") sizes = await concurrent_map(keys, self.getsize, limit=limit) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 4303bf6de6..86e325d589 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,12 +2,15 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING, TypeGuard +from typing import TYPE_CHECKING, Self, TypeGuard, overload +from typing_extensions import Literal import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from zarr.abc.codec import ArrayBytesCodec, Numcodec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BaseCodec, BytesBytesCodec, CodecJSON, CodecJSON_V2, Numcodec +from zarr.core.buffer.core import BufferPrototype +from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: @@ -133,3 +136,100 @@ async def _encode_single( def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError + + +@dataclass(frozen=True, kw_only=True) +class NumcodecsWrapper: + codec: Numcodec + + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: + if zarr_format == 2: + return self.codec.get_config() + elif zarr_format == 3: + config = self.codec.get_config() + config_no_id = {k: v for k, v in config.items() if k != "id"} + return {"name": config["id"], "configuration": config_no_id} + raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + raise NotADirectoryError( + "This class does not support creating instances from JSON data for Zarr format 2." + ) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + raise NotImplementedError( + "This class does not support creating instances from JSON data for Zarr format 3." + ) + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError + + def to_array_array(self) -> NumcodecsArrayArrayCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. + """ + return NumcodecsArrayArrayCodec(codec=self.codec) + + def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec. + """ + return NumcodecsBytesBytesCodec(codec=self.codec) + + def to_array_bytes(self) -> NumcodecsArrayBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec. + """ + return NumcodecsArrayBytesCodec(codec=self.codec) + + +class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread( + as_numpy_array_wrapper, + self.codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self.codec.encode(chunk_bytes.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayArrayCodec(NumcodecsWrapper, ArrayArrayCodec): + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr] + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type] + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayBytesCodec(NumcodecsWrapper, ArrayBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self.codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) \ No newline at end of file diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 2bad85257b..e25805908d 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -20,7 +20,6 @@ from typing_extensions import ReadOnly from zarr.abc.codec import BytesBytesCodec, CodecJSON -from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import ( JSON, NamedRequiredConfig, @@ -266,6 +265,7 @@ async def _decode_single( chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer: + from zarr.core.buffer.cpu import as_numpy_array_wrapper return await asyncio.to_thread( as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype ) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 6e51442d50..6703e07806 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -4,10 +4,8 @@ import textwrap from typing import TYPE_CHECKING, Literal -from zarr.abc.codec import Numcodec - if TYPE_CHECKING: - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec + from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Numcodec from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index edfb5be87a..c11e037e1f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger @@ -19,16 +19,14 @@ ) from warnings import warn -import numcodecs -import numcodecs.abc import numpy as np from typing_extensions import deprecated import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, Numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo @@ -61,24 +59,20 @@ ZarrFormat, _default_zarr_format, _warn_order_kwarg, - ceildiv, concurrent_map, parse_shapelike, product, ) -from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( VariableLengthBytes, VariableLengthUTF8, ZDType, ZDTypeLike, - parse_dtype, + parse_data_type, ) from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( - AsyncOIndex, - AsyncVIndex, BasicIndexer, BasicSelection, BlockIndex, @@ -95,6 +89,7 @@ Selection, VIndex, _iter_grid, + ceildiv, check_fields, check_no_multi_fields, is_pure_fancy_indexing, @@ -112,7 +107,7 @@ T_ArrayMetadata, ) from zarr.core.metadata.v2 import ( - CompressorLikev2, + CompressorLike_V2, get_object_codec_id, parse_compressor, parse_filters, @@ -130,7 +125,7 @@ from zarr.storage._utils import _relativize_path if TYPE_CHECKING: - from collections.abc import Iterator, Sequence + from collections.abc import Iterator from typing import Self import numpy.typing as npt @@ -204,8 +199,26 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None if isinstance(metadata, ArrayV3Metadata): return get_pipeline_class().from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): - v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) - return get_pipeline_class().from_codecs([v2_codec]) + _codecs: tuple[Codec, ...] = () + if metadata.filters is not None: + _codecs += metadata.filters + if metadata.compressor is not None: + _codecs += (metadata.compressor,) + if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs) and not isinstance( + metadata.dtype, HasObjectCodec + ): + # The role filled by the ArrayBytesCodec was implicit in zarr v2. So a valid zarr v2-style + # chain of filters + compressor might not contain a codec identifiable as an array-bytes codec. + # In such a case, we will insert a bytes codec that applies no endian transformation. + # We skip this insertion if the data type is an instance of HasObjectCodec, because + # in zarr v2 these data types required a special codec that functioned like an array bytes codec. + _codecs = (BytesCodec(endian=None),) + _codecs + if metadata.order == "F": + # Zarr V2 supports declaring the order of an array in metadata. Using the zarr v3 codec + # framework, we express C or F ordered arrays by adding a transpose codec to the front + # of the list of codecs. + _codecs = (TransposeCodec(order=tuple(reversed(range(metadata.ndim)))),) + _codecs + return get_pipeline_class().from_codecs(_codecs) raise TypeError # pragma: no cover @@ -342,7 +355,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: CompressorLikev2 | Literal["auto"] = "auto", + compressor: CompressorLike_V2 | Literal["auto"] = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -619,7 +632,7 @@ async def _create( Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -818,8 +831,8 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | Numcodec] | None = None, - compressor: CompressorLikev2 = None, + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | None = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: if dimension_separator is None: @@ -856,8 +869,8 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | Numcodec] | None = None, - compressor: CompressorLike = "auto", + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | Literal["auto"] = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -869,14 +882,9 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - compressor_parsed: CompressorLikev2 + compressor_parsed: CompressorLike_V2 if compressor == "auto": compressor_parsed = default_compressor_v2(dtype) - elif isinstance(compressor, BytesBytesCodec): - raise ValueError( - "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. " - "Use a numcodecs codec directly instead." - ) else: compressor_parsed = compressor @@ -1033,7 +1041,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Codec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1427,56 +1435,6 @@ async def getitem( ) return await self._get_selection(indexer, prototype=prototype) - async def get_orthogonal_selection( - self, - selection: OrthogonalSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - async def get_mask_selection( - self, - mask: MaskSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - async def get_coordinate_selection( - self, - selection: CoordinateSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - out_array = await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - if hasattr(out_array, "shape"): - # restore shape - out_array = np.array(out_array).reshape(indexer.sel_shape) - return out_array - async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = False) -> None: """ Asynchronously save the array metadata. @@ -1518,7 +1476,7 @@ async def _set_selection( if isinstance(array_like, np._typing._SupportsArrayFunc): # TODO: need to handle array types that don't support __array_function__ # like PyTorch and JAX - array_like_ = cast("np._typing._SupportsArrayFunc", array_like) + array_like_ = cast(np._typing._SupportsArrayFunc, array_like) value = np.asanyarray(value, dtype=self.dtype, like=array_like_) else: if not hasattr(value, "shape"): @@ -1532,8 +1490,7 @@ async def _set_selection( value = value.astype(dtype=self.dtype, order="A") else: value = np.array(value, dtype=self.dtype, order="A") - value = cast("NDArrayLike", value) - + value = cast(NDArrayLike, value) # We accept any ndarray like object from the user and convert it # to a NDBuffer (or subclass). From this point onwards, we only pass # Buffer and NDBuffer between components. @@ -1608,19 +1565,6 @@ async def setitem( ) return await self._set_selection(indexer, value, prototype=prototype) - @property - def oindex(self) -> AsyncOIndex[T_ArrayMetadata]: - """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and - :func:`set_orthogonal_selection` for documentation and examples.""" - return AsyncOIndex(self) - - @property - def vindex(self) -> AsyncVIndex[T_ArrayMetadata]: - """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, - :func:`set_coordinate_selection`, :func:`get_mask_selection` and - :func:`set_mask_selection` for documentation and examples.""" - return AsyncVIndex(self) - async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: """ Asynchronously resize the array to a new shape. @@ -4301,7 +4245,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - zdtype = parse_dtype(dtype, zarr_format=zarr_format) + zdtype = parse_data_type(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4710,26 +4654,6 @@ def _parse_chunk_key_encoding( return result -def _get_default_chunk_encoding_v3( - dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: - """ - Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. - """ - - dtype_category = categorize_data_type(dtype) - - filters = zarr_config.get("array.v3_default_filters").get(dtype_category) - compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) - serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) - - return ( - tuple(_parse_array_array_codec(f) for f in filters), - _parse_array_bytes_codec(serializer), - tuple(_parse_bytes_bytes_codec(c) for c in compressors), - ) - - def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: """ Given a data type, return the default filters for that data type. @@ -4773,7 +4697,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Codec] | None: """ Given a data type, return the default filters for that data type. @@ -4782,28 +4706,22 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ if isinstance(dtype, HasObjectCodec): if dtype.object_codec_id == "vlen-bytes": - from numcodecs import VLenBytes - - return (VLenBytes(),) + return (VLenBytesCodec(),) elif dtype.object_codec_id == "vlen-utf8": - from numcodecs import VLenUTF8 - - return (VLenUTF8(),) + return (VLenUTF8Codec(),) else: msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." raise ValueError(msg) return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> BytesBytesCodec: """ Given a data type, return the default compressors for that data type. - This is just the numcodecs ``Zstd`` codec. + This is just the ``Zstd`` codec. """ - from numcodecs import Zstd - - return Zstd(level=0, checksum=False) + return ZstdCodec() def _parse_chunk_encoding_v2( @@ -4811,23 +4729,20 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[Numcodec, ...] | None, Numcodec | None]: +) -> tuple[tuple[Codec, ...] | None, Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - _filters: tuple[Numcodec, ...] | None - _compressor: Numcodec | None + _filters: tuple[Codec, ...] | None + _compressor: Codec | None if compressor is None or compressor == (): _compressor = None elif compressor == "auto": _compressor = default_compressor_v2(dtype) - elif isinstance(compressor, tuple | list) and len(compressor) == 1: + elif isinstance(compressor, Sequence) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." - raise TypeError(msg) _compressor = parse_compressor(compressor) if filters is None: @@ -4835,14 +4750,6 @@ def _parse_chunk_encoding_v2( elif filters == "auto": _filters = default_filters_v2(dtype) else: - if isinstance(filters, Iterable): - for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): - msg = ( - "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " - f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." - ) - raise TypeError(msg) _filters = parse_filters(filters) if isinstance(dtype, HasObjectCodec): # check the filters and the compressor for the object codec required for this data type @@ -4850,12 +4757,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.to_json(zarr_format=2),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.to_json(zarr_format=2) for f in _filters], + _compressor.to_json(zarr_format=2) if _compressor is not None else None, ) ) if object_codec_id is None: @@ -4895,7 +4802,9 @@ def _parse_chunk_encoding_v3( maybe_array_array = (filters,) else: maybe_array_array = cast("Iterable[Codec | dict[str, JSON]]", filters) - out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) + out_array_array = tuple( + _parse_array_array_codec(c, zarr_format=3) for c in maybe_array_array + ) if serializer == "auto": out_array_bytes = default_serializer_v3(dtype) @@ -4903,26 +4812,34 @@ def _parse_chunk_encoding_v3( # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - out_array_bytes = _parse_array_bytes_codec(serializer) + out_array_bytes = _parse_array_bytes_codec(serializer, zarr_format=3) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": out_bytes_bytes = default_compressors_v3(dtype) else: - maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] - if isinstance(compressors, dict | Codec): + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON] | Numcodec] + if isinstance(compressors, dict | Codec | Numcodec): maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = cast("Iterable[Codec | dict[str, JSON]]", compressors) + maybe_bytes_bytes = compressors # type: ignore[assignment] + + out_bytes_bytes = tuple( + _parse_bytes_bytes_codec(c, zarr_format=3) for c in maybe_bytes_bytes + ) - out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + # specialize codecs as needed given the dtype + + # TODO: refactor so that the config only contains the name of the codec, and we use the dtype + # to create the codec instance, instead of storing a dict representation of a full codec. # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - - # TODO: add checks to ensure that the right serializer is used for vlen data types + if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): + # The default endianness in the bytescodec might not be None, so we need to replace it + out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes @@ -4942,9 +4859,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) # type: ignore[assignment] - elif zarr_format == 2 and compressor == compressors == "auto": - compressors = ({"id": "blosc"},) + compressors = (compressor,) return compressors diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 23c27e40c6..36458e3c66 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -14,6 +14,7 @@ Codec, CodecPipeline, ) +from zarr.codecs._v2 import NumcodecsWrapper from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar @@ -87,7 +88,6 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: @classmethod def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) - return cls( array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, @@ -219,7 +219,6 @@ async def encode_batch( zip(chunk_array_batch, chunk_specs, strict=False) ) chunk_specs = resolve_batched(aa_codec, chunk_specs) - chunk_bytes_batch = await self.array_bytes_codec.encode( zip(chunk_array_batch, chunk_specs, strict=False) ) @@ -494,15 +493,168 @@ def codecs_from_list( from zarr.codecs.sharding import ShardingCodec array_array: tuple[ArrayArrayCodec, ...] = () - array_bytes_maybe: ArrayBytesCodec | None = None + array_bytes_maybe: ArrayBytesCodec bytes_bytes: tuple[BytesBytesCodec, ...] = () - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: + # handle two cases + # either all of the codecs are numcodecwrapper instances, in which case we set the last element + # to array-bytes and the rest to array-array + # or one of the codecs is an array-bytes, in which case we convert any preceding numcodecswrapper + # instances to array-array, and any following numcodecswrapper instances to bytes-bytes + + codecs_tup = tuple(codecs) + array_array_idcs: tuple[tuple[int, ArrayArrayCodec], ...] = () + array_bytes_idcs: tuple[tuple[int, ArrayBytesCodec], ...] = () + bytes_bytes_idcs: tuple[tuple[int, BytesBytesCodec], ...] = () + numcodec_wrapper_idcs: tuple[tuple[int, NumcodecsWrapper], ...] = () + + for idx, codec in enumerate(codecs_tup): + match codec: + case ArrayArrayCodec(): + array_array_idcs += ((idx, codec),) + case ArrayBytesCodec(): + array_bytes_idcs += ((idx, codec),) + case BytesBytesCodec(): + bytes_bytes_idcs += ((idx, codec),) + case NumcodecsWrapper(): # type: ignore[union-attr] + numcodec_wrapper_idcs += ((idx, codec),) + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs_tup) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " "writes, which may lead to inefficient performance.", stacklevel=3, ) + if len(array_bytes_idcs) == 0: + # There is no array-bytes codec. Unless we can find a numcodec wrapper to act as an + # array-bytes codec, this is an error. + if len(numcodec_wrapper_idcs) == 0: + msg = ( + f"The codecs {codecs_tup} do not include an ArrayBytesCodec or a codec castable to an " + "ArrayBytesCodec, such as a NumcodecsWrapper. This is an invalid sequence of codecs." + ) + raise ValueError(msg) + elif len(numcodec_wrapper_idcs) == len(codecs_tup): + # All the codecs are numcodecs wrappers. This means we have no information about which + # codec is array-array, array-bytes, and bytes-bytes, so we we just cast the numcodecs wrappers + # into a sequence of array-array codecs terminated by a single array-bytes codec. + # This choice is almost arbitrary. + # It would be equally valid to convert the first codec to an array-bytes, and the remaining + # codecs to bytes-bytes, or to pick a random codec and convert it to array-bytes, then + # converting all the preceding codecs to array-array, and the following codecs to bytes-bytes. + # But we know from experience that the Zarr V2-style chunk encoding pipelines typically + # start with array-array transformations, so casting all but one of the unknown codecs + # to array-array is a safe choice. + array_bytes_maybe = codecs_tup[-1].to_array_bytes() + array_array = tuple(c.to_array_array() for c in codecs_tup[:-1]) + else: + # There are no array-bytes codecs, there is at least one numcodec wrapper, but there are + # also some array-array and / or bytes-bytes codecs + if len(array_array_idcs) > 0: + # There is at least one array-array codec. We will use it as a reference point for + # casting any numcodecs wrappers. + last_array_array_idx = array_array_idcs[-1][0] + + if last_array_array_idx == len(codecs_tup) - 1: + # The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec. This + # cannot be fixed by converting numcodecs wrappers, so we raise an exception. + raise ValueError( + "The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec." + ) + + for idx, aac in enumerate(codecs_tup[: (last_array_array_idx + 1)]): + # Iterate over the codecs leading up to the last array-array codec. + if isinstance(aac, ArrayArrayCodec): + # Any array-array codec gets added to the list of array-array codecs + array_array += (aac,) + elif isinstance(aac, NumcodecsWrapper): + # Any numcodecs wrapper gets converted to an array-array codec + array_array += (aac.to_array_array(),) + else: + # Any other kind of codec is invalid and we raise an exception. + msg = f"Invalid codec {aac} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + + if isinstance(codecs_tup[last_array_array_idx + 1], NumcodecsWrapper): + # The codec following the last array-array codec is a numcodecs wrapper. + # We will cast it to an array-bytes codec. + array_bytes_maybe = codecs_tup[last_array_array_idx + 1].to_array_bytes() + else: + # The codec following the last array-array codec was a bytes bytes codec, or + # something else entirely. This is invalid and we raise an exception. + msg = ( + f"Invalid codec {codecs_tup[last_array_array_idx + 1]} at index " + f"{last_array_array_idx + 1}." + "Expected a NumcodecsWrapper or an ArrayBytesCodec, got " + f"{type(codecs_tup[last_array_array_idx + 1])}" + ) + raise TypeError(msg) + + start = last_array_array_idx + 2 + for idx, rem in enumerate(codecs_tup[start:]): + # We have already checked the codec after the last array-array codec, so we start + # iterating over the codecs after that. + if isinstance(rem, BytesBytesCodec): + bytes_bytes += (rem,) + elif isinstance(rem, NumcodecsWrapper): + bytes_bytes += (rem.to_bytes_bytes(),) + else: + msg = f"Invalid codec {rem} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) + else: + # there are no array-array codecs, just numcodecs wrappers and bytes-bytes codecs + first_bytes_bytes_idx = bytes_bytes_idcs[0][0] + if first_bytes_bytes_idx == 0: + raise ValueError( + "The first codec is a BytesBytesCodec, but there is no ArrayBytesCodec." + ) + else: + # Iterate over all codecs. Cast all numcodecs wrappers to array-array codecs, until + # the codec immediately prior to the first bytes-bytes codec, which we cast to + # an array-bytes codec. All codecs after that point are cast to bytes-bytes codecs. + for idx, bb_codec in enumerate(codecs_tup): + if idx < first_bytes_bytes_idx - 1: + # This must be a numcodecs wrapper. cast it to array-array + array_array += (bb_codec.to_array_array(),) + elif idx == first_bytes_bytes_idx - 1: + array_bytes_maybe = bb_codec.to_array_bytes() + else: + if isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + elif isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + else: + msg = f"Invalid codec {bb_codec} at index {idx}. Expected a NumcodecsWrapper" + raise TypeError(msg) + + elif len(array_bytes_idcs) == 1: + bb_idx, ab_codec = array_bytes_idcs[0] + array_bytes_maybe = ab_codec + + end = bb_idx + + for idx, aa_codec in enumerate(codecs_tup[:end]): + if isinstance(aa_codec, ArrayArrayCodec): + array_array += (aa_codec,) + elif isinstance(aa_codec, NumcodecsWrapper): + array_array += (aa_codec.to_array_array(),) + else: + msg = f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + start = bb_idx + 1 + if bb_idx < len(codecs_tup) - 1: + for idx, bb_codec in enumerate(codecs_tup[start:]): + if isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + elif isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + else: + msg = f"Invalid codec {bb_codec} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) + else: + raise ValueError("More than one ArrayBytes codec found, that is a big error!") + + return array_array, array_bytes_maybe, bytes_bytes for prev_codec, cur_codec in pairwise((None, *codecs)): if isinstance(cur_codec, ArrayArrayCodec): @@ -540,7 +692,7 @@ def codecs_from_list( f"Got {type(prev_codec)} instead." ) bytes_bytes += (cur_codec,) - else: + elif isinstance(cur_codec, NumcodecsWrapper): raise TypeError if array_bytes_maybe is None: diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index a199dd4399..d4cdebc39c 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -48,7 +48,8 @@ DimensionNames = Iterable[str | None] | None TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) +BaseConfig = Mapping[str, object] +TConfig = TypeVar("TConfig", bound=BaseConfig) class NamedConfig(TypedDict, Generic[TName, TConfig]): diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 532b1695ee..bedd069b6c 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,17 +1,17 @@ from __future__ import annotations import warnings -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -import numcodecs.abc - -from zarr.abc.codec import Numcodec +from zarr.abc.codec import ArrayArrayCodec, Codec, Numcodec from zarr.abc.metadata import Metadata +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json -from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.core.dtype.common import OBJECT_CODEC_IDS +from zarr.registry import get_codec if TYPE_CHECKING: from typing import Literal, Self @@ -20,18 +20,17 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.common import ChunkCoords + from zarr.core.dtype.common import DTypeSpec_V2 from zarr.core.dtype.wrapper import ( TBaseDType, TBaseScalar, TDType_co, TScalar_co, - ZDType, ) import json from dataclasses import dataclass, field, fields, replace -import numcodecs import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -44,6 +43,9 @@ parse_shapelike, ) from zarr.core.config import config, parse_indexing_order +from zarr.core.dtype.wrapper import ( + ZDType, +) from zarr.core.metadata.common import parse_attributes @@ -57,7 +59,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None +CompressorLike_V2: TypeAlias = Mapping[str, JSON] | Numcodec | Codec @dataclass(frozen=True, kw_only=True) @@ -67,9 +69,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[Numcodec, ...] | None = None + filters: tuple[Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: Numcodec | None + compressor: Codec attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -82,8 +84,8 @@ def __init__( fill_value: Any, order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", - compressor: CompressorLikev2 = None, - filters: Iterable[Numcodec | dict[str, JSON]] | None = None, + compressor: CompressorLike_V2 | None = None, + filters: Iterable[CompressorLike_V2] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -91,6 +93,9 @@ def __init__( """ shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) + # TODO: remove this + if not isinstance(dtype, ZDType): + raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -100,6 +105,18 @@ def __init__( fill_value_parsed = dtype.cast_scalar(fill_value) else: fill_value_parsed = fill_value + + array_spec = ArraySpec( + shape=shape_parsed, + dtype=dtype, + fill_value=fill_value_parsed, + config=ArrayConfig.from_dict({}), # TODO: config is not needed here. + prototype=default_buffer_prototype(), # TODO: prototype is not needed here. + ) + if compressor_parsed is not None: + compressor_parsed = compressor_parsed.evolve_from_array_spec(array_spec) + if filters_parsed is not None: + filters_parsed = tuple(fp.evolve_from_array_spec(array_spec) for fp in filters_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -133,10 +150,10 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, indent=json_indent, allow_nan=True).encode() + json.dumps(zarray_dict, indent=json_indent, allow_nan=False).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( - json.dumps(zattrs_dict, indent=json_indent, allow_nan=True).encode() + json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode() ), } @@ -159,6 +176,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: object_codec_id = get_object_codec_id((_compressor,)) # we add a layer of indirection here around the dtype attribute of the array metadata # because we also need to know the object codec id, if any, to resolve the data type + dtype_spec: DTypeSpec_V2 = { "name": data["dtype"], "object_codec_id": object_codec_id, @@ -198,34 +216,24 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): - codec_config = zarray_dict["compressor"].get_config() - # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 - if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config - + if self.compressor is not None: + zarray_dict["compressor"] = self.compressor.to_json(zarr_format=2) + else: + zarray_dict["compressor"] = None + new_filters = [] if zarray_dict["filters"] is not None: - raw_filters = zarray_dict["filters"] - # TODO: remove this when we can stratically type the output JSON data structure - # entirely - if not isinstance(raw_filters, list | tuple): - raise TypeError("Invalid type for filters. Expected a list or tuple.") - new_filters = [] - for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): - new_filters.append(f.get_config()) - else: - new_filters.append(f) - zarray_dict["filters"] = new_filters + new_filters.extend([f.to_json(zarr_format=2) for f in self.filters]) + else: + new_filters = None + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - # pull the "name" attribute out of the dtype spec returned by self.dtype.to_json - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] + # serialize the dtype after fill value-specific JSON encoding + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] # type: ignore[assignment] return zarray_dict @@ -263,20 +271,23 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[Numcodec, ...] | None: +def parse_filters(data: object) -> tuple[ArrayArrayCodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[Numcodec] = [] + out: list[Codec | NumcodecsWrapper] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if isinstance(val, numcodecs.abc.Codec): + if isinstance(val, (Codec, NumcodecsWrapper)): out.append(val) + elif isinstance(val, Numcodec): + out.append(NumcodecsWrapper(codec=val)) elif isinstance(val, dict): - out.append(numcodecs.get_codec(val)) + codec = get_codec(val, zarr_format=2) + out.append(codec) else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -286,20 +297,29 @@ def parse_filters(data: object) -> tuple[Numcodec, ...] | None: else: return tuple(out) # take a single codec instance and wrap it in a tuple - if isinstance(data, numcodecs.abc.Codec): + if isinstance(data, Numcodec): + return (NumcodecsWrapper(codec=data),) + elif isinstance(data, Codec): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> Numcodec | None: +def parse_compressor(data: object) -> Codec | NumcodecsWrapper | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + # TODO: only validate the compressor in one place. currently we do it twice, once in init_array + # and again when constructing metadata + if data is None or isinstance(data, Codec | NumcodecsWrapper): return data + if isinstance(data, Numcodec): + try: + return get_codec(data.get_config(), zarr_format=2) + except KeyError: + return NumcodecsWrapper(codec=data) if isinstance(data, dict): - return numcodecs.get_codec(data) + return get_codec(data, zarr_format=2) msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 6f79fb4b09..26cf35423f 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -30,13 +30,12 @@ ZARR_JSON, ChunkCoords, DimensionNames, - parse_named_configuration, parse_shapelike, ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class +from zarr.registry import get_codec def parse_zarr_format(data: object) -> Literal[3]: @@ -63,8 +62,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: ): # Can't use Codec here because of mypy limitation out += (c,) else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - out += (get_codec_class(name_parsed).from_dict(c),) + out += (get_codec(c, zarr_format=3),) return out @@ -82,9 +80,13 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseScalar]) -> None: """Check that the codecs are valid for the given dtype""" + # avoid circular import from zarr.codecs.sharding import ShardingCodec + from zarr.core.codec_pipeline import codecs_from_list + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) + _codecs = (*array_array_codecs, array_bytes_codec, *bytes_bytes_codecs) - abc = validate_array_bytes_codec(codecs) + abc = validate_array_bytes_codec(_codecs) # Recursively resolve array-bytes codecs within sharding codecs while isinstance(abc, ShardingCodec): @@ -288,7 +290,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: d = self.to_dict() return { ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(d, allow_nan=True, indent=json_indent).encode() + json.dumps(d, allow_nan=False, indent=json_indent).encode() ) } @@ -335,6 +337,10 @@ def to_dict(self) -> dict[str, JSON]: if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") + out_dict["codecs"] = () + for codec in self.codecs: + out_dict["codecs"] += (codec.to_json(zarr_format=3),) + # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` # until then, we have this hack here, which relies on the fact that to_dict will pass through diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 189d42abed..661c4a4657 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -5,6 +5,7 @@ from importlib.metadata import entry_points as get_entry_points from typing import TYPE_CHECKING, Any, Generic, TypeVar +from zarr.codecs._v2 import NumcodecsWrapper from zarr.core.config import BadConfigError, config from zarr.core.dtype import data_type_registry @@ -16,10 +17,11 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON, CodecPipeline, ) from zarr.core.buffer import Buffer, NDBuffer - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat __all__ = [ "Registry", @@ -141,15 +143,18 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: +def _get_codec_class( + key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False +) -> type[Codec]: if reload_config: _reload_config() - if key in __codec_registries: + if key in registry: # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) - __codec_registries[key].lazy_load() + registry[key].lazy_load() + + codec_classes = registry[key] - codec_classes = __codec_registries[key] if not codec_classes: raise KeyError(key) @@ -169,6 +174,47 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: raise KeyError(key) +def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec: + """ + Get an instance of a codec from a name and a configuration + """ + # avoid circular import + from zarr.codecs._numcodecs import get_numcodec + + codec_name: str + if zarr_format == 2: + if isinstance(request, str): + raise TypeError( + f"Invalid request type {type(request)} for zarr format 2. Expected dict, got {request!r}" + ) + else: + codec_name = request["id"] + codec_config = {k: v for k, v in request.items() if k != "id"} + elif zarr_format == 3: + if isinstance(request, str): + codec_name = request + codec_config = {} + else: + codec_name = request["name"] + codec_config = request.get("configuration", {}) + else: + raise ValueError( + f"Invalid zarr format. Must be 2 or 3, got {zarr_format!r}" + ) # pragma: no cover + + try: + codec_cls = get_codec_class(codec_name) + return codec_cls.from_json(request, zarr_format=zarr_format) + except KeyError: + # if we can't find the codec in the zarr python registry, try the numcodecs registry + codec = get_numcodec(request) + return NumcodecsWrapper(codec=codec) + + +def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: + return _get_codec_class(key, __codec_registries, reload_config=reload_config) + + def _resolve_codec(data: dict[str, JSON]) -> Codec: """ Get a codec instance from a dict representation of that codec. @@ -177,19 +223,28 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. """ + # avoid circular import, AKA a sign that this function is in the wrong place from zarr.abc.codec import BytesBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsBytesBytesCodec, NumcodecsWrapper + result: BytesBytesCodec if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_bytes_bytes() if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsBytesBytesCodec(codec=data) else: if not isinstance(data, BytesBytesCodec): raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") @@ -197,19 +252,26 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: +def _parse_array_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_bytes() if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayBytesCodec(codec=data) else: if not isinstance(data, ArrayBytesCodec): raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") @@ -217,19 +279,26 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: +def _parse_array_array_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayArrayCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) - if not isinstance(result, ArrayArrayCodec): + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_array() + elif not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayArrayCodec(codec=data) else: if not isinstance(data, ArrayArrayCodec): raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") From 84c9780616399512412c838ac54d8d68dcb65216 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:15:37 +0200 Subject: [PATCH 22/54] avoid circular imports by importing lower-level routines exactly where needed --- src/zarr/abc/store.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 1fbdb3146c..31e9728f8a 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -6,10 +6,6 @@ from itertools import starmap from typing import TYPE_CHECKING, Protocol, runtime_checkable -from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.common import concurrent_map -from zarr.core.config import config - if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable from types import TracebackType @@ -438,6 +434,8 @@ async def getsize(self, key: str) -> int: # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. + from zarr.core.buffer.core import default_buffer_prototype + value = await self.get(key, prototype=default_buffer_prototype()) if value is None: raise FileNotFoundError(key) @@ -476,6 +474,9 @@ async def getsize_prefix(self, prefix: str) -> int: # on to getting sizes. Ideally we would overlap those two, which should # improve tail latency and might reduce memory pressure (since not all keys # would be in memory at once). + from zarr.core.common import concurrent_map + from zarr.core.config import config + keys = [(x,) async for x in self.list_prefix(prefix)] limit = config.get("async.concurrency") sizes = await concurrent_map(keys, self.getsize, limit=limit) From 9a2f35ba7efd639caa9bd7d92414bfa82f7b509b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:16:26 +0200 Subject: [PATCH 23/54] push numcodec prototol into abcs; remove all numcodecs.abc.Codec type annotations --- src/zarr/abc/numcodec.py | 59 +++++++++++++++++++ src/zarr/api/asynchronous.py | 2 +- src/zarr/api/synchronous.py | 2 +- src/zarr/codecs/_numcodecs.py | 30 ---------- src/zarr/codecs/_v2.py | 59 +------------------ src/zarr/core/_info.py | 7 +-- src/zarr/core/array.py | 47 +++++++-------- src/zarr/core/metadata/v2.py | 39 ++++++------ src/zarr/registry.py | 30 ++++++++++ tests/test_array.py | 18 +++--- tests/test_codecs/test_numcodecs.py | 4 +- .../test_v2_dtype_regression.py | 8 +-- 12 files changed, 154 insertions(+), 151 deletions(-) create mode 100644 src/zarr/abc/numcodec.py delete mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py new file mode 100644 index 0000000000..d2c9380146 --- /dev/null +++ b/src/zarr/abc/numcodec.py @@ -0,0 +1,59 @@ +from typing import Self, TypeGuard + +from typing_extensions import Protocol + +from zarr.abc.codec import CodecJSON_V2 +from zarr.core.buffer import Buffer, NDBuffer + + +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: str + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index d8d3a5f21d..9137d6395c 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -47,7 +47,7 @@ from collections.abc import Iterable from zarr.abc.codec import Codec - from zarr.codecs._v2 import Numcodec + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index df667c0b23..a368d37a5b 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -18,8 +18,8 @@ import numpy.typing as npt from zarr.abc.codec import Codec + from zarr.abc.numcodec import Numcodec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py deleted file mode 100644 index b00f258db5..0000000000 --- a/src/zarr/codecs/_numcodecs.py +++ /dev/null @@ -1,30 +0,0 @@ -from zarr.abc.codec import CodecJSON_V2 -from zarr.codecs._v2 import Numcodec - - -def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: - """ - Resolve a numcodec codec from the numcodecs registry. - - This requires the Numcodecs package to be installed. - - Parameters - ---------- - data : CodecJSON_V2 - The JSON metadata for the codec. - - Returns - ------- - codec : Numcodec - - Examples - -------- - - >>> codec = get_codec({'id': 'zlib', 'level': 1}) - >>> codec - Zlib(level=1) - """ - - from numcodecs.registry import get_codec - - return get_codec(data) # type: ignore[no-any-return] diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 8deae99a6d..92eda38226 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,73 +2,20 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard +from typing import TYPE_CHECKING import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 +from zarr.abc.codec import ArrayBytesCodec from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: + from zarr.abc.numcodec import Numcodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -class Numcodec(Protocol): - """ - A protocol that models the ``numcodecs.abc.Codec`` interface. - """ - - codec_id: ClassVar[str] - - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... - - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... - - def get_config(self) -> CodecJSON_V2[str]: ... - - @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... - - -def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: - """ - Check if the given object implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return _is_numcodec_cls(type(obj)) - - -def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: - """ - Check if the given object is a class implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return ( - isinstance(obj, type) - and hasattr(obj, "codec_id") - and isinstance(obj.codec_id, str) - and hasattr(obj, "encode") - and callable(obj.encode) - and hasattr(obj, "decode") - and callable(obj.decode) - and hasattr(obj, "get_config") - and callable(obj.get_config) - and hasattr(obj, "from_config") - and callable(obj.from_config) - ) - - @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): filters: tuple[Numcodec, ...] | None diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index a5b14d573a..fef424346a 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,9 +5,8 @@ from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: - import numcodecs.abc - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec + from zarr.abc.numcodec import Numcodec from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -88,9 +87,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _filters: tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] = () _serializer: ArrayBytesCodec | None = None - _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () + _compressors: tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 68a4694a55..85cd84ae1b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -19,15 +19,14 @@ ) from warnings import warn -import numcodecs -import numcodecs.abc import numpy as np from typing_extensions import deprecated import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import Numcodec, V2Codec +from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -1033,7 +1032,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1062,7 +1061,7 @@ def serializer(self) -> ArrayBytesCodec | None: @property @deprecated("Use AsyncArray.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -1075,7 +1074,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: raise TypeError("`compressor` is not available for Zarr format 3 arrays.") @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -2227,7 +2226,7 @@ def fill_value(self) -> Any: return self.metadata.fill_value @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -2243,7 +2242,7 @@ def serializer(self) -> None | ArrayBytesCodec: @property @deprecated("Use Array.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -2254,7 +2253,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: return self._async_array.compressor @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3900,15 +3899,13 @@ def _build_parents( FiltersLike: TypeAlias = ( Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec - | Iterable[numcodecs.abc.Codec] - | numcodecs.abc.Codec + | Iterable[Numcodec] + | Numcodec | Literal["auto"] | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = ( - dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None -) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None CompressorsLike: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] @@ -4775,7 +4772,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ Given a data type, return the default filters for that data type. @@ -4797,7 +4794,7 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> Numcodec: """ Given a data type, return the default compressors for that data type. @@ -4805,7 +4802,7 @@ def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: """ from numcodecs import Zstd - return Zstd(level=0, checksum=False) + return Zstd(level=0, checksum=False) # type: ignore[no-any-return] def _parse_chunk_encoding_v2( @@ -4813,12 +4810,12 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Numcodec, ...] | None, Numcodec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - _filters: tuple[numcodecs.abc.Codec, ...] | None - _compressor: numcodecs.abc.Codec | None + _filters: tuple[Numcodec, ...] | None + _compressor: Numcodec | None if compressor is None or compressor == (): _compressor = None @@ -4839,7 +4836,7 @@ def _parse_chunk_encoding_v2( else: if isinstance(filters, Iterable): for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): + if not _is_numcodec(f): msg = ( "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." @@ -4852,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.get_config() for f in _filters], # type: ignore[arg-type] + _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] ) ) if object_codec_id is None: @@ -4944,7 +4941,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) # type: ignore[assignment] + compressors = (compressor,) elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 17af3538a9..0c5d09583f 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -5,12 +5,12 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -import numcodecs.abc - from zarr.abc.metadata import Metadata +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.registry import get_numcodec if TYPE_CHECKING: from typing import Literal, Self @@ -30,7 +30,6 @@ import json from dataclasses import dataclass, field, fields, replace -import numcodecs import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -56,7 +55,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None @dataclass(frozen=True, kw_only=True) @@ -66,9 +65,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[numcodecs.abc.Codec, ...] | None = None + filters: tuple[Numcodec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None + compressor: Numcodec | None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -82,7 +81,7 @@ def __init__( order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", compressor: CompressorLikev2 = None, - filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, + filters: Iterable[Numcodec | dict[str, JSON]] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -197,12 +196,12 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): + if _is_numcodec(zarray_dict["compressor"]): codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config + codec_config.pop("checksum") # type: ignore[typeddict-item] + zarray_dict["compressor"] = codec_config # type: ignore[assignment] if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -212,11 +211,11 @@ def to_dict(self) -> dict[str, JSON]: raise TypeError("Invalid type for filters. Expected a list or tuple.") new_filters = [] for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): + if _is_numcodec(f): new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters + zarray_dict["filters"] = new_filters # type: ignore[assignment] # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: @@ -262,20 +261,20 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: +def parse_filters(data: object) -> tuple[Numcodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[numcodecs.abc.Codec] = [] + out: list[Numcodec] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if isinstance(val, numcodecs.abc.Codec): + if _is_numcodec(val): out.append(val) elif isinstance(val, dict): - out.append(numcodecs.get_codec(val)) + out.append(get_numcodec(val)) # type: ignore[arg-type] else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -285,20 +284,20 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: return tuple(out) # take a single codec instance and wrap it in a tuple - if isinstance(data, numcodecs.abc.Codec): + if _is_numcodec(data): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> Numcodec | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + if data is None or _is_numcodec(data): return data if isinstance(data, dict): - return numcodecs.get_codec(data) + return get_numcodec(data) # type: ignore[arg-type] msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 189d42abed..879adb2058 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -16,8 +16,10 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON_V2, CodecPipeline, ) + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON @@ -278,3 +280,31 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: _collect_entrypoints() + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_codec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] diff --git a/tests/test_array.py b/tests/test_array.py index 6342ce6430..a6b829041a 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -69,6 +69,7 @@ from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: + from zarr.abc.codec import CodecJSON_V3 from zarr.core.metadata.v3 import ArrayV3Metadata @@ -1315,11 +1316,11 @@ async def test_v2_chunk_encoding( assert arr.metadata.filters == filters_expected # Normalize for property getters - compressor_expected = () if compressor_expected is None else (compressor_expected,) - filters_expected = () if filters_expected is None else filters_expected + arr_compressors_expected = () if compressor_expected is None else (compressor_expected,) + arr_filters_expected = () if filters_expected is None else filters_expected - assert arr.compressors == compressor_expected - assert arr.filters == filters_expected + assert arr.compressors == arr_compressors_expected + assert arr.filters == arr_filters_expected @staticmethod @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) @@ -1357,11 +1358,12 @@ async def test_default_filters_compressors( if default_filters is None: expected_filters = () else: - expected_filters = default_filters + expected_filters = default_filters # type: ignore[assignment] + if default_compressors is None: expected_compressors = () else: - expected_compressors = (default_compressors,) + expected_compressors = (default_compressors,) # type: ignore[assignment] expected_serializer = None else: raise ValueError(f"Invalid zarr_format: {zarr_format}") @@ -1665,7 +1667,7 @@ def test_roundtrip_numcodecs() -> None: {"name": "numcodecs.shuffle", "configuration": {"elementsize": 2}}, {"name": "numcodecs.zlib", "configuration": {"level": 4}}, ] - filters = [ + filters: list[CodecJSON_V3] = [ { "name": "numcodecs.fixedscaleoffset", "configuration": { @@ -1685,7 +1687,7 @@ def test_roundtrip_numcodecs() -> None: chunks=(720, 1440), dtype="float64", compressors=compressors, # type: ignore[arg-type] - filters=filters, + filters=filters, # type: ignore[arg-type] fill_value=-9.99, dimension_names=["lat", "lon"], ) diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index bb381c615a..1c4d550587 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -2,8 +2,8 @@ from numcodecs import GZip -from zarr.codecs._numcodecs import get_numcodec -from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls +from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls +from zarr.registry import get_numcodec def test_get_numcodec() -> None: diff --git a/tests/test_regression/test_v2_dtype_regression.py b/tests/test_regression/test_v2_dtype_regression.py index 9702ca7d23..ffe273490d 100644 --- a/tests/test_regression/test_v2_dtype_regression.py +++ b/tests/test_regression/test_v2_dtype_regression.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal -import numcodecs import numpy as np import pytest from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd @@ -13,6 +12,7 @@ import zarr.abc import zarr.abc.codec import zarr.codecs as zarrcodecs +from zarr.abc.numcodec import Numcodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.bytes import VariableLengthBytes @@ -40,12 +40,12 @@ def runner_installed() -> bool: class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] fill_value: np.generic | str | int | bytes - filters: tuple[numcodecs.abc.Codec, ...] = () + filters: tuple[Numcodec, ...] = () serializer: str | None = None - compressor: numcodecs.abc.Codec + compressor: Numcodec -basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() +basic_codecs: tuple[Numcodec, ...] = GZip(), Blosc(), LZ4(), LZMA(), Zstd() basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" string_dtypes = "U4" From 0d0712f0974c243e376ba2fd9de0078be1978bae Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:32:56 +0200 Subject: [PATCH 24/54] add tests for codecjson typeguard --- tests/test_abc/__init__.py | 0 tests/test_abc/test_codec.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 tests/test_abc/__init__.py create mode 100644 tests/test_abc/test_codec.py diff --git a/tests/test_abc/__init__.py b/tests/test_abc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_abc/test_codec.py b/tests/test_abc/test_codec.py new file mode 100644 index 0000000000..e0f9ddb7bb --- /dev/null +++ b/tests/test_abc/test_codec.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from zarr.abc.codec import _check_codecjson_v2 + + +def test_check_codecjson_v2_valid() -> None: + """ + Test that the _check_codecjson_v2 function works + """ + assert _check_codecjson_v2({"id": "gzip"}) + assert not _check_codecjson_v2({"id": 10}) + assert not _check_codecjson_v2([10, 11]) From 931bf2fd35f909a9eb2458fba25cb795207798de Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 17:01:52 +0200 Subject: [PATCH 25/54] avoid using zarr's buffer / ndbuffer for numcodec encode / decode --- src/zarr/abc/numcodec.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index d2c9380146..db6ff4655a 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,9 +1,8 @@ from typing import Self, TypeGuard -from typing_extensions import Protocol +from typing_extensions import Buffer, Protocol from zarr.abc.codec import CodecJSON_V2 -from zarr.core.buffer import Buffer, NDBuffer class Numcodec(Protocol): @@ -13,11 +12,9 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + def encode(self, buf: Buffer) -> Buffer: ... - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... + def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... def get_config(self) -> CodecJSON_V2[str]: ... From 01bd4b71d132f09673aeca5b8f9d983b1f2ea6b6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 19:12:35 +0200 Subject: [PATCH 26/54] use Any to model input / output types of numcodec protocol --- src/zarr/abc/numcodec.py | 14 ++++++-------- src/zarr/codecs/_v2.py | 14 ++++++-------- src/zarr/core/array.py | 6 +++--- src/zarr/core/metadata/v2.py | 6 +++--- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index db6ff4655a..c671428388 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,8 +1,6 @@ -from typing import Self, TypeGuard +from typing import Any, Self, TypeGuard -from typing_extensions import Buffer, Protocol - -from zarr.abc.codec import CodecJSON_V2 +from typing_extensions import Protocol class Numcodec(Protocol): @@ -12,14 +10,14 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer) -> Buffer: ... + def encode(self, buf: Any) -> Any: ... - def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... + def decode(self, buf: Any, out: Any | None = None) -> Any: ... - def get_config(self) -> CodecJSON_V2[str]: ... + def get_config(self) -> Any: ... @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + def from_config(cls, config: Any) -> Self: ... def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 92eda38226..3c6c99c21c 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -31,9 +31,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] + chunk = await asyncio.to_thread(self.compressor.decode, cdata) else: - chunk = cdata # type: ignore[assignment] + chunk = cdata # apply filters if self.filters: @@ -54,7 +54,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -83,18 +83,16 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] - + chunk = await asyncio.to_thread(f.encode, chunk) # check object encoding if ensure_ndarray_like(chunk).dtype == object: raise RuntimeError("cannot write object array without object codec") # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] + cdata = await asyncio.to_thread(self.compressor.encode, chunk) else: - cdata = chunk # type: ignore[assignment] - + cdata = chunk cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 85cd84ae1b..02a97aee0f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4849,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] + object_codec_id = get_object_codec_id((_compressor.get_config(),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], # type: ignore[arg-type] - _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] + *[f.get_config() for f in _filters], + _compressor.get_config() if _compressor is not None else None, ) ) if object_codec_id is None: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 0c5d09583f..ae1d44c44d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -200,8 +200,8 @@ def to_dict(self) -> dict[str, JSON]: codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") # type: ignore[typeddict-item] - zarray_dict["compressor"] = codec_config # type: ignore[assignment] + codec_config.pop("checksum") + zarray_dict["compressor"] = codec_config if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -215,7 +215,7 @@ def to_dict(self) -> dict[str, JSON]: new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters # type: ignore[assignment] + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: From f06c6aa4bf30db5a768a374376d25b2a99289901 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 15:44:18 +0200 Subject: [PATCH 27/54] add numcodec protocol --- src/zarr/abc/codec.py | 28 ++++++++++++- src/zarr/api/asynchronous.py | 5 +-- src/zarr/api/synchronous.py | 4 +- src/zarr/codecs/_numcodecs.py | 37 ++++++++++++++++ src/zarr/codecs/_v2.py | 77 ++++++++++++++++++++++++++++------ src/zarr/core/array.py | 16 +++---- tests/test_api.py | 2 +- tests/test_codecs/test_vlen.py | 2 +- 8 files changed, 141 insertions(+), 30 deletions(-) create mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index f8a5447a70..d5c995d2ca 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,11 +1,14 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar +from collections.abc import Mapping +from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar + +from typing_extensions import ReadOnly, TypedDict from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -34,6 +37,27 @@ CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) +TName = TypeVar("TName", bound=str, covariant=True) + + +class CodecJSON_V2(TypedDict, Generic[TName]): + """The JSON representation of a codec for Zarr V2""" + + id: ReadOnly[TName] + + +def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]: + return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str) + + +CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]] +"""The JSON representation of a codec for Zarr V3.""" + +# The widest type we will *accept* for a codec JSON +# This covers v2 and v3 +CodecJSON = str | Mapping[str, object] +"""The widest type of JSON-like input that could specify a codec.""" + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 78b68caf73..861557b5f3 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -52,9 +52,8 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc - from zarr.abc.codec import Codec + from zarr.codecs._v2 import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike @@ -877,7 +876,7 @@ async def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index ed1ae2cf2a..6db173d5d4 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -15,12 +15,12 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc import numpy as np import numpy.typing as npt from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, @@ -610,7 +610,7 @@ def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py new file mode 100644 index 0000000000..4a8f43b5c6 --- /dev/null +++ b/src/zarr/codecs/_numcodecs.py @@ -0,0 +1,37 @@ +import numcodecs.registry as numcodecs_registry + +from zarr.abc.codec import CodecJSON_V2 +from zarr.codecs._v2 import Numcodec + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + codec_id = data["id"] + cls = numcodecs_registry.codec_registry.get(codec_id) + if cls is None and data in numcodecs_registry.entries: + cls = numcodecs_registry.entries[data].load() + numcodecs_registry.register_codec(cls, codec_id=data) + if cls is not None: + return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] + raise KeyError(data) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 08853f27f1..8deae99a6d 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,26 +2,77 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard -import numcodecs import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like +from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: - import numcodecs.abc - from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: ClassVar[str] + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): - filters: tuple[numcodecs.abc.Codec, ...] | None - compressor: numcodecs.abc.Codec | None + filters: tuple[Numcodec, ...] | None + compressor: Numcodec | None is_fixed_size = False @@ -33,9 +84,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) + chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] else: - chunk = cdata + chunk = cdata # type: ignore[assignment] # apply filters if self.filters: @@ -56,7 +107,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -85,7 +136,7 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) + chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] # check object encoding if ensure_ndarray_like(chunk).dtype == object: @@ -93,9 +144,9 @@ async def _encode_single( # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) + cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] else: - cdata = chunk + cdata = chunk # type: ignore[assignment] cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ca8bc414cc..1d939771cb 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -27,7 +27,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec +from zarr.codecs._v2 import Numcodec, V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -607,7 +607,7 @@ async def _create( chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", # runtime overwrite: bool = False, @@ -818,7 +818,7 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLikev2 = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: @@ -856,7 +856,7 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, @@ -3898,7 +3898,7 @@ def _build_parents( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec @@ -3911,10 +3911,10 @@ def _build_parents( ) CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] | dict[str, JSON] | BytesBytesCodec - | numcodecs.abc.Codec + | Numcodec | Literal["auto"] | None ) @@ -4944,7 +4944,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) + compressors = (compressor,) # type: ignore[assignment] elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/tests/test_api.py b/tests/test_api.py index 12acf80589..69fc9b5b16 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1283,7 +1283,7 @@ def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None: dtype=src.dtype, overwrite=True, zarr_format=zarr_format, - compressors=compressors, + compressors=compressors, # type: ignore[arg-type] ) z[:10, :10] = src[:10, :10] diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 6fe1863464..cf0905daca 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -40,7 +40,7 @@ def test_vlen_string( chunks=data.shape, dtype=data.dtype, fill_value="", - compressors=compressor, + compressors=compressor, # type: ignore[arg-type] ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy From b71e8ac1c41c7cb81195d22e72ef1a611f5cc815 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:25:36 +0200 Subject: [PATCH 28/54] add tests for numcodecs compatibility --- tests/test_codecs/test_numcodecs.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/test_codecs/test_numcodecs.py diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py new file mode 100644 index 0000000000..a3824cc386 --- /dev/null +++ b/tests/test_codecs/test_numcodecs.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from numcodecs import GZip + +from zarr.codecs._numcodecs import get_numcodec +from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls + + +def test_get_numcodec() -> None: + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + + +def test_is_numcodec() -> None: + """ + Test the _is_numcodec function + """ + assert _is_numcodec(GZip()) + + +def test_is_numcodec_cls() -> None: + """ + Test the _is_numcodec_cls function + """ + assert _is_numcodec_cls(GZip) From bcaa9ee5428312f03c44fb10d1d9ad7b3970d475 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:32:31 +0200 Subject: [PATCH 29/54] changelog --- changes/3318.misc.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/3318.misc.rst diff --git a/changes/3318.misc.rst b/changes/3318.misc.rst new file mode 100644 index 0000000000..f8308e6b97 --- /dev/null +++ b/changes/3318.misc.rst @@ -0,0 +1,2 @@ +Define a ``Protocol`` to model the ``numcodecs.abc.Codec`` interface. This is groundwork toward +making ``numcodecs`` an optional dependency for ``zarr-python``. \ No newline at end of file From 7e49f39f46ebe262f4027ec45c9c5ed839872989 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:43:41 +0200 Subject: [PATCH 30/54] ignore unknown key --- tests/test_codecs/test_numcodecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index a3824cc386..bb381c615a 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -7,7 +7,7 @@ def test_get_numcodec() -> None: - assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) # type: ignore[typeddict-unknown-key] def test_is_numcodec() -> None: From 4b53f5df298c3c6156685ac960ac502f501a49b4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 1 Aug 2025 11:05:43 +0200 Subject: [PATCH 31/54] remove re-implementation of get_codec --- src/zarr/codecs/_numcodecs.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py index 4a8f43b5c6..b00f258db5 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/_numcodecs.py @@ -1,5 +1,3 @@ -import numcodecs.registry as numcodecs_registry - from zarr.abc.codec import CodecJSON_V2 from zarr.codecs._v2 import Numcodec @@ -22,16 +20,11 @@ def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: Examples -------- - >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec = get_codec({'id': 'zlib', 'level': 1}) >>> codec Zlib(level=1) """ - codec_id = data["id"] - cls = numcodecs_registry.codec_registry.get(codec_id) - if cls is None and data in numcodecs_registry.entries: - cls = numcodecs_registry.entries[data].load() - numcodecs_registry.register_codec(cls, codec_id=data) - if cls is not None: - return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] - raise KeyError(data) + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] From b35e6c96ac2b4ee48457ec5d5482dd1e813696c7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:15:37 +0200 Subject: [PATCH 32/54] avoid circular imports by importing lower-level routines exactly where needed --- src/zarr/abc/store.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 1fbdb3146c..31e9728f8a 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -6,10 +6,6 @@ from itertools import starmap from typing import TYPE_CHECKING, Protocol, runtime_checkable -from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.common import concurrent_map -from zarr.core.config import config - if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable from types import TracebackType @@ -438,6 +434,8 @@ async def getsize(self, key: str) -> int: # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. + from zarr.core.buffer.core import default_buffer_prototype + value = await self.get(key, prototype=default_buffer_prototype()) if value is None: raise FileNotFoundError(key) @@ -476,6 +474,9 @@ async def getsize_prefix(self, prefix: str) -> int: # on to getting sizes. Ideally we would overlap those two, which should # improve tail latency and might reduce memory pressure (since not all keys # would be in memory at once). + from zarr.core.common import concurrent_map + from zarr.core.config import config + keys = [(x,) async for x in self.list_prefix(prefix)] limit = config.get("async.concurrency") sizes = await concurrent_map(keys, self.getsize, limit=limit) From deef94ae9ffc7505fb8b496fa140ee669f7b23a1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:16:26 +0200 Subject: [PATCH 33/54] push numcodec prototol into abcs; remove all numcodecs.abc.Codec type annotations --- src/zarr/abc/numcodec.py | 59 +++++++++++++++++++ src/zarr/api/asynchronous.py | 2 +- src/zarr/api/synchronous.py | 2 +- src/zarr/codecs/_numcodecs.py | 30 ---------- src/zarr/codecs/_v2.py | 59 +------------------ src/zarr/core/_info.py | 7 +-- src/zarr/core/array.py | 47 +++++++-------- src/zarr/core/metadata/v2.py | 39 ++++++------ src/zarr/registry.py | 30 ++++++++++ tests/test_array.py | 16 ++--- tests/test_codecs/test_numcodecs.py | 4 +- .../test_v2_dtype_regression.py | 8 +-- 12 files changed, 153 insertions(+), 150 deletions(-) create mode 100644 src/zarr/abc/numcodec.py delete mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py new file mode 100644 index 0000000000..d2c9380146 --- /dev/null +++ b/src/zarr/abc/numcodec.py @@ -0,0 +1,59 @@ +from typing import Self, TypeGuard + +from typing_extensions import Protocol + +from zarr.abc.codec import CodecJSON_V2 +from zarr.core.buffer import Buffer, NDBuffer + + +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: str + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 861557b5f3..a044ba8594 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -53,7 +53,7 @@ from collections.abc import Iterable from zarr.abc.codec import Codec - from zarr.codecs._v2 import Numcodec + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 6db173d5d4..50a1c0fa20 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -19,8 +19,8 @@ import numpy.typing as npt from zarr.abc.codec import Codec + from zarr.abc.numcodec import Numcodec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py deleted file mode 100644 index b00f258db5..0000000000 --- a/src/zarr/codecs/_numcodecs.py +++ /dev/null @@ -1,30 +0,0 @@ -from zarr.abc.codec import CodecJSON_V2 -from zarr.codecs._v2 import Numcodec - - -def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: - """ - Resolve a numcodec codec from the numcodecs registry. - - This requires the Numcodecs package to be installed. - - Parameters - ---------- - data : CodecJSON_V2 - The JSON metadata for the codec. - - Returns - ------- - codec : Numcodec - - Examples - -------- - - >>> codec = get_codec({'id': 'zlib', 'level': 1}) - >>> codec - Zlib(level=1) - """ - - from numcodecs.registry import get_codec - - return get_codec(data) # type: ignore[no-any-return] diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 8deae99a6d..92eda38226 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,73 +2,20 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard +from typing import TYPE_CHECKING import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 +from zarr.abc.codec import ArrayBytesCodec from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: + from zarr.abc.numcodec import Numcodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -class Numcodec(Protocol): - """ - A protocol that models the ``numcodecs.abc.Codec`` interface. - """ - - codec_id: ClassVar[str] - - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... - - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... - - def get_config(self) -> CodecJSON_V2[str]: ... - - @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... - - -def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: - """ - Check if the given object implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return _is_numcodec_cls(type(obj)) - - -def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: - """ - Check if the given object is a class implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return ( - isinstance(obj, type) - and hasattr(obj, "codec_id") - and isinstance(obj.codec_id, str) - and hasattr(obj, "encode") - and callable(obj.encode) - and hasattr(obj, "decode") - and callable(obj.decode) - and hasattr(obj, "get_config") - and callable(obj.get_config) - and hasattr(obj, "from_config") - and callable(obj.from_config) - ) - - @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): filters: tuple[Numcodec, ...] | None diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index a5b14d573a..fef424346a 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,9 +5,8 @@ from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: - import numcodecs.abc - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec + from zarr.abc.numcodec import Numcodec from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -88,9 +87,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _filters: tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] = () _serializer: ArrayBytesCodec | None = None - _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () + _compressors: tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1d939771cb..71de2f58f5 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -19,15 +19,14 @@ ) from warnings import warn -import numcodecs -import numcodecs.abc import numpy as np from typing_extensions import deprecated import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import Numcodec, V2Codec +from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -1033,7 +1032,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1062,7 +1061,7 @@ def serializer(self) -> ArrayBytesCodec | None: @property @deprecated("Use AsyncArray.compressors instead.", category=ZarrDeprecationWarning) - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -1075,7 +1074,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: raise TypeError("`compressor` is not available for Zarr format 3 arrays.") @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -2227,7 +2226,7 @@ def fill_value(self) -> Any: return self.metadata.fill_value @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -2243,7 +2242,7 @@ def serializer(self) -> None | ArrayBytesCodec: @property @deprecated("Use Array.compressors instead.", category=ZarrDeprecationWarning) - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -2254,7 +2253,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: return self._async_array.compressor @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3900,15 +3899,13 @@ def _build_parents( FiltersLike: TypeAlias = ( Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec - | Iterable[numcodecs.abc.Codec] - | numcodecs.abc.Codec + | Iterable[Numcodec] + | Numcodec | Literal["auto"] | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = ( - dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None -) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None CompressorsLike: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] @@ -4775,7 +4772,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ Given a data type, return the default filters for that data type. @@ -4797,7 +4794,7 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> Numcodec: """ Given a data type, return the default compressors for that data type. @@ -4805,7 +4802,7 @@ def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: """ from numcodecs import Zstd - return Zstd(level=0, checksum=False) + return Zstd(level=0, checksum=False) # type: ignore[no-any-return] def _parse_chunk_encoding_v2( @@ -4813,12 +4810,12 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Numcodec, ...] | None, Numcodec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - _filters: tuple[numcodecs.abc.Codec, ...] | None - _compressor: numcodecs.abc.Codec | None + _filters: tuple[Numcodec, ...] | None + _compressor: Numcodec | None if compressor is None or compressor == (): _compressor = None @@ -4839,7 +4836,7 @@ def _parse_chunk_encoding_v2( else: if isinstance(filters, Iterable): for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): + if not _is_numcodec(f): msg = ( "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." @@ -4852,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.get_config() for f in _filters], # type: ignore[arg-type] + _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] ) ) if object_codec_id is None: @@ -4944,7 +4941,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) # type: ignore[assignment] + compressors = (compressor,) elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 9ad6b3bc42..934befec91 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -5,13 +5,13 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -import numcodecs.abc - from zarr.abc.metadata import Metadata +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 from zarr.errors import ZarrUserWarning +from zarr.registry import get_numcodec if TYPE_CHECKING: from typing import Literal, Self @@ -31,7 +31,6 @@ import json from dataclasses import dataclass, field, fields, replace -import numcodecs import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -57,7 +56,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None @dataclass(frozen=True, kw_only=True) @@ -67,9 +66,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[numcodecs.abc.Codec, ...] | None = None + filters: tuple[Numcodec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None + compressor: Numcodec | None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -83,7 +82,7 @@ def __init__( order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", compressor: CompressorLikev2 = None, - filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, + filters: Iterable[Numcodec | dict[str, JSON]] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -198,12 +197,12 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): + if _is_numcodec(zarray_dict["compressor"]): codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config + codec_config.pop("checksum") # type: ignore[typeddict-item] + zarray_dict["compressor"] = codec_config # type: ignore[assignment] if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -213,11 +212,11 @@ def to_dict(self) -> dict[str, JSON]: raise TypeError("Invalid type for filters. Expected a list or tuple.") new_filters = [] for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): + if _is_numcodec(f): new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters + zarray_dict["filters"] = new_filters # type: ignore[assignment] # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: @@ -263,20 +262,20 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: +def parse_filters(data: object) -> tuple[Numcodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[numcodecs.abc.Codec] = [] + out: list[Numcodec] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if isinstance(val, numcodecs.abc.Codec): + if _is_numcodec(val): out.append(val) elif isinstance(val, dict): - out.append(numcodecs.get_codec(val)) + out.append(get_numcodec(val)) # type: ignore[arg-type] else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -286,20 +285,20 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: return tuple(out) # take a single codec instance and wrap it in a tuple - if isinstance(data, numcodecs.abc.Codec): + if _is_numcodec(data): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> Numcodec | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + if data is None or _is_numcodec(data): return data if isinstance(data, dict): - return numcodecs.get_codec(data) + return get_numcodec(data) # type: ignore[arg-type] msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index fc3ffd7f7c..46216205f7 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -17,8 +17,10 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON_V2, CodecPipeline, ) + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON @@ -280,3 +282,31 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: _collect_entrypoints() + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_codec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] diff --git a/tests/test_array.py b/tests/test_array.py index 46b78de7bf..6a11682f98 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -73,6 +73,7 @@ from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: + from zarr.abc.codec import CodecJSON_V3 from zarr.core.metadata.v3 import ArrayV3Metadata @@ -1322,11 +1323,11 @@ async def test_v2_chunk_encoding( assert arr.metadata.filters == filters_expected # Normalize for property getters - compressor_expected = () if compressor_expected is None else (compressor_expected,) - filters_expected = () if filters_expected is None else filters_expected + arr_compressors_expected = () if compressor_expected is None else (compressor_expected,) + arr_filters_expected = () if filters_expected is None else filters_expected - assert arr.compressors == compressor_expected - assert arr.filters == filters_expected + assert arr.compressors == arr_compressors_expected + assert arr.filters == arr_filters_expected @staticmethod @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) @@ -1364,11 +1365,12 @@ async def test_default_filters_compressors( if default_filters is None: expected_filters = () else: - expected_filters = default_filters + expected_filters = default_filters # type: ignore[assignment] + if default_compressors is None: expected_compressors = () else: - expected_compressors = (default_compressors,) + expected_compressors = (default_compressors,) # type: ignore[assignment] expected_serializer = None else: raise ValueError(f"Invalid zarr_format: {zarr_format}") @@ -1672,7 +1674,7 @@ def test_roundtrip_numcodecs() -> None: {"name": "numcodecs.shuffle", "configuration": {"elementsize": 2}}, {"name": "numcodecs.zlib", "configuration": {"level": 4}}, ] - filters = [ + filters: list[CodecJSON_V3] = [ { "name": "numcodecs.fixedscaleoffset", "configuration": { diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index bb381c615a..1c4d550587 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -2,8 +2,8 @@ from numcodecs import GZip -from zarr.codecs._numcodecs import get_numcodec -from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls +from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls +from zarr.registry import get_numcodec def test_get_numcodec() -> None: diff --git a/tests/test_regression/test_v2_dtype_regression.py b/tests/test_regression/test_v2_dtype_regression.py index 9702ca7d23..ffe273490d 100644 --- a/tests/test_regression/test_v2_dtype_regression.py +++ b/tests/test_regression/test_v2_dtype_regression.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal -import numcodecs import numpy as np import pytest from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd @@ -13,6 +12,7 @@ import zarr.abc import zarr.abc.codec import zarr.codecs as zarrcodecs +from zarr.abc.numcodec import Numcodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.bytes import VariableLengthBytes @@ -40,12 +40,12 @@ def runner_installed() -> bool: class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] fill_value: np.generic | str | int | bytes - filters: tuple[numcodecs.abc.Codec, ...] = () + filters: tuple[Numcodec, ...] = () serializer: str | None = None - compressor: numcodecs.abc.Codec + compressor: Numcodec -basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() +basic_codecs: tuple[Numcodec, ...] = GZip(), Blosc(), LZ4(), LZMA(), Zstd() basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" string_dtypes = "U4" From f057525adf2bbcad643d6fe9bfc6d8328f299130 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:32:56 +0200 Subject: [PATCH 34/54] add tests for codecjson typeguard --- tests/test_abc/__init__.py | 0 tests/test_abc/test_codec.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 tests/test_abc/__init__.py create mode 100644 tests/test_abc/test_codec.py diff --git a/tests/test_abc/__init__.py b/tests/test_abc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_abc/test_codec.py b/tests/test_abc/test_codec.py new file mode 100644 index 0000000000..e0f9ddb7bb --- /dev/null +++ b/tests/test_abc/test_codec.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from zarr.abc.codec import _check_codecjson_v2 + + +def test_check_codecjson_v2_valid() -> None: + """ + Test that the _check_codecjson_v2 function works + """ + assert _check_codecjson_v2({"id": "gzip"}) + assert not _check_codecjson_v2({"id": 10}) + assert not _check_codecjson_v2([10, 11]) From 190e1b2744e6305db6fe0c54001d48d76d57b18f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 17:01:52 +0200 Subject: [PATCH 35/54] avoid using zarr's buffer / ndbuffer for numcodec encode / decode --- src/zarr/abc/numcodec.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index d2c9380146..db6ff4655a 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,9 +1,8 @@ from typing import Self, TypeGuard -from typing_extensions import Protocol +from typing_extensions import Buffer, Protocol from zarr.abc.codec import CodecJSON_V2 -from zarr.core.buffer import Buffer, NDBuffer class Numcodec(Protocol): @@ -13,11 +12,9 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + def encode(self, buf: Buffer) -> Buffer: ... - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... + def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... def get_config(self) -> CodecJSON_V2[str]: ... From 82992c5710e9a83a266dffdce1a20d3f5a18606c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 19:12:35 +0200 Subject: [PATCH 36/54] use Any to model input / output types of numcodec protocol --- src/zarr/abc/numcodec.py | 14 ++++++-------- src/zarr/codecs/_v2.py | 14 ++++++-------- src/zarr/core/array.py | 6 +++--- src/zarr/core/metadata/v2.py | 6 +++--- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index db6ff4655a..c671428388 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,8 +1,6 @@ -from typing import Self, TypeGuard +from typing import Any, Self, TypeGuard -from typing_extensions import Buffer, Protocol - -from zarr.abc.codec import CodecJSON_V2 +from typing_extensions import Protocol class Numcodec(Protocol): @@ -12,14 +10,14 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer) -> Buffer: ... + def encode(self, buf: Any) -> Any: ... - def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... + def decode(self, buf: Any, out: Any | None = None) -> Any: ... - def get_config(self) -> CodecJSON_V2[str]: ... + def get_config(self) -> Any: ... @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + def from_config(cls, config: Any) -> Self: ... def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 92eda38226..3c6c99c21c 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -31,9 +31,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] + chunk = await asyncio.to_thread(self.compressor.decode, cdata) else: - chunk = cdata # type: ignore[assignment] + chunk = cdata # apply filters if self.filters: @@ -54,7 +54,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -83,18 +83,16 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] - + chunk = await asyncio.to_thread(f.encode, chunk) # check object encoding if ensure_ndarray_like(chunk).dtype == object: raise RuntimeError("cannot write object array without object codec") # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] + cdata = await asyncio.to_thread(self.compressor.encode, chunk) else: - cdata = chunk # type: ignore[assignment] - + cdata = chunk cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 71de2f58f5..9e84e68d46 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4849,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] + object_codec_id = get_object_codec_id((_compressor.get_config(),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], # type: ignore[arg-type] - _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] + *[f.get_config() for f in _filters], + _compressor.get_config() if _compressor is not None else None, ) ) if object_codec_id is None: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 934befec91..efc6bd7949 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -201,8 +201,8 @@ def to_dict(self) -> dict[str, JSON]: codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") # type: ignore[typeddict-item] - zarray_dict["compressor"] = codec_config # type: ignore[assignment] + codec_config.pop("checksum") + zarray_dict["compressor"] = codec_config if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -216,7 +216,7 @@ def to_dict(self) -> dict[str, JSON]: new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters # type: ignore[assignment] + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: From 0183eb58ff833050b8f8202547ace4924225136d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 18:38:09 +0200 Subject: [PATCH 37/54] bring in contents of numcodecs.zarr3 --- src/zarr/codecs/_numcodecs.py | 351 ++++++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py new file mode 100644 index 0000000000..7d70a9f67e --- /dev/null +++ b/src/zarr/codecs/_numcodecs.py @@ -0,0 +1,351 @@ +""" +This module provides the compatibility for :py:mod:`numcodecs` in Zarr version 3. + +A compatibility module is required because the codec handling in Zarr version 3 is different from Zarr version 2. + +You can use codecs from :py:mod:`numcodecs` by constructing codecs from :py:mod:`numcodecs.zarr3` using the same parameters as the original codecs. + +>>> import zarr +>>> import numcodecs.zarr3 +>>> +>>> array = zarr.create_array( +... store="data.zarr", +... shape=(1024, 1024), +... chunks=(64, 64), +... dtype="uint32", +... filters=[numcodecs.zarr3.Delta()], +... compressors=[numcodecs.zarr3.BZ2(level=5)]) +>>> array[:] = np.arange(*array.shape).astype(array.dtype) + +.. note:: + + Please note that the codecs in :py:mod:`numcodecs.zarr3` are not part of the Zarr version 3 specification. + Using these codecs might cause interoperability issues with other Zarr implementations. +""" + +from __future__ import annotations + +import asyncio +import math +from dataclasses import dataclass, replace +from functools import cached_property +from typing import TYPE_CHECKING, Any, Self +from warnings import warn + +import numpy as np + +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.abc.metadata import Metadata +from zarr.core.buffer.cpu import as_numpy_array_wrapper +from zarr.core.common import JSON, parse_named_configuration, product +from zarr.dtype import UInt8, ZDType, parse_dtype +from zarr.errors import ZarrUserWarning +from zarr.registry import get_numcodec + +if TYPE_CHECKING: + from zarr.abc.numcodec import Numcodec + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer + +CODEC_PREFIX = "numcodecs." + + +def _expect_name_prefix(codec_name: str) -> str: + if not codec_name.startswith(CODEC_PREFIX): + raise ValueError( + f"Expected name to start with '{CODEC_PREFIX}'. Got {codec_name} instead." + ) # pragma: no cover + return codec_name.removeprefix(CODEC_PREFIX) + + +def _parse_codec_configuration(data: dict[str, JSON]) -> dict[str, JSON]: + parsed_name, parsed_configuration = parse_named_configuration(data) + if not parsed_name.startswith(CODEC_PREFIX): + raise ValueError( + f"Expected name to start with '{CODEC_PREFIX}'. Got {parsed_name} instead." + ) # pragma: no cover + id = _expect_name_prefix(parsed_name) + return {"id": id, **parsed_configuration} + + +@dataclass(frozen=True) +class _NumcodecsCodec(Metadata): + codec_name: str + codec_config: dict[str, JSON] + + def __init_subclass__(cls, *, codec_name: str | None = None, **kwargs: Any) -> None: + """To be used only when creating the actual public-facing codec class.""" + super().__init_subclass__(**kwargs) + if codec_name is not None: + namespace = codec_name + + cls_name = f"{CODEC_PREFIX}{namespace}.{cls.__name__}" + cls.codec_name = f"{CODEC_PREFIX}{namespace}" + cls.__doc__ = f""" + See :class:`{cls_name}` for more details and parameters. + """ + + def __init__(self, **codec_config: JSON) -> None: + if not self.codec_name: + raise ValueError( + "The codec name needs to be supplied through the `codec_name` attribute." + ) # pragma: no cover + unprefixed_codec_name = _expect_name_prefix(self.codec_name) + + if "id" not in codec_config: + codec_config = {"id": unprefixed_codec_name, **codec_config} + elif codec_config["id"] != unprefixed_codec_name: + raise ValueError( + f"Codec id does not match {unprefixed_codec_name}. Got: {codec_config['id']}." + ) # pragma: no cover + + object.__setattr__(self, "codec_config", codec_config) + warn( + "Numcodecs codecs are not in the Zarr version 3 specification and " + "may not be supported by other zarr implementations.", + category=ZarrUserWarning, + stacklevel=2, + ) + + @cached_property + def _codec(self) -> Numcodec: + return get_numcodec(self.codec_config) # type: ignore[arg-type] + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + codec_config = _parse_codec_configuration(data) + return cls(**codec_config) + + def to_dict(self) -> dict[str, JSON]: + codec_config = self.codec_config.copy() + codec_config.pop("id", None) + return { + "name": self.codec_name, + "configuration": codec_config, + } + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError # pragma: no cover + + # Override __repr__ because dynamically constructed classes don't seem to work otherwise + def __repr__(self) -> str: + codec_config = self.codec_config.copy() + codec_config.pop("id", None) + return f"{self.__class__.__name__}(codec_name={self.codec_name!r}, codec_config={codec_config!r})" + + +class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread( + as_numpy_array_wrapper, + self._codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_data: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self._codec.encode(chunk_data.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) + + +class _NumcodecsArrayBytesCodec(_NumcodecsCodec, ArrayBytesCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self._codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) + + +# bytes-to-bytes codecs +class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"): + pass + + +class LZ4(_NumcodecsBytesBytesCodec, codec_name="lz4"): + pass + + +class Zstd(_NumcodecsBytesBytesCodec, codec_name="zstd"): + pass + + +class Zlib(_NumcodecsBytesBytesCodec, codec_name="zlib"): + pass + + +class GZip(_NumcodecsBytesBytesCodec, codec_name="gzip"): + pass + + +class BZ2(_NumcodecsBytesBytesCodec, codec_name="bz2"): + pass + + +class LZMA(_NumcodecsBytesBytesCodec, codec_name="lzma"): + pass + + +class Shuffle(_NumcodecsBytesBytesCodec, codec_name="shuffle"): + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Shuffle: + if self.codec_config.get("elementsize") is None: + dtype = array_spec.dtype.to_native_dtype() + return Shuffle(**{**self.codec_config, "elementsize": dtype.itemsize}) + return self # pragma: no cover + + +# array-to-array codecs ("filters") +class Delta(_NumcodecsArrayArrayCodec, codec_name="delta"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + if astype := self.codec_config.get("astype"): + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] + return replace(chunk_spec, dtype=dtype) + return chunk_spec + + +class BitRound(_NumcodecsArrayArrayCodec, codec_name="bitround"): + pass + + +class FixedScaleOffset(_NumcodecsArrayArrayCodec, codec_name="fixedscaleoffset"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + if astype := self.codec_config.get("astype"): + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] + return replace(chunk_spec, dtype=dtype) + return chunk_spec + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset: + if self.codec_config.get("dtype") is None: + dtype = array_spec.dtype.to_native_dtype() + return FixedScaleOffset(**{**self.codec_config, "dtype": str(dtype)}) + return self + + +class Quantize(_NumcodecsArrayArrayCodec, codec_name="quantize"): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Quantize: + if self.codec_config.get("dtype") is None: + dtype = array_spec.dtype.to_native_dtype() + return Quantize(**{**self.codec_config, "dtype": str(dtype)}) + return self + + +class PackBits(_NumcodecsArrayArrayCodec, codec_name="packbits"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + return replace( + chunk_spec, + shape=(1 + math.ceil(product(chunk_spec.shape) / 8),), + dtype=UInt8(), + ) + + # todo: remove this type: ignore when this class can be defined w.r.t. + # a single zarr dtype API + def validate(self, *, dtype: ZDType[Any, Any], **_kwargs: Any) -> None: + # this is bugged and will fail + _dtype = dtype.to_native_dtype() + if _dtype != np.dtype("bool"): + raise ValueError(f"Packbits filter requires bool dtype. Got {dtype}.") + + +class AsType(_NumcodecsArrayArrayCodec, codec_name="astype"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + dtype = parse_dtype(np.dtype(self.codec_config["encode_dtype"]), zarr_format=3) # type: ignore[arg-type] + return replace(chunk_spec, dtype=dtype) + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> AsType: + if self.codec_config.get("decode_dtype") is None: + # TODO: remove these coverage exemptions the correct way, i.e. with tests + dtype = array_spec.dtype.to_native_dtype() # pragma: no cover + return AsType(**{**self.codec_config, "decode_dtype": str(dtype)}) # pragma: no cover + return self + + +# bytes-to-bytes checksum codecs +class _NumcodecsChecksumCodec(_NumcodecsBytesBytesCodec): + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + return input_byte_length + 4 # pragma: no cover + + +class CRC32(_NumcodecsChecksumCodec, codec_name="crc32"): + pass + + +class CRC32C(_NumcodecsChecksumCodec, codec_name="crc32c"): + pass + + +class Adler32(_NumcodecsChecksumCodec, codec_name="adler32"): + pass + + +class Fletcher32(_NumcodecsChecksumCodec, codec_name="fletcher32"): + pass + + +class JenkinsLookup3(_NumcodecsChecksumCodec, codec_name="jenkins_lookup3"): + pass + + +# array-to-bytes codecs +class PCodec(_NumcodecsArrayBytesCodec, codec_name="pcodec"): + pass + + +class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"): + pass + + +__all__ = [ + "BZ2", + "CRC32", + "CRC32C", + "LZ4", + "LZMA", + "ZFPY", + "Adler32", + "AsType", + "BitRound", + "Blosc", + "Delta", + "FixedScaleOffset", + "Fletcher32", + "GZip", + "JenkinsLookup3", + "PCodec", + "PackBits", + "Quantize", + "Shuffle", + "Zlib", + "Zstd", +] From 5f06c9e0ae95fcfef997f028b5bf426d36f7d330 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 21:34:29 +0200 Subject: [PATCH 38/54] fix tests --- src/zarr/codecs/_numcodecs.py | 25 ++- src/zarr/core/config.py | 21 ++ src/zarr/registry.py | 1 - tests/test_array.py | 4 +- tests/test_codecs/test_numcodecs.py | 325 ++++++++++++++++++++++++++++ 5 files changed, 372 insertions(+), 4 deletions(-) diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py index 7d70a9f67e..0f84f57594 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/_numcodecs.py @@ -40,7 +40,7 @@ from zarr.core.common import JSON, parse_named_configuration, product from zarr.dtype import UInt8, ZDType, parse_dtype from zarr.errors import ZarrUserWarning -from zarr.registry import get_numcodec +from zarr.registry import get_numcodec, register_codec if TYPE_CHECKING: from zarr.abc.numcodec import Numcodec @@ -326,6 +326,29 @@ class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"): pass +# TODO: move the codec registration outside this module +register_codec("numcodecs.bz2", BZ2) +register_codec("numcodecs.crc32", CRC32) +register_codec("numcodecs.crc32c", CRC32C) +register_codec("numcodecs.lz4", LZ4) +register_codec("numcodecs.lzma", LZMA) +register_codec("numcodecs.zfpy", ZFPY) +register_codec("numcodecs.adler32", Adler32) +register_codec("numcodecs.astype", AsType) +register_codec("numcodecs.bitround", BitRound) +register_codec("numcodecs.blosc", Blosc) +register_codec("numcodecs.delta", Delta) +register_codec("numcodecs.fixedscaleoffset", FixedScaleOffset) +register_codec("numcodecs.fletcher32", Fletcher32) +register_codec("numcodecs.gzip", GZip) +register_codec("numcodecs.jenkins_lookup3", JenkinsLookup3) +register_codec("numcodecs.pcodec", PCodec) +register_codec("numcodecs.packbits", PackBits) +register_codec("numcodecs.quantize", Quantize) +register_codec("numcodecs.shuffle", Shuffle) +register_codec("numcodecs.zlib", Zlib) +register_codec("numcodecs.zstd", Zstd) + __all__ = [ "BZ2", "CRC32", diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index cc3c33cd17..b955788642 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -125,6 +125,27 @@ def enable_gpu(self) -> ConfigSet: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs._numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs._numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs._numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer", diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 46216205f7..11f2be234a 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -155,7 +155,6 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: codec_classes = __codec_registries[key] if not codec_classes: raise KeyError(key) - config_entry = config.get("codecs", {}).get(key) if config_entry is None: if len(codec_classes) == 1: diff --git a/tests/test_array.py b/tests/test_array.py index a316ee127f..742166a229 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1713,7 +1713,7 @@ def test_roundtrip_numcodecs() -> None: # Create the array with the correct codecs root = zarr.group(store) warn_msg = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations." - with pytest.warns(UserWarning, match=warn_msg): + with pytest.warns(ZarrUserWarning, match=warn_msg): root.create_array( "test", shape=(720, 1440), @@ -1728,7 +1728,7 @@ def test_roundtrip_numcodecs() -> None: BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} # Read in the array again and check compressor config root = zarr.open_group(store) - with pytest.warns(UserWarning, match=warn_msg): + with pytest.warns(ZarrUserWarning, match=warn_msg): metadata = root["test"].metadata.to_dict() expected = (*filters, BYTES_CODEC, *compressors) assert metadata["codecs"] == expected diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index 1c4d550587..ee01bdc85f 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -1,10 +1,56 @@ from __future__ import annotations +import contextlib +import pickle +from typing import TYPE_CHECKING, Any + +import numpy as np +import pytest from numcodecs import GZip +from zarr import config, create_array, open_array from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls +from zarr.codecs import _numcodecs +from zarr.errors import ZarrUserWarning from zarr.registry import get_numcodec +if TYPE_CHECKING: + from collections.abc import Iterator + + +@contextlib.contextmanager +def codec_conf() -> Iterator[Any]: + base_conf = config.get("codecs") + new_conf = { + "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs._numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs._numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", + "numcodecs.jenkinslookup3": "zarr.codecs._numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", + } + + yield config.set({"codecs": new_conf | base_conf}) + + +if TYPE_CHECKING: + from zarr.core.common import JSON + def test_get_numcodec() -> None: assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) # type: ignore[typeddict-unknown-key] @@ -22,3 +68,282 @@ def test_is_numcodec_cls() -> None: Test the _is_numcodec_cls function """ assert _is_numcodec_cls(GZip) + + +EXPECTED_WARNING_STR = "Numcodecs codecs are not in the Zarr version 3.*" + +ALL_CODECS = [getattr(_numcodecs, cls_name) for cls_name in _numcodecs.__all__] + + +@pytest.mark.parametrize("codec_class", ALL_CODECS) +def test_docstring(codec_class: type[_numcodecs._NumcodecsCodec]) -> None: + assert "See :class:`numcodecs." in codec_class.__doc__ # type: ignore[operator] + + +@pytest.mark.parametrize( + "codec_class", + [ + _numcodecs.Blosc, + _numcodecs.LZ4, + _numcodecs.Zstd, + _numcodecs.Zlib, + _numcodecs.GZip, + _numcodecs.BZ2, + _numcodecs.LZMA, + _numcodecs.Shuffle, + ], +) +def test_generic_compressor(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: + data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) + + a[:, :] = data.copy() + np.testing.assert_array_equal(data, a[:, :]) + + +@pytest.mark.parametrize( + ("codec_class", "codec_config"), + [ + (_numcodecs.Delta, {"dtype": "float32"}), + (_numcodecs.FixedScaleOffset, {"offset": 0, "scale": 25.5}), + (_numcodecs.FixedScaleOffset, {"offset": 0, "scale": 51, "astype": "uint16"}), + (_numcodecs.AsType, {"encode_dtype": "float32", "decode_dtype": "float32"}), + ], + ids=[ + "delta", + "fixedscaleoffset", + "fixedscaleoffset2", + "astype", + ], +) +def test_generic_filter( + codec_class: type[_numcodecs._NumcodecsArrayArrayCodec], + codec_config: dict[str, JSON], +) -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + codec_class(**codec_config), + ], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +def test_generic_filter_bitround() -> None: + data = np.linspace(0, 1, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.BitRound(keepbits=3)], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + assert np.allclose(data, b[:, :], atol=0.1) + + +def test_generic_filter_quantize() -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.Quantize(digits=3)], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + assert np.allclose(data, b[:, :], atol=0.001) + + +def test_generic_filter_packbits() -> None: + data = np.zeros((16, 16), dtype="bool") + data[0:4, :] = True + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.PackBits()], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + with pytest.raises(ValueError, match=".*requires bool dtype.*"): + create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype="uint32", + fill_value=0, + filters=[_numcodecs.PackBits()], + ) + + +@pytest.mark.parametrize( + "codec_class", + [ + _numcodecs.CRC32, + _numcodecs.CRC32C, + _numcodecs.Adler32, + _numcodecs.Fletcher32, + _numcodecs.JenkinsLookup3, + ], +) +def test_generic_checksum(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +@pytest.mark.parametrize("codec_class", [_numcodecs.PCodec, _numcodecs.ZFPY]) +def test_generic_bytes_codec(codec_class: type[_numcodecs._NumcodecsArrayBytesCodec]) -> None: + try: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec_class()._codec # noqa: B018 + except ValueError as e: # pragma: no cover + if "codec not available" in str(e): + pytest.xfail(f"{codec_class.codec_name} is not available: {e}") + else: + raise + except ImportError as e: # pragma: no cover + pytest.xfail(f"{codec_class.codec_name} is not available: {e}") + + data = np.arange(0, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + serializer=codec_class(), + ) + + a[:, :] = data.copy() + np.testing.assert_array_equal(data, a[:, :]) + + +def test_delta_astype() -> None: + data = np.linspace(0, 10, 256, dtype="i8").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + _numcodecs.Delta(dtype="i8", astype="i2"), + ], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +def test_repr() -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = _numcodecs.LZ4(level=5) + assert repr(codec) == "LZ4(codec_name='numcodecs.lz4', codec_config={'level': 5})" + + +def test_to_dict() -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = _numcodecs.LZ4(level=5) + assert codec.to_dict() == {"name": "numcodecs.lz4", "configuration": {"level": 5}} + + +@pytest.mark.parametrize( + "codec_cls", + [ + _numcodecs.Blosc, + _numcodecs.LZ4, + _numcodecs.Zstd, + _numcodecs.Zlib, + _numcodecs.GZip, + _numcodecs.BZ2, + _numcodecs.LZMA, + _numcodecs.Shuffle, + _numcodecs.BitRound, + _numcodecs.Delta, + _numcodecs.FixedScaleOffset, + _numcodecs.Quantize, + _numcodecs.PackBits, + _numcodecs.AsType, + _numcodecs.CRC32, + _numcodecs.CRC32C, + _numcodecs.Adler32, + _numcodecs.Fletcher32, + _numcodecs.JenkinsLookup3, + _numcodecs.PCodec, + _numcodecs.ZFPY, + ], +) +def test_codecs_pickleable(codec_cls: type[_numcodecs._NumcodecsCodec]) -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = codec_cls() + + expected = codec + + p = pickle.dumps(codec) + actual = pickle.loads(p) + assert actual == expected From e64830f9dae15f4bf9bdd27591ce27a1ced7722e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 21:38:23 +0200 Subject: [PATCH 39/54] fix docs --- docs/user-guide/config.rst | 42 ++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 0ae8017ca9..e87187d63b 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -42,23 +42,43 @@ requires the value of ``codecs.bytes.name`` to be ``'custompackage.NewBytesCodec This is the current default configuration:: >>> zarr.config.pprint() - {'array': {'order': 'C', - 'write_empty_chunks': False}, - 'async': {'concurrency': 10, 'timeout': None}, - 'buffer': 'zarr.buffer.cpu.Buffer', - 'codec_pipeline': {'batch_size': 1, - 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, - 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', + {'array': {'order': 'C', 'write_empty_chunks': False}, + 'async': {'concurrency': 10, 'timeout': None}, + 'buffer': 'zarr.buffer.cpu.Buffer', + 'codec_pipeline': {'batch_size': 1, + 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, + 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', 'bytes': 'zarr.codecs.bytes.BytesCodec', 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', 'endian': 'zarr.codecs.bytes.BytesCodec', 'gzip': 'zarr.codecs.gzip.GzipCodec', + 'numcodecs.adler32': 'zarr.codecs._numcodecs.Adler32', + 'numcodecs.astype': 'zarr.codecs._numcodecs.AsType', + 'numcodecs.bitround': 'zarr.codecs._numcodecs.BitRound', + 'numcodecs.blosc': 'zarr.codecs._numcodecs.Blosc', + 'numcodecs.bz2': 'zarr.codecs._numcodecs.BZ2', + 'numcodecs.crc32': 'zarr.codecs._numcodecs.CRC32', + 'numcodecs.crc32c': 'zarr.codecs._numcodecs.CRC32C', + 'numcodecs.delta': 'zarr.codecs._numcodecs.Delta', + 'numcodecs.fixedscaleoffset': 'zarr.codecs._numcodecs.FixedScaleOffset', + 'numcodecs.fletcher32': 'zarr.codecs._numcodecs.Fletcher32', + 'numcodecs.gZip': 'zarr.codecs._numcodecs.GZip', + 'numcodecs.jenkins_lookup3': 'zarr.codecs._numcodecs.JenkinsLookup3', + 'numcodecs.lz4': 'zarr.codecs._numcodecs.LZ4', + 'numcodecs.lzma': 'zarr.codecs._numcodecs.LZMA', + 'numcodecs.packbits': 'zarr.codecs._numcodecs.PackBits', + 'numcodecs.pcodec': 'zarr.codecs._numcodecs.PCodec', + 'numcodecs.quantize': 'zarr.codecs._numcodecs.Quantize', + 'numcodecs.shuffle': 'zarr.codecs._numcodecs.Shuffle', + 'numcodecs.zfpy': 'zarr.codecs._numcodecs.ZFPY', + 'numcodecs.zlib': 'zarr.codecs._numcodecs.Zlib', + 'numcodecs.zstd': 'zarr.codecs._numcodecs.Zstd', 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', 'transpose': 'zarr.codecs.transpose.TransposeCodec', 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, - 'default_zarr_format': 3, - 'json_indent': 2, - 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', - 'threading': {'max_workers': None}} + 'default_zarr_format': 3, + 'json_indent': 2, + 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', + 'threading': {'max_workers': None}} From b6b2260a953e38bd4e432508b2063c5beda11703 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 22:01:25 +0200 Subject: [PATCH 40/54] fix config test --- tests/test_config.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_config.py b/tests/test_config.py index 0c029dda3a..2d308c2816 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -73,6 +73,27 @@ def test_config_defaults_set() -> None: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs._numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs._numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs._numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer", From 223ae65806dbfa93f08f16dafae5ee6457c879d8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 18 Aug 2025 21:42:31 +0200 Subject: [PATCH 41/54] make zarr.codecs.numcodecs --- src/zarr/codecs/numcodecs/__init__.py | 80 +++++++++++++++++++ .../{_numcodecs.py => numcodecs/_codecs.py} | 63 ++------------- tests/test_codecs/test_numcodecs.py | 2 +- 3 files changed, 88 insertions(+), 57 deletions(-) create mode 100644 src/zarr/codecs/numcodecs/__init__.py rename src/zarr/codecs/{_numcodecs.py => numcodecs/_codecs.py} (85%) diff --git a/src/zarr/codecs/numcodecs/__init__.py b/src/zarr/codecs/numcodecs/__init__.py new file mode 100644 index 0000000000..5f5bebea13 --- /dev/null +++ b/src/zarr/codecs/numcodecs/__init__.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from zarr.codecs.numcodecs._codecs import ( + BZ2, + CRC32, + CRC32C, + LZ4, + LZMA, + ZFPY, + Adler32, + AsType, + BitRound, + Blosc, + Delta, + FixedScaleOffset, + Fletcher32, + GZip, + JenkinsLookup3, + PackBits, + PCodec, + Quantize, + Shuffle, + Zlib, + Zstd, + _NumcodecsArrayArrayCodec, + _NumcodecsArrayBytesCodec, + _NumcodecsBytesBytesCodec, + _NumcodecsCodec, +) +from zarr.registry import register_codec + +register_codec("numcodecs.bz2", BZ2) +register_codec("numcodecs.crc32", CRC32) +register_codec("numcodecs.crc32c", CRC32C) +register_codec("numcodecs.lz4", LZ4) +register_codec("numcodecs.lzma", LZMA) +register_codec("numcodecs.zfpy", ZFPY) +register_codec("numcodecs.adler32", Adler32) +register_codec("numcodecs.astype", AsType) +register_codec("numcodecs.bitround", BitRound) +register_codec("numcodecs.blosc", Blosc) +register_codec("numcodecs.delta", Delta) +register_codec("numcodecs.fixedscaleoffset", FixedScaleOffset) +register_codec("numcodecs.fletcher32", Fletcher32) +register_codec("numcodecs.gzip", GZip) +register_codec("numcodecs.jenkins_lookup3", JenkinsLookup3) +register_codec("numcodecs.pcodec", PCodec) +register_codec("numcodecs.packbits", PackBits) +register_codec("numcodecs.quantize", Quantize) +register_codec("numcodecs.shuffle", Shuffle) +register_codec("numcodecs.zlib", Zlib) +register_codec("numcodecs.zstd", Zstd) + +__all__ = [ + "BZ2", + "CRC32", + "CRC32C", + "LZ4", + "LZMA", + "ZFPY", + "Adler32", + "AsType", + "BitRound", + "Blosc", + "Delta", + "FixedScaleOffset", + "Fletcher32", + "GZip", + "JenkinsLookup3", + "PCodec", + "PackBits", + "Quantize", + "Shuffle", + "Zlib", + "Zstd", + "_NumcodecsArrayArrayCodec", + "_NumcodecsArrayBytesCodec", + "_NumcodecsBytesBytesCodec", + "_NumcodecsCodec", +] diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/numcodecs/_codecs.py similarity index 85% rename from src/zarr/codecs/_numcodecs.py rename to src/zarr/codecs/numcodecs/_codecs.py index 0f84f57594..c7884700c7 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/numcodecs/_codecs.py @@ -1,12 +1,10 @@ """ -This module provides the compatibility for :py:mod:`numcodecs` in Zarr version 3. +This module provides compatibility for :py:mod:`numcodecs` in Zarr version 3. -A compatibility module is required because the codec handling in Zarr version 3 is different from Zarr version 2. - -You can use codecs from :py:mod:`numcodecs` by constructing codecs from :py:mod:`numcodecs.zarr3` using the same parameters as the original codecs. +These codecs were previously defined in :py:mod:`numcodecs`, and have now been moved to `zarr`. >>> import zarr ->>> import numcodecs.zarr3 +>>> import zarr.codecs.numcodecs as numcodecs >>> >>> array = zarr.create_array( ... store="data.zarr", @@ -19,8 +17,9 @@ .. note:: - Please note that the codecs in :py:mod:`numcodecs.zarr3` are not part of the Zarr version 3 specification. - Using these codecs might cause interoperability issues with other Zarr implementations. + Please note that the codecs in :py:mod:`zarr.codecs.numcodecs` are not part of the Zarr version + 3 specification. Using these codecs might cause interoperability issues with other Zarr + implementations. """ from __future__ import annotations @@ -40,7 +39,7 @@ from zarr.core.common import JSON, parse_named_configuration, product from zarr.dtype import UInt8, ZDType, parse_dtype from zarr.errors import ZarrUserWarning -from zarr.registry import get_numcodec, register_codec +from zarr.registry import get_numcodec if TYPE_CHECKING: from zarr.abc.numcodec import Numcodec @@ -324,51 +323,3 @@ class PCodec(_NumcodecsArrayBytesCodec, codec_name="pcodec"): class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"): pass - - -# TODO: move the codec registration outside this module -register_codec("numcodecs.bz2", BZ2) -register_codec("numcodecs.crc32", CRC32) -register_codec("numcodecs.crc32c", CRC32C) -register_codec("numcodecs.lz4", LZ4) -register_codec("numcodecs.lzma", LZMA) -register_codec("numcodecs.zfpy", ZFPY) -register_codec("numcodecs.adler32", Adler32) -register_codec("numcodecs.astype", AsType) -register_codec("numcodecs.bitround", BitRound) -register_codec("numcodecs.blosc", Blosc) -register_codec("numcodecs.delta", Delta) -register_codec("numcodecs.fixedscaleoffset", FixedScaleOffset) -register_codec("numcodecs.fletcher32", Fletcher32) -register_codec("numcodecs.gzip", GZip) -register_codec("numcodecs.jenkins_lookup3", JenkinsLookup3) -register_codec("numcodecs.pcodec", PCodec) -register_codec("numcodecs.packbits", PackBits) -register_codec("numcodecs.quantize", Quantize) -register_codec("numcodecs.shuffle", Shuffle) -register_codec("numcodecs.zlib", Zlib) -register_codec("numcodecs.zstd", Zstd) - -__all__ = [ - "BZ2", - "CRC32", - "CRC32C", - "LZ4", - "LZMA", - "ZFPY", - "Adler32", - "AsType", - "BitRound", - "Blosc", - "Delta", - "FixedScaleOffset", - "Fletcher32", - "GZip", - "JenkinsLookup3", - "PCodec", - "PackBits", - "Quantize", - "Shuffle", - "Zlib", - "Zstd", -] diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index ee01bdc85f..f823b60c06 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -10,7 +10,7 @@ from zarr import config, create_array, open_array from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls -from zarr.codecs import _numcodecs +from zarr.codecs import numcodecs as _numcodecs from zarr.errors import ZarrUserWarning from zarr.registry import get_numcodec From 1720a758be90a0ccc8ab676d88676fb64cdfabbb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 19 Aug 2025 19:25:35 +0200 Subject: [PATCH 42/54] complete move to zarr.codecs.numcodecs --- docs/user-guide/config.rst | 42 +++++++++++----------- src/zarr/codecs/numcodecs/__init__.py | 48 ++++++++++++++----------- src/zarr/core/config.py | 42 +++++++++++----------- src/zarr/registry.py | 4 +-- tests/test_codecs/test_numcodecs.py | 52 +++++++++++++++------------ tests/test_config.py | 42 +++++++++++----------- 6 files changed, 122 insertions(+), 108 deletions(-) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index e87187d63b..05cfa1e53c 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -52,27 +52,27 @@ This is the current default configuration:: 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', 'endian': 'zarr.codecs.bytes.BytesCodec', 'gzip': 'zarr.codecs.gzip.GzipCodec', - 'numcodecs.adler32': 'zarr.codecs._numcodecs.Adler32', - 'numcodecs.astype': 'zarr.codecs._numcodecs.AsType', - 'numcodecs.bitround': 'zarr.codecs._numcodecs.BitRound', - 'numcodecs.blosc': 'zarr.codecs._numcodecs.Blosc', - 'numcodecs.bz2': 'zarr.codecs._numcodecs.BZ2', - 'numcodecs.crc32': 'zarr.codecs._numcodecs.CRC32', - 'numcodecs.crc32c': 'zarr.codecs._numcodecs.CRC32C', - 'numcodecs.delta': 'zarr.codecs._numcodecs.Delta', - 'numcodecs.fixedscaleoffset': 'zarr.codecs._numcodecs.FixedScaleOffset', - 'numcodecs.fletcher32': 'zarr.codecs._numcodecs.Fletcher32', - 'numcodecs.gZip': 'zarr.codecs._numcodecs.GZip', - 'numcodecs.jenkins_lookup3': 'zarr.codecs._numcodecs.JenkinsLookup3', - 'numcodecs.lz4': 'zarr.codecs._numcodecs.LZ4', - 'numcodecs.lzma': 'zarr.codecs._numcodecs.LZMA', - 'numcodecs.packbits': 'zarr.codecs._numcodecs.PackBits', - 'numcodecs.pcodec': 'zarr.codecs._numcodecs.PCodec', - 'numcodecs.quantize': 'zarr.codecs._numcodecs.Quantize', - 'numcodecs.shuffle': 'zarr.codecs._numcodecs.Shuffle', - 'numcodecs.zfpy': 'zarr.codecs._numcodecs.ZFPY', - 'numcodecs.zlib': 'zarr.codecs._numcodecs.Zlib', - 'numcodecs.zstd': 'zarr.codecs._numcodecs.Zstd', + 'numcodecs.adler32': 'zarr.codecs.numcodecs.Adler32', + 'numcodecs.astype': 'zarr.codecs.numcodecs.AsType', + 'numcodecs.bitround': 'zarr.codecs.numcodecs.BitRound', + 'numcodecs.blosc': 'zarr.codecs.numcodecs.Blosc', + 'numcodecs.bz2': 'zarr.codecs.numcodecs.BZ2', + 'numcodecs.crc32': 'zarr.codecs.numcodecs.CRC32', + 'numcodecs.crc32c': 'zarr.codecs.numcodecs.CRC32C', + 'numcodecs.delta': 'zarr.codecs.numcodecs.Delta', + 'numcodecs.fixedscaleoffset': 'zarr.codecs.numcodecs.FixedScaleOffset', + 'numcodecs.fletcher32': 'zarr.codecs.numcodecs.Fletcher32', + 'numcodecs.gZip': 'zarr.codecs.numcodecs.GZip', + 'numcodecs.jenkins_lookup3': 'zarr.codecs.numcodecs.JenkinsLookup3', + 'numcodecs.lz4': 'zarr.codecs.numcodecs.LZ4', + 'numcodecs.lzma': 'zarr.codecs.numcodecs.LZMA', + 'numcodecs.packbits': 'zarr.codecs.numcodecs.PackBits', + 'numcodecs.pcodec': 'zarr.codecs.numcodecs.PCodec', + 'numcodecs.quantize': 'zarr.codecs.numcodecs.Quantize', + 'numcodecs.shuffle': 'zarr.codecs.numcodecs.Shuffle', + 'numcodecs.zfpy': 'zarr.codecs.numcodecs.ZFPY', + 'numcodecs.zlib': 'zarr.codecs.numcodecs.Zlib', + 'numcodecs.zstd': 'zarr.codecs.numcodecs.Zstd', 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', 'transpose': 'zarr.codecs.transpose.TransposeCodec', 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', diff --git a/src/zarr/codecs/numcodecs/__init__.py b/src/zarr/codecs/numcodecs/__init__.py index 5f5bebea13..08ccf535de 100644 --- a/src/zarr/codecs/numcodecs/__init__.py +++ b/src/zarr/codecs/numcodecs/__init__.py @@ -29,27 +29,33 @@ ) from zarr.registry import register_codec -register_codec("numcodecs.bz2", BZ2) -register_codec("numcodecs.crc32", CRC32) -register_codec("numcodecs.crc32c", CRC32C) -register_codec("numcodecs.lz4", LZ4) -register_codec("numcodecs.lzma", LZMA) -register_codec("numcodecs.zfpy", ZFPY) -register_codec("numcodecs.adler32", Adler32) -register_codec("numcodecs.astype", AsType) -register_codec("numcodecs.bitround", BitRound) -register_codec("numcodecs.blosc", Blosc) -register_codec("numcodecs.delta", Delta) -register_codec("numcodecs.fixedscaleoffset", FixedScaleOffset) -register_codec("numcodecs.fletcher32", Fletcher32) -register_codec("numcodecs.gzip", GZip) -register_codec("numcodecs.jenkins_lookup3", JenkinsLookup3) -register_codec("numcodecs.pcodec", PCodec) -register_codec("numcodecs.packbits", PackBits) -register_codec("numcodecs.quantize", Quantize) -register_codec("numcodecs.shuffle", Shuffle) -register_codec("numcodecs.zlib", Zlib) -register_codec("numcodecs.zstd", Zstd) +register_codec("numcodecs.bz2", BZ2, qualname="zarr.codecs.numcodecs.BZ2") +register_codec("numcodecs.crc32", CRC32, qualname="zarr.codecs.numcodecs.CRC32") +register_codec("numcodecs.crc32c", CRC32C, qualname="zarr.codecs.numcodecs.CRC32C") +register_codec("numcodecs.lz4", LZ4, qualname="zarr.codecs.numcodecs.LZ4") +register_codec("numcodecs.lzma", LZMA, qualname="zarr.codecs.numcodecs.LZMA") +register_codec("numcodecs.zfpy", ZFPY, qualname="zarr.codecs.numcodecs.ZFPY") +register_codec("numcodecs.adler32", Adler32, qualname="zarr.codecs.numcodecs.Adler32") +register_codec("numcodecs.astype", AsType, qualname="zarr.codecs.numcodecs.AsType") +register_codec("numcodecs.bitround", BitRound, qualname="zarr.codecs.numcodecs.BitRound") +register_codec("numcodecs.blosc", Blosc, qualname="zarr.codecs.numcodecs.Blosc") +register_codec("numcodecs.delta", Delta, qualname="zarr.codecs.numcodecs.Delta") +register_codec( + "numcodecs.fixedscaleoffset", + FixedScaleOffset, + qualname="zarr.codecs.numcodecs.FixedScaleOffset", +) +register_codec("numcodecs.fletcher32", Fletcher32, qualname="zarr.codecs.numcodecs.Fletcher32") +register_codec("numcodecs.gzip", GZip, qualname="zarr.codecs.numcodecs.GZip") +register_codec( + "numcodecs.jenkins_lookup3", JenkinsLookup3, qualname="zarr.codecs.numcodecs.JenkinsLookup3" +) +register_codec("numcodecs.pcodec", PCodec, qualname="zarr.codecs.numcodecs.pcodec") +register_codec("numcodecs.packbits", PackBits, qualname="zarr.codecs.numcodecs.PackBits") +register_codec("numcodecs.quantize", Quantize, qualname="zarr.codecs.numcodecs.Quantize") +register_codec("numcodecs.shuffle", Shuffle, qualname="zarr.codecs.numcodecs.Shuffle") +register_codec("numcodecs.zlib", Zlib, qualname="zarr.codecs.numcodecs.Zlib") +register_codec("numcodecs.zstd", Zstd, qualname="zarr.codecs.numcodecs.Zstd") __all__ = [ "BZ2", diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index b955788642..2b7fbbe0c6 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -125,27 +125,27 @@ def enable_gpu(self) -> ConfigSet: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", - "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", - "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", - "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", - "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", - "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", - "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", - "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", - "numcodecs.astype": "zarr.codecs._numcodecs.AsType", - "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", - "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", - "numcodecs.delta": "zarr.codecs._numcodecs.Delta", - "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", - "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", - "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", - "numcodecs.jenkins_lookup3": "zarr.codecs._numcodecs.JenkinsLookup3", - "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", - "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", - "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", - "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", - "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", - "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer", diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 11f2be234a..5483b65c54 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -126,10 +126,10 @@ def fully_qualified_name(cls: type) -> str: return module + "." + cls.__qualname__ -def register_codec(key: str, codec_cls: type[Codec]) -> None: +def register_codec(key: str, codec_cls: type[Codec], *, qualname: str | None = None) -> None: if key not in __codec_registries: __codec_registries[key] = Registry() - __codec_registries[key].register(codec_cls) + __codec_registries[key].register(codec_cls, qualname=qualname) def register_pipeline(pipe_cls: type[CodecPipeline]) -> None: diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index f823b60c06..99c4685a67 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -22,27 +22,27 @@ def codec_conf() -> Iterator[Any]: base_conf = config.get("codecs") new_conf = { - "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", - "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", - "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", - "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", - "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", - "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", - "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", - "numcodecs.astype": "zarr.codecs._numcodecs.AsType", - "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", - "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", - "numcodecs.delta": "zarr.codecs._numcodecs.Delta", - "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", - "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", - "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", - "numcodecs.jenkinslookup3": "zarr.codecs._numcodecs.JenkinsLookup3", - "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", - "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", - "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", - "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", - "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", - "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkinslookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", } yield config.set({"codecs": new_conf | base_conf}) @@ -72,11 +72,19 @@ def test_is_numcodec_cls() -> None: EXPECTED_WARNING_STR = "Numcodecs codecs are not in the Zarr version 3.*" -ALL_CODECS = [getattr(_numcodecs, cls_name) for cls_name in _numcodecs.__all__] +ALL_CODECS = tuple( + filter( + lambda v: isinstance(v, _numcodecs._NumcodecsCodec), + tuple(getattr(_numcodecs, cls_name) for cls_name in _numcodecs.__all__), + ) +) @pytest.mark.parametrize("codec_class", ALL_CODECS) def test_docstring(codec_class: type[_numcodecs._NumcodecsCodec]) -> None: + """ + Test that the docstring for the zarr.numcodecs codecs references the wrapped numcodecs class. + """ assert "See :class:`numcodecs." in codec_class.__doc__ # type: ignore[operator] diff --git a/tests/test_config.py b/tests/test_config.py index 2d308c2816..60e541efd3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -73,27 +73,27 @@ def test_config_defaults_set() -> None: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", - "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", - "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", - "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", - "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", - "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", - "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", - "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", - "numcodecs.astype": "zarr.codecs._numcodecs.AsType", - "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", - "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", - "numcodecs.delta": "zarr.codecs._numcodecs.Delta", - "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", - "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", - "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", - "numcodecs.jenkins_lookup3": "zarr.codecs._numcodecs.JenkinsLookup3", - "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", - "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", - "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", - "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", - "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", - "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer", From 9ac07dad6e23cd3d691017acce206860d27cdfbb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 10:17:55 +0200 Subject: [PATCH 43/54] adjust sharding JSON model --- src/zarr/codecs/sharding.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index f0ec45ae01..e44a4461b7 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -105,7 +105,16 @@ class ShardingJSON_V2(ShardingConfigV2): id: ReadOnly[Literal["sharding_indexed"]] -class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): ... +class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): + """ + The JSON form of sharding codec for Zarr V3. + + Attributes + ---------- + name : Literal["sharding_indexed"] + The name of the sharding codec. + configuration : ShardingConfigV3 + """ class ShardingCodecIndexLocation(Enum): From 90a7acab128f10ee4071f360ffc70d6aacae210e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 10:20:21 +0200 Subject: [PATCH 44/54] fix info tests --- tests/test_array.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 6e6d7eca7e..8376f9600e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -35,6 +35,7 @@ _parse_chunk_encoding_v3, chunks_initialized, create_array, + default_compressor_v2, default_filters_v2, default_serializer_v3, ) @@ -470,14 +471,14 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _zarr_format=2, _data_type=arr._async_array._zdtype, _fill_value=arr.fill_value, - _shape=(8, 8), + _shape=arr.shape, _chunk_shape=chunks, _shard_shape=None, _order="C", _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=arr.compressors, ) assert result == expected @@ -488,14 +489,14 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _zarr_format=3, _data_type=arr._async_array._zdtype, _fill_value=arr.fill_value, - _shape=(8, 8), + _shape=arr.shape, _chunk_shape=chunks, _shard_shape=shards, _order="C", _read_only=False, _store_type="MemoryStore", - _compressors=(ZstdCodec(),), - _serializer=BytesCodec(), + _compressors=arr.compressors, + _serializer=arr.serializer, _count_bytes=512, ) assert result == expected From d64a5d2994cdd02200743d1cb430dc16d1b8e64a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 10:48:05 +0200 Subject: [PATCH 45/54] remove redundant declaration --- src/zarr/abc/codec.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 4c4ed524c6..bdede01b35 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -518,22 +518,3 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | return await func(chunk, chunk_spec) return wrap - - -class Numcodec(Protocol): - """ - A protocol that models the ``numcodecs.abc.Codec`` interface. - """ - - codec_id: ClassVar[str] - - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... - - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... - - def get_config(self) -> CodecJSON_V2[str]: ... - - @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... From 7895f8f0062e3ec9fa40a16c07fc1de461b98f52 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 10:48:31 +0200 Subject: [PATCH 46/54] make test_info more robust --- tests/test_array.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 8376f9600e..4229d0b9f0 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -551,14 +551,14 @@ async def test_info_v2_async( _zarr_format=2, _data_type=Float64(), _fill_value=arr.metadata.fill_value, - _shape=(8, 8), - _chunk_shape=(2, 2), + _shape=arr.shape, + _chunk_shape=arr.chunks, _shard_shape=None, _order="C", _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=arr.compressors, ) assert result == expected @@ -583,8 +583,8 @@ async def test_info_v3_async( _order="C", _read_only=False, _store_type="MemoryStore", - _compressors=(ZstdCodec(),), - _serializer=BytesCodec(), + _compressors=arr.compressors, + _serializer=arr.serializer, _count_bytes=512, ) assert result == expected From 41459eb958713ff6ca5ff3db9273536aeeb2ec7c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 10:48:58 +0200 Subject: [PATCH 47/54] fix imports and type checking --- src/zarr/core/array.py | 10 ++++------ src/zarr/core/metadata/v2.py | 4 ++-- src/zarr/registry.py | 12 +++++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e249e9e58a..f2b8d22ad1 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -24,7 +24,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec -from zarr.abc.numcodec import Numcodec +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.abc.store import Store, set_or_delete from zarr.codecs.bytes import BytesCodec from zarr.codecs.transpose import TransposeCodec @@ -4716,15 +4716,13 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Codec] | None: return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> Numcodec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> BytesBytesCodec: """ Given a data type, return the default compressors for that data type. This is just the ``Zstd`` codec. """ - from numcodecs import Zstd - - return Zstd(level=0, checksum=False) # type: ignore[no-any-return] + return ZstdCodec(level=0, checksum=False) # type: ignore[no-any-return] def _parse_chunk_encoding_v2( @@ -4823,7 +4821,7 @@ def _parse_chunk_encoding_v3( out_bytes_bytes = default_compressors_v3(dtype) else: maybe_bytes_bytes: Iterable[Codec | dict[str, JSON] | Numcodec] - if isinstance(compressors, dict | Codec | Numcodec): + if isinstance(compressors, dict | Codec) or _is_numcodec(compressors): maybe_bytes_bytes = (compressors,) else: maybe_bytes_bytes = compressors # type: ignore[assignment] diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 5f5b13af94..e267541dd2 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -5,9 +5,9 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -from zarr.abc.codec import ArrayArrayCodec, Codec, Numcodec +from zarr.abc.codec import ArrayArrayCodec, Codec from zarr.abc.metadata import Metadata -from zarr.abc.numcodec import _is_numcodec +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.codecs._v2 import NumcodecsWrapper from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid diff --git a/src/zarr/registry.py b/src/zarr/registry.py index e3ca422697..6ad54e282f 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -232,7 +232,8 @@ def _parse_bytes_bytes_codec( """ # avoid circular import, AKA a sign that this function is in the wrong place from zarr.abc.codec import BytesBytesCodec - from zarr.codecs._v2 import Numcodec, NumcodecsBytesBytesCodec, NumcodecsWrapper + from zarr.abc.numcodec import _is_numcodec + from zarr.codecs._v2 import NumcodecsBytesBytesCodec, NumcodecsWrapper result: BytesBytesCodec if isinstance(data, dict): @@ -242,7 +243,7 @@ def _parse_bytes_bytes_codec( if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) - elif isinstance(data, Numcodec): + elif _is_numcodec(data): return NumcodecsBytesBytesCodec(codec=data) else: if not isinstance(data, BytesBytesCodec): @@ -269,7 +270,7 @@ def _parse_array_bytes_codec( if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) - elif isinstance(data, Numcodec): + elif _is_numcodec(data): return NumcodecsArrayBytesCodec(codec=data) else: if not isinstance(data, ArrayBytesCodec): @@ -287,7 +288,8 @@ def _parse_array_array_codec( is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayArrayCodec - from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec, NumcodecsWrapper + from zarr.abc.numcodec import _is_numcodec + from zarr.codecs._v2 import NumcodecsArrayArrayCodec, NumcodecsWrapper if isinstance(data, dict): result = get_codec(data, zarr_format=zarr_format) @@ -296,7 +298,7 @@ def _parse_array_array_codec( elif not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) - elif isinstance(data, Numcodec): + elif _is_numcodec(data): return NumcodecsArrayArrayCodec(codec=data) else: if not isinstance(data, ArrayArrayCodec): From 096cc1aa1296223ea4619d5ff7d8dba534259898 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 11:35:24 +0200 Subject: [PATCH 48/54] add temporary default implementations of json ser/de serialization --- src/zarr/abc/codec.py | 10 ++++++---- src/zarr/registry.py | 9 +++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index bdede01b35..b454702251 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -202,12 +202,14 @@ def to_json( raise NotImplementedError @classmethod - def _from_json_v2(cls, data: CodecJSON) -> Self: - raise NotImplementedError + def _from_json_v2(cls, data: CodecJSON_V2[str]) -> Self: + return cls(**{k: v for k, v in data.items() if k != "id"}) @classmethod - def _from_json_v3(cls, data: CodecJSON) -> Self: - raise NotImplementedError + def _from_json_v3(cls, data: CodecJSON_V3) -> Self: + if isinstance(data, str): + return cls() + return cls(**data["configuration"]) @classmethod def from_json(cls, data: CodecJSON, zarr_format: ZarrFormat) -> Self: diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 6ad54e282f..69246ccafd 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -192,21 +192,25 @@ def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec | Numcode ) else: codec_name = request["id"] + codec_config = {k: v for k, v in request.items() if k != "id"} elif zarr_format == 3: if isinstance(request, str): codec_name = request + codec_config = {} else: codec_name = request["name"] + codec_config = request["configuration"] else: raise ValueError( f"Invalid zarr format. Must be 2 or 3, got {zarr_format!r}" ) # pragma: no cover + try: codec_cls = get_codec_class(codec_name) return codec_cls.from_json(request, zarr_format=zarr_format) except KeyError: # if we can't find the codec in the zarr python registry, try the numcodecs registry - codec = get_numcodec(request) + codec = get_numcodec({'id': codec_name, **codec_config}) return NumcodecsWrapper(codec=codec) @@ -261,7 +265,8 @@ def _parse_array_bytes_codec( is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec - from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec, NumcodecsWrapper + from zarr.abc.numcodec import _is_numcodec + from zarr.codecs._v2 import NumcodecsArrayBytesCodec, NumcodecsWrapper if isinstance(data, dict): result = get_codec(data, zarr_format=zarr_format) From de9e3ddd10bca1b16d6aefe448c724d4b3fd8db9 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 11:35:49 +0200 Subject: [PATCH 49/54] temporarily skip codec refinement in v2 metadata --- src/zarr/core/metadata/v2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index e267541dd2..6aa7fd46f5 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -116,9 +116,11 @@ def __init__( prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) if compressor_parsed is not None: - compressor_parsed = compressor_parsed.evolve_from_array_spec(array_spec) + pass + # compressor_parsed = compressor_parsed.evolve_from_array_spec(array_spec) if filters_parsed is not None: - filters_parsed = tuple(fp.evolve_from_array_spec(array_spec) for fp in filters_parsed) + pass + # filters_parsed = tuple(fp.evolve_from_array_spec(array_spec) for fp in filters_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) From d5e0461e4802a7d092d1af48fd05c44a21e78ac5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 11:52:02 +0200 Subject: [PATCH 50/54] changelog --- changes/3376.misc.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changes/3376.misc.rst diff --git a/changes/3376.misc.rst b/changes/3376.misc.rst new file mode 100644 index 0000000000..68b0d9d855 --- /dev/null +++ b/changes/3376.misc.rst @@ -0,0 +1,3 @@ +Define Zarr V3-specific codecs from numcodecs inside this repo. These codecs can be found in +:mod:`zarr.codecs.numcodecs`. This is necessary to resolve a circular dependency between Zarr +and Numcodecs. \ No newline at end of file From 0a724e6cc15d057481a00dc06ea786c922ecc35b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 13:03:07 +0200 Subject: [PATCH 51/54] register codecs in codecs/__init__.py --- src/zarr/codecs/__init__.py | 67 +++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 165dbe476d..7cb4e2a44d 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -4,10 +4,34 @@ from zarr.codecs.bytes import BytesCodec, Endian from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.gzip import GzipCodec +from zarr.codecs.numcodecs import ( + BZ2, + CRC32, + CRC32C, + LZ4, + LZMA, + ZFPY, + Adler32, + AsType, + BitRound, + Blosc, + Delta, + FixedScaleOffset, + Fletcher32, + GZip, + JenkinsLookup3, + PackBits, + PCodec, + Quantize, + Shuffle, + Zlib, + Zstd, +) from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec +from zarr.registry import register_codec __all__ = [ "BloscCname", @@ -24,3 +48,46 @@ "VLenUTF8Codec", "ZstdCodec", ] + +register_codec("blosc", BloscCodec) +register_codec("bytes", BytesCodec) + +# compatibility with earlier versions of ZEP1 +register_codec("endian", BytesCodec) +register_codec("crc32c", Crc32cCodec) +register_codec("gzip", GzipCodec) +register_codec("sharding_indexed", ShardingCodec) +register_codec("zstd", ZstdCodec) +register_codec("vlen-utf8", VLenUTF8Codec) +register_codec("vlen-bytes", VLenBytesCodec) +register_codec("transpose", TransposeCodec) + +# Register all the codecs formerly contained in numcodecs.zarr3 + +register_codec("numcodecs.bz2", BZ2, qualname="zarr.codecs.numcodecs.BZ2") +register_codec("numcodecs.crc32", CRC32, qualname="zarr.codecs.numcodecs.CRC32") +register_codec("numcodecs.crc32c", CRC32C, qualname="zarr.codecs.numcodecs.CRC32C") +register_codec("numcodecs.lz4", LZ4, qualname="zarr.codecs.numcodecs.LZ4") +register_codec("numcodecs.lzma", LZMA, qualname="zarr.codecs.numcodecs.LZMA") +register_codec("numcodecs.zfpy", ZFPY, qualname="zarr.codecs.numcodecs.ZFPY") +register_codec("numcodecs.adler32", Adler32, qualname="zarr.codecs.numcodecs.Adler32") +register_codec("numcodecs.astype", AsType, qualname="zarr.codecs.numcodecs.AsType") +register_codec("numcodecs.bitround", BitRound, qualname="zarr.codecs.numcodecs.BitRound") +register_codec("numcodecs.blosc", Blosc, qualname="zarr.codecs.numcodecs.Blosc") +register_codec("numcodecs.delta", Delta, qualname="zarr.codecs.numcodecs.Delta") +register_codec( + "numcodecs.fixedscaleoffset", + FixedScaleOffset, + qualname="zarr.codecs.numcodecs.FixedScaleOffset", +) +register_codec("numcodecs.fletcher32", Fletcher32, qualname="zarr.codecs.numcodecs.Fletcher32") +register_codec("numcodecs.gzip", GZip, qualname="zarr.codecs.numcodecs.GZip") +register_codec( + "numcodecs.jenkins_lookup3", JenkinsLookup3, qualname="zarr.codecs.numcodecs.JenkinsLookup3" +) +register_codec("numcodecs.pcodec", PCodec, qualname="zarr.codecs.numcodecs.pcodec") +register_codec("numcodecs.packbits", PackBits, qualname="zarr.codecs.numcodecs.PackBits") +register_codec("numcodecs.quantize", Quantize, qualname="zarr.codecs.numcodecs.Quantize") +register_codec("numcodecs.shuffle", Shuffle, qualname="zarr.codecs.numcodecs.Shuffle") +register_codec("numcodecs.zlib", Zlib, qualname="zarr.codecs.numcodecs.Zlib") +register_codec("numcodecs.zstd", Zstd, qualname="zarr.codecs.numcodecs.Zstd") From f023487a587ec45311406d5b43d11cff146128b0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 13:04:24 +0200 Subject: [PATCH 52/54] remove old registration sites --- src/zarr/codecs/blosc.py | 4 ---- src/zarr/codecs/bytes.py | 7 ------- src/zarr/codecs/crc32c_.py | 4 ---- src/zarr/codecs/gzip.py | 4 ---- src/zarr/codecs/numcodecs/__init__.py | 29 --------------------------- src/zarr/codecs/sharding.py | 5 +---- src/zarr/codecs/transpose.py | 4 ---- src/zarr/codecs/vlen_utf8.py | 5 ----- src/zarr/codecs/zstd.py | 4 ---- 9 files changed, 1 insertion(+), 65 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index f89f127852..6a482ed6e5 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -14,7 +14,6 @@ from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, parse_enum, parse_named_configuration from zarr.core.dtype.common import HasItemSize -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -199,6 +198,3 @@ async def _encode_single( def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError - - -register_codec("blosc", BloscCodec) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 7576119c82..39c26bd4a8 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -11,7 +11,6 @@ from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration from zarr.core.dtype.common import HasEndianness -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -119,9 +118,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length - - -register_codec("bytes", BytesCodec) - -# compatibility with earlier versions of ZEP1 -register_codec("endian", BytesCodec) diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index c2e30f689a..b2ea356b0c 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -9,7 +9,6 @@ from zarr.abc.codec import BytesBytesCodec from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -65,6 +64,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 - - -register_codec("crc32c", Crc32cCodec) diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 9e6515a4d1..610ca9dadd 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -9,7 +9,6 @@ from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -73,6 +72,3 @@ def compute_encoded_size( _chunk_spec: ArraySpec, ) -> int: raise NotImplementedError - - -register_codec("gzip", GzipCodec) diff --git a/src/zarr/codecs/numcodecs/__init__.py b/src/zarr/codecs/numcodecs/__init__.py index 08ccf535de..d68ad3fba6 100644 --- a/src/zarr/codecs/numcodecs/__init__.py +++ b/src/zarr/codecs/numcodecs/__init__.py @@ -27,35 +27,6 @@ _NumcodecsBytesBytesCodec, _NumcodecsCodec, ) -from zarr.registry import register_codec - -register_codec("numcodecs.bz2", BZ2, qualname="zarr.codecs.numcodecs.BZ2") -register_codec("numcodecs.crc32", CRC32, qualname="zarr.codecs.numcodecs.CRC32") -register_codec("numcodecs.crc32c", CRC32C, qualname="zarr.codecs.numcodecs.CRC32C") -register_codec("numcodecs.lz4", LZ4, qualname="zarr.codecs.numcodecs.LZ4") -register_codec("numcodecs.lzma", LZMA, qualname="zarr.codecs.numcodecs.LZMA") -register_codec("numcodecs.zfpy", ZFPY, qualname="zarr.codecs.numcodecs.ZFPY") -register_codec("numcodecs.adler32", Adler32, qualname="zarr.codecs.numcodecs.Adler32") -register_codec("numcodecs.astype", AsType, qualname="zarr.codecs.numcodecs.AsType") -register_codec("numcodecs.bitround", BitRound, qualname="zarr.codecs.numcodecs.BitRound") -register_codec("numcodecs.blosc", Blosc, qualname="zarr.codecs.numcodecs.Blosc") -register_codec("numcodecs.delta", Delta, qualname="zarr.codecs.numcodecs.Delta") -register_codec( - "numcodecs.fixedscaleoffset", - FixedScaleOffset, - qualname="zarr.codecs.numcodecs.FixedScaleOffset", -) -register_codec("numcodecs.fletcher32", Fletcher32, qualname="zarr.codecs.numcodecs.Fletcher32") -register_codec("numcodecs.gzip", GZip, qualname="zarr.codecs.numcodecs.GZip") -register_codec( - "numcodecs.jenkins_lookup3", JenkinsLookup3, qualname="zarr.codecs.numcodecs.JenkinsLookup3" -) -register_codec("numcodecs.pcodec", PCodec, qualname="zarr.codecs.numcodecs.pcodec") -register_codec("numcodecs.packbits", PackBits, qualname="zarr.codecs.numcodecs.PackBits") -register_codec("numcodecs.quantize", Quantize, qualname="zarr.codecs.numcodecs.Quantize") -register_codec("numcodecs.shuffle", Shuffle, qualname="zarr.codecs.numcodecs.Shuffle") -register_codec("numcodecs.zlib", Zlib, qualname="zarr.codecs.numcodecs.Zlib") -register_codec("numcodecs.zstd", Zstd, qualname="zarr.codecs.numcodecs.Zstd") __all__ = [ "BZ2", diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 58d34f62e7..ecbd258ccf 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -51,7 +51,7 @@ morton_order_iter, ) from zarr.core.metadata.v3 import parse_codecs -from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec +from zarr.registry import get_ndbuffer_class, get_pipeline_class if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -760,6 +760,3 @@ async def _load_full_shard_maybe( def compute_encoded_size(self, input_byte_length: int, shard_spec: ArraySpec) -> int: chunks_per_shard = self._get_chunks_per_shard(shard_spec) return input_byte_length + self._shard_index_size(chunks_per_shard) - - -register_codec("sharding_indexed", ShardingCodec) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 92e7da81e1..a8570b6e8f 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -9,7 +9,6 @@ from zarr.abc.codec import ArrayArrayCodec from zarr.core.array_spec import ArraySpec from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -113,6 +112,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length - - -register_codec("transpose", TransposeCodec) diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 28c64be1c0..fa1a229855 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -9,7 +9,6 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -112,7 +111,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: # what is input_byte_length for an object dtype? raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") - - -register_codec("vlen-utf8", VLenUTF8Codec) -register_codec("vlen-bytes", VLenBytesCodec) diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index ead41e7b5f..27cc9a7777 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -12,7 +12,6 @@ from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -92,6 +91,3 @@ async def _encode_single( def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError - - -register_codec("zstd", ZstdCodec) From 6636c10f4b166d8db0f57e8df7aa9b1f7f41bfdc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 18:32:41 +0200 Subject: [PATCH 53/54] wip --- src/zarr/abc/codec.py | 3 +- src/zarr/codecs/_v2.py | 36 +++++ src/zarr/codecs/blosc.py | 218 ++++++++++++++++++-------- src/zarr/codecs/numcodecs/__init__.py | 27 ++++ src/zarr/codecs/numcodecs/_codecs.py | 31 ++-- src/zarr/core/codec_pipeline.py | 47 +----- src/zarr/core/metadata/v2.py | 4 + src/zarr/registry.py | 6 +- tests/test_array.py | 10 +- tests/test_codecs/test_numcodecs.py | 3 +- 10 files changed, 254 insertions(+), 131 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index b454702251..7c9dc20ae2 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -4,7 +4,6 @@ from collections.abc import Mapping from typing import ( TYPE_CHECKING, - ClassVar, Generic, Literal, Self, @@ -14,7 +13,7 @@ overload, ) -from typing_extensions import Protocol, ReadOnly +from typing_extensions import ReadOnly from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 8ea1143771..6a826559bb 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -14,6 +14,8 @@ CodecJSON, CodecJSON_V2, ) +from zarr.core.chunk_grids import ChunkGrid +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: @@ -141,6 +143,40 @@ def _from_json_v3(cls, data: CodecJSON) -> Self: def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: raise NotImplementedError + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: + """Fills in codec configuration parameters that can be automatically + inferred from the array metadata. + + Parameters + ---------- + array_spec : ArraySpec + + Returns + ------- + Self + """ + return self + + def validate( + self, + *, + shape: tuple[int, ...], + dtype: ZDType[TBaseDType, TBaseScalar], + chunk_grid: ChunkGrid, + ) -> None: + """Validates that the codec configuration is compatible with the array metadata. + Raises errors when the codec configuration is not compatible. + + Parameters + ---------- + shape : tuple[int, ...] + The array shape + dtype : np.dtype[Any] + The array data type + chunk_grid : ChunkGrid + The array chunk grid + """ + def to_array_array(self) -> NumcodecsArrayArrayCodec: """ Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 6a482ed6e5..2da1e6697b 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,19 +1,32 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass, replace -from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Final, + Literal, + NotRequired, + TypedDict, + TypeGuard, + overload, +) import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version - -from zarr.abc.codec import BytesBytesCodec -from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from typing_extensions import ReadOnly + +from zarr.abc.codec import BytesBytesCodec, CodecJSON +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.core.dtype.common import HasItemSize +from zarr.errors import CodecValidationError if TYPE_CHECKING: from typing import Self @@ -21,39 +34,66 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") + +BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") + + +class BloscConfigV2(TypedDict): + cname: BloscCname + clevel: int + shuffle: int + blocksize: int + typesize: NotRequired[int] -class BloscShuffle(Enum): + +class BloscConfigV3(TypedDict): + cname: BloscCname + clevel: int + shuffle: BloscShuffle + blocksize: int + typesize: int + + +class BloscJSON_V2(BloscConfigV2): """ - Enum for shuffle filter used by blosc. + The JSON form of the Blosc codec in Zarr V2. """ - noshuffle = "noshuffle" - shuffle = "shuffle" - bitshuffle = "bitshuffle" - - @classmethod - def from_int(cls, num: int) -> BloscShuffle: - blosc_shuffle_int_to_str = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", - } - if num not in blosc_shuffle_int_to_str: - raise ValueError(f"Value must be between 0 and 2. Got {num}.") - return BloscShuffle[blosc_shuffle_int_to_str[num]] + id: ReadOnly[Literal["blosc"]] -class BloscCname(Enum): +class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ - Enum for compression library used by blosc. + The JSON form of the Blosc codec in Zarr V3. """ - lz4 = "lz4" - lz4hc = "lz4hc" - blosclz = "blosclz" - zstd = "zstd" - snappy = "snappy" - zlib = "zlib" + +def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"} + and data["id"] == "blosc" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "blosc" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"cname", "clevel", "shuffle", "blocksize", "typesize"} + ) + + +def parse_cname(value: object) -> BloscCname: + if value not in BLOSC_CNAME: + raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.") + return value # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc @@ -84,31 +124,35 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") +def parse_shuffle(data: object) -> BloscShuffle: + if data in BLOSC_SHUFFLE: + return data # type: ignore[return-value] + raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.") + + @dataclass(frozen=True) class BloscCodec(BytesBytesCodec): - """blosc codec""" - is_fixed_size = False typesize: int | None - cname: BloscCname = BloscCname.zstd - clevel: int = 5 - shuffle: BloscShuffle | None = BloscShuffle.noshuffle - blocksize: int = 0 + cname: BloscCname + clevel: int + shuffle: BloscShuffle | None + blocksize: int def __init__( self, *, typesize: int | None = None, - cname: BloscCname | str = BloscCname.zstd, + cname: BloscCname = "zstd", clevel: int = 5, - shuffle: BloscShuffle | str | None = None, + shuffle: BloscShuffle | None = None, blocksize: int = 0, ) -> None: typesize_parsed = parse_typesize(typesize) if typesize is not None else None - cname_parsed = parse_enum(cname, BloscCname) + cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None + shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -119,24 +163,74 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "blosc") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.typesize is None: - raise ValueError("`typesize` needs to be set for serialization.") - if self.shuffle is None: - raise ValueError("`shuffle` needs to be set for serialization.") - return { - "name": "blosc", - "configuration": { - "typesize": self.typesize, - "cname": self.cname.value, + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + cname=data["cname"], + clevel=data["clevel"], + shuffle=BLOSC_SHUFFLE[data["shuffle"]], + blocksize=data["blocksize"], + typesize=data.get("typesize", None), + ) + msg = ( + "Invalid Zarr V2 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + typesize=data["configuration"]["typesize"], + cname=data["configuration"]["cname"], + clevel=data["configuration"]["clevel"], + shuffle=data["configuration"]["shuffle"], + blocksize=data["configuration"]["blocksize"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: + if self.typesize is None or self.shuffle is None: + raise ValueError("typesize and blocksize need to be set for encoding.") + if zarr_format == 2: + return { + "id": "blosc", "clevel": self.clevel, - "shuffle": self.shuffle.value, + "cname": self.cname, + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, - }, - } + } + elif zarr_format == 3: + return { + "name": "blosc", + "configuration": { + "clevel": self.clevel, + "cname": self.cname, + "shuffle": self.shuffle, + "typesize": self.typesize, + "blocksize": self.blocksize, + }, + } + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = 1 @@ -146,10 +240,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if new_codec.typesize is None: new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: - new_codec = replace( - new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), - ) + new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle") return new_codec @@ -157,15 +248,10 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: def _blosc_codec(self) -> Blosc: if self.shuffle is None: raise ValueError("`shuffle` needs to be set for decoding and encoding.") - map_shuffle_str_to_int = { - BloscShuffle.noshuffle: 0, - BloscShuffle.shuffle: 1, - BloscShuffle.bitshuffle: 2, - } config_dict = { - "cname": self.cname.name, + "cname": self.cname, "clevel": self.clevel, - "shuffle": map_shuffle_str_to_int[self.shuffle], + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, } # See https://github.com/zarr-developers/numcodecs/pull/713 @@ -178,6 +264,8 @@ async def _decode_single( chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer: + from zarr.core.buffer.cpu import as_numpy_array_wrapper + return await asyncio.to_thread( as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype ) diff --git a/src/zarr/codecs/numcodecs/__init__.py b/src/zarr/codecs/numcodecs/__init__.py index d68ad3fba6..cd0541797a 100644 --- a/src/zarr/codecs/numcodecs/__init__.py +++ b/src/zarr/codecs/numcodecs/__init__.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Final + from zarr.codecs.numcodecs._codecs import ( BZ2, CRC32, @@ -28,6 +30,31 @@ _NumcodecsCodec, ) +# This is a fixed dictionary of numcodecs codecs for which we have pre-made Zarr V3 wrappers +numcodecs_wrappers: Final[dict[str, type[_NumcodecsCodec]]] = { + "bz2": BZ2, + "crc32": CRC32, + "crc32c": CRC32C, + "lz4": LZ4, + "lzma": LZMA, + "zfpy": ZFPY, + "adler32": Adler32, + "astype": AsType, + "bitround": BitRound, + "blosc": Blosc, + "delta": Delta, + "fixedscaleoffset": FixedScaleOffset, + "fletcher32": Fletcher32, + "gzip": GZip, + "jenkins_lookup3": JenkinsLookup3, + "packbits": PackBits, + "pcodec": PCodec, + "quantize": Quantize, + "shuffle": Shuffle, + "zlib": Zlib, + "zstd": Zstd, +} + __all__ = [ "BZ2", "CRC32", diff --git a/src/zarr/codecs/numcodecs/_codecs.py b/src/zarr/codecs/numcodecs/_codecs.py index c7884700c7..493736b9b8 100644 --- a/src/zarr/codecs/numcodecs/_codecs.py +++ b/src/zarr/codecs/numcodecs/_codecs.py @@ -26,17 +26,18 @@ import asyncio import math +from collections.abc import Mapping from dataclasses import dataclass, replace from functools import cached_property -from typing import TYPE_CHECKING, Any, Self +from typing import TYPE_CHECKING, Any, Final, Literal, Self, overload from warnings import warn import numpy as np -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, CodecJSON_V2 from zarr.abc.metadata import Metadata from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration, product +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration, product from zarr.dtype import UInt8, ZDType, parse_dtype from zarr.errors import ZarrUserWarning from zarr.registry import get_numcodec @@ -46,7 +47,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer -CODEC_PREFIX = "numcodecs." +CODEC_PREFIX: Final = "numcodecs." def _expect_name_prefix(codec_name: str) -> str: @@ -116,12 +117,22 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: return cls(**codec_config) def to_dict(self) -> dict[str, JSON]: - codec_config = self.codec_config.copy() - codec_config.pop("id", None) - return { - "name": self.codec_name, - "configuration": codec_config, - } + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]: + codec_id = self.codec_config["id"] + codec_config = {k: v for k, v in self.codec_config.items() if k != "id"} + if zarr_format == 2: + return {"id": codec_id, **codec_config} + else: + return {"name": codec_id, "configuration": codec_config} def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: raise NotImplementedError # pragma: no cover diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 100e0b07d3..1bda0eabf1 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from itertools import islice, pairwise +from itertools import islice from typing import TYPE_CHECKING, Any, TypeVar from warnings import warn @@ -531,6 +531,7 @@ def codecs_from_list( category=ZarrUserWarning, stacklevel=3, ) + if len(array_bytes_idcs) == 0: # There is no array-bytes codec. Unless we can find a numcodec wrapper to act as an # array-bytes codec, this is an error. @@ -662,49 +663,5 @@ def codecs_from_list( return array_array, array_bytes_maybe, bytes_bytes - for prev_codec, cur_codec in pairwise((None, *codecs)): - if isinstance(cur_codec, ArrayArrayCodec): - if isinstance(prev_codec, ArrayBytesCodec | BytesBytesCodec): - msg = ( - f"Invalid codec order. ArrayArrayCodec {cur_codec}" - "must be preceded by another ArrayArrayCodec. " - f"Got {type(prev_codec)} instead." - ) - raise TypeError(msg) - array_array += (cur_codec,) - - elif isinstance(cur_codec, ArrayBytesCodec): - if isinstance(prev_codec, BytesBytesCodec): - msg = ( - f"Invalid codec order. ArrayBytes codec {cur_codec}" - f" must be preceded by an ArrayArrayCodec. Got {type(prev_codec)} instead." - ) - raise TypeError(msg) - - if array_bytes_maybe is not None: - msg = ( - f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {cur_codec}. " - "Only one array-to-bytes codec is allowed." - ) - raise ValueError(msg) - - array_bytes_maybe = cur_codec - - elif isinstance(cur_codec, BytesBytesCodec): - if isinstance(prev_codec, ArrayArrayCodec): - msg = ( - f"Invalid codec order. BytesBytesCodec {cur_codec}" - "must be preceded by either another BytesBytesCodec, or an ArrayBytesCodec. " - f"Got {type(prev_codec)} instead." - ) - bytes_bytes += (cur_codec,) - elif isinstance(cur_codec, NumcodecsWrapper): - raise TypeError - - if array_bytes_maybe is None: - raise ValueError("Required ArrayBytesCodec was not found.") - else: - return array_array, array_bytes_maybe, bytes_bytes - register_pipeline(BatchedCodecPipeline) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 6aa7fd46f5..7346e04f93 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -338,6 +338,10 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data +def is_object_codec(codec: JSON) -> bool: + return codec.get("id") in OBJECT_CODEC_IDS + + def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: """ Inspect a sequence of codecs / filters for an "object codec", i.e. a codec diff --git a/src/zarr/registry.py b/src/zarr/registry.py index e74e9f7df8..40d21ceece 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -198,7 +198,7 @@ def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec | Numcode codec_config = {} else: codec_name = request["name"] - codec_config = request["configuration"] + codec_config = request.get("configuration", {}) else: raise ValueError( f"Invalid zarr format. Must be 2 or 3, got {zarr_format!r}" @@ -209,7 +209,7 @@ def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec | Numcode return codec_cls.from_json(request, zarr_format=zarr_format) except KeyError: # if we can't find the codec in the zarr python registry, try the numcodecs registry - codec = get_numcodec({'id': codec_name, **codec_config}) + codec = get_numcodec({"id": codec_name, **codec_config}) return NumcodecsWrapper(codec=codec) @@ -264,7 +264,7 @@ def _parse_array_bytes_codec( is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec - from zarr.abc.numcodec import _is_numcodec + from zarr.abc.numcodec import _is_numcodec from zarr.codecs._v2 import NumcodecsArrayBytesCodec, NumcodecsWrapper if isinstance(data, dict): diff --git a/tests/test_array.py b/tests/test_array.py index e402d99cc2..89f9f74cdb 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -35,7 +35,6 @@ _parse_chunk_encoding_v3, chunks_initialized, create_array, - default_compressor_v2, default_filters_v2, default_serializer_v3, ) @@ -1729,9 +1728,12 @@ def test_roundtrip_numcodecs() -> None: BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} # Read in the array again and check compressor config root = zarr.open_group(store) - with pytest.warns(ZarrUserWarning, match=warn_msg): - metadata = root["test"].metadata.to_dict() - expected = (*filters, BYTES_CODEC, *compressors) + metadata = root["test"].metadata.to_dict() + # The names will change because numcodecs. is an alias for + expected = tuple( + {"name": v["name"].removeprefix("numcodecs."), "configuration": v["configuration"]} + for v in (*filters, BYTES_CODEC, *compressors) + ) assert metadata["codecs"] == expected diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index 99c4685a67..a15e59dce4 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -153,8 +153,7 @@ def test_generic_filter( a[:, :] = data.copy() with codec_conf(): - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") np.testing.assert_array_equal(data, b[:, :]) From 53c6b465f450afaa5ce5a809394751f9b09f850e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 21 Aug 2025 20:02:53 +0200 Subject: [PATCH 54/54] wip --- src/zarr/codecs/_v2.py | 27 ++-- src/zarr/codecs/blosc.py | 10 +- src/zarr/codecs/numcodecs/_codecs.py | 154 ++++++++--------------- src/zarr/core/array.py | 65 ++++++++++ tests/test_codecs/test_numcodecs.py | 17 +-- tests/test_metadata/test_consolidated.py | 4 +- 6 files changed, 143 insertions(+), 134 deletions(-) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 6a826559bb..71389f1311 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -1,8 +1,10 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, Self, overload +from functools import cached_property +from typing import TYPE_CHECKING, ClassVar, Literal, Self, overload import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like @@ -112,7 +114,12 @@ def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) @dataclass(frozen=True, kw_only=True) class NumcodecsWrapper: - codec: Numcodec + codec_cls: ClassVar[type[Numcodec]] + config: Mapping[str, object] + + @cached_property + def codec(self) -> Numcodec: + return self.codec_cls(**self.config) @overload def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... @@ -121,7 +128,7 @@ def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: if zarr_format == 2: - return self.codec.get_config() + return {"id": self.codec_cls.codec_id, **self.config} elif zarr_format == 3: config = self.codec.get_config() config_no_id = {k: v for k, v in config.items() if k != "id"} @@ -130,15 +137,11 @@ def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[st @classmethod def _from_json_v2(cls, data: CodecJSON) -> Self: - raise NotADirectoryError( - "This class does not support creating instances from JSON data for Zarr format 2." - ) + return cls(config=data) @classmethod def _from_json_v3(cls, data: CodecJSON) -> Self: - raise NotImplementedError( - "This class does not support creating instances from JSON data for Zarr format 3." - ) + return cls(config=data.get("configuration", {})) def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: raise NotImplementedError @@ -181,19 +184,19 @@ def to_array_array(self) -> NumcodecsArrayArrayCodec: """ Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. """ - return NumcodecsArrayArrayCodec(codec=self.codec) + return NumcodecsArrayArrayCodec(config=self.config) def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec: """ Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec. """ - return NumcodecsBytesBytesCodec(codec=self.codec) + return NumcodecsBytesBytesCodec(config=self.config) def to_array_bytes(self) -> NumcodecsArrayBytesCodec: """ Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec. """ - return NumcodecsArrayBytesCodec(codec=self.codec) + return NumcodecsArrayBytesCodec(config=self.config) class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec): diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 2da1e6697b..57d44bd783 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -134,10 +134,10 @@ def parse_shuffle(data: object) -> BloscShuffle: class BloscCodec(BytesBytesCodec): is_fixed_size = False - typesize: int | None + typesize: int cname: BloscCname clevel: int - shuffle: BloscShuffle | None + shuffle: BloscShuffle blocksize: int def __init__( @@ -149,10 +149,10 @@ def __init__( shuffle: BloscShuffle | None = None, blocksize: int = 0, ) -> None: - typesize_parsed = parse_typesize(typesize) if typesize is not None else None + typesize_parsed = parse_typesize(typesize) if typesize is not None else 1 cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None + shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else "noshuffle" blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -207,8 +207,6 @@ def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ... def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: - if self.typesize is None or self.shuffle is None: - raise ValueError("typesize and blocksize need to be set for encoding.") if zarr_format == 2: return { "id": "blosc", diff --git a/src/zarr/codecs/numcodecs/_codecs.py b/src/zarr/codecs/numcodecs/_codecs.py index 493736b9b8..52f4718f07 100644 --- a/src/zarr/codecs/numcodecs/_codecs.py +++ b/src/zarr/codecs/numcodecs/_codecs.py @@ -30,16 +30,13 @@ from dataclasses import dataclass, replace from functools import cached_property from typing import TYPE_CHECKING, Any, Final, Literal, Self, overload -from warnings import warn import numpy as np from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, CodecJSON_V2 -from zarr.abc.metadata import Metadata from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration, product from zarr.dtype import UInt8, ZDType, parse_dtype -from zarr.errors import ZarrUserWarning from zarr.registry import get_numcodec if TYPE_CHECKING: @@ -69,55 +66,12 @@ def _parse_codec_configuration(data: dict[str, JSON]) -> dict[str, JSON]: @dataclass(frozen=True) -class _NumcodecsCodec(Metadata): - codec_name: str - codec_config: dict[str, JSON] - - def __init_subclass__(cls, *, codec_name: str | None = None, **kwargs: Any) -> None: - """To be used only when creating the actual public-facing codec class.""" - super().__init_subclass__(**kwargs) - if codec_name is not None: - namespace = codec_name - - cls_name = f"{CODEC_PREFIX}{namespace}.{cls.__name__}" - cls.codec_name = f"{CODEC_PREFIX}{namespace}" - cls.__doc__ = f""" - See :class:`{cls_name}` for more details and parameters. - """ - - def __init__(self, **codec_config: JSON) -> None: - if not self.codec_name: - raise ValueError( - "The codec name needs to be supplied through the `codec_name` attribute." - ) # pragma: no cover - unprefixed_codec_name = _expect_name_prefix(self.codec_name) - - if "id" not in codec_config: - codec_config = {"id": unprefixed_codec_name, **codec_config} - elif codec_config["id"] != unprefixed_codec_name: - raise ValueError( - f"Codec id does not match {unprefixed_codec_name}. Got: {codec_config['id']}." - ) # pragma: no cover - - object.__setattr__(self, "codec_config", codec_config) - warn( - "Numcodecs codecs are not in the Zarr version 3 specification and " - "may not be supported by other zarr implementations.", - category=ZarrUserWarning, - stacklevel=2, - ) +class _NumcodecsCodec: + codec_cls: type[Numcodec] @cached_property def _codec(self) -> Numcodec: - return get_numcodec(self.codec_config) # type: ignore[arg-type] - - @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: - codec_config = _parse_codec_configuration(data) - return cls(**codec_config) - - def to_dict(self) -> dict[str, JSON]: - return self.to_json(zarr_format=3) + return get_numcodec(self.to_json(zarr_format=2)) # type: ignore[arg-type] @overload def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... @@ -134,20 +88,8 @@ def to_json( else: return {"name": codec_id, "configuration": codec_config} - def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: - raise NotImplementedError # pragma: no cover - - # Override __repr__ because dynamically constructed classes don't seem to work otherwise - def __repr__(self) -> str: - codec_config = self.codec_config.copy() - codec_config.pop("id", None) - return f"{self.__class__.__name__}(codec_name={self.codec_name!r}, codec_config={codec_config!r})" - class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec): - def __init__(self, **codec_config: JSON) -> None: - super().__init__(**codec_config) - async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: return await asyncio.to_thread( as_numpy_array_wrapper, @@ -167,9 +109,6 @@ async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buf class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec): - def __init__(self, **codec_config: JSON) -> None: - super().__init__(**codec_config) - async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: chunk_ndarray = chunk_data.as_ndarray_like() out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) @@ -197,35 +136,37 @@ async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> B # bytes-to-bytes codecs -class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"): - pass +class Blosc(_NumcodecsBytesBytesCodec): + codec_name = "blosc" -class LZ4(_NumcodecsBytesBytesCodec, codec_name="lz4"): - pass +class LZ4(_NumcodecsBytesBytesCodec): + codec_name = "lz4" -class Zstd(_NumcodecsBytesBytesCodec, codec_name="zstd"): - pass +class Zstd(_NumcodecsBytesBytesCodec): + codec_name = "zstd" -class Zlib(_NumcodecsBytesBytesCodec, codec_name="zlib"): - pass +class Zlib(_NumcodecsBytesBytesCodec): + codec_name = "zlib" -class GZip(_NumcodecsBytesBytesCodec, codec_name="gzip"): - pass +class GZip(_NumcodecsBytesBytesCodec): + codec_name = "gzip" -class BZ2(_NumcodecsBytesBytesCodec, codec_name="bz2"): - pass +class BZ2(_NumcodecsBytesBytesCodec): + codec_name = "bz2" -class LZMA(_NumcodecsBytesBytesCodec, codec_name="lzma"): - pass +class LZMA(_NumcodecsBytesBytesCodec): + codec_name = "lzma" -class Shuffle(_NumcodecsBytesBytesCodec, codec_name="shuffle"): +class Shuffle(_NumcodecsBytesBytesCodec): + codec_name = "shuffle" + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Shuffle: if self.codec_config.get("elementsize") is None: dtype = array_spec.dtype.to_native_dtype() @@ -234,7 +175,9 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Shuffle: # array-to-array codecs ("filters") -class Delta(_NumcodecsArrayArrayCodec, codec_name="delta"): +class Delta(_NumcodecsArrayArrayCodec): + codec_name = "delta" + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: if astype := self.codec_config.get("astype"): dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] @@ -242,11 +185,13 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: return chunk_spec -class BitRound(_NumcodecsArrayArrayCodec, codec_name="bitround"): - pass +class BitRound(_NumcodecsArrayArrayCodec): + codec_name = "bitround" -class FixedScaleOffset(_NumcodecsArrayArrayCodec, codec_name="fixedscaleoffset"): +class FixedScaleOffset(_NumcodecsArrayArrayCodec): + codec_name = "fixedscaleoffset" + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: if astype := self.codec_config.get("astype"): dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] @@ -260,18 +205,19 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset: return self -class Quantize(_NumcodecsArrayArrayCodec, codec_name="quantize"): - def __init__(self, **codec_config: JSON) -> None: - super().__init__(**codec_config) +class Quantize(_NumcodecsArrayArrayCodec): + codec_name = "quantize" - def evolve_from_array_spec(self, array_spec: ArraySpec) -> Quantize: + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if self.codec_config.get("dtype") is None: dtype = array_spec.dtype.to_native_dtype() return Quantize(**{**self.codec_config, "dtype": str(dtype)}) return self -class PackBits(_NumcodecsArrayArrayCodec, codec_name="packbits"): +class PackBits(_NumcodecsArrayArrayCodec): + codec_name = "packbits" + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: return replace( chunk_spec, @@ -288,7 +234,9 @@ def validate(self, *, dtype: ZDType[Any, Any], **_kwargs: Any) -> None: raise ValueError(f"Packbits filter requires bool dtype. Got {dtype}.") -class AsType(_NumcodecsArrayArrayCodec, codec_name="astype"): +class AsType(_NumcodecsArrayArrayCodec): + codec_name = "astype" + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: dtype = parse_dtype(np.dtype(self.codec_config["encode_dtype"]), zarr_format=3) # type: ignore[arg-type] return replace(chunk_spec, dtype=dtype) @@ -307,30 +255,30 @@ def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> return input_byte_length + 4 # pragma: no cover -class CRC32(_NumcodecsChecksumCodec, codec_name="crc32"): - pass +class CRC32(_NumcodecsChecksumCodec): + codec_name = "crc32" -class CRC32C(_NumcodecsChecksumCodec, codec_name="crc32c"): - pass +class CRC32C(_NumcodecsChecksumCodec): + codec_name = "crc32c" -class Adler32(_NumcodecsChecksumCodec, codec_name="adler32"): - pass +class Adler32(_NumcodecsChecksumCodec): + codec_name = "adler32" -class Fletcher32(_NumcodecsChecksumCodec, codec_name="fletcher32"): - pass +class Fletcher32(_NumcodecsChecksumCodec): + codec_name = "fletcher32" -class JenkinsLookup3(_NumcodecsChecksumCodec, codec_name="jenkins_lookup3"): - pass +class JenkinsLookup3(_NumcodecsChecksumCodec): + codec_name = "jenkins_lookup3" # array-to-bytes codecs -class PCodec(_NumcodecsArrayBytesCodec, codec_name="pcodec"): - pass +class PCodec(_NumcodecsArrayBytesCodec): + codec_name = "pcodec" -class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"): - pass +class ZFPY(_NumcodecsArrayBytesCodec): + codec_name = "zfpy" diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 6a7b7d6ee7..aae684ad2c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -73,6 +73,8 @@ ) from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( + AsyncOIndex, + AsyncVIndex, BasicIndexer, BasicSelection, BlockIndex, @@ -1451,6 +1453,56 @@ async def getitem( ) return await self._get_selection(indexer, prototype=prototype) + async def get_orthogonal_selection( + self, + selection: OrthogonalSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + if prototype is None: + prototype = default_buffer_prototype() + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + return await self._get_selection( + indexer=indexer, out=out, fields=fields, prototype=prototype + ) + + async def get_mask_selection( + self, + mask: MaskSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + if prototype is None: + prototype = default_buffer_prototype() + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + return await self._get_selection( + indexer=indexer, out=out, fields=fields, prototype=prototype + ) + + async def get_coordinate_selection( + self, + selection: CoordinateSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + if prototype is None: + prototype = default_buffer_prototype() + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + out_array = await self._get_selection( + indexer=indexer, out=out, fields=fields, prototype=prototype + ) + + if hasattr(out_array, "shape"): + # restore shape + out_array = np.array(out_array).reshape(indexer.sel_shape) + return out_array + async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = False) -> None: """ Asynchronously save the array metadata. @@ -1581,6 +1633,19 @@ async def setitem( ) return await self._set_selection(indexer, value, prototype=prototype) + @property + def oindex(self) -> AsyncOIndex[T_ArrayMetadata]: + """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and + :func:`set_orthogonal_selection` for documentation and examples.""" + return AsyncOIndex(self) + + @property + def vindex(self) -> AsyncVIndex[T_ArrayMetadata]: + """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, + :func:`set_coordinate_selection`, :func:`get_mask_selection` and + :func:`set_mask_selection` for documentation and examples.""" + return AsyncVIndex(self) + async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: """ Asynchronously resize the array to a new shape. diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index a15e59dce4..39b98cded4 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -171,8 +171,7 @@ def test_generic_filter_bitround() -> None: ) a[:, :] = data.copy() - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") assert np.allclose(data, b[:, :], atol=0.1) @@ -190,8 +189,7 @@ def test_generic_filter_quantize() -> None: ) a[:, :] = data.copy() - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") assert np.allclose(data, b[:, :], atol=0.001) @@ -210,8 +208,7 @@ def test_generic_filter_packbits() -> None: ) a[:, :] = data.copy() - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") np.testing.assert_array_equal(data, b[:, :]) with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): @@ -251,8 +248,7 @@ def test_generic_checksum(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec a[:, :] = data.copy() with codec_conf(): - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") np.testing.assert_array_equal(data, b[:, :]) @@ -302,8 +298,7 @@ def test_delta_astype() -> None: a[:, :] = data.copy() with codec_conf(): - with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): - b = open_array(a.store, mode="r") + b = open_array(a.store, mode="r") np.testing.assert_array_equal(data, b[:, :]) @@ -316,7 +311,7 @@ def test_repr() -> None: def test_to_dict() -> None: with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): codec = _numcodecs.LZ4(level=5) - assert codec.to_dict() == {"name": "numcodecs.lz4", "configuration": {"level": 5}} + assert codec.to_dict() == {"name": "lz4", "configuration": {"level": 5}} @pytest.mark.parametrize( diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 0995be3c6d..8cc8a9ad11 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -5,7 +5,6 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr.api.asynchronous import zarr.api.synchronous @@ -17,6 +16,7 @@ open, open_consolidated, ) +from zarr.core.array import default_compressor_v2 from zarr.core.buffer import cpu, default_buffer_prototype from zarr.core.dtype import parse_dtype from zarr.core.group import ConsolidatedMetadata, GroupMetadata @@ -576,7 +576,7 @@ async def test_consolidated_metadata_v2(self) -> None: attributes={"key": "a"}, chunks=(1,), fill_value=0, - compressor=Blosc(), + compressor=default_compressor_v2(dtype=dtype), order="C", ), "g1": GroupMetadata(