diff --git a/changes/3376.misc.rst b/changes/3376.misc.rst new file mode 100644 index 0000000000..68b0d9d855 --- /dev/null +++ b/changes/3376.misc.rst @@ -0,0 +1,3 @@ +Define Zarr V3-specific codecs from numcodecs inside this repo. These codecs can be found in +:mod:`zarr.codecs.numcodecs`. This is necessary to resolve a circular dependency between Zarr +and Numcodecs. \ No newline at end of file diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 0ae8017ca9..05cfa1e53c 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -42,23 +42,43 @@ requires the value of ``codecs.bytes.name`` to be ``'custompackage.NewBytesCodec This is the current default configuration:: >>> zarr.config.pprint() - {'array': {'order': 'C', - 'write_empty_chunks': False}, - 'async': {'concurrency': 10, 'timeout': None}, - 'buffer': 'zarr.buffer.cpu.Buffer', - 'codec_pipeline': {'batch_size': 1, - 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, - 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', + {'array': {'order': 'C', 'write_empty_chunks': False}, + 'async': {'concurrency': 10, 'timeout': None}, + 'buffer': 'zarr.buffer.cpu.Buffer', + 'codec_pipeline': {'batch_size': 1, + 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, + 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', 'bytes': 'zarr.codecs.bytes.BytesCodec', 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', 'endian': 'zarr.codecs.bytes.BytesCodec', 'gzip': 'zarr.codecs.gzip.GzipCodec', + 'numcodecs.adler32': 'zarr.codecs.numcodecs.Adler32', + 'numcodecs.astype': 'zarr.codecs.numcodecs.AsType', + 'numcodecs.bitround': 'zarr.codecs.numcodecs.BitRound', + 'numcodecs.blosc': 'zarr.codecs.numcodecs.Blosc', + 'numcodecs.bz2': 'zarr.codecs.numcodecs.BZ2', + 'numcodecs.crc32': 'zarr.codecs.numcodecs.CRC32', + 'numcodecs.crc32c': 'zarr.codecs.numcodecs.CRC32C', + 'numcodecs.delta': 'zarr.codecs.numcodecs.Delta', + 'numcodecs.fixedscaleoffset': 'zarr.codecs.numcodecs.FixedScaleOffset', + 'numcodecs.fletcher32': 'zarr.codecs.numcodecs.Fletcher32', + 'numcodecs.gZip': 'zarr.codecs.numcodecs.GZip', + 'numcodecs.jenkins_lookup3': 'zarr.codecs.numcodecs.JenkinsLookup3', + 'numcodecs.lz4': 'zarr.codecs.numcodecs.LZ4', + 'numcodecs.lzma': 'zarr.codecs.numcodecs.LZMA', + 'numcodecs.packbits': 'zarr.codecs.numcodecs.PackBits', + 'numcodecs.pcodec': 'zarr.codecs.numcodecs.PCodec', + 'numcodecs.quantize': 'zarr.codecs.numcodecs.Quantize', + 'numcodecs.shuffle': 'zarr.codecs.numcodecs.Shuffle', + 'numcodecs.zfpy': 'zarr.codecs.numcodecs.ZFPY', + 'numcodecs.zlib': 'zarr.codecs.numcodecs.Zlib', + 'numcodecs.zstd': 'zarr.codecs.numcodecs.Zstd', 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', 'transpose': 'zarr.codecs.transpose.TransposeCodec', 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, - 'default_zarr_format': 3, - 'json_indent': 2, - 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', - 'threading': {'max_workers': None}} + 'default_zarr_format': 3, + 'json_indent': 2, + 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', + 'threading': {'max_workers': None}} diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index d41c457b4e..7c9dc20ae2 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -2,13 +2,22 @@ from abc import abstractmethod from collections.abc import Mapping -from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar - -from typing_extensions import ReadOnly, TypedDict +from typing import ( + TYPE_CHECKING, + Generic, + Literal, + Self, + TypedDict, + TypeGuard, + TypeVar, + overload, +) + +from typing_extensions import ReadOnly from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import NamedConfig, concurrent_map +from zarr.core.common import NamedConfig, ZarrFormat, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -181,6 +190,36 @@ async def encode( """ return await _batching_helper(self._encode_single, chunks_and_specs) + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]: + raise NotImplementedError + + @classmethod + def _from_json_v2(cls, data: CodecJSON_V2[str]) -> Self: + return cls(**{k: v for k, v in data.items() if k != "id"}) + + @classmethod + def _from_json_v3(cls, data: CodecJSON_V3) -> Self: + if isinstance(data, str): + return cls() + return cls(**data["configuration"]) + + @classmethod + def from_json(cls, data: CodecJSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls._from_json_v2(data) + elif zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]): """Base class for array-to-array codecs.""" diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 53e981c3bd..2421bce37a 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -431,6 +431,7 @@ async def getsize(self, key: str) -> int: FileNotFoundError When the given key does not exist in the store. """ + # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 165dbe476d..7cb4e2a44d 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -4,10 +4,34 @@ from zarr.codecs.bytes import BytesCodec, Endian from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.gzip import GzipCodec +from zarr.codecs.numcodecs import ( + BZ2, + CRC32, + CRC32C, + LZ4, + LZMA, + ZFPY, + Adler32, + AsType, + BitRound, + Blosc, + Delta, + FixedScaleOffset, + Fletcher32, + GZip, + JenkinsLookup3, + PackBits, + PCodec, + Quantize, + Shuffle, + Zlib, + Zstd, +) from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec +from zarr.registry import register_codec __all__ = [ "BloscCname", @@ -24,3 +48,46 @@ "VLenUTF8Codec", "ZstdCodec", ] + +register_codec("blosc", BloscCodec) +register_codec("bytes", BytesCodec) + +# compatibility with earlier versions of ZEP1 +register_codec("endian", BytesCodec) +register_codec("crc32c", Crc32cCodec) +register_codec("gzip", GzipCodec) +register_codec("sharding_indexed", ShardingCodec) +register_codec("zstd", ZstdCodec) +register_codec("vlen-utf8", VLenUTF8Codec) +register_codec("vlen-bytes", VLenBytesCodec) +register_codec("transpose", TransposeCodec) + +# Register all the codecs formerly contained in numcodecs.zarr3 + +register_codec("numcodecs.bz2", BZ2, qualname="zarr.codecs.numcodecs.BZ2") +register_codec("numcodecs.crc32", CRC32, qualname="zarr.codecs.numcodecs.CRC32") +register_codec("numcodecs.crc32c", CRC32C, qualname="zarr.codecs.numcodecs.CRC32C") +register_codec("numcodecs.lz4", LZ4, qualname="zarr.codecs.numcodecs.LZ4") +register_codec("numcodecs.lzma", LZMA, qualname="zarr.codecs.numcodecs.LZMA") +register_codec("numcodecs.zfpy", ZFPY, qualname="zarr.codecs.numcodecs.ZFPY") +register_codec("numcodecs.adler32", Adler32, qualname="zarr.codecs.numcodecs.Adler32") +register_codec("numcodecs.astype", AsType, qualname="zarr.codecs.numcodecs.AsType") +register_codec("numcodecs.bitround", BitRound, qualname="zarr.codecs.numcodecs.BitRound") +register_codec("numcodecs.blosc", Blosc, qualname="zarr.codecs.numcodecs.Blosc") +register_codec("numcodecs.delta", Delta, qualname="zarr.codecs.numcodecs.Delta") +register_codec( + "numcodecs.fixedscaleoffset", + FixedScaleOffset, + qualname="zarr.codecs.numcodecs.FixedScaleOffset", +) +register_codec("numcodecs.fletcher32", Fletcher32, qualname="zarr.codecs.numcodecs.Fletcher32") +register_codec("numcodecs.gzip", GZip, qualname="zarr.codecs.numcodecs.GZip") +register_codec( + "numcodecs.jenkins_lookup3", JenkinsLookup3, qualname="zarr.codecs.numcodecs.JenkinsLookup3" +) +register_codec("numcodecs.pcodec", PCodec, qualname="zarr.codecs.numcodecs.pcodec") +register_codec("numcodecs.packbits", PackBits, qualname="zarr.codecs.numcodecs.PackBits") +register_codec("numcodecs.quantize", Quantize, qualname="zarr.codecs.numcodecs.Quantize") +register_codec("numcodecs.shuffle", Shuffle, qualname="zarr.codecs.numcodecs.Shuffle") +register_codec("numcodecs.zlib", Zlib, qualname="zarr.codecs.numcodecs.Zlib") +register_codec("numcodecs.zstd", Zstd, qualname="zarr.codecs.numcodecs.Zstd") diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 3c6c99c21c..6a826559bb 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,18 +2,28 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Self, overload import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BytesBytesCodec, + CodecJSON, + CodecJSON_V2, +) +from zarr.core.chunk_grids import ChunkGrid +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: from zarr.abc.numcodec import Numcodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer + from zarr.core.buffer.core import BufferPrototype + from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat @dataclass(frozen=True) @@ -98,3 +108,136 @@ async def _encode_single( def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError + + +@dataclass(frozen=True, kw_only=True) +class NumcodecsWrapper: + codec: Numcodec + + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: + if zarr_format == 2: + return self.codec.get_config() + elif zarr_format == 3: + config = self.codec.get_config() + config_no_id = {k: v for k, v in config.items() if k != "id"} + return {"name": config["id"], "configuration": config_no_id} + raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + raise NotADirectoryError( + "This class does not support creating instances from JSON data for Zarr format 2." + ) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + raise NotImplementedError( + "This class does not support creating instances from JSON data for Zarr format 3." + ) + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: + """Fills in codec configuration parameters that can be automatically + inferred from the array metadata. + + Parameters + ---------- + array_spec : ArraySpec + + Returns + ------- + Self + """ + return self + + def validate( + self, + *, + shape: tuple[int, ...], + dtype: ZDType[TBaseDType, TBaseScalar], + chunk_grid: ChunkGrid, + ) -> None: + """Validates that the codec configuration is compatible with the array metadata. + Raises errors when the codec configuration is not compatible. + + Parameters + ---------- + shape : tuple[int, ...] + The array shape + dtype : np.dtype[Any] + The array data type + chunk_grid : ChunkGrid + The array chunk grid + """ + + def to_array_array(self) -> NumcodecsArrayArrayCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. + """ + return NumcodecsArrayArrayCodec(codec=self.codec) + + def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec. + """ + return NumcodecsBytesBytesCodec(codec=self.codec) + + def to_array_bytes(self) -> NumcodecsArrayBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec. + """ + return NumcodecsArrayBytesCodec(codec=self.codec) + + +class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + from zarr.core.buffer.cpu import as_numpy_array_wrapper + + return await asyncio.to_thread( + as_numpy_array_wrapper, + self.codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self.codec.encode(chunk_bytes.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayArrayCodec(NumcodecsWrapper, ArrayArrayCodec): + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr] + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type] + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayBytesCodec(NumcodecsWrapper, ArrayBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self.codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index f89f127852..2da1e6697b 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,20 +1,32 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass, replace -from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Final, + Literal, + NotRequired, + TypedDict, + TypeGuard, + overload, +) import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version - -from zarr.abc.codec import BytesBytesCodec -from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from typing_extensions import ReadOnly + +from zarr.abc.codec import BytesBytesCodec, CodecJSON +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.core.dtype.common import HasItemSize -from zarr.registry import register_codec +from zarr.errors import CodecValidationError if TYPE_CHECKING: from typing import Self @@ -22,39 +34,66 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") + +BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") + + +class BloscConfigV2(TypedDict): + cname: BloscCname + clevel: int + shuffle: int + blocksize: int + typesize: NotRequired[int] + -class BloscShuffle(Enum): +class BloscConfigV3(TypedDict): + cname: BloscCname + clevel: int + shuffle: BloscShuffle + blocksize: int + typesize: int + + +class BloscJSON_V2(BloscConfigV2): """ - Enum for shuffle filter used by blosc. + The JSON form of the Blosc codec in Zarr V2. """ - noshuffle = "noshuffle" - shuffle = "shuffle" - bitshuffle = "bitshuffle" + id: ReadOnly[Literal["blosc"]] - @classmethod - def from_int(cls, num: int) -> BloscShuffle: - blosc_shuffle_int_to_str = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", - } - if num not in blosc_shuffle_int_to_str: - raise ValueError(f"Value must be between 0 and 2. Got {num}.") - return BloscShuffle[blosc_shuffle_int_to_str[num]] - -class BloscCname(Enum): +class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ - Enum for compression library used by blosc. + The JSON form of the Blosc codec in Zarr V3. """ - lz4 = "lz4" - lz4hc = "lz4hc" - blosclz = "blosclz" - zstd = "zstd" - snappy = "snappy" - zlib = "zlib" + +def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"} + and data["id"] == "blosc" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "blosc" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"cname", "clevel", "shuffle", "blocksize", "typesize"} + ) + + +def parse_cname(value: object) -> BloscCname: + if value not in BLOSC_CNAME: + raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.") + return value # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc @@ -85,31 +124,35 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") +def parse_shuffle(data: object) -> BloscShuffle: + if data in BLOSC_SHUFFLE: + return data # type: ignore[return-value] + raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.") + + @dataclass(frozen=True) class BloscCodec(BytesBytesCodec): - """blosc codec""" - is_fixed_size = False typesize: int | None - cname: BloscCname = BloscCname.zstd - clevel: int = 5 - shuffle: BloscShuffle | None = BloscShuffle.noshuffle - blocksize: int = 0 + cname: BloscCname + clevel: int + shuffle: BloscShuffle | None + blocksize: int def __init__( self, *, typesize: int | None = None, - cname: BloscCname | str = BloscCname.zstd, + cname: BloscCname = "zstd", clevel: int = 5, - shuffle: BloscShuffle | str | None = None, + shuffle: BloscShuffle | None = None, blocksize: int = 0, ) -> None: typesize_parsed = parse_typesize(typesize) if typesize is not None else None - cname_parsed = parse_enum(cname, BloscCname) + cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None + shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -120,24 +163,74 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "blosc") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.typesize is None: - raise ValueError("`typesize` needs to be set for serialization.") - if self.shuffle is None: - raise ValueError("`shuffle` needs to be set for serialization.") - return { - "name": "blosc", - "configuration": { - "typesize": self.typesize, - "cname": self.cname.value, + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + cname=data["cname"], + clevel=data["clevel"], + shuffle=BLOSC_SHUFFLE[data["shuffle"]], + blocksize=data["blocksize"], + typesize=data.get("typesize", None), + ) + msg = ( + "Invalid Zarr V2 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + typesize=data["configuration"]["typesize"], + cname=data["configuration"]["cname"], + clevel=data["configuration"]["clevel"], + shuffle=data["configuration"]["shuffle"], + blocksize=data["configuration"]["blocksize"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: + if self.typesize is None or self.shuffle is None: + raise ValueError("typesize and blocksize need to be set for encoding.") + if zarr_format == 2: + return { + "id": "blosc", "clevel": self.clevel, - "shuffle": self.shuffle.value, + "cname": self.cname, + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, - }, - } + } + elif zarr_format == 3: + return { + "name": "blosc", + "configuration": { + "clevel": self.clevel, + "cname": self.cname, + "shuffle": self.shuffle, + "typesize": self.typesize, + "blocksize": self.blocksize, + }, + } + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = 1 @@ -147,10 +240,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if new_codec.typesize is None: new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: - new_codec = replace( - new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), - ) + new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle") return new_codec @@ -158,15 +248,10 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: def _blosc_codec(self) -> Blosc: if self.shuffle is None: raise ValueError("`shuffle` needs to be set for decoding and encoding.") - map_shuffle_str_to_int = { - BloscShuffle.noshuffle: 0, - BloscShuffle.shuffle: 1, - BloscShuffle.bitshuffle: 2, - } config_dict = { - "cname": self.cname.name, + "cname": self.cname, "clevel": self.clevel, - "shuffle": map_shuffle_str_to_int[self.shuffle], + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, } # See https://github.com/zarr-developers/numcodecs/pull/713 @@ -179,6 +264,8 @@ async def _decode_single( chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer: + from zarr.core.buffer.cpu import as_numpy_array_wrapper + return await asyncio.to_thread( as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype ) @@ -199,6 +286,3 @@ async def _encode_single( def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError - - -register_codec("blosc", BloscCodec) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 7576119c82..a227f073eb 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -1,17 +1,18 @@ from __future__ import annotations import sys +from collections.abc import Mapping from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numpy as np +from typing_extensions import ReadOnly -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import HasEndianness -from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self @@ -28,35 +29,111 @@ class Endian(Enum): little = "little" -default_system_endian = Endian(sys.byteorder) +# TODO: unify with the endianness defined in core.dtype.common +EndiannessStr = Literal["little", "big"] +ENDIANNESS_STR: Final = "little", "big" + +default_system_endian = sys.byteorder + + +class BytesConfig(TypedDict): + endian: NotRequired[EndiannessStr] + + +class BytesJSON_V2(BytesConfig): + """ + JSON representation of the bytes codec for zarr v2. + """ + + id: ReadOnly[Literal["bytes"]] + + +BytesJSON_V3 = NamedConfig[Literal["bytes"], BytesConfig] | Literal["bytes"] + + +def parse_endianness(data: object) -> EndiannessStr: + if data in ENDIANNESS_STR: + return data # type: ignore [return-value] + raise ValueError(f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}") + + +def check_json_v2(data: CodecJSON) -> TypeGuard[BytesJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"id", "endian"}, {"id"}) + and data["id"] == "bytes" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BytesJSON_V3]: + return data == "bytes" or ( + ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name"}, {"name", "configuration"}) + and data["name"] == "bytes" + ) + and isinstance(data.get("configuration", {}), Mapping) + and set(data.get("configuration", {}).keys()) in ({"endian"}, set()) + ) @dataclass(frozen=True) class BytesCodec(ArrayBytesCodec): - """bytes codec""" - is_fixed_size = True - endian: Endian | None + endian: EndiannessStr | None - def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None: - endian_parsed = None if endian is None else parse_enum(endian, Endian) + def __init__(self, *, endian: EndiannessStr | str | None = default_system_endian) -> None: + endian_parsed = None if endian is None else parse_endianness(endian) object.__setattr__(self, "endian", endian_parsed) @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.endian is None: + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls(endian=data.get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + # Three different representations of the exact same codec... + if data in ("bytes", {"name": "bytes"}, {"name": "bytes", "configuration": {}}): + return cls() + else: + return cls(endian=data["configuration"].get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @overload + def to_json(self, zarr_format: Literal[2]) -> BytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BytesJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BytesJSON_V2 | BytesJSON_V3: + if zarr_format == 2: + if self.endian is not None: + return { + "id": "bytes", + "endian": self.endian, + } + return {"id": "bytes"} + elif zarr_format == 3: + if self.endian is not None: + return { + "name": "bytes", + "configuration": {"endian": self.endian}, + } return {"name": "bytes"} - else: - return {"name": "bytes", "configuration": {"endian": self.endian.value}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if not isinstance(array_spec.dtype, HasEndianness): @@ -75,9 +152,9 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None - if isinstance(chunk_spec.dtype, HasEndianness): - dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg] + endian = self.endian if self.endian is not None else None + if isinstance(chunk_spec.dtype, HasEndianness) and endian is not None: + dtype = replace(chunk_spec.dtype, endianness=endian).to_native_dtype() # type: ignore[call-arg] else: dtype = chunk_spec.dtype.to_native_dtype() as_array_like = chunk_bytes.as_array_like() @@ -109,7 +186,7 @@ async def _encode_single( ): # type-ignore is a numpy bug # see https://github.com/numpy/numpy/issues/26473 - new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) # type: ignore[arg-type] + new_dtype = chunk_array.dtype.newbyteorder(self.endian) # type: ignore[arg-type] chunk_array = chunk_array.astype(new_dtype) nd_array = chunk_array.as_ndarray_like() @@ -119,9 +196,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length - - -register_codec("bytes", BytesCodec) - -# compatibility with earlier versions of ZEP1 -register_codec("endian", BytesCodec) diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index c2e30f689a..e1c9dfeec8 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -1,15 +1,16 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, cast, overload import numpy as np import typing_extensions from crc32c import crc32c -from zarr.abc.codec import BytesBytesCodec -from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration +from zarr.errors import CodecValidationError if TYPE_CHECKING: from typing import Self @@ -18,20 +19,77 @@ from zarr.core.buffer import Buffer +class Crc32Config(TypedDict): ... + + +class Crc32cJSON_V2(CodecJSON_V2[Literal["crc32c"]]): ... + + +class Crc32cJSON_V3(NamedConfig[Literal["crc32c"], Crc32Config]): ... + + +def check_json_v2(data: CodecJSON) -> TypeGuard[Crc32cJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()) == {"id"} and data["id"] == "crc32c" + + +def check_json_v3(data: CodecJSON) -> TypeGuard[Crc32cJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name", "configuration"}, {"name"}) + and data["name"] == "crc32c" + and data.get("configuration") in ({}, None) + ) + + @dataclass(frozen=True) class Crc32cCodec(BytesBytesCodec): - """crc32c codec""" - is_fixed_size = True @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) parse_named_configuration(data, "crc32c", require_configuration=False) return cls() + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls() + msg = ( + "Invalid Zarr V2 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('id')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls() + msg = ( + "Invalid Zarr V3 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('name')" + ) + raise CodecValidationError(msg) + def to_dict(self) -> dict[str, JSON]: + return self.to_json(zarr_format=3) return {"name": "crc32c"} + @overload + def to_json(self, zarr_format: Literal[2]) -> Crc32cJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Crc32cJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "crc32c"} + elif zarr_format == 3: + return {"name": "crc32c"} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + async def _decode_single( self, chunk_bytes: Buffer, @@ -65,6 +123,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 - - -register_codec("crc32c", Crc32cCodec) diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 9e6515a4d1..7da9acaefe 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -1,15 +1,20 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload from numcodecs.gzip import GZip +from typing_extensions import ReadOnly -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) if TYPE_CHECKING: from typing import Self @@ -28,10 +33,26 @@ def parse_gzip_level(data: JSON) -> int: return data +class GZipConfig(TypedDict): + level: int + + +class GZipJSON_V2(GZipConfig): + """ + The JSON form of the GZip codec in Zarr V2. + """ + + id: ReadOnly[Literal["gzip"]] + + +class GZipJSON_V3(NamedRequiredConfig[Literal["gzip"], GZipConfig]): + """ + The JSON form of the GZip codec in Zarr V3. + """ + + @dataclass(frozen=True) class GzipCodec(BytesBytesCodec): - """gzip codec""" - is_fixed_size = False level: int = 5 @@ -43,11 +64,56 @@ def __init__(self, *, level: int = 5) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "gzip") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - return {"name": "gzip", "configuration": {"level": self.level}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> GZipJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> GZipJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> GZipJSON_V2 | GZipJSON_V3: + if zarr_format == 2: + return {"id": "gzip", "level": self.level} + elif zarr_format == 3: + return {"name": "gzip", "configuration": {"level": self.level}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "level"} + and data["id"] == "gzip" + and isinstance(data["level"], int) + ) + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "gzip" + and isinstance(data["configuration"], dict) + and "level" in data["configuration"] + and isinstance(data["configuration"]["level"], int) + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls(level=data["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls(level=data["configuration"]["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 3: {data!r}") async def _decode_single( self, @@ -73,6 +139,3 @@ def compute_encoded_size( _chunk_spec: ArraySpec, ) -> int: raise NotImplementedError - - -register_codec("gzip", GzipCodec) diff --git a/src/zarr/codecs/numcodecs/__init__.py b/src/zarr/codecs/numcodecs/__init__.py new file mode 100644 index 0000000000..cd0541797a --- /dev/null +++ b/src/zarr/codecs/numcodecs/__init__.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import Final + +from zarr.codecs.numcodecs._codecs import ( + BZ2, + CRC32, + CRC32C, + LZ4, + LZMA, + ZFPY, + Adler32, + AsType, + BitRound, + Blosc, + Delta, + FixedScaleOffset, + Fletcher32, + GZip, + JenkinsLookup3, + PackBits, + PCodec, + Quantize, + Shuffle, + Zlib, + Zstd, + _NumcodecsArrayArrayCodec, + _NumcodecsArrayBytesCodec, + _NumcodecsBytesBytesCodec, + _NumcodecsCodec, +) + +# This is a fixed dictionary of numcodecs codecs for which we have pre-made Zarr V3 wrappers +numcodecs_wrappers: Final[dict[str, type[_NumcodecsCodec]]] = { + "bz2": BZ2, + "crc32": CRC32, + "crc32c": CRC32C, + "lz4": LZ4, + "lzma": LZMA, + "zfpy": ZFPY, + "adler32": Adler32, + "astype": AsType, + "bitround": BitRound, + "blosc": Blosc, + "delta": Delta, + "fixedscaleoffset": FixedScaleOffset, + "fletcher32": Fletcher32, + "gzip": GZip, + "jenkins_lookup3": JenkinsLookup3, + "packbits": PackBits, + "pcodec": PCodec, + "quantize": Quantize, + "shuffle": Shuffle, + "zlib": Zlib, + "zstd": Zstd, +} + +__all__ = [ + "BZ2", + "CRC32", + "CRC32C", + "LZ4", + "LZMA", + "ZFPY", + "Adler32", + "AsType", + "BitRound", + "Blosc", + "Delta", + "FixedScaleOffset", + "Fletcher32", + "GZip", + "JenkinsLookup3", + "PCodec", + "PackBits", + "Quantize", + "Shuffle", + "Zlib", + "Zstd", + "_NumcodecsArrayArrayCodec", + "_NumcodecsArrayBytesCodec", + "_NumcodecsBytesBytesCodec", + "_NumcodecsCodec", +] diff --git a/src/zarr/codecs/numcodecs/_codecs.py b/src/zarr/codecs/numcodecs/_codecs.py new file mode 100644 index 0000000000..493736b9b8 --- /dev/null +++ b/src/zarr/codecs/numcodecs/_codecs.py @@ -0,0 +1,336 @@ +""" +This module provides compatibility for :py:mod:`numcodecs` in Zarr version 3. + +These codecs were previously defined in :py:mod:`numcodecs`, and have now been moved to `zarr`. + +>>> import zarr +>>> import zarr.codecs.numcodecs as numcodecs +>>> +>>> array = zarr.create_array( +... store="data.zarr", +... shape=(1024, 1024), +... chunks=(64, 64), +... dtype="uint32", +... filters=[numcodecs.zarr3.Delta()], +... compressors=[numcodecs.zarr3.BZ2(level=5)]) +>>> array[:] = np.arange(*array.shape).astype(array.dtype) + +.. note:: + + Please note that the codecs in :py:mod:`zarr.codecs.numcodecs` are not part of the Zarr version + 3 specification. Using these codecs might cause interoperability issues with other Zarr + implementations. +""" + +from __future__ import annotations + +import asyncio +import math +from collections.abc import Mapping +from dataclasses import dataclass, replace +from functools import cached_property +from typing import TYPE_CHECKING, Any, Final, Literal, Self, overload +from warnings import warn + +import numpy as np + +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, CodecJSON_V2 +from zarr.abc.metadata import Metadata +from zarr.core.buffer.cpu import as_numpy_array_wrapper +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration, product +from zarr.dtype import UInt8, ZDType, parse_dtype +from zarr.errors import ZarrUserWarning +from zarr.registry import get_numcodec + +if TYPE_CHECKING: + from zarr.abc.numcodec import Numcodec + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer + +CODEC_PREFIX: Final = "numcodecs." + + +def _expect_name_prefix(codec_name: str) -> str: + if not codec_name.startswith(CODEC_PREFIX): + raise ValueError( + f"Expected name to start with '{CODEC_PREFIX}'. Got {codec_name} instead." + ) # pragma: no cover + return codec_name.removeprefix(CODEC_PREFIX) + + +def _parse_codec_configuration(data: dict[str, JSON]) -> dict[str, JSON]: + parsed_name, parsed_configuration = parse_named_configuration(data) + if not parsed_name.startswith(CODEC_PREFIX): + raise ValueError( + f"Expected name to start with '{CODEC_PREFIX}'. Got {parsed_name} instead." + ) # pragma: no cover + id = _expect_name_prefix(parsed_name) + return {"id": id, **parsed_configuration} + + +@dataclass(frozen=True) +class _NumcodecsCodec(Metadata): + codec_name: str + codec_config: dict[str, JSON] + + def __init_subclass__(cls, *, codec_name: str | None = None, **kwargs: Any) -> None: + """To be used only when creating the actual public-facing codec class.""" + super().__init_subclass__(**kwargs) + if codec_name is not None: + namespace = codec_name + + cls_name = f"{CODEC_PREFIX}{namespace}.{cls.__name__}" + cls.codec_name = f"{CODEC_PREFIX}{namespace}" + cls.__doc__ = f""" + See :class:`{cls_name}` for more details and parameters. + """ + + def __init__(self, **codec_config: JSON) -> None: + if not self.codec_name: + raise ValueError( + "The codec name needs to be supplied through the `codec_name` attribute." + ) # pragma: no cover + unprefixed_codec_name = _expect_name_prefix(self.codec_name) + + if "id" not in codec_config: + codec_config = {"id": unprefixed_codec_name, **codec_config} + elif codec_config["id"] != unprefixed_codec_name: + raise ValueError( + f"Codec id does not match {unprefixed_codec_name}. Got: {codec_config['id']}." + ) # pragma: no cover + + object.__setattr__(self, "codec_config", codec_config) + warn( + "Numcodecs codecs are not in the Zarr version 3 specification and " + "may not be supported by other zarr implementations.", + category=ZarrUserWarning, + stacklevel=2, + ) + + @cached_property + def _codec(self) -> Numcodec: + return get_numcodec(self.codec_config) # type: ignore[arg-type] + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + codec_config = _parse_codec_configuration(data) + return cls(**codec_config) + + def to_dict(self) -> dict[str, JSON]: + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]: + codec_id = self.codec_config["id"] + codec_config = {k: v for k, v in self.codec_config.items() if k != "id"} + if zarr_format == 2: + return {"id": codec_id, **codec_config} + else: + return {"name": codec_id, "configuration": codec_config} + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError # pragma: no cover + + # Override __repr__ because dynamically constructed classes don't seem to work otherwise + def __repr__(self) -> str: + codec_config = self.codec_config.copy() + codec_config.pop("id", None) + return f"{self.__class__.__name__}(codec_name={self.codec_name!r}, codec_config={codec_config!r})" + + +class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread( + as_numpy_array_wrapper, + self._codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_data: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self._codec.encode(chunk_data.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) + + +class _NumcodecsArrayBytesCodec(_NumcodecsCodec, ArrayBytesCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self._codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) + + +# bytes-to-bytes codecs +class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"): + pass + + +class LZ4(_NumcodecsBytesBytesCodec, codec_name="lz4"): + pass + + +class Zstd(_NumcodecsBytesBytesCodec, codec_name="zstd"): + pass + + +class Zlib(_NumcodecsBytesBytesCodec, codec_name="zlib"): + pass + + +class GZip(_NumcodecsBytesBytesCodec, codec_name="gzip"): + pass + + +class BZ2(_NumcodecsBytesBytesCodec, codec_name="bz2"): + pass + + +class LZMA(_NumcodecsBytesBytesCodec, codec_name="lzma"): + pass + + +class Shuffle(_NumcodecsBytesBytesCodec, codec_name="shuffle"): + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Shuffle: + if self.codec_config.get("elementsize") is None: + dtype = array_spec.dtype.to_native_dtype() + return Shuffle(**{**self.codec_config, "elementsize": dtype.itemsize}) + return self # pragma: no cover + + +# array-to-array codecs ("filters") +class Delta(_NumcodecsArrayArrayCodec, codec_name="delta"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + if astype := self.codec_config.get("astype"): + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] + return replace(chunk_spec, dtype=dtype) + return chunk_spec + + +class BitRound(_NumcodecsArrayArrayCodec, codec_name="bitround"): + pass + + +class FixedScaleOffset(_NumcodecsArrayArrayCodec, codec_name="fixedscaleoffset"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + if astype := self.codec_config.get("astype"): + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] + return replace(chunk_spec, dtype=dtype) + return chunk_spec + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset: + if self.codec_config.get("dtype") is None: + dtype = array_spec.dtype.to_native_dtype() + return FixedScaleOffset(**{**self.codec_config, "dtype": str(dtype)}) + return self + + +class Quantize(_NumcodecsArrayArrayCodec, codec_name="quantize"): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Quantize: + if self.codec_config.get("dtype") is None: + dtype = array_spec.dtype.to_native_dtype() + return Quantize(**{**self.codec_config, "dtype": str(dtype)}) + return self + + +class PackBits(_NumcodecsArrayArrayCodec, codec_name="packbits"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + return replace( + chunk_spec, + shape=(1 + math.ceil(product(chunk_spec.shape) / 8),), + dtype=UInt8(), + ) + + # todo: remove this type: ignore when this class can be defined w.r.t. + # a single zarr dtype API + def validate(self, *, dtype: ZDType[Any, Any], **_kwargs: Any) -> None: + # this is bugged and will fail + _dtype = dtype.to_native_dtype() + if _dtype != np.dtype("bool"): + raise ValueError(f"Packbits filter requires bool dtype. Got {dtype}.") + + +class AsType(_NumcodecsArrayArrayCodec, codec_name="astype"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + dtype = parse_dtype(np.dtype(self.codec_config["encode_dtype"]), zarr_format=3) # type: ignore[arg-type] + return replace(chunk_spec, dtype=dtype) + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> AsType: + if self.codec_config.get("decode_dtype") is None: + # TODO: remove these coverage exemptions the correct way, i.e. with tests + dtype = array_spec.dtype.to_native_dtype() # pragma: no cover + return AsType(**{**self.codec_config, "decode_dtype": str(dtype)}) # pragma: no cover + return self + + +# bytes-to-bytes checksum codecs +class _NumcodecsChecksumCodec(_NumcodecsBytesBytesCodec): + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + return input_byte_length + 4 # pragma: no cover + + +class CRC32(_NumcodecsChecksumCodec, codec_name="crc32"): + pass + + +class CRC32C(_NumcodecsChecksumCodec, codec_name="crc32c"): + pass + + +class Adler32(_NumcodecsChecksumCodec, codec_name="adler32"): + pass + + +class Fletcher32(_NumcodecsChecksumCodec, codec_name="fletcher32"): + pass + + +class JenkinsLookup3(_NumcodecsChecksumCodec, codec_name="jenkins_lookup3"): + pass + + +# array-to-bytes codecs +class PCodec(_NumcodecsArrayBytesCodec, codec_name="pcodec"): + pass + + +class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"): + pass diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 58d34f62e7..ed2a13ce03 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -1,20 +1,35 @@ from __future__ import annotations -from collections.abc import Iterable, Mapping, MutableMapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache from operator import itemgetter -from typing import TYPE_CHECKING, Any, NamedTuple, cast +from typing import ( + TYPE_CHECKING, + Any, + Literal, + NamedTuple, + NotRequired, + Self, + TypedDict, + TypeGuard, + cast, + overload, +) import numpy as np import numpy.typing as npt +from typing_extensions import ReadOnly from zarr.abc.codec import ( ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin, Codec, + CodecJSON, + CodecJSON_V2, + CodecJSON_V3, CodecPipeline, ) from zarr.abc.store import ( @@ -36,9 +51,9 @@ ) from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.core.common import ( + NamedRequiredConfig, ShapeLike, parse_enum, - parse_named_configuration, parse_shapelike, product, ) @@ -51,7 +66,8 @@ morton_order_iter, ) from zarr.core.metadata.v3 import parse_codecs -from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec +from zarr.errors import CodecValidationError +from zarr.registry import get_ndbuffer_class, get_pipeline_class if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -64,6 +80,42 @@ ShardMapping = Mapping[tuple[int, ...], Buffer] ShardMutableMapping = MutableMapping[tuple[int, ...], Buffer] +IndexLocation = Literal["start", "end"] + + +class ShardingConfigV2(TypedDict): + codecs: tuple[CodecJSON_V2[str], ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V2[str], ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingConfigV3(TypedDict): + codecs: tuple[CodecJSON_V3, ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V3, ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingJSON_V2(ShardingConfigV2): + """ + The JSON form of the sharding codec in Zarr V2. + """ + + id: ReadOnly[Literal["sharding_indexed"]] + + +class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): + """ + The JSON form of sharding codec for Zarr V3. + + Attributes + ---------- + name : Literal["sharding_indexed"] + The name of the sharding codec. + configuration : ShardingConfigV3 + """ + class ShardingCodecIndexLocation(Enum): """ @@ -78,6 +130,37 @@ def parse_index_location(data: object) -> ShardingCodecIndexLocation: return parse_enum(data, ShardingCodecIndexLocation) +def check_json_v2(data: CodecJSON) -> TypeGuard[ShardingJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "codecs", "chunk_shape"} + and data["id"] == "sharding_indexed" + and isinstance(data["chunk_shape"], Sequence) + and not isinstance(data["chunk_shape"], str) + and isinstance(data["codecs"], Sequence) + and not isinstance(data["codecs"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ShardingJSON_V3]: + # TODO: Automate this with a function that does runtime type checking on typeddicts. + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "sharding_indexed" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"codecs", "chunk_shape", "index_codecs", "index_location"} + and isinstance(data["configuration"]["chunk_shape"], Sequence) + and not isinstance(data["configuration"]["chunk_shape"], str) + and isinstance(data["configuration"]["codecs"], Sequence) + and not isinstance(data["configuration"]["codecs"], str) + and isinstance(data["configuration"]["index_codecs"], Sequence) + and not isinstance(data["configuration"]["index_codecs"], str) + and data["configuration"]["index_location"] in ("start", "end") + ) + + @dataclass(frozen=True) class _ShardingByteGetter(ByteGetter): shard_dict: ShardMapping @@ -384,23 +467,76 @@ def __setstate__(self, state: dict[str, Any]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + codecs=data["codecs"], + index_codecs=data["index_codecs"], + index_location=data["index_location"], + chunk_shape=data["chunk_shape"], + ) + msg = ( + "Invalid Zarr V2 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'codecs', 'index_codecs', 'chunk_shape', 'index_location')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + codecs=data["configuration"]["codecs"], + index_codecs=data["configuration"]["index_codecs"], + index_location=data["configuration"]["index_location"], + chunk_shape=data["configuration"]["chunk_shape"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('codecs', 'index_codecs', 'index_location', 'chunk_shape')" + ) + raise CodecValidationError(msg) @property def codec_pipeline(self) -> CodecPipeline: return get_pipeline_class().from_codecs(self.codecs) def to_dict(self) -> dict[str, JSON]: - return { - "name": "sharding_indexed", - "configuration": { + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ShardingJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ShardingJSON_V3: ... + + def to_json(self, zarr_format: int) -> ShardingJSON_V2 | ShardingJSON_V3: + if zarr_format == 2: + return { + "id": "sharding_indexed", + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), "chunk_shape": self.chunk_shape, - "codecs": tuple(s.to_dict() for s in self.codecs), - "index_codecs": tuple(s.to_dict() for s in self.index_codecs), "index_location": self.index_location.value, - }, - } + } + elif zarr_format == 3: + return { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": self.chunk_shape, + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), + "index_location": self.index_location.value, + }, + } + raise ValueError(f"Unsupported Zarr format {zarr_format}. Expected 2 or 3.") def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: shard_spec = self._get_chunk_spec(array_spec) @@ -760,6 +896,3 @@ async def _load_full_shard_maybe( def compute_encoded_size(self, input_byte_length: int, shard_spec: ArraySpec) -> int: chunks_per_shard = self._get_chunks_per_shard(shard_spec) return input_byte_length + self._shard_index_size(chunks_per_shard) - - -register_codec("sharding_indexed", ShardingCodec) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 92e7da81e1..51c3d4209d 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -1,15 +1,20 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np +from typing_extensions import ReadOnly -from zarr.abc.codec import ArrayArrayCodec +from zarr.abc.codec import ArrayArrayCodec, CodecJSON from zarr.core.array_spec import ArraySpec -from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) +from zarr.errors import CodecValidationError if TYPE_CHECKING: from typing import Self @@ -27,10 +32,46 @@ def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: return tuple(cast("Iterable[int]", data)) +class TransposeConfig(TypedDict): + order: tuple[int, ...] + + +class TransposeJSON_V2(TransposeConfig): + """ + The JSON form of the Transpose codec in Zarr V2. + """ + + id: ReadOnly[Literal["transpose"]] + + +class TransposeJSON_V3(NamedRequiredConfig[Literal["transpose"], TransposeConfig]): + """ + The JSON form of the Transpose codec in Zarr V3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[TransposeJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "configuration"} + and data["id"] == "transpose" + and isinstance(data["order"], Sequence) + and not isinstance(data["order"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[TransposeJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "transpose" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"order"} + ) + + @dataclass(frozen=True) class TransposeCodec(ArrayArrayCodec): - """Transpose codec""" - is_fixed_size = True order: tuple[int, ...] @@ -42,12 +83,47 @@ def __init__(self, *, order: Iterable[int]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "transpose") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v2(data): + return cls(order=data["order"]) # type: ignore[arg-type] + msg = ( + "Invalid Zarr V2 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'order')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v3(data): + return cls(order=data["configuration"]["order"]) + msg = ( + "Invalid Zarr V3 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('order')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: return {"name": "transpose", "configuration": {"order": tuple(self.order)}} + @overload + def to_json(self, zarr_format: Literal[2]) -> TransposeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> TransposeJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "transpose", "order": self.order} + elif zarr_format == 3: + return {"name": "transpose", "configuration": {"order": self.order}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + def validate( self, shape: tuple[int, ...], @@ -113,6 +189,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length - - -register_codec("transpose", TransposeCodec) diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 28c64be1c0..cd034cf804 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -1,15 +1,14 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload import numpy as np from numcodecs.vlen import VLenBytes, VLenUTF8 -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration if TYPE_CHECKING: from typing import Self @@ -22,12 +21,29 @@ _vlen_bytes_codec = VLenBytes() +class VlenUF8Config(TypedDict): ... + + +class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): ... + + +class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): ... + + +class VLenBytesConfig(TypedDict): ... + + +class VLenBytesJSON_V2(CodecJSON_V2[Literal["vlen-bytes"]]): ... + + +VLenBytesJSON_V3 = NamedConfig[Literal["vlen-bytes"], VLenBytesConfig] | Literal["vlen-bytes"] + + @dataclass(frozen=True) class VLenUTF8Codec(ArrayBytesCodec): - """Variable-length UTF8 codec""" - @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration( data, "vlen-utf8", require_configuration=False ) @@ -37,6 +53,40 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-utf8", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenUTF8JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenUTF8JSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenUTF8JSON_V2 | VLenUTF8JSON_V3: + if zarr_format == 2: + return {"id": "vlen-utf8"} + else: + return {"name": "vlen-utf8"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V2]: + return data == {"id": "vlen-utf8"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V3]: + return data in ( + {"name": "vlen-utf8"}, + {"name": "vlen-utf8", "configuration": {}}, + "vlen-utf8", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self @@ -74,15 +124,45 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - class VLenBytesCodec(ArrayBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "vlen-bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-bytes", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenBytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenBytesJSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenBytesJSON_V2 | VLenBytesJSON_V3: + if zarr_format == 2: + return {"id": "vlen-bytes"} + else: + return {"name": "vlen-bytes"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V2]: + return data == {"id": "vlen-bytes"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V3]: + return data in ( + {"name": "vlen-bytes"}, + {"name": "vlen-bytes", "configuration": {}}, + "vlen-bytes", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self @@ -112,7 +192,3 @@ async def _encode_single( def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: # what is input_byte_length for an object dtype? raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") - - -register_codec("vlen-utf8", VLenUTF8Codec) -register_codec("vlen-bytes", VLenBytesCodec) diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index ead41e7b5f..106987f830 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -1,18 +1,24 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, overload import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version +from typing_extensions import ReadOnly -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration -from zarr.registry import register_codec +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) +from zarr.errors import CodecValidationError if TYPE_CHECKING: from typing import Self @@ -21,6 +27,43 @@ from zarr.core.buffer import Buffer +class ZstdConfig_V2(TypedDict): + level: int + + +class ZstdConfig_V3(TypedDict): + level: int + checksum: bool + + +class ZstdJSON_V2(ZstdConfig_V2): + """ + The JSON form of the ZStandard codec in Zarr v2. + """ + + id: ReadOnly[Literal["zstd"]] + + +class ZstdJSON_V3(NamedRequiredConfig[Literal["zstd"], ZstdConfig_V3]): + """ + The JSON form of the ZStandard codec in Zarr v3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[ZstdJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()).issuperset({"id", "level"}) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ZstdJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "zstd" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"level", "checksum"} + ) + + def parse_zstd_level(data: JSON) -> int: if isinstance(data, int): if data >= 23: @@ -37,8 +80,6 @@ def parse_checksum(data: JSON) -> bool: @dataclass(frozen=True) class ZstdCodec(BytesBytesCodec): - """zstd codec""" - is_fixed_size = True level: int = 0 @@ -61,11 +102,52 @@ def __init__(self, *, level: int = 0, checksum: bool = False) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "zstd") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + if "checksum" in data: + return cls(level=data["level"], checksum=data["checksum"]) + else: + return cls(level=data["level"]) + + msg = ( + "Invalid Zarr V2 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'level')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + level=data["configuration"]["level"], checksum=data["configuration"]["checksum"] + ) + msg = ( + "Invalid Zarr V3 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration') " + "Where the 'configuration' key is a Mapping with keys ('level', 'checksum')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: - return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ZstdJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ZstdJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> ZstdJSON_V2 | ZstdJSON_V3: + if zarr_format == 2: + return {"id": "zstd", "level": self.level} + else: + return { + "name": "zstd", + "configuration": {"level": self.level, "checksum": self.checksum}, + } @cached_property def _zstd_codec(self) -> Zstd: @@ -92,6 +174,3 @@ async def _encode_single( def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError - - -register_codec("zstd", ZstdCodec) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7b3f11d8cd..6a7b7d6ee7 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable, Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger @@ -26,8 +26,8 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo @@ -59,24 +59,20 @@ ZarrFormat, _default_zarr_format, _warn_order_kwarg, - ceildiv, concurrent_map, parse_shapelike, product, ) -from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( VariableLengthBytes, VariableLengthUTF8, ZDType, ZDTypeLike, - parse_dtype, + parse_data_type, ) from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( - AsyncOIndex, - AsyncVIndex, BasicIndexer, BasicSelection, BlockIndex, @@ -93,6 +89,7 @@ Selection, VIndex, _iter_grid, + ceildiv, check_fields, check_no_multi_fields, is_pure_fancy_indexing, @@ -110,7 +107,7 @@ T_ArrayMetadata, ) from zarr.core.metadata.v2 import ( - CompressorLikev2, + CompressorLike_V2, get_object_codec_id, parse_compressor, parse_filters, @@ -133,7 +130,7 @@ from zarr.storage._utils import _relativize_path if TYPE_CHECKING: - from collections.abc import Iterator, Sequence + from collections.abc import Iterator from typing import Self import numpy.typing as npt @@ -207,8 +204,26 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None if isinstance(metadata, ArrayV3Metadata): return get_pipeline_class().from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): - v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) - return get_pipeline_class().from_codecs([v2_codec]) + _codecs: tuple[Codec, ...] = () + if metadata.filters is not None: + _codecs += metadata.filters + if metadata.compressor is not None: + _codecs += (metadata.compressor,) + if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs) and not isinstance( + metadata.dtype, HasObjectCodec + ): + # The role filled by the ArrayBytesCodec was implicit in zarr v2. So a valid zarr v2-style + # chain of filters + compressor might not contain a codec identifiable as an array-bytes codec. + # In such a case, we will insert a bytes codec that applies no endian transformation. + # We skip this insertion if the data type is an instance of HasObjectCodec, because + # in zarr v2 these data types required a special codec that functioned like an array bytes codec. + _codecs = (BytesCodec(endian=None),) + _codecs + if metadata.order == "F": + # Zarr V2 supports declaring the order of an array in metadata. Using the zarr v3 codec + # framework, we express C or F ordered arrays by adding a transpose codec to the front + # of the list of codecs. + _codecs = (TransposeCodec(order=tuple(reversed(range(metadata.ndim)))),) + _codecs + return get_pipeline_class().from_codecs(_codecs) raise TypeError # pragma: no cover @@ -357,7 +372,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: CompressorLikev2 | Literal["auto"] = "auto", + compressor: CompressorLike_V2 | Literal["auto"] = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -634,7 +649,7 @@ async def _create( Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -833,8 +848,8 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | Numcodec] | None = None, - compressor: CompressorLikev2 = None, + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | None = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: if dimension_separator is None: @@ -871,8 +886,8 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | Numcodec] | None = None, - compressor: CompressorLike = "auto", + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | Literal["auto"] = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -884,14 +899,9 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - compressor_parsed: CompressorLikev2 + compressor_parsed: CompressorLike_V2 if compressor == "auto": compressor_parsed = default_compressor_v2(dtype) - elif isinstance(compressor, BytesBytesCodec): - raise ValueError( - "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. " - "Use a numcodecs codec directly instead." - ) else: compressor_parsed = compressor @@ -1441,56 +1451,6 @@ async def getitem( ) return await self._get_selection(indexer, prototype=prototype) - async def get_orthogonal_selection( - self, - selection: OrthogonalSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - async def get_mask_selection( - self, - mask: MaskSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - async def get_coordinate_selection( - self, - selection: CoordinateSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - out_array = await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - if hasattr(out_array, "shape"): - # restore shape - out_array = np.array(out_array).reshape(indexer.sel_shape) - return out_array - async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = False) -> None: """ Asynchronously save the array metadata. @@ -1532,7 +1492,7 @@ async def _set_selection( if isinstance(array_like, np._typing._SupportsArrayFunc): # TODO: need to handle array types that don't support __array_function__ # like PyTorch and JAX - array_like_ = cast("np._typing._SupportsArrayFunc", array_like) + array_like_ = cast(np._typing._SupportsArrayFunc, array_like) value = np.asanyarray(value, dtype=self.dtype, like=array_like_) else: if not hasattr(value, "shape"): @@ -1546,8 +1506,7 @@ async def _set_selection( value = value.astype(dtype=self.dtype, order="A") else: value = np.array(value, dtype=self.dtype, order="A") - value = cast("NDArrayLike", value) - + value = cast(NDArrayLike, value) # We accept any ndarray like object from the user and convert it # to a NDBuffer (or subclass). From this point onwards, we only pass # Buffer and NDBuffer between components. @@ -1622,19 +1581,6 @@ async def setitem( ) return await self._set_selection(indexer, value, prototype=prototype) - @property - def oindex(self) -> AsyncOIndex[T_ArrayMetadata]: - """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and - :func:`set_orthogonal_selection` for documentation and examples.""" - return AsyncOIndex(self) - - @property - def vindex(self) -> AsyncVIndex[T_ArrayMetadata]: - """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, - :func:`set_coordinate_selection`, :func:`get_mask_selection` and - :func:`set_mask_selection` for documentation and examples.""" - return AsyncVIndex(self) - async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: """ Asynchronously resize the array to a new shape. @@ -4317,7 +4263,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - zdtype = parse_dtype(dtype, zarr_format=zarr_format) + zdtype = parse_data_type(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4726,26 +4672,6 @@ def _parse_chunk_key_encoding( return result -def _get_default_chunk_encoding_v3( - dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: - """ - Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. - """ - - dtype_category = categorize_data_type(dtype) - - filters = zarr_config.get("array.v3_default_filters").get(dtype_category) - compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) - serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) - - return ( - tuple(_parse_array_array_codec(f) for f in filters), - _parse_array_bytes_codec(serializer), - tuple(_parse_bytes_bytes_codec(c) for c in compressors), - ) - - def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: """ Given a data type, return the default filters for that data type. @@ -4789,7 +4715,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Codec] | None: """ Given a data type, return the default filters for that data type. @@ -4798,28 +4724,22 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ if isinstance(dtype, HasObjectCodec): if dtype.object_codec_id == "vlen-bytes": - from numcodecs import VLenBytes - - return (VLenBytes(),) + return (VLenBytesCodec(),) elif dtype.object_codec_id == "vlen-utf8": - from numcodecs import VLenUTF8 - - return (VLenUTF8(),) + return (VLenUTF8Codec(),) else: msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." raise ValueError(msg) return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> Numcodec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> BytesBytesCodec: """ Given a data type, return the default compressors for that data type. - This is just the numcodecs ``Zstd`` codec. + This is just the ``Zstd`` codec. """ - from numcodecs import Zstd - - return Zstd(level=0, checksum=False) # type: ignore[no-any-return] + return ZstdCodec(level=0, checksum=False) # type: ignore[no-any-return] def _parse_chunk_encoding_v2( @@ -4838,12 +4758,9 @@ def _parse_chunk_encoding_v2( _compressor = None elif compressor == "auto": _compressor = default_compressor_v2(dtype) - elif isinstance(compressor, tuple | list) and len(compressor) == 1: + elif isinstance(compressor, Sequence) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." - raise TypeError(msg) _compressor = parse_compressor(compressor) if filters is None: @@ -4851,14 +4768,6 @@ def _parse_chunk_encoding_v2( elif filters == "auto": _filters = default_filters_v2(dtype) else: - if isinstance(filters, Iterable): - for idx, f in enumerate(filters): - if not _is_numcodec(f): - msg = ( - "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " - f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." - ) - raise TypeError(msg) _filters = parse_filters(filters) if isinstance(dtype, HasObjectCodec): # check the filters and the compressor for the object codec required for this data type @@ -4866,12 +4775,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.to_json(zarr_format=2),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.to_json(zarr_format=2) for f in _filters], + _compressor.to_json(zarr_format=2) if _compressor is not None else None, ) ) if object_codec_id is None: @@ -4911,7 +4820,9 @@ def _parse_chunk_encoding_v3( maybe_array_array = (filters,) else: maybe_array_array = cast("Iterable[Codec | dict[str, JSON]]", filters) - out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) + out_array_array = tuple( + _parse_array_array_codec(c, zarr_format=3) for c in maybe_array_array + ) if serializer == "auto": out_array_bytes = default_serializer_v3(dtype) @@ -4919,26 +4830,34 @@ def _parse_chunk_encoding_v3( # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - out_array_bytes = _parse_array_bytes_codec(serializer) + out_array_bytes = _parse_array_bytes_codec(serializer, zarr_format=3) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": out_bytes_bytes = default_compressors_v3(dtype) else: - maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] - if isinstance(compressors, dict | Codec): + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON] | Numcodec] + if isinstance(compressors, dict | Codec) or _is_numcodec(compressors): maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = cast("Iterable[Codec | dict[str, JSON]]", compressors) + maybe_bytes_bytes = compressors # type: ignore[assignment] - out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + out_bytes_bytes = tuple( + _parse_bytes_bytes_codec(c, zarr_format=3) for c in maybe_bytes_bytes + ) + + # specialize codecs as needed given the dtype + + # TODO: refactor so that the config only contains the name of the codec, and we use the dtype + # to create the codec instance, instead of storing a dict representation of a full codec. # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - - # TODO: add checks to ensure that the right serializer is used for vlen data types + if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): + # The default endianness in the bytescodec might not be None, so we need to replace it + out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes @@ -4959,8 +4878,6 @@ def _parse_deprecated_compressor( compressors = () else: compressors = (compressor,) - elif zarr_format == 2 and compressor == compressors == "auto": - compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 63fcda7065..1bda0eabf1 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from itertools import islice, pairwise +from itertools import islice from typing import TYPE_CHECKING, Any, TypeVar from warnings import warn @@ -14,6 +14,7 @@ Codec, CodecPipeline, ) +from zarr.codecs._v2 import NumcodecsWrapper from zarr.core.common import concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar @@ -88,7 +89,6 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: @classmethod def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) - return cls( array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, @@ -224,7 +224,6 @@ async def encode_batch( zip(chunk_array_batch, chunk_specs, strict=False) ) chunk_specs = resolve_batched(aa_codec, chunk_specs) - chunk_bytes_batch = await self.array_bytes_codec.encode( zip(chunk_array_batch, chunk_specs, strict=False) ) @@ -499,10 +498,33 @@ def codecs_from_list( from zarr.codecs.sharding import ShardingCodec array_array: tuple[ArrayArrayCodec, ...] = () - array_bytes_maybe: ArrayBytesCodec | None = None + array_bytes_maybe: ArrayBytesCodec bytes_bytes: tuple[BytesBytesCodec, ...] = () - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: + # handle two cases + # either all of the codecs are numcodecwrapper instances, in which case we set the last element + # to array-bytes and the rest to array-array + # or one of the codecs is an array-bytes, in which case we convert any preceding numcodecswrapper + # instances to array-array, and any following numcodecswrapper instances to bytes-bytes + + codecs_tup = tuple(codecs) + array_array_idcs: tuple[tuple[int, ArrayArrayCodec], ...] = () + array_bytes_idcs: tuple[tuple[int, ArrayBytesCodec], ...] = () + bytes_bytes_idcs: tuple[tuple[int, BytesBytesCodec], ...] = () + numcodec_wrapper_idcs: tuple[tuple[int, NumcodecsWrapper], ...] = () + + for idx, codec in enumerate(codecs_tup): + match codec: + case ArrayArrayCodec(): + array_array_idcs += ((idx, codec),) + case ArrayBytesCodec(): + array_bytes_idcs += ((idx, codec),) + case BytesBytesCodec(): + bytes_bytes_idcs += ((idx, codec),) + case NumcodecsWrapper(): # type: ignore[union-attr] + numcodec_wrapper_idcs += ((idx, codec),) + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs_tup) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " "writes, which may lead to inefficient performance.", @@ -510,49 +532,136 @@ def codecs_from_list( stacklevel=3, ) - for prev_codec, cur_codec in pairwise((None, *codecs)): - if isinstance(cur_codec, ArrayArrayCodec): - if isinstance(prev_codec, ArrayBytesCodec | BytesBytesCodec): - msg = ( - f"Invalid codec order. ArrayArrayCodec {cur_codec}" - "must be preceded by another ArrayArrayCodec. " - f"Got {type(prev_codec)} instead." - ) - raise TypeError(msg) - array_array += (cur_codec,) - - elif isinstance(cur_codec, ArrayBytesCodec): - if isinstance(prev_codec, BytesBytesCodec): - msg = ( - f"Invalid codec order. ArrayBytes codec {cur_codec}" - f" must be preceded by an ArrayArrayCodec. Got {type(prev_codec)} instead." - ) - raise TypeError(msg) - - if array_bytes_maybe is not None: - msg = ( - f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {cur_codec}. " - "Only one array-to-bytes codec is allowed." - ) - raise ValueError(msg) - - array_bytes_maybe = cur_codec - - elif isinstance(cur_codec, BytesBytesCodec): - if isinstance(prev_codec, ArrayArrayCodec): - msg = ( - f"Invalid codec order. BytesBytesCodec {cur_codec}" - "must be preceded by either another BytesBytesCodec, or an ArrayBytesCodec. " - f"Got {type(prev_codec)} instead." - ) - bytes_bytes += (cur_codec,) + if len(array_bytes_idcs) == 0: + # There is no array-bytes codec. Unless we can find a numcodec wrapper to act as an + # array-bytes codec, this is an error. + if len(numcodec_wrapper_idcs) == 0: + msg = ( + f"The codecs {codecs_tup} do not include an ArrayBytesCodec or a codec castable to an " + "ArrayBytesCodec, such as a NumcodecsWrapper. This is an invalid sequence of codecs." + ) + raise ValueError(msg) + elif len(numcodec_wrapper_idcs) == len(codecs_tup): + # All the codecs are numcodecs wrappers. This means we have no information about which + # codec is array-array, array-bytes, and bytes-bytes, so we we just cast the numcodecs wrappers + # into a sequence of array-array codecs terminated by a single array-bytes codec. + # This choice is almost arbitrary. + # It would be equally valid to convert the first codec to an array-bytes, and the remaining + # codecs to bytes-bytes, or to pick a random codec and convert it to array-bytes, then + # converting all the preceding codecs to array-array, and the following codecs to bytes-bytes. + # But we know from experience that the Zarr V2-style chunk encoding pipelines typically + # start with array-array transformations, so casting all but one of the unknown codecs + # to array-array is a safe choice. + array_bytes_maybe = codecs_tup[-1].to_array_bytes() + array_array = tuple(c.to_array_array() for c in codecs_tup[:-1]) else: - raise TypeError + # There are no array-bytes codecs, there is at least one numcodec wrapper, but there are + # also some array-array and / or bytes-bytes codecs + if len(array_array_idcs) > 0: + # There is at least one array-array codec. We will use it as a reference point for + # casting any numcodecs wrappers. + last_array_array_idx = array_array_idcs[-1][0] + + if last_array_array_idx == len(codecs_tup) - 1: + # The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec. This + # cannot be fixed by converting numcodecs wrappers, so we raise an exception. + raise ValueError( + "The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec." + ) - if array_bytes_maybe is None: - raise ValueError("Required ArrayBytesCodec was not found.") + for idx, aac in enumerate(codecs_tup[: (last_array_array_idx + 1)]): + # Iterate over the codecs leading up to the last array-array codec. + if isinstance(aac, ArrayArrayCodec): + # Any array-array codec gets added to the list of array-array codecs + array_array += (aac,) + elif isinstance(aac, NumcodecsWrapper): + # Any numcodecs wrapper gets converted to an array-array codec + array_array += (aac.to_array_array(),) + else: + # Any other kind of codec is invalid and we raise an exception. + msg = f"Invalid codec {aac} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + + if isinstance(codecs_tup[last_array_array_idx + 1], NumcodecsWrapper): + # The codec following the last array-array codec is a numcodecs wrapper. + # We will cast it to an array-bytes codec. + array_bytes_maybe = codecs_tup[last_array_array_idx + 1].to_array_bytes() + else: + # The codec following the last array-array codec was a bytes bytes codec, or + # something else entirely. This is invalid and we raise an exception. + msg = ( + f"Invalid codec {codecs_tup[last_array_array_idx + 1]} at index " + f"{last_array_array_idx + 1}." + "Expected a NumcodecsWrapper or an ArrayBytesCodec, got " + f"{type(codecs_tup[last_array_array_idx + 1])}" + ) + raise TypeError(msg) + + start = last_array_array_idx + 2 + for idx, rem in enumerate(codecs_tup[start:]): + # We have already checked the codec after the last array-array codec, so we start + # iterating over the codecs after that. + if isinstance(rem, BytesBytesCodec): + bytes_bytes += (rem,) + elif isinstance(rem, NumcodecsWrapper): + bytes_bytes += (rem.to_bytes_bytes(),) + else: + msg = f"Invalid codec {rem} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) + else: + # there are no array-array codecs, just numcodecs wrappers and bytes-bytes codecs + first_bytes_bytes_idx = bytes_bytes_idcs[0][0] + if first_bytes_bytes_idx == 0: + raise ValueError( + "The first codec is a BytesBytesCodec, but there is no ArrayBytesCodec." + ) + else: + # Iterate over all codecs. Cast all numcodecs wrappers to array-array codecs, until + # the codec immediately prior to the first bytes-bytes codec, which we cast to + # an array-bytes codec. All codecs after that point are cast to bytes-bytes codecs. + for idx, bb_codec in enumerate(codecs_tup): + if idx < first_bytes_bytes_idx - 1: + # This must be a numcodecs wrapper. cast it to array-array + array_array += (bb_codec.to_array_array(),) + elif idx == first_bytes_bytes_idx - 1: + array_bytes_maybe = bb_codec.to_array_bytes() + else: + if isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + elif isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + else: + msg = f"Invalid codec {bb_codec} at index {idx}. Expected a NumcodecsWrapper" + raise TypeError(msg) + + elif len(array_bytes_idcs) == 1: + bb_idx, ab_codec = array_bytes_idcs[0] + array_bytes_maybe = ab_codec + + end = bb_idx + + for idx, aa_codec in enumerate(codecs_tup[:end]): + if isinstance(aa_codec, ArrayArrayCodec): + array_array += (aa_codec,) + elif isinstance(aa_codec, NumcodecsWrapper): + array_array += (aa_codec.to_array_array(),) + else: + msg = f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + start = bb_idx + 1 + if bb_idx < len(codecs_tup) - 1: + for idx, bb_codec in enumerate(codecs_tup[start:]): + if isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + elif isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + else: + msg = f"Invalid codec {bb_codec} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) else: - return array_array, array_bytes_maybe, bytes_bytes + raise ValueError("More than one ArrayBytes codec found, that is a big error!") + + return array_array, array_bytes_maybe, bytes_bytes register_pipeline(BatchedCodecPipeline) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index ed28fd2da4..dfa0456e45 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -14,6 +14,7 @@ Final, Generic, Literal, + NotRequired, TypedDict, TypeVar, cast, @@ -48,13 +49,32 @@ DimensionNames = Iterable[str | None] | None TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) +BaseConfig = Mapping[str, object] +TConfig = TypeVar("TConfig", bound=BaseConfig) class NamedConfig(TypedDict, Generic[TName, TConfig]): """ - A typed dictionary representing an object with a name and configuration, where the configuration - is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + A typed dictionary representing an object with a `"name"` and `"configuration"` keys. + + The configuration key is not required. + + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + """ + + name: ReadOnly[TName] + """The name of the object.""" + + configuration: NotRequired[ReadOnly[TConfig]] + """The configuration of the object.""" + + +class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): + """ + A typed dictionary representing an object with a `"name"` and `"configuration"` keys. + + The configuration is required. This class is generic with two type parameters: the type of the name (``TName``) and the type of the configuration (``TConfig``). diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index cc3c33cd17..2b7fbbe0c6 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -125,6 +125,27 @@ def enable_gpu(self) -> ConfigSet: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer", diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3204543426..7346e04f93 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,17 +1,20 @@ from __future__ import annotations import warnings -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast +from zarr.abc.codec import ArrayArrayCodec, Codec from zarr.abc.metadata import Metadata from zarr.abc.numcodec import Numcodec, _is_numcodec +from zarr.codecs._v2 import NumcodecsWrapper +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json -from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.core.dtype.common import OBJECT_CODEC_IDS from zarr.errors import ZarrUserWarning -from zarr.registry import get_numcodec +from zarr.registry import get_codec if TYPE_CHECKING: from typing import Literal, Self @@ -19,12 +22,12 @@ import numpy.typing as npt from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.dtype.common import DTypeSpec_V2 from zarr.core.dtype.wrapper import ( TBaseDType, TBaseScalar, TDType_co, TScalar_co, - ZDType, ) import json @@ -42,6 +45,9 @@ parse_shapelike, ) from zarr.core.config import config, parse_indexing_order +from zarr.core.dtype.wrapper import ( + ZDType, +) from zarr.core.metadata.common import parse_attributes @@ -55,7 +61,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None +CompressorLike_V2: TypeAlias = Mapping[str, JSON] | Numcodec | Codec @dataclass(frozen=True, kw_only=True) @@ -65,9 +71,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[Numcodec, ...] | None = None + filters: tuple[Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: Numcodec | None + compressor: Codec attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -80,8 +86,8 @@ def __init__( fill_value: Any, order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", - compressor: CompressorLikev2 = None, - filters: Iterable[Numcodec | dict[str, JSON]] | None = None, + compressor: CompressorLike_V2 | None = None, + filters: Iterable[CompressorLike_V2] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -89,6 +95,9 @@ def __init__( """ shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) + # TODO: remove this + if not isinstance(dtype, ZDType): + raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -98,6 +107,20 @@ def __init__( fill_value_parsed = dtype.cast_scalar(fill_value) else: fill_value_parsed = fill_value + + array_spec = ArraySpec( + shape=shape_parsed, + dtype=dtype, + fill_value=fill_value_parsed, + config=ArrayConfig.from_dict({}), # TODO: config is not needed here. + prototype=default_buffer_prototype(), # TODO: prototype is not needed here. + ) + if compressor_parsed is not None: + pass + # compressor_parsed = compressor_parsed.evolve_from_array_spec(array_spec) + if filters_parsed is not None: + pass + # filters_parsed = tuple(fp.evolve_from_array_spec(array_spec) for fp in filters_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -131,10 +154,10 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, indent=json_indent, allow_nan=True).encode() + json.dumps(zarray_dict, indent=json_indent, allow_nan=False).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( - json.dumps(zattrs_dict, indent=json_indent, allow_nan=True).encode() + json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode() ), } @@ -157,6 +180,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: object_codec_id = get_object_codec_id((_compressor,)) # we add a layer of indirection here around the dtype attribute of the array metadata # because we also need to know the object codec id, if any, to resolve the data type + dtype_spec: DTypeSpec_V2 = { "name": data["dtype"], "object_codec_id": object_codec_id, @@ -196,34 +220,24 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if _is_numcodec(zarray_dict["compressor"]): - codec_config = zarray_dict["compressor"].get_config() - # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 - if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config - + if self.compressor is not None: + zarray_dict["compressor"] = self.compressor.to_json(zarr_format=2) + else: + zarray_dict["compressor"] = None + new_filters = [] if zarray_dict["filters"] is not None: - raw_filters = zarray_dict["filters"] - # TODO: remove this when we can stratically type the output JSON data structure - # entirely - if not isinstance(raw_filters, list | tuple): - raise TypeError("Invalid type for filters. Expected a list or tuple.") - new_filters = [] - for f in raw_filters: - if _is_numcodec(f): - new_filters.append(f.get_config()) - else: - new_filters.append(f) - zarray_dict["filters"] = new_filters + new_filters.extend([f.to_json(zarr_format=2) for f in self.filters]) + else: + new_filters = None + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - # pull the "name" attribute out of the dtype spec returned by self.dtype.to_json - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] + # serialize the dtype after fill value-specific JSON encoding + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] # type: ignore[assignment] return zarray_dict @@ -261,20 +275,23 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[Numcodec, ...] | None: +def parse_filters(data: object) -> tuple[ArrayArrayCodec | NumcodecsWrapper, ...] | None: """ Parse a potential tuple of filters """ - out: list[Numcodec] = [] + out: list[Codec | NumcodecsWrapper] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if _is_numcodec(val): + if isinstance(val, (Codec, NumcodecsWrapper)): out.append(val) + elif _is_numcodec(val): + out.append(NumcodecsWrapper(codec=val)) elif isinstance(val, dict): - out.append(get_numcodec(val)) # type: ignore[arg-type] + codec = get_codec(val, zarr_format=2) + out.append(codec) else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -285,19 +302,28 @@ def parse_filters(data: object) -> tuple[Numcodec, ...] | None: return tuple(out) # take a single codec instance and wrap it in a tuple if _is_numcodec(data): + return (NumcodecsWrapper(codec=data),) + elif isinstance(data, Codec): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> Numcodec | None: +def parse_compressor(data: object) -> Codec | NumcodecsWrapper | None: """ Parse a potential compressor. """ - if data is None or _is_numcodec(data): + # TODO: only validate the compressor in one place. currently we do it twice, once in init_array + # and again when constructing metadata + if data is None or isinstance(data, Codec | NumcodecsWrapper): return data + if _is_numcodec(data): + try: + return get_codec(data.get_config(), zarr_format=2) + except KeyError: + return NumcodecsWrapper(codec=data) if isinstance(data, dict): - return get_numcodec(data) # type: ignore[arg-type] + return get_codec(data, zarr_format=2) msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) @@ -312,6 +338,10 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data +def is_object_codec(codec: JSON) -> bool: + return codec.get("id") in OBJECT_CODEC_IDS + + def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: """ Inspect a sequence of codecs / filters for an "object codec", i.e. a codec diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index d1420b1ddd..1c240c09d4 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -29,13 +29,12 @@ JSON, ZARR_JSON, DimensionNames, - parse_named_configuration, parse_shapelike, ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError, UnknownCodecError -from zarr.registry import get_codec_class +from zarr.registry import get_codec def parse_zarr_format(data: object) -> Literal[3]: @@ -62,10 +61,8 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: ): # Can't use Codec here because of mypy limitation out += (c,) else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - try: - out += (get_codec_class(name_parsed).from_dict(c),) + out += (get_codec(c, zarr_format=3),) except KeyError as e: raise UnknownCodecError(f"Unknown codec: {e.args[0]!r}") from e @@ -85,9 +82,14 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseScalar]) -> None: """Check that the codecs are valid for the given dtype""" + # avoid circular import from zarr.codecs.sharding import ShardingCodec + from zarr.core.codec_pipeline import codecs_from_list + + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) + _codecs = (*array_array_codecs, array_bytes_codec, *bytes_bytes_codecs) - abc = validate_array_bytes_codec(codecs) + abc = validate_array_bytes_codec(_codecs) # Recursively resolve array-bytes codecs within sharding codecs while isinstance(abc, ShardingCodec): @@ -291,7 +293,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: d = self.to_dict() return { ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(d, allow_nan=True, indent=json_indent).encode() + json.dumps(d, allow_nan=False, indent=json_indent).encode() ) } @@ -338,6 +340,10 @@ def to_dict(self) -> dict[str, JSON]: if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") + out_dict["codecs"] = () + for codec in self.codecs: + out_dict["codecs"] += (codec.to_json(zarr_format=3),) + # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` # until then, we have this hack here, which relies on the fact that to_dict will pass through diff --git a/src/zarr/errors.py b/src/zarr/errors.py index 472199ff1b..cf860cdc7b 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -146,3 +146,6 @@ class ZarrRuntimeWarning(RuntimeWarning): """ A warning for dubious runtime behavior. """ + + +class CodecValidationError(ValueError): ... diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 46216205f7..40d21ceece 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -17,12 +17,14 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON, CodecJSON_V2, CodecPipeline, ) from zarr.abc.numcodec import Numcodec + from zarr.codecs._v2 import NumcodecsWrapper from zarr.core.buffer import Buffer, NDBuffer - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat __all__ = [ "Registry", @@ -126,10 +128,10 @@ def fully_qualified_name(cls: type) -> str: return module + "." + cls.__qualname__ -def register_codec(key: str, codec_cls: type[Codec]) -> None: +def register_codec(key: str, codec_cls: type[Codec], *, qualname: str | None = None) -> None: if key not in __codec_registries: __codec_registries[key] = Registry() - __codec_registries[key].register(codec_cls) + __codec_registries[key].register(codec_cls, qualname=qualname) def register_pipeline(pipe_cls: type[CodecPipeline]) -> None: @@ -144,18 +146,20 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: +def _get_codec_class( + key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False +) -> type[Codec]: if reload_config: _reload_config() - if key in __codec_registries: + if key in registry: # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) - __codec_registries[key].lazy_load() + registry[key].lazy_load() + + codec_classes = registry[key] - codec_classes = __codec_registries[key] if not codec_classes: raise KeyError(key) - config_entry = config.get("codecs", {}).get(key) if config_entry is None: if len(codec_classes) == 1: @@ -173,6 +177,46 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: raise KeyError(key) +def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec | NumcodecsWrapper: + """ + Get an instance of a codec from a name and a configuration + """ + from zarr.codecs._v2 import NumcodecsWrapper + + codec_name: str + if zarr_format == 2: + if isinstance(request, str): + raise TypeError( + f"Invalid request type {type(request)} for zarr format 2. Expected dict, got {request!r}" + ) + else: + codec_name = request["id"] + codec_config = {k: v for k, v in request.items() if k != "id"} + elif zarr_format == 3: + if isinstance(request, str): + codec_name = request + codec_config = {} + else: + codec_name = request["name"] + codec_config = request.get("configuration", {}) + else: + raise ValueError( + f"Invalid zarr format. Must be 2 or 3, got {zarr_format!r}" + ) # pragma: no cover + + try: + codec_cls = get_codec_class(codec_name) + return codec_cls.from_json(request, zarr_format=zarr_format) + except KeyError: + # if we can't find the codec in the zarr python registry, try the numcodecs registry + codec = get_numcodec({"id": codec_name, **codec_config}) + return NumcodecsWrapper(codec=codec) + + +def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: + return _get_codec_class(key, __codec_registries, reload_config=reload_config) + + def _resolve_codec(data: dict[str, JSON]) -> Codec: """ Get a codec instance from a dict representation of that codec. @@ -181,19 +225,29 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. """ + # avoid circular import, AKA a sign that this function is in the wrong place from zarr.abc.codec import BytesBytesCodec + from zarr.abc.numcodec import _is_numcodec + from zarr.codecs._v2 import NumcodecsBytesBytesCodec, NumcodecsWrapper + result: BytesBytesCodec if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_bytes_bytes() if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif _is_numcodec(data): + return NumcodecsBytesBytesCodec(codec=data) else: if not isinstance(data, BytesBytesCodec): raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") @@ -201,19 +255,27 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: +def _parse_array_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec + from zarr.abc.numcodec import _is_numcodec + from zarr.codecs._v2 import NumcodecsArrayBytesCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_bytes() if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif _is_numcodec(data): + return NumcodecsArrayBytesCodec(codec=data) else: if not isinstance(data, ArrayBytesCodec): raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") @@ -221,19 +283,27 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: +def _parse_array_array_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayArrayCodec + from zarr.abc.numcodec import _is_numcodec + from zarr.codecs._v2 import NumcodecsArrayArrayCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) - if not isinstance(result, ArrayArrayCodec): + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_array() + elif not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif _is_numcodec(data): + return NumcodecsArrayArrayCodec(codec=data) else: if not isinstance(data, ArrayArrayCodec): raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") diff --git a/tests/test_api.py b/tests/test_api.py index 3668ef306a..4f5b095adb 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1346,17 +1346,6 @@ def test_v2_without_compressor() -> None: assert arr.compressors == () -def test_v2_with_v3_compressor() -> None: - # Check trying to create a v2 array with a v3 compressor fails - with pytest.raises( - ValueError, - match="Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. Use a numcodecs codec directly instead.", - ): - zarr.create( - store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() - ) - - def add_empty_file(path: Path) -> Path: fpath = path / "a.txt" fpath.touch() diff --git a/tests/test_array.py b/tests/test_array.py index a316ee127f..89f9f74cdb 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -470,14 +470,14 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _zarr_format=2, _data_type=arr._async_array._zdtype, _fill_value=arr.fill_value, - _shape=(8, 8), + _shape=arr.shape, _chunk_shape=chunks, _shard_shape=None, _order="C", _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=arr.compressors, ) assert result == expected @@ -488,14 +488,14 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _zarr_format=3, _data_type=arr._async_array._zdtype, _fill_value=arr.fill_value, - _shape=(8, 8), + _shape=arr.shape, _chunk_shape=chunks, _shard_shape=shards, _order="C", _read_only=False, _store_type="MemoryStore", - _compressors=(ZstdCodec(),), - _serializer=BytesCodec(), + _compressors=arr.compressors, + _serializer=arr.serializer, _count_bytes=512, ) assert result == expected @@ -550,14 +550,14 @@ async def test_info_v2_async( _zarr_format=2, _data_type=Float64(), _fill_value=arr.metadata.fill_value, - _shape=(8, 8), - _chunk_shape=(2, 2), + _shape=arr.shape, + _chunk_shape=arr.chunks, _shard_shape=None, _order="C", _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=arr.compressors, ) assert result == expected @@ -582,8 +582,8 @@ async def test_info_v3_async( _order="C", _read_only=False, _store_type="MemoryStore", - _compressors=(ZstdCodec(),), - _serializer=BytesCodec(), + _compressors=arr.compressors, + _serializer=arr.serializer, _count_bytes=512, ) assert result == expected @@ -1137,8 +1137,8 @@ def test_dtype_roundtrip( (ZstdCodec(level=3),), (ZstdCodec(level=3), GzipCodec(level=0)), ZstdCodec(level=3), - {"name": "zstd", "configuration": {"level": 3}}, - ({"name": "zstd", "configuration": {"level": 3}},), + {"name": "zstd", "configuration": {"level": 3, "checksum": True}}, + ({"name": "zstd", "configuration": {"level": 3, "checksum": True}},), ], ) @pytest.mark.parametrize( @@ -1713,7 +1713,7 @@ def test_roundtrip_numcodecs() -> None: # Create the array with the correct codecs root = zarr.group(store) warn_msg = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations." - with pytest.warns(UserWarning, match=warn_msg): + with pytest.warns(ZarrUserWarning, match=warn_msg): root.create_array( "test", shape=(720, 1440), @@ -1728,9 +1728,12 @@ def test_roundtrip_numcodecs() -> None: BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} # Read in the array again and check compressor config root = zarr.open_group(store) - with pytest.warns(UserWarning, match=warn_msg): - metadata = root["test"].metadata.to_dict() - expected = (*filters, BYTES_CODEC, *compressors) + metadata = root["test"].metadata.to_dict() + # The names will change because numcodecs. is an alias for + expected = tuple( + {"name": v["name"].removeprefix("numcodecs."), "configuration": v["configuration"]} + for v in (*filters, BYTES_CODEC, *compressors) + ) assert metadata["codecs"] == expected diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 6e6e9df383..296889a55a 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -8,10 +8,50 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BloscCodec +from zarr.codecs.blosc import ( + BLOSC_CNAME, + BLOSC_SHUFFLE, + BloscCname, + BloscJSON_V2, + BloscJSON_V3, + BloscShuffle, +) from zarr.core.buffer import default_buffer_prototype from zarr.storage import StorePath +@pytest.mark.parametrize("shuffle", BLOSC_SHUFFLE) +@pytest.mark.parametrize("cname", BLOSC_CNAME) +@pytest.mark.parametrize("clevel", [1, 2]) +@pytest.mark.parametrize("blocksize", [1, 2]) +@pytest.mark.parametrize("typesize", [1, 2]) +def test_to_json_v2( + cname: BloscCname, shuffle: BloscShuffle, clevel: int, blocksize: int, typesize: int +) -> None: + codec = BloscCodec( + shuffle=shuffle, cname=cname, clevel=clevel, blocksize=blocksize, typesize=typesize + ) + expected_v2: BloscJSON_V2 = { + "id": "blosc", + "cname": cname, + "clevel": clevel, + "shuffle": BLOSC_SHUFFLE.index(shuffle), + "blocksize": blocksize, + } + expected_v3: BloscJSON_V3 = { + "name": "blosc", + "configuration": { + "cname": cname, + "clevel": clevel, + "shuffle": shuffle, + "blocksize": blocksize, + "typesize": typesize, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("dtype", ["uint8", "uint16"]) async def test_blosc_evolve(store: Store, dtype: str) -> None: diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_bytes.py similarity index 76% rename from tests/test_codecs/test_endian.py rename to tests/test_codecs/test_bytes.py index ab64afb1b8..e991885fa6 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_bytes.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np import pytest @@ -10,6 +10,26 @@ from .test_codecs import _AsyncArrayProxy +if TYPE_CHECKING: + from zarr.codecs.bytes import BytesJSON_V2, BytesJSON_V3 + + +@pytest.mark.parametrize("endian", ["big", "little"]) +def test_bytescodec_to_json(endian: Literal["big", "little"]) -> None: + codec = BytesCodec(endian=endian) + expected_v2: BytesJSON_V2 = { + "id": "bytes", + "endian": endian, + } + expected_v3: BytesJSON_V3 = { + "name": "bytes", + "configuration": { + "endian": endian, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 4753036c87..db146c04de 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -6,6 +8,26 @@ from zarr.codecs import GzipCodec from zarr.storage import StorePath +if TYPE_CHECKING: + from zarr.codecs.gzip import GZipJSON_V2, GZipJSON_V3 + + +@pytest.mark.parametrize("level", [1, 5, 9]) +def test_json(level: int) -> None: + codec = GzipCodec(level=level) + expected_v2: GZipJSON_V2 = { + "id": "gzip", + "level": level, + } + expected_v3: GZipJSON_V3 = { + "name": "gzip", + "configuration": { + "level": level, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_gzip(store: Store) -> None: diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index 1c4d550587..a15e59dce4 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -1,10 +1,56 @@ from __future__ import annotations +import contextlib +import pickle +from typing import TYPE_CHECKING, Any + +import numpy as np +import pytest from numcodecs import GZip +from zarr import config, create_array, open_array from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls +from zarr.codecs import numcodecs as _numcodecs +from zarr.errors import ZarrUserWarning from zarr.registry import get_numcodec +if TYPE_CHECKING: + from collections.abc import Iterator + + +@contextlib.contextmanager +def codec_conf() -> Iterator[Any]: + base_conf = config.get("codecs") + new_conf = { + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkinslookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", + } + + yield config.set({"codecs": new_conf | base_conf}) + + +if TYPE_CHECKING: + from zarr.core.common import JSON + def test_get_numcodec() -> None: assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) # type: ignore[typeddict-unknown-key] @@ -22,3 +68,289 @@ def test_is_numcodec_cls() -> None: Test the _is_numcodec_cls function """ assert _is_numcodec_cls(GZip) + + +EXPECTED_WARNING_STR = "Numcodecs codecs are not in the Zarr version 3.*" + +ALL_CODECS = tuple( + filter( + lambda v: isinstance(v, _numcodecs._NumcodecsCodec), + tuple(getattr(_numcodecs, cls_name) for cls_name in _numcodecs.__all__), + ) +) + + +@pytest.mark.parametrize("codec_class", ALL_CODECS) +def test_docstring(codec_class: type[_numcodecs._NumcodecsCodec]) -> None: + """ + Test that the docstring for the zarr.numcodecs codecs references the wrapped numcodecs class. + """ + assert "See :class:`numcodecs." in codec_class.__doc__ # type: ignore[operator] + + +@pytest.mark.parametrize( + "codec_class", + [ + _numcodecs.Blosc, + _numcodecs.LZ4, + _numcodecs.Zstd, + _numcodecs.Zlib, + _numcodecs.GZip, + _numcodecs.BZ2, + _numcodecs.LZMA, + _numcodecs.Shuffle, + ], +) +def test_generic_compressor(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: + data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) + + a[:, :] = data.copy() + np.testing.assert_array_equal(data, a[:, :]) + + +@pytest.mark.parametrize( + ("codec_class", "codec_config"), + [ + (_numcodecs.Delta, {"dtype": "float32"}), + (_numcodecs.FixedScaleOffset, {"offset": 0, "scale": 25.5}), + (_numcodecs.FixedScaleOffset, {"offset": 0, "scale": 51, "astype": "uint16"}), + (_numcodecs.AsType, {"encode_dtype": "float32", "decode_dtype": "float32"}), + ], + ids=[ + "delta", + "fixedscaleoffset", + "fixedscaleoffset2", + "astype", + ], +) +def test_generic_filter( + codec_class: type[_numcodecs._NumcodecsArrayArrayCodec], + codec_config: dict[str, JSON], +) -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + codec_class(**codec_config), + ], + ) + + a[:, :] = data.copy() + with codec_conf(): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +def test_generic_filter_bitround() -> None: + data = np.linspace(0, 1, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.BitRound(keepbits=3)], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + assert np.allclose(data, b[:, :], atol=0.1) + + +def test_generic_filter_quantize() -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.Quantize(digits=3)], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + assert np.allclose(data, b[:, :], atol=0.001) + + +def test_generic_filter_packbits() -> None: + data = np.zeros((16, 16), dtype="bool") + data[0:4, :] = True + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.PackBits()], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + with pytest.raises(ValueError, match=".*requires bool dtype.*"): + create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype="uint32", + fill_value=0, + filters=[_numcodecs.PackBits()], + ) + + +@pytest.mark.parametrize( + "codec_class", + [ + _numcodecs.CRC32, + _numcodecs.CRC32C, + _numcodecs.Adler32, + _numcodecs.Fletcher32, + _numcodecs.JenkinsLookup3, + ], +) +def test_generic_checksum(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +@pytest.mark.parametrize("codec_class", [_numcodecs.PCodec, _numcodecs.ZFPY]) +def test_generic_bytes_codec(codec_class: type[_numcodecs._NumcodecsArrayBytesCodec]) -> None: + try: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec_class()._codec # noqa: B018 + except ValueError as e: # pragma: no cover + if "codec not available" in str(e): + pytest.xfail(f"{codec_class.codec_name} is not available: {e}") + else: + raise + except ImportError as e: # pragma: no cover + pytest.xfail(f"{codec_class.codec_name} is not available: {e}") + + data = np.arange(0, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + serializer=codec_class(), + ) + + a[:, :] = data.copy() + np.testing.assert_array_equal(data, a[:, :]) + + +def test_delta_astype() -> None: + data = np.linspace(0, 10, 256, dtype="i8").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + _numcodecs.Delta(dtype="i8", astype="i2"), + ], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +def test_repr() -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = _numcodecs.LZ4(level=5) + assert repr(codec) == "LZ4(codec_name='numcodecs.lz4', codec_config={'level': 5})" + + +def test_to_dict() -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = _numcodecs.LZ4(level=5) + assert codec.to_dict() == {"name": "numcodecs.lz4", "configuration": {"level": 5}} + + +@pytest.mark.parametrize( + "codec_cls", + [ + _numcodecs.Blosc, + _numcodecs.LZ4, + _numcodecs.Zstd, + _numcodecs.Zlib, + _numcodecs.GZip, + _numcodecs.BZ2, + _numcodecs.LZMA, + _numcodecs.Shuffle, + _numcodecs.BitRound, + _numcodecs.Delta, + _numcodecs.FixedScaleOffset, + _numcodecs.Quantize, + _numcodecs.PackBits, + _numcodecs.AsType, + _numcodecs.CRC32, + _numcodecs.CRC32C, + _numcodecs.Adler32, + _numcodecs.Fletcher32, + _numcodecs.JenkinsLookup3, + _numcodecs.PCodec, + _numcodecs.ZFPY, + ], +) +def test_codecs_pickleable(codec_cls: type[_numcodecs._NumcodecsCodec]) -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = codec_cls() + + expected = codec + + p = pickle.dumps(codec) + actual = pickle.loads(p) + assert actual == expected diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index eb80545ff3..ba25b07da4 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -1,5 +1,6 @@ import pickle -from typing import Any +from codecs import Codec +from typing import TYPE_CHECKING, Any, Literal import numpy as np import numpy.typing as npt @@ -16,6 +17,8 @@ ShardingCodecIndexLocation, TransposeCodec, ) +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.crc32c_ import Crc32cCodec from zarr.core.buffer import NDArrayLike, default_buffer_prototype from zarr.errors import ZarrUserWarning from zarr.storage import StorePath, ZipStore @@ -23,6 +26,45 @@ from ..conftest import ArrayRequest from .test_codecs import _AsyncArrayProxy, order_from_dim +if TYPE_CHECKING: + from zarr.codecs.sharding import ShardingJSON_V2, ShardingJSON_V3 + + +@pytest.mark.parametrize("index_location", ["start", "end"]) +@pytest.mark.parametrize("chunk_shape", [(32, 32), (64, 64)]) +@pytest.mark.parametrize("codecs", [(BytesCodec(),)]) +@pytest.mark.parametrize("index_codecs", [(Crc32cCodec(),)]) +def test_sharding_codec_to_json( + index_location: Literal["start", "end"], + chunk_shape: tuple[int, ...], + codecs: tuple[Codec, ...], + index_codecs: tuple[Codec, ...], +) -> None: + codec = ShardingCodec( + chunk_shape=chunk_shape, + codecs=codecs, + index_location=index_location, + index_codecs=index_codecs, + ) + expected_v2: ShardingJSON_V2 = { + "id": "sharding_indexed", + "chunk_shape": chunk_shape, + "codecs": tuple(c.to_json(zarr_format=2) for c in codecs), + "index_codecs": tuple(c.to_json(zarr_format=2) for c in index_codecs), + "index_location": index_location, + } + expected_v3: ShardingJSON_V3 = { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": chunk_shape, + "codecs": tuple(c.to_json(zarr_format=3) for c in codecs), + "index_codecs": tuple(c.to_json(zarr_format=3) for c in index_codecs), + "index_location": index_location, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("index_location", ["start", "end"]) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 06ec668ad3..0d48d3fddf 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -10,6 +12,21 @@ from .test_codecs import _AsyncArrayProxy +if TYPE_CHECKING: + from zarr.codecs.transpose import TransposeJSON_V2, TransposeJSON_V3 + + +@pytest.mark.parametrize("order", [(1, 2, 3), (2, 1, 0)]) +def test_transpose_to_json(order: tuple[int, ...]) -> None: + codec = TransposeCodec(order=order) + expected_v2: TransposeJSON_V2 = {"id": "transpose", "order": order} + expected_v3: TransposeJSON_V3 = { + "name": "transpose", + "configuration": {"order": order}, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("input_order", ["F", "C"]) @pytest.mark.parametrize("runtime_write_order", ["F", "C"]) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index cf0905daca..50c69b6ae2 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,6 +8,7 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.codecs.vlen_utf8 import VLenUTF8Codec, VLenUTF8JSON_V2, VLenUTF8JSON_V3 from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata @@ -22,6 +23,16 @@ expected_array_string_dtype = np.dtype("O") +def test_vlen_utf8_to_json() -> None: + codec = VLenUTF8Codec() + expected_v2: VLenUTF8JSON_V2 = {"id": "vlen-utf8"} + expected_v3: VLenUTF8JSON_V3 = { + "name": "vlen-utf8", + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("dtype", numpy_str_dtypes) diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 6068f53443..7c18a25103 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -6,6 +8,25 @@ from zarr.codecs import ZstdCodec from zarr.storage import StorePath +if TYPE_CHECKING: + from zarr.codecs.zstd import ZstdJSON_V2, ZstdJSON_V3 + + +@pytest.mark.parametrize("level", [1, 5, 9]) +@pytest.mark.parametrize("checksum", [True, False]) +def test_json(level: int, checksum: bool) -> None: + codec = ZstdCodec(level=level, checksum=checksum) + expected_v2: ZstdJSON_V2 = { + "id": "zstd", + "level": level, + } + expected_v3: ZstdJSON_V3 = { + "name": "zstd", + "configuration": {"level": level, "checksum": checksum}, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("checksum", [True, False]) diff --git a/tests/test_config.py b/tests/test_config.py index 0c029dda3a..4dc831137c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -73,6 +73,27 @@ def test_config_defaults_set() -> None: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer", @@ -181,7 +202,18 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu chunks=(10,), zarr_format=3, dtype="i4", - compressors=[{"name": "blosc", "configuration": {}}], + compressors=[ + { + "name": "blosc", + "configuration": { + "cname": "lz4", + "clevel": 1, + "shuffle": "noshuffle", + "blocksize": 1, + "typesize": 1, + }, + }, + ], ) arr[:] = range(100) _mock.call.assert_called() diff --git a/tests/test_group.py b/tests/test_group.py index e5cfe82daa..bc6398761a 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -12,7 +12,6 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr import zarr.api.asynchronous @@ -22,6 +21,7 @@ from zarr.abc.store import Store from zarr.core import sync_group from zarr.core._info import GroupInfo +from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config from zarr.core.dtype.common import unpack_dtype_json @@ -597,7 +597,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "chunks": (1,), "order": "C", "filters": None, - "compressor": Blosc(), + "compressor": default_compressor_v2(dtype).to_json(zarr_format=zarr_format), "zarr_format": zarr_format, }, "subgroup": { @@ -624,8 +624,11 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "name": "default", }, "codecs": ( - {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + default_serializer_v3(dtype).to_json(zarr_format=zarr_format), + *[ + c.to_json(zarr_format=zarr_format) + for c in default_compressors_v3(dtype) + ], ), "data_type": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, diff --git a/tests/test_info.py b/tests/test_info.py index 28c8803c83..08f2318dc2 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -74,7 +74,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : ()""") @@ -117,7 +117,7 @@ def test_array_info_complete( Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored} ({count_bytes_stored_formatted})