From 5ac14517e7c8344b5b19fd75294f0b6f2ff92955 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 11:37:25 +0200 Subject: [PATCH 01/25] docs: design spec for statically-typed config (drop donfig) Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- ...26-06-25-statically-typed-config-design.md | 298 ++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 docs/superpowers/specs/2026-06-25-statically-typed-config-design.md diff --git a/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md b/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md new file mode 100644 index 0000000000..189334ec7c --- /dev/null +++ b/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md @@ -0,0 +1,298 @@ +# Statically-typed configuration for zarr-python + +**Date:** 2026-06-25 +**Status:** Approved design, ready for implementation planning + +## Problem + +zarr-python's configuration is built on [donfig](https://github.com/pytroll/donfig). +donfig stores config as an untyped nested `dict`, so there is no static type +information for any configuration value. `config.get("array.order")` is typed as +`Any`, `config.array` does not exist as a typed attribute, and there is no way for +a type checker to catch a misspelled key or a wrong-typed value. + +We want to drop donfig entirely and model the configuration as plain frozen +dataclasses, which gives native static typing for attribute access +(`config.array.order`), while retaining donfig's ergonomic dotted-string API +(`config.get("array.order")`, `config.set({"array.order": "F"})`) with precise +static types via hand-written overloads. This is the technique demonstrated in the +[`tytr`](https://github.com/d-v-b/tytr) project: a flattened mapping from dotted +keys to value types, surfaced through an overloaded getter/setter. + +## Non-negotiable constraint: backwards compatibility + +**Backwards compatibility is extremely important for this work.** The public +`zarr.config` object is widely used in downstream code, notebooks, and +documentation. The replacement MUST be a drop-in for every documented and +commonly-used pattern. Concretely: + +- All of these must continue to work with identical behavior and (where they + returned values) identical return values: + - `config.get("a.b.c")` and `config.get("a.b.c", default)` + - subtree retrieval: `config.get("codecs", {}).get(key)` + - `config.set({"a.b.c": value})` applied **permanently** + - `with config.set({"a.b.c": value}):` applied **scoped**, restored on exit + - `config.reset()` + - `config.enable_gpu()` + - `config.defaults` + - `BadConfigError` + - the `ZARR_FOO__BAR` environment-variable ingestion + - YAML config-file ingestion from standard locations + - the `deprecations` key-redirection/removal warnings +- Public import paths are unchanged: `from zarr.core.config import config, + BadConfigError, parse_indexing_order` and `zarr.config`. +- donfig provides a broader method surface (`to_dict`, `update`, `merge`, + `pprint`, `clear`, `refresh`, `collect`, ...). We preserve the subset zarr + itself uses (`get`, `set`, `reset`, `enable_gpu`, `defaults`, `clear`, + `refresh`) and additionally provide compatible shims for `to_dict`/`update`/ + `pprint` since these are plausible downstream uses. Any donfig method we do not + reimplement must raise a clear, actionable error pointing at the new API rather + than an `AttributeError`. +- Behavior changes are only acceptable where they are strictly additive (new + precise types) or where donfig behavior was undocumented/incidental. Any + observable change is called out in the changelog with migration guidance. +- A `towncrier` changelog entry under `changes/` documents the donfig removal and + confirms the API is preserved. + +## Architecture + +Three layers with clear boundaries. + +### Layer A — schema (frozen dataclasses) + +The configuration shape is a tree of frozen, slotted dataclasses. This is the +single source of truth for both structure and defaults. + +> **Naming note:** a distinct `ArrayConfig` already exists in +> `src/zarr/core/array_spec.py` (a runtime per-array object, unrelated to the +> global config). To avoid collision and confusion, the global-config schema +> dataclasses are named with a `Config` suffix scoped under the config module +> (e.g. the array-namespace schema below). If the names below would still read +> ambiguously next to the existing `ArrayConfig`, prefer an explicit suffix such +> as `ArraySettings` / `ZarrSettings` during implementation. The final names are +> an implementation detail; the structure is what matters. + +```python +@dataclass(frozen=True, slots=True) +class ArrayConfig: + order: Literal["C", "F"] = "C" + write_empty_chunks: bool = False + read_missing_chunks: bool = True + target_shard_size_bytes: int | None = None + rectilinear_chunks: bool = False + sharding_coalesce_max_gap_bytes: int = 1 << 20 # 1 MiB + sharding_coalesce_max_bytes: int = 16 << 20 # 16 MiB + +@dataclass(frozen=True, slots=True) +class AsyncConfig: + concurrency: int = 10 + timeout: float | None = None + +@dataclass(frozen=True, slots=True) +class ThreadingConfig: + max_workers: int | None = None + +@dataclass(frozen=True, slots=True) +class CodecPipelineConfig: + path: str = "zarr.core.codec_pipeline.BatchedCodecPipeline" + batch_size: int = 1 + +@dataclass(frozen=True, slots=True) +class ZarrConfig: + default_zarr_format: Literal[2, 3] = 3 + array: ArrayConfig = field(default_factory=ArrayConfig) + async_: AsyncConfig = field(default_factory=AsyncConfig) # serialized key: "async" + threading: ThreadingConfig = field(default_factory=ThreadingConfig) + json_indent: int = 2 + codec_pipeline: CodecPipelineConfig = field(default_factory=CodecPipelineConfig) + codecs: Mapping[str, str] = field(default_factory=lambda: dict(DEFAULT_CODECS)) + buffer: str = "zarr.buffer.cpu.Buffer" + ndbuffer: str = "zarr.buffer.cpu.NDBuffer" +``` + +Notes: +- `config.array.order` etc. are natively typed by the dataclass — no overloads + needed for the attribute-access path. +- `async_` carries the serialized key `"async"` (an illegal Python identifier). + The mapping between Python field name and serialized dotted key is recorded in a + small per-class `__key_aliases__` (or equivalent) so the string API and ingest + layers translate correctly. Attribute access for `async` is only available via + the string API (`config.get("async.concurrency")`); this matches donfig, which + also has no `config.async` attribute. +- `codecs` is an open `Mapping[str, str]` subtree (per design decision): users + register arbitrary codec names at runtime via `config.set({"codecs.foo": ...})` + and `ZARR_CODECS__FOO=...`. Structured keys get precise static types; codec keys + degrade to the string fallback. `DEFAULT_CODECS` holds the current default codec + name → import-path mapping verbatim. + +### Layer B — state holder (base snapshot + contextvar overlay) + +State is held as immutable `ZarrConfig` snapshots. To preserve donfig's exact +runtime semantics — in particular cross-thread visibility of permanent sets — we +use a **hybrid** of a process-global base and a context-local overlay rather than a +pure `ContextVar`. + +Rationale: zarr runs work in `ThreadPoolExecutor` (`src/zarr/core/sync.py`). +`ThreadPoolExecutor` does **not** copy `contextvars` into worker threads. A pure +`ContextVar` would make a permanent `config.set({...})` invisible inside worker +threads — a silent regression versus donfig's process-global dict mutation. The +hybrid avoids this. + +- `_base: ZarrConfig` — a module-global snapshot, process-wide, visible across all + threads. A **permanent** `config.set(...)` (not used as a `with` block) replaces + this reference. +- `_overlay: ContextVar[ZarrConfig | None]` — a context-local override. `with + config.set(...)` sets this and resets it via the returned `Token` on exit. + Provides async-safe and thread-safe scoping for the common `with config.set(...)` + idiom. +- Resolution: the effective snapshot is `_overlay.get() or _base`. +- Every mutation produces a **new** frozen `ZarrConfig` by applying the requested + dotted-key updates through `dataclasses.replace` along the path (a small + recursive `replace_path(snapshot, "a.b.c", value) -> ZarrConfig` helper). For the + open `codecs` mapping, updates copy-and-extend the dict. + +`config.set(...)` semantics, matching donfig: +- Applies immediately (mutates effective state) **and** returns a context-manager + object. +- If used as `with config.set(...):`, the prior state is restored on `__exit__`. +- If not used as a context manager, the change persists (permanent set updates + `_base`). + +### Layer C — proxy (`config`) + +`config` is the shared singleton everyone imports. It is **not** the data; it reads +the current resolved snapshot on each access, so existing `from zarr.core.config +import config` references continue to observe live updates (preserving donfig's +import-by-reference behavior). It exposes: + +- Typed attribute properties delegating to the resolved snapshot: `config.array -> + ArrayConfig`, `config.async_ -> AsyncConfig`, `config.json_indent -> int`, etc. +- The donfig-compatible string API: `get`, `set`, `reset`, `enable_gpu`, + `defaults`, plus compat shims (`to_dict`, `update`, `pprint`). + +## The typed string API (hand-written overloads) + +Per the design decision, the dotted-key → value-type overloads are **hand-written** +(no codegen, no `tytr` runtime dependency). This is the `tytr` getter pattern, +authored directly: + +```python +class _ConfigProxy: + @overload + def get(self, key: Literal["default_zarr_format"]) -> Literal[2, 3]: ... + @overload + def get(self, key: Literal["array.order"]) -> Literal["C", "F"]: ... + @overload + def get(self, key: Literal["array.write_empty_chunks"]) -> bool: ... + @overload + def get(self, key: Literal["async.concurrency"]) -> int: ... + @overload + def get(self, key: Literal["async.timeout"]) -> float | None: ... + @overload + def get(self, key: Literal["json_indent"]) -> int: ... + # ... one overload per structured leaf key ... + @overload + def get(self, key: str, default: object = ...) -> Any: ... # codecs.*, subtrees, unknown keys + def get(self, key: str, default: object = _MISSING) -> Any: ... +``` + +`set` mirrors this: an overloaded surface (or a `TypedDict` of optional dotted +keys) so that `config.set({"array.order": "F"})` type-checks the value against the +key. The open `codecs.*` keys and whole-subtree gets (`config.get("codecs", {})`) +resolve through the `str` fallback overload. + +### Drift protection + +Hand-written overloads can drift from the dataclass schema. A regression test walks +`ZarrConfig` recursively, enumerates every structured dotted leaf key, and asserts +each has a corresponding `get` overload with a matching return type (introspected +via `typing.get_overloads`). CI fails on any missing/mismatched overload. This +neutralizes the main downside of the hand-written approach. + +## Ingest sources + +Both retained (per design decision). Reimplemented in zarr (~a few dozen lines) +rather than vendoring donfig's loader. + +Precedence, lowest to highest: + +1. dataclass defaults +2. YAML config files +3. environment variables +4. runtime `config.set(...)` + +- **Environment variables:** collect `ZARR_*`, lower-case the key, treat `__` as + nested access, `ast.literal_eval` the value (with literal-eval failure falling + back to the raw string, matching donfig). Builds overrides merged into the base + snapshot at construction. +- **YAML files:** read from standard locations — `ZARR_CONFIG` env var path(s) and + the default config directory (e.g. `~/.config/zarr`), matching donfig's search + behavior. Parsed with the existing YAML dependency and merged under env vars. + +Ingested values are validated/coerced into the dataclass field types where the key +is structured; unknown keys under open subtrees (`codecs.*`) pass through as +strings. + +## Deprecations + +donfig's `deprecations` mechanism (old-key → new-key, or `None` for removed) is +reimplemented. Accessing or setting a deprecated key emits the same warning and +redirects to the new key (or raises/warns for removed keys). The existing +`deprecations` mapping in `config.py` is carried over verbatim: + +```python +deprecations = { + "array.v2_default_compressor.numeric": None, + # ... unchanged ... +} +``` + +## Backwards-compatibility verification + +Beyond the per-feature preservation above: + +- A compatibility test module exercises every pattern in the "Non-negotiable + constraint" list against the new implementation. +- `config.defaults` returns a representation equivalent to today's (the existing + `test_config_defaults_set` is updated to the new snapshot representation while + asserting the same values). +- Methods not reimplemented raise an informative error naming the supported + replacement, never a bare `AttributeError`. + +## Testing + +- Existing `tests/test_config.py` remains largely valid since the string API is + preserved; only `config.defaults` structural assertions are updated. +- New tests: + - overload ↔ dataclass sync (drift protection). + - env-var ingestion (including `ZARR_CODECS__*` dynamic keys). + - YAML-file ingestion and precedence ordering. + - permanent-set visibility inside a `ThreadPoolExecutor` worker (the hybrid + state-model regression). + - `with config.set(...)` scoping under threads and asyncio tasks. + - deprecation warnings/redirects. +- Static-typing assertions in the test suite (the repo type-checks tests): + `reveal_type(config.get("array.order"))` is `Literal['C', 'F']`, + `reveal_type(config.array.order)` is `Literal['C', 'F']`, and a wrong-typed + `config.set({"array.order": "Q"})` is a type error. + +## Files affected + +- `src/zarr/core/config.py` — rewritten: dataclasses, proxy, state holder, string + API, ingest, deprecations. Keeps `config`, `BadConfigError`, + `parse_indexing_order` exports. +- `pyproject.toml` — remove `donfig` dependency; ensure a YAML dependency is + declared (currently transitive via donfig). +- `src/zarr/__init__.py` — remove `donfig` from the version-reporting table. +- `tests/test_config.py` and new test modules — as above. +- `changes/.misc.md` (or `.feature.md`) — changelog entry. +- Documentation referencing donfig (`config.py` module docstring, any docs/ pages) + — updated to describe the new typed API while keeping the string-API examples. + +## Out of scope + +- Changing the set of configuration keys or their defaults. +- Migrating the `codecs` registry out of config (the open `dict[str, str]` subtree + is retained). +- Any change to the `ArrayConfig`/`ArraySpec` runtime objects in + `core/array_spec.py` beyond what is needed to read from the new config. From a42e0da0b22874bb919756163f122e0b75452656 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 11:46:03 +0200 Subject: [PATCH 02/25] docs: explain why the async_ alias is unavoidable Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- ...26-06-25-statically-typed-config-design.md | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md b/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md index 189334ec7c..e0549735c3 100644 --- a/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md +++ b/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md @@ -119,6 +119,47 @@ Notes: layers translate correctly. Attribute access for `async` is only available via the string API (`config.get("async.concurrency")`); this matches donfig, which also has no `config.async` attribute. + +#### Why the `async_` alias is unavoidable (and harmless) + +A natural objection: the `async_` alias is ugly — can't a programmatic +`TypedDict("ZarrConfig", {"async": int, ...})` keep the real key `"async"` and +avoid the alias? It can keep the *string* key, but it does **not** avoid the +problem, because the constraint here is a **syntax** rule, not a typing one: + +- `async` has been a hard keyword since Python 3.7. `config.async` is a + `SyntaxError` regardless of the type machinery behind it. `getattr(config, + "async")` works at runtime but cannot be statically typed precisely. So + attribute access to a field literally named `async` is impossible in any + approach. +- Functional/programmatic `TypedDict` does **not** lose static typing — type + checkers fully support `cfg["async"]` typed from a functional TypedDict. But it + does not rescue attribute access either; it merely moves you from `config.async` + (illegal) to `config["async"]` (subscript). It buys nothing the alias didn't, + and it gives up the natural dotted-attribute ergonomics (`config.array.order`) + for *every other* namespace, which would then also be subscript access. + +So the real axis is attribute-access vs subscript/string-access, not "typed vs +untyped". Every option is fully typed; only `config.async` (the attribute form) is +forbidden, by Python syntax, in all of them. + +Crucially, this is confined to the new typed-attribute convenience and does **not** +touch backwards compatibility. donfig never exposed a `config.async` attribute; the +only place `async` appears today is the *string key* `"async.concurrency"` (and the +env var `ZARR_ASYNC__CONCURRENCY`). Those are strings and behave identically +whether the schema is a dataclass or a TypedDict, and the serialized key stays +`"async"`. Therefore we keep both, fully typed: + +- `config.get("async.concurrency") -> int` — the real key, full backwards compat, + the **primary** documented path. +- `config.async_.concurrency -> int` — the optional typed-attribute convenience, + with the alias documented. + +Net: the dataclass approach keeps full static typing *and* clean attribute access +for every namespace except the one Python forbids by syntax — and for that one, no +approach can do better than an alias or a subscript. The `async_` wrinkle is +cosmetic, confined to attribute access, and costs nothing on the compatibility +surface that matters. - `codecs` is an open `Mapping[str, str]` subtree (per design decision): users register arbitrary codec names at runtime via `config.set({"codecs.foo": ...})` and `ZARR_CODECS__FOO=...`. Structured keys get precise static types; codec keys From 6ff9f652db7263e3c41a855df9cacd0a2f3b4859 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 11:51:25 +0200 Subject: [PATCH 03/25] docs: implementation plan for statically-typed config Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- .../2026-06-25-statically-typed-config.md | 997 ++++++++++++++++++ 1 file changed, 997 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-25-statically-typed-config.md diff --git a/docs/superpowers/plans/2026-06-25-statically-typed-config.md b/docs/superpowers/plans/2026-06-25-statically-typed-config.md new file mode 100644 index 0000000000..94320dc2fc --- /dev/null +++ b/docs/superpowers/plans/2026-06-25-statically-typed-config.md @@ -0,0 +1,997 @@ +# Statically-typed configuration (drop donfig) Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace donfig with a hand-typed, dataclass-backed configuration object that preserves donfig's dotted-string API exactly while adding precise static types. + +**Architecture:** A tree of frozen dataclasses is the schema/source-of-truth. A process-global base snapshot plus a `ContextVar` scope provide donfig-compatible mutable-global semantics with `with`-restore. A proxy object (`config`) exposes typed attribute access and a hand-written overloaded `get`/`set` string API, plus env-var and YAML ingest and deprecation handling. + +**Tech Stack:** Python 3.11+, `dataclasses`, `typing.overload`, `contextvars`, PyYAML, pytest, mypy (strict). + +## Global Constraints + +- Backwards compatibility is the top priority. These must keep identical behavior: `config.get("a.b.c")`, `config.get("a.b.c", default)`, `config.get("codecs", {}).get(key)`, permanent `config.set({...})`, `with config.set({...})`, `config.reset()`, `config.refresh()`, `config.enable_gpu()`, `config.defaults`, `BadConfigError`, `parse_indexing_order`, `ZARR_FOO__BAR` env ingest, YAML ingest, deprecation warnings. +- Public import paths unchanged: `from zarr.core.config import config, BadConfigError, parse_indexing_order`; `zarr.config`. +- mypy strict must pass; PEP8, max line length 100 (prefer <90); numpydoc docstrings on public API. +- Use `uv run` for all pytest/mypy/python invocations (e.g. `uv run pytest ...`). +- No new runtime dependency on `tytr`. Overloads are hand-written. +- The serialized key for the async namespace stays `"async"`; the dataclass field is `async_`. +- `codecs` is an open `Mapping[str, str]` subtree. +- Keep all current config keys, defaults, and the existing `deprecations` mapping verbatim. +- Frequent commits; one logical change per commit. + +## File Structure + +- `src/zarr/core/config.py` — **rewritten** (single module, preserves import paths). Contains: schema dataclasses, path helpers, ingest functions, deprecations, the `ZarrConfigManager` proxy, the module-level `config` instance, `BadConfigError`, `parse_indexing_order`. +- `src/zarr/__init__.py` — remove `"donfig"` from the `required` version-report list. +- `pyproject.toml` — remove the three donfig entries; ensure `pyyaml` is a declared runtime dependency. +- `tests/test_config.py` — existing suite; update only the `defaults` structural assertion. +- `tests/test_config_typed.py` — **new**: schema/helpers/ingest/state/drift/typing unit tests. +- `changes/.misc.md` — **new**: changelog entry. + +--- + +### Task 1: Schema dataclasses + path helpers + +**Files:** +- Modify: `src/zarr/core/config.py` (add new code; do not remove donfig yet) +- Test: `tests/test_config_typed.py` + +**Interfaces:** +- Produces: + - Frozen dataclasses `ArraySettings`, `AsyncSettings`, `ThreadingSettings`, `CodecPipelineSettings`, `ZarrConfig`. + - `DEFAULT_CODECS: dict[str, str]` — the default codec-name→import-path map. + - `make_default_config() -> ZarrConfig`. + - `get_path(cfg: ZarrConfig, key: str) -> Any` — read a dotted key; raises `KeyError` if absent. + - `replace_path(cfg: ZarrConfig, key: str, value: Any) -> ZarrConfig` — return a new snapshot with the dotted key updated. + - `to_nested_dict(cfg: ZarrConfig) -> dict[str, Any]` — donfig-style nested dict using serialized keys (`"async"`, not `"async_"`). + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_config_typed.py`: + +```python +from __future__ import annotations + +import pytest + +from zarr.core.config import ( + DEFAULT_CODECS, + ZarrConfig, + get_path, + make_default_config, + replace_path, + to_nested_dict, +) + + +def test_default_config_values() -> None: + cfg = make_default_config() + assert cfg.default_zarr_format == 3 + assert cfg.array.order == "C" + assert cfg.array.sharding_coalesce_max_bytes == 16 << 20 + assert cfg.async_.concurrency == 10 + assert cfg.async_.timeout is None + assert cfg.threading.max_workers is None + assert cfg.json_indent == 2 + assert cfg.codec_pipeline.path == "zarr.core.codec_pipeline.BatchedCodecPipeline" + assert cfg.codecs["blosc"] == "zarr.codecs.blosc.BloscCodec" + assert cfg.codecs == DEFAULT_CODECS + + +def test_get_path_structured_and_async_alias() -> None: + cfg = make_default_config() + assert get_path(cfg, "array.order") == "C" + assert get_path(cfg, "async.concurrency") == 10 # serialized key, not async_ + assert get_path(cfg, "json_indent") == 2 + assert get_path(cfg, "codecs") == DEFAULT_CODECS + assert get_path(cfg, "codecs.blosc") == "zarr.codecs.blosc.BloscCodec" + with pytest.raises(KeyError): + get_path(cfg, "array.nonexistent") + + +def test_replace_path_is_immutable_and_typed() -> None: + cfg = make_default_config() + cfg2 = replace_path(cfg, "array.order", "F") + assert cfg.array.order == "C" # original unchanged (frozen) + assert cfg2.array.order == "F" + cfg3 = replace_path(cfg, "async.concurrency", 99) + assert cfg3.async_.concurrency == 99 + cfg4 = replace_path(cfg, "codecs.my_codec", "my.module.MyCodec") + assert cfg4.codecs["my_codec"] == "my.module.MyCodec" + assert "my_codec" not in cfg.codecs + + +def test_to_nested_dict_uses_serialized_keys() -> None: + nested = to_nested_dict(make_default_config()) + assert nested["array"]["order"] == "C" + assert nested["async"]["concurrency"] == 10 # serialized key + assert "async_" not in nested + assert nested["codecs"]["blosc"] == "zarr.codecs.blosc.BloscCodec" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/test_config_typed.py -v` +Expected: FAIL — `ImportError` (names not yet defined). + +- [ ] **Step 3: Write minimal implementation** + +Add near the top of `src/zarr/core/config.py` (after `from __future__ import annotations` and imports; add `from dataclasses import dataclass, field, fields, replace`, `from collections.abc import Mapping`, `from typing import Any`): + +```python +DEFAULT_CODECS: dict[str, str] = { + "blosc": "zarr.codecs.blosc.BloscCodec", + "gzip": "zarr.codecs.gzip.GzipCodec", + "zstd": "zarr.codecs.zstd.ZstdCodec", + "bytes": "zarr.codecs.bytes.BytesCodec", + "endian": "zarr.codecs.bytes.BytesCodec", + "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", + "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", + "transpose": "zarr.codecs.transpose.TransposeCodec", + "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", + "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gzip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", +} + +# Map serialized dotted-key segments to Python field names where they differ +# (Python keywords cannot be used as identifiers). +_FIELD_ALIASES: dict[str, str] = {"async": "async_"} +_SERIALIZED_NAMES: dict[str, str] = {v: k for k, v in _FIELD_ALIASES.items()} + + +@dataclass(frozen=True, slots=True) +class ArraySettings: + order: Literal["C", "F"] = "C" + write_empty_chunks: bool = False + read_missing_chunks: bool = True + target_shard_size_bytes: int | None = None + rectilinear_chunks: bool = False + sharding_coalesce_max_gap_bytes: int = 1 << 20 + sharding_coalesce_max_bytes: int = 16 << 20 + + +@dataclass(frozen=True, slots=True) +class AsyncSettings: + concurrency: int = 10 + timeout: float | None = None + + +@dataclass(frozen=True, slots=True) +class ThreadingSettings: + max_workers: int | None = None + + +@dataclass(frozen=True, slots=True) +class CodecPipelineSettings: + path: str = "zarr.core.codec_pipeline.BatchedCodecPipeline" + batch_size: int = 1 + + +@dataclass(frozen=True, slots=True) +class ZarrConfig: + default_zarr_format: Literal[2, 3] = 3 + array: ArraySettings = field(default_factory=ArraySettings) + async_: AsyncSettings = field(default_factory=AsyncSettings) + threading: ThreadingSettings = field(default_factory=ThreadingSettings) + json_indent: int = 2 + codec_pipeline: CodecPipelineSettings = field(default_factory=CodecPipelineSettings) + codecs: Mapping[str, str] = field(default_factory=lambda: dict(DEFAULT_CODECS)) + buffer: str = "zarr.buffer.cpu.Buffer" + ndbuffer: str = "zarr.buffer.cpu.NDBuffer" + + +def make_default_config() -> ZarrConfig: + """Return a fresh `ZarrConfig` populated with the built-in defaults.""" + return ZarrConfig() + + +def _resolve_field(obj: Any, segment: str) -> str: + """Translate a serialized key segment to the dataclass field name.""" + return _FIELD_ALIASES.get(segment, segment) + + +def get_path(cfg: ZarrConfig, key: str) -> Any: + """Read a dotted-string key from a `ZarrConfig` snapshot. + + Raises + ------ + KeyError + If the key does not resolve to a value. + """ + obj: Any = cfg + segments = key.split(".") + for i, segment in enumerate(segments): + if isinstance(obj, Mapping): + # remaining segments index into an open mapping (e.g. codecs.*) + remainder = ".".join(segments[i:]) + try: + return obj[remainder] + except KeyError: + raise KeyError(key) from None + field_name = _resolve_field(obj, segment) + if not hasattr(obj, field_name): + raise KeyError(key) + obj = getattr(obj, field_name) + return obj + + +def replace_path(cfg: ZarrConfig, key: str, value: Any) -> ZarrConfig: + """Return a new `ZarrConfig` with the dotted-string key set to ``value``.""" + segments = key.split(".") + return _replace_recursive(cfg, segments, value, key) + + +def _replace_recursive(obj: Any, segments: list[str], value: Any, key: str) -> Any: + segment = segments[0] + if isinstance(obj, Mapping): + remainder = ".".join(segments) + return {**obj, remainder: value} + field_name = _resolve_field(obj, segment) + if not hasattr(obj, field_name): + raise KeyError(key) + if len(segments) == 1: + return replace(obj, **{field_name: value}) + child = getattr(obj, field_name) + new_child = _replace_recursive(child, segments[1:], value, key) + return replace(obj, **{field_name: new_child}) + + +def to_nested_dict(cfg: ZarrConfig) -> dict[str, Any]: + """Convert a `ZarrConfig` to a donfig-style nested dict (serialized keys).""" + + def convert(obj: Any) -> Any: + if isinstance(obj, Mapping): + return dict(obj) + if hasattr(type(obj), "__dataclass_fields__"): + out: dict[str, Any] = {} + for f in fields(obj): + serialized = _SERIALIZED_NAMES.get(f.name, f.name) + out[serialized] = convert(getattr(obj, f.name)) + return out + return obj + + return convert(cfg) # type: ignore[no-any-return] +``` + +Ensure `Literal` and `Any` are imported at the top of the module. + +- [ ] **Step 4: Run test to verify it passes** + +Run: `uv run pytest tests/test_config_typed.py -v` +Expected: PASS (4 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/zarr/core/config.py tests/test_config_typed.py +git commit -m "feat(config): add frozen dataclass schema and path helpers" +``` + +--- + +### Task 2: Env-var and YAML ingest + +**Files:** +- Modify: `src/zarr/core/config.py` +- Test: `tests/test_config_typed.py` + +**Interfaces:** +- Consumes: `ZarrConfig`, `replace_path` (Task 1). +- Produces: + - `collect_env(environ: Mapping[str, str]) -> dict[str, Any]` — flat dotted-key → value map from `ZARR_*` vars. + - `collect_yaml(paths: list[str]) -> dict[str, Any]` — flat dotted-key map merged from YAML files (missing files skipped). + - `apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, Any]) -> ZarrConfig`. + - `build_config(environ: Mapping[str, str] | None = None) -> ZarrConfig` — defaults < YAML < env. + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_config_typed.py`: + +```python +from zarr.core.config import apply_overrides, build_config, collect_env + + +def test_collect_env_parses_nested_and_literal() -> None: + env = { + "ZARR_ARRAY__ORDER": "F", + "ZARR_ASYNC__CONCURRENCY": "32", + "ZARR_CODECS__MY_CODEC": "my.module.MyCodec", + "UNRELATED": "ignored", + } + out = collect_env(env) + assert out["array.order"] == "F" + assert out["async.concurrency"] == 32 # ast.literal_eval -> int + assert out["codecs.my_codec"] == "my.module.MyCodec" # non-literal -> raw str + assert "unrelated" not in out + + +def test_apply_overrides_and_build_config_precedence() -> None: + cfg = apply_overrides( + build_config(environ={}), + {"array.order": "F", "codecs.x": "pkg.X"}, + ) + assert cfg.array.order == "F" + assert cfg.codecs["x"] == "pkg.X" + # env overrides defaults + cfg2 = build_config(environ={"ZARR_JSON_INDENT": "4"}) + assert cfg2.json_indent == 4 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/test_config_typed.py -k "env or precedence" -v` +Expected: FAIL — `ImportError`. + +- [ ] **Step 3: Write minimal implementation** + +Add to `src/zarr/core/config.py` (add `import ast`, `import os`, `import contextlib` at top): + +```python +ENV_PREFIX = "ZARR_" + + +def _parse_env_value(raw: str) -> Any: + """Parse an env value with ``ast.literal_eval``; fall back to the raw string.""" + try: + return ast.literal_eval(raw) + except (ValueError, SyntaxError): + return raw + + +def collect_env(environ: Mapping[str, str]) -> dict[str, Any]: + """Collect ``ZARR_*`` environment variables into a flat dotted-key map. + + ``ZARR_FOO__BAR_BAZ=1`` becomes ``{"foo.bar_baz": 1}`` — the key is + lower-cased and ``__`` denotes nested access. + """ + out: dict[str, Any] = {} + for name, raw in environ.items(): + if not name.startswith(ENV_PREFIX): + continue + body = name[len(ENV_PREFIX) :] + dotted = body.lower().replace("__", ".") + out[dotted] = _parse_env_value(raw) + return out + + +def _config_search_paths() -> list[str]: + """Standard YAML config locations, mirroring donfig's search order.""" + paths: list[str] = [] + env_path = os.environ.get("ZARR_CONFIG") + if env_path: + paths.append(env_path) + paths.append(os.path.join(os.path.expanduser("~"), ".config", "zarr")) + return paths + + +def collect_yaml(paths: list[str]) -> dict[str, Any]: + """Merge YAML config files found at ``paths`` into a flat dotted-key map.""" + import yaml + + merged: dict[str, Any] = {} + for path in paths: + candidates: list[str] = [] + if os.path.isdir(path): + for fn in sorted(os.listdir(path)): + if fn.endswith((".yaml", ".yml")): + candidates.append(os.path.join(path, fn)) + elif os.path.isfile(path): + candidates.append(path) + for candidate in candidates: + with contextlib.suppress(FileNotFoundError): + with open(candidate) as fh: + data = yaml.safe_load(fh) + if isinstance(data, Mapping): + merged.update(_flatten_mapping(data)) + return merged + + +def _flatten_mapping(data: Mapping[str, Any], prefix: str = "") -> dict[str, Any]: + out: dict[str, Any] = {} + for k, v in data.items(): + key = f"{prefix}{k}" if not prefix else f"{prefix}.{k}" + if isinstance(v, Mapping) and k not in ("codecs",): + out.update(_flatten_mapping(v, key)) + else: + out[key] = v + return out + + +def apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, Any]) -> ZarrConfig: + """Apply a flat dotted-key override map to a snapshot.""" + for key, value in overrides.items(): + cfg = replace_path(cfg, key, value) + return cfg + + +def build_config(environ: Mapping[str, str] | None = None) -> ZarrConfig: + """Build the base snapshot: defaults < YAML files < environment variables.""" + if environ is None: + environ = os.environ + cfg = make_default_config() + cfg = apply_overrides(cfg, collect_yaml(_config_search_paths())) + cfg = apply_overrides(cfg, collect_env(environ)) + return cfg +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `uv run pytest tests/test_config_typed.py -k "env or precedence" -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/zarr/core/config.py tests/test_config_typed.py +git commit -m "feat(config): add env-var and YAML ingest" +``` + +--- + +### Task 3: State holder + proxy with typed get/set/reset + +**Files:** +- Modify: `src/zarr/core/config.py` +- Test: `tests/test_config_typed.py` + +**Interfaces:** +- Consumes: `ZarrConfig`, `build_config`, `get_path`, `replace_path`, `to_nested_dict`, `deprecations` (Tasks 1–2 + existing). +- Produces: + - `class ZarrConfigManager` with: typed properties (`array`, `async_`, `threading`, `codec_pipeline`, `default_zarr_format`, `json_indent`, `codecs`, `buffer`, `ndbuffer`); overloaded `get(key, default=...)`; `set(mapping) -> _ConfigSet`; `reset()`; `refresh()`; `enable_gpu()`; `defaults` property; compat shims `to_dict()`, `update(mapping)`, `pprint()`. + - module-level `config: ZarrConfigManager`. + - `_ConfigSet` context manager. + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_config_typed.py`: + +```python +from concurrent.futures import ThreadPoolExecutor + +from zarr.core.config import ZarrConfigManager + + +def test_proxy_attribute_and_string_access() -> None: + cfg = ZarrConfigManager() + assert cfg.array.order == "C" + assert cfg.get("array.order") == "C" + assert cfg.get("async.concurrency") == 10 + assert cfg.get("codecs", {})["blosc"] == "zarr.codecs.blosc.BloscCodec" + assert cfg.get("does.not.exist", "fallback") == "fallback" + + +def test_set_permanent_and_context() -> None: + cfg = ZarrConfigManager() + cfg.set({"array.order": "F"}) + assert cfg.get("array.order") == "F" # permanent + with cfg.set({"array.order": "C"}): + assert cfg.get("array.order") == "C" + assert cfg.get("array.order") == "F" # restored to permanent value + cfg.reset() + assert cfg.get("array.order") == "C" + + +def test_permanent_set_visible_in_worker_thread() -> None: + cfg = ZarrConfigManager() + cfg.set({"async.concurrency": 77}) + try: + with ThreadPoolExecutor(max_workers=1) as ex: + seen = ex.submit(lambda: cfg.get("async.concurrency")).result() + assert seen == 77 # ThreadPoolExecutor does not copy contextvars + finally: + cfg.reset() + + +def test_defaults_and_enable_gpu() -> None: + cfg = ZarrConfigManager() + assert cfg.defaults["array"]["order"] == "C" + with cfg.set({"buffer": "x"}): + pass + cfg.enable_gpu() + try: + assert cfg.get("buffer") == "zarr.buffer.gpu.Buffer" + assert cfg.get("ndbuffer") == "zarr.buffer.gpu.NDBuffer" + finally: + cfg.reset() +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/test_config_typed.py -k "proxy or set_permanent or worker or defaults_and" -v` +Expected: FAIL — `ImportError` / attribute errors. + +- [ ] **Step 3: Write minimal implementation** + +Add to `src/zarr/core/config.py` (add `from contextvars import ContextVar`, `from typing import overload`, `import warnings` if not present): + +```python +_MISSING = object() + + +class _ConfigSet: + """Context manager returned by ``ZarrConfigManager.set``. + + The change is applied immediately (permanent by default); using the object + as a ``with`` block restores the prior state on exit. + """ + + def __init__(self, manager: ZarrConfigManager, prev_base: ZarrConfig, token: Any) -> None: + self._manager = manager + self._prev_base = prev_base + self._token = token + + def __enter__(self) -> _ConfigSet: + return self + + def __exit__(self, *exc: object) -> None: + self._manager._restore(self._prev_base, self._token) + + +class ZarrConfigManager: + """Typed, donfig-compatible configuration object.""" + + def __init__(self) -> None: + self._base: ZarrConfig = build_config() + self._scope: ContextVar[ZarrConfig] = ContextVar("zarr_config_scope") + + # --- state resolution ------------------------------------------------- + def _current(self) -> ZarrConfig: + return self._scope.get(self._base) + + def _restore(self, prev_base: ZarrConfig, token: Any) -> None: + self._base = prev_base + self._scope.reset(token) + + # --- typed attribute access ------------------------------------------ + @property + def default_zarr_format(self) -> Literal[2, 3]: + return self._current().default_zarr_format + + @property + def array(self) -> ArraySettings: + return self._current().array + + @property + def async_(self) -> AsyncSettings: + return self._current().async_ + + @property + def threading(self) -> ThreadingSettings: + return self._current().threading + + @property + def codec_pipeline(self) -> CodecPipelineSettings: + return self._current().codec_pipeline + + @property + def json_indent(self) -> int: + return self._current().json_indent + + @property + def codecs(self) -> Mapping[str, str]: + return self._current().codecs + + @property + def buffer(self) -> str: + return self._current().buffer + + @property + def ndbuffer(self) -> str: + return self._current().ndbuffer + + # --- string API: get -------------------------------------------------- + @overload + def get(self, key: Literal["default_zarr_format"]) -> Literal[2, 3]: ... + @overload + def get(self, key: Literal["array.order"]) -> Literal["C", "F"]: ... + @overload + def get(self, key: Literal["array.write_empty_chunks"]) -> bool: ... + @overload + def get(self, key: Literal["array.read_missing_chunks"]) -> bool: ... + @overload + def get(self, key: Literal["array.target_shard_size_bytes"]) -> int | None: ... + @overload + def get(self, key: Literal["array.rectilinear_chunks"]) -> bool: ... + @overload + def get(self, key: Literal["array.sharding_coalesce_max_gap_bytes"]) -> int: ... + @overload + def get(self, key: Literal["array.sharding_coalesce_max_bytes"]) -> int: ... + @overload + def get(self, key: Literal["async.concurrency"]) -> int: ... + @overload + def get(self, key: Literal["async.timeout"]) -> float | None: ... + @overload + def get(self, key: Literal["threading.max_workers"]) -> int | None: ... + @overload + def get(self, key: Literal["json_indent"]) -> int: ... + @overload + def get(self, key: Literal["codec_pipeline.path"]) -> str: ... + @overload + def get(self, key: Literal["codec_pipeline.batch_size"]) -> int: ... + @overload + def get(self, key: Literal["buffer"]) -> str: ... + @overload + def get(self, key: Literal["ndbuffer"]) -> str: ... + @overload + def get(self, key: str, default: Any = ...) -> Any: ... + + def get(self, key: str, default: Any = _MISSING) -> Any: + resolved = self._apply_deprecation(key) + if resolved is None: + if default is _MISSING: + raise KeyError(key) + return default + try: + return get_path(self._current(), resolved) + except KeyError: + if default is _MISSING: + raise + return default + + # --- string API: set -------------------------------------------------- + def set(self, updates: Mapping[str, Any]) -> _ConfigSet: + prev_base = self._base + new = self._current() + for key, value in updates.items(): + resolved = self._apply_deprecation(key) + if resolved is None: + continue + new = replace_path(new, resolved, value) + self._base = new + token = self._scope.set(new) + return _ConfigSet(self, prev_base, token) + + # --- lifecycle -------------------------------------------------------- + def reset(self) -> None: + self._base = build_config() + with contextlib.suppress(LookupError): + self._scope.set(self._base) + + def refresh(self) -> None: + self._base = build_config() + + def enable_gpu(self) -> _ConfigSet: + return self.set( + {"buffer": "zarr.buffer.gpu.Buffer", "ndbuffer": "zarr.buffer.gpu.NDBuffer"} + ) + + # --- compat / introspection ------------------------------------------ + @property + def defaults(self) -> dict[str, Any]: + return to_nested_dict(make_default_config()) + + def to_dict(self) -> dict[str, Any]: + return to_nested_dict(self._current()) + + def update(self, updates: Mapping[str, Any]) -> None: + self.set(updates) + + def pprint(self) -> None: + import pprint as _pp + + _pp.pprint(self.to_dict()) + + # --- deprecations ----------------------------------------------------- + def _apply_deprecation(self, key: str) -> str | None: + if key not in deprecations: + return key + new_key = deprecations[key] + if new_key is None: + warnings.warn( + f"Configuration key {key!r} has been removed and no longer has " + f"any effect.", + ZarrDeprecationWarning, + stacklevel=3, + ) + return None + warnings.warn( + f"Configuration key {key!r} has been renamed to {new_key!r}.", + ZarrDeprecationWarning, + stacklevel=3, + ) + return new_key +``` + +Add `from zarr.errors import ZarrDeprecationWarning` to the imports — this class already exists in `src/zarr/errors.py` (a `DeprecationWarning` subclass). Define the module-level instance at the bottom of the schema/proxy section but BEFORE the existing donfig `config = Config(...)` (which Task 4 removes): + +```python +# Provisional new instance; Task 4 makes this THE module-level `config`. +_typed_config = ZarrConfigManager() +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `uv run pytest tests/test_config_typed.py -k "proxy or set_permanent or worker or defaults_and" -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/zarr/core/config.py tests/test_config_typed.py +git commit -m "feat(config): add typed proxy with get/set/reset and deprecations" +``` + +--- + +### Task 4: Swap out donfig (make `config` the new proxy) + +**Files:** +- Modify: `src/zarr/core/config.py` (remove donfig `Config` subclass and instance; promote proxy) +- Test: existing `tests/test_config.py` (and the full suite) + +**Interfaces:** +- Consumes: everything from Tasks 1–3. +- Produces: module-level `config: ZarrConfigManager`; unchanged exports `BadConfigError`, `parse_indexing_order`. + +- [ ] **Step 1: Update the existing `defaults` assertion test** + +In `tests/test_config.py::test_config_defaults_set`, replace the `config.defaults == [ {...} ]` list-of-one-dict assertion with the new nested-dict form: + +```python +def test_config_defaults_set() -> None: + assert config.defaults == { + "default_zarr_format": 3, + "array": { + "order": "C", + "write_empty_chunks": False, + "read_missing_chunks": True, + "target_shard_size_bytes": None, + "rectilinear_chunks": False, + "sharding_coalesce_max_gap_bytes": 1 << 20, + "sharding_coalesce_max_bytes": 16 << 20, + }, + "async": {"concurrency": 10, "timeout": None}, + "threading": {"max_workers": None}, + "json_indent": 2, + "codec_pipeline": { + "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", + "batch_size": 1, + }, + "codecs": dict(DEFAULT_CODECS), + "buffer": "zarr.buffer.cpu.Buffer", + "ndbuffer": "zarr.buffer.cpu.NDBuffer", + } + assert config.get("array.order") == "C" + assert config.get("async.concurrency") == 10 + assert config.get("async.timeout") is None + assert config.get("codec_pipeline.batch_size") == 1 + assert config.get("json_indent") == 2 +``` + +Add `from zarr.core.config import DEFAULT_CONFIG` is not needed; import `DEFAULT_CODECS` in the test's existing config import line. + +- [ ] **Step 2: Run to verify it fails** + +Run: `uv run pytest tests/test_config.py::test_config_defaults_set -v` +Expected: FAIL — `config.defaults` is still donfig's list form. + +- [ ] **Step 3: Remove donfig and promote the proxy** + +In `src/zarr/core/config.py`: +1. Delete the `from donfig import Config as DConfig` import and the `if TYPE_CHECKING: from donfig.config_obj import ConfigSet` block. +2. Delete the `class Config(DConfig): ...` definition (its `reset`/`enable_gpu` now live on `ZarrConfigManager`). +3. Delete the `config = Config("zarr", defaults=[...], deprecations=deprecations)` block. The big defaults dict is now expressed by the dataclasses + `DEFAULT_CODECS`; keep the `deprecations` dict (it is consumed by `ZarrConfigManager`). +4. Replace the provisional `_typed_config = ZarrConfigManager()` line with: + +```python +config = ZarrConfigManager() +``` + +5. Update the module docstring at the top: replace donfig references with a description of the typed config and the `ZARR_FOO__BAR` env-var behavior (keep the example showing `config.set({"codecs.bytes": "your.module.NewBytesCodec"})` and the `ZARR_CODECS__BYTES` env var — both still work). +6. Keep `parse_indexing_order` and `BadConfigError` exactly as-is. + +- [ ] **Step 4: Run the full config + dependent suites** + +Run: `uv run pytest tests/test_config.py -v` +Expected: PASS. + +Run: `uv run pytest tests/test_api.py tests/test_buffer.py tests/test_codec_entrypoints.py tests/test_v2.py tests/test_sync.py tests/test_common.py -q` +Expected: PASS (these import/use `config`). + +Run: `uv run pytest tests -q` +Expected: PASS (full suite; backwards-compat gate). + +- [ ] **Step 5: Run mypy** + +Run: `uv run mypy src/zarr/core/config.py src/zarr/registry.py src/zarr/core/sync.py` +Expected: no errors. Confirm `reveal_type` is not needed here; fix any typing fallout in consumers (e.g. casts that referenced donfig types). + +- [ ] **Step 6: Commit** + +```bash +git add src/zarr/core/config.py tests/test_config.py +git commit -m "feat(config): replace donfig with typed config object" +``` + +--- + +### Task 5: Remove the donfig dependency + +**Files:** +- Modify: `pyproject.toml` (lines ~39, ~246, ~272), `src/zarr/__init__.py` (~line 71) +- Test: import smoke test + +**Interfaces:** none new. + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_config_typed.py`: + +```python +def test_donfig_not_imported() -> None: + import sys + + import zarr # noqa: F401 + + assert "donfig" not in sys.modules +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `uv run pytest tests/test_config_typed.py::test_donfig_not_imported -v` +Expected: FAIL — donfig still imported somewhere / installed and pulled in. + +- [ ] **Step 3: Edit dependency declarations** + +In `pyproject.toml`: +- Remove `'donfig>=0.8',` from the `dependencies` list (~line 39). +- Add `'pyyaml',` to the `dependencies` list (donfig previously pulled YAML support transitively; we now use it directly). +- Remove `'donfig @ git+https://github.com/pytroll/donfig',` from the `dynamic`/upstream group (~line 246). +- Remove `'donfig==0.8.*',` from the minimal-pins group (~line 272). + +In `src/zarr/__init__.py`, remove `"donfig",` from the `required` list (~line 71). + +- [ ] **Step 4: Re-sync the environment and verify** + +Run: `uv run --reinstall-package zarr pytest tests/test_config_typed.py::test_donfig_not_imported -v` +Expected: PASS. + +Run: `uv run python -c "import zarr; print(zarr.config.get('array.order'))"` +Expected: prints `C`. + +- [ ] **Step 5: Commit** + +```bash +git add pyproject.toml src/zarr/__init__.py tests/test_config_typed.py +git commit -m "build: drop donfig dependency, add pyyaml" +``` + +--- + +### Task 6: Drift-protection, typing assertions, docs, changelog + +**Files:** +- Test: `tests/test_config_typed.py` +- Modify: `src/zarr/core/config.py` docstring (if not already done in Task 4) +- Create: `changes/.misc.md` + +**Interfaces:** none new. + +- [ ] **Step 1: Write the drift-protection + typing tests** + +Add to `tests/test_config_typed.py`: + +```python +import typing + +from zarr.core.config import ZarrConfig, ZarrConfigManager, _SERIALIZED_NAMES + + +def _structured_leaf_keys(cfg_cls: type, prefix: str = "") -> list[str]: + import dataclasses + + keys: list[str] = [] + for f in dataclasses.fields(cfg_cls): + serialized = _SERIALIZED_NAMES.get(f.name, f.name) + key = f"{prefix}{serialized}" if not prefix else f"{prefix}.{serialized}" + ftype = f.type + if dataclasses.is_dataclass(ftype): + keys.extend(_structured_leaf_keys(ftype, key)) + elif f.name == "codecs": + continue # open mapping, intentionally not enumerated + else: + keys.append(key) + return keys + + +def test_every_structured_key_has_a_get_overload() -> None: + overloads = typing.get_overloads(ZarrConfigManager.get) + literal_keys: set[str] = set() + for ov in overloads: + hints = typing.get_type_hints(ov) + key_hint = hints.get("key") + if typing.get_origin(key_hint) is typing.Literal: + literal_keys.update(typing.get_args(key_hint)) + missing = set(_structured_leaf_keys(ZarrConfig)) - literal_keys + assert not missing, f"get() overloads missing for: {sorted(missing)}" + + +if typing.TYPE_CHECKING: + + def _typing_smoke(cfg: ZarrConfigManager) -> None: + typing.assert_type(cfg.get("array.order"), typing.Literal["C", "F"]) + typing.assert_type(cfg.array.order, typing.Literal["C", "F"]) + typing.assert_type(cfg.get("async.concurrency"), int) +``` + +Note: `f.type` may be a string under `from __future__ import annotations`. If so, resolve with `typing.get_type_hints(cfg_cls)` inside `_structured_leaf_keys` instead of reading `f.type` directly. Adjust the helper accordingly so dataclass detection works on resolved types. + +- [ ] **Step 2: Run to verify it fails (then passes once overloads complete)** + +Run: `uv run pytest tests/test_config_typed.py -k "overload" -v` +Expected: PASS if all overloads from Task 3 are present; if it lists missing keys, add the corresponding `get` overloads in `config.py` and re-run until PASS. + +- [ ] **Step 3: Type-check the typing smoke test** + +Run: `uv run mypy tests/test_config_typed.py` +Expected: no errors (`assert_type` calls confirm the precise static types). + +- [ ] **Step 4: Add the changelog entry** + +Create `changes/.misc.md` (replace `` with the PR number) with: + +```markdown +Replaced the ``donfig``-based configuration with a statically-typed +configuration object. ``zarr.config`` now provides precise static types for +attribute access (``zarr.config.array.order``) and for the dotted-string API +(``zarr.config.get("array.order")``). The string API, environment-variable +ingestion (``ZARR_FOO__BAR``), YAML config files, ``config.set`` (permanent and +as a context manager), ``config.reset``, ``config.enable_gpu``, and the +``deprecations`` mechanism are all preserved. The ``donfig`` dependency has been +removed. +``` + +- [ ] **Step 5: Update the module docstring (if not done in Task 4)** + +Confirm `src/zarr/core/config.py`'s top docstring no longer references donfig and documents the typed API + `ZARR_*` env vars + YAML. (Use single-backtick markdown — docs are mkdocs.) + +- [ ] **Step 6: Full verification + commit** + +Run: `uv run pytest tests/test_config.py tests/test_config_typed.py -q` +Expected: PASS. + +Run: `uv run pytest tests -q` +Expected: PASS. + +Run: `uv run mypy src tests/test_config_typed.py` +Expected: no errors. + +```bash +git add tests/test_config_typed.py src/zarr/core/config.py changes/ +git commit -m "test(config): drift-protection + typing assertions; docs + changelog" +``` + +--- + +## Self-Review + +**Spec coverage:** +- Schema dataclasses → Task 1. Open `codecs` mapping → Task 1 (`get_path`/`replace_path` mapping handling) + tests. State holder (base + contextvar) → Task 3. Proxy + typed attribute access → Task 3. Hand-written overloads → Task 3, completeness enforced Task 6. Env + YAML ingest → Task 2. Deprecations → Task 3. Backwards-compat surface → Task 4 (full suite) + preserved methods (`to_dict`/`update`/`pprint`/`refresh`/`reset`/`enable_gpu`/`defaults`). donfig removal → Task 5. Drift protection + typing assertions + changelog + docs → Task 6. +- `async_` alias rationale → realized via `_FIELD_ALIASES`/`_SERIALIZED_NAMES` in Tasks 1/3/6. + +**Type consistency:** `ZarrConfig`, `ArraySettings`, `AsyncSettings`, `ThreadingSettings`, `CodecPipelineSettings`, `get_path`, `replace_path`, `to_nested_dict`, `build_config`, `collect_env`, `collect_yaml`, `apply_overrides`, `ZarrConfigManager`, `_ConfigSet`, `_FIELD_ALIASES`, `_SERIALIZED_NAMES`, `DEFAULT_CODECS` are used consistently across tasks. + +**Known follow-ups for the implementer (not placeholders — explicit decisions):** +- If `from __future__ import annotations` makes `dataclasses.fields(...).type` a string, resolve via `get_type_hints` in the drift helper (Task 6, Step 1). +- `set` semantics note: a top-level `config.set({...})` updates `_base` (cross-thread, permanent) and the contextvar scope; `with config.set({...})` restores both on exit. This matches donfig's permanent-by-default behavior while keeping cross-thread visibility (verified by `test_permanent_set_visible_in_worker_thread`). From e05905f41efc861a19597b20c0ad02fb31435c59 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 11:57:21 +0200 Subject: [PATCH 04/25] feat(config): add frozen dataclass schema and path helpers --- src/zarr/core/config.py | 156 +++++++++++++++++++++++++++++++++++++ tests/test_config_typed.py | 56 +++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 tests/test_config_typed.py diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 08d2a50ace..eff60cbb3a 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -29,6 +29,8 @@ from __future__ import annotations +from collections.abc import Mapping +from dataclasses import dataclass, field, fields, replace from typing import TYPE_CHECKING, Any, Literal, cast from donfig import Config as DConfig @@ -37,6 +39,160 @@ from donfig.config_obj import ConfigSet +DEFAULT_CODECS: dict[str, str] = { + "blosc": "zarr.codecs.blosc.BloscCodec", + "gzip": "zarr.codecs.gzip.GzipCodec", + "zstd": "zarr.codecs.zstd.ZstdCodec", + "bytes": "zarr.codecs.bytes.BytesCodec", + "endian": "zarr.codecs.bytes.BytesCodec", + "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", + "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", + "transpose": "zarr.codecs.transpose.TransposeCodec", + "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", + "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs.numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs.numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", + "numcodecs.gzip": "zarr.codecs.numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", +} + +# Map serialized dotted-key segments to Python field names where they differ +# (Python keywords cannot be used as identifiers). +_FIELD_ALIASES: dict[str, str] = {"async": "async_"} +_SERIALIZED_NAMES: dict[str, str] = {v: k for k, v in _FIELD_ALIASES.items()} + + +@dataclass(frozen=True, slots=True) +class ArraySettings: + order: Literal["C", "F"] = "C" + write_empty_chunks: bool = False + read_missing_chunks: bool = True + target_shard_size_bytes: int | None = None + rectilinear_chunks: bool = False + sharding_coalesce_max_gap_bytes: int = 1 << 20 + sharding_coalesce_max_bytes: int = 16 << 20 + + +@dataclass(frozen=True, slots=True) +class AsyncSettings: + concurrency: int = 10 + timeout: float | None = None + + +@dataclass(frozen=True, slots=True) +class ThreadingSettings: + max_workers: int | None = None + + +@dataclass(frozen=True, slots=True) +class CodecPipelineSettings: + path: str = "zarr.core.codec_pipeline.BatchedCodecPipeline" + batch_size: int = 1 + + +@dataclass(frozen=True, slots=True) +class ZarrConfig: + default_zarr_format: Literal[2, 3] = 3 + array: ArraySettings = field(default_factory=ArraySettings) + async_: AsyncSettings = field(default_factory=AsyncSettings) + threading: ThreadingSettings = field(default_factory=ThreadingSettings) + json_indent: int = 2 + codec_pipeline: CodecPipelineSettings = field(default_factory=CodecPipelineSettings) + codecs: Mapping[str, str] = field(default_factory=lambda: dict(DEFAULT_CODECS)) + buffer: str = "zarr.buffer.cpu.Buffer" + ndbuffer: str = "zarr.buffer.cpu.NDBuffer" + + +def make_default_config() -> ZarrConfig: + """Return a fresh `ZarrConfig` populated with the built-in defaults.""" + return ZarrConfig() + + +def _resolve_field(obj: Any, segment: str) -> str: + """Translate a serialized key segment to the dataclass field name.""" + return _FIELD_ALIASES.get(segment, segment) + + +def get_path(cfg: ZarrConfig, key: str) -> Any: + """Read a dotted-string key from a `ZarrConfig` snapshot. + + Raises + ------ + KeyError + If the key does not resolve to a value. + """ + obj: Any = cfg + segments = key.split(".") + for i, segment in enumerate(segments): + if isinstance(obj, Mapping): + # remaining segments index into an open mapping (e.g. codecs.*) + remainder = ".".join(segments[i:]) + try: + return obj[remainder] + except KeyError: + raise KeyError(key) from None + field_name = _resolve_field(obj, segment) + if not hasattr(obj, field_name): + raise KeyError(key) + obj = getattr(obj, field_name) + return obj + + +def replace_path(cfg: ZarrConfig, key: str, value: Any) -> ZarrConfig: + """Return a new `ZarrConfig` with the dotted-string key set to ``value``.""" + segments = key.split(".") + return cast(ZarrConfig, _replace_recursive(cfg, segments, value, key)) + + +def _replace_recursive(obj: Any, segments: list[str], value: Any, key: str) -> Any: + segment = segments[0] + if isinstance(obj, Mapping): + remainder = ".".join(segments) + return {**obj, remainder: value} + field_name = _resolve_field(obj, segment) + if not hasattr(obj, field_name): + raise KeyError(key) + if len(segments) == 1: + return replace(obj, **{field_name: value}) + child = getattr(obj, field_name) + new_child = _replace_recursive(child, segments[1:], value, key) + return replace(obj, **{field_name: new_child}) + + +def to_nested_dict(cfg: ZarrConfig) -> dict[str, Any]: + """Convert a `ZarrConfig` to a donfig-style nested dict (serialized keys).""" + + def convert(obj: Any) -> Any: + if isinstance(obj, Mapping): + return dict(obj) + if hasattr(type(obj), "__dataclass_fields__"): + out: dict[str, Any] = {} + for f in fields(obj): + serialized = _SERIALIZED_NAMES.get(f.name, f.name) + out[serialized] = convert(getattr(obj, f.name)) + return out + return obj + + return convert(cfg) # type: ignore[no-any-return] + + class BadConfigError(ValueError): _msg = "bad Config: %r" diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py new file mode 100644 index 0000000000..e010a8fa87 --- /dev/null +++ b/tests/test_config_typed.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import pytest + +from zarr.core.config import ( + DEFAULT_CODECS, + get_path, + make_default_config, + replace_path, + to_nested_dict, +) + + +def test_default_config_values() -> None: + cfg = make_default_config() + assert cfg.default_zarr_format == 3 + assert cfg.array.order == "C" + assert cfg.array.sharding_coalesce_max_bytes == 16 << 20 + assert cfg.async_.concurrency == 10 + assert cfg.async_.timeout is None + assert cfg.threading.max_workers is None + assert cfg.json_indent == 2 + assert cfg.codec_pipeline.path == "zarr.core.codec_pipeline.BatchedCodecPipeline" + assert cfg.codecs["blosc"] == "zarr.codecs.blosc.BloscCodec" + assert cfg.codecs == DEFAULT_CODECS + + +def test_get_path_structured_and_async_alias() -> None: + cfg = make_default_config() + assert get_path(cfg, "array.order") == "C" + assert get_path(cfg, "async.concurrency") == 10 # serialized key, not async_ + assert get_path(cfg, "json_indent") == 2 + assert get_path(cfg, "codecs") == DEFAULT_CODECS + assert get_path(cfg, "codecs.blosc") == "zarr.codecs.blosc.BloscCodec" + with pytest.raises(KeyError): + get_path(cfg, "array.nonexistent") + + +def test_replace_path_is_immutable_and_typed() -> None: + cfg = make_default_config() + cfg2 = replace_path(cfg, "array.order", "F") + assert cfg.array.order == "C" # original unchanged (frozen) + assert cfg2.array.order == "F" + cfg3 = replace_path(cfg, "async.concurrency", 99) + assert cfg3.async_.concurrency == 99 + cfg4 = replace_path(cfg, "codecs.my_codec", "my.module.MyCodec") + assert cfg4.codecs["my_codec"] == "my.module.MyCodec" + assert "my_codec" not in cfg.codecs + + +def test_to_nested_dict_uses_serialized_keys() -> None: + nested = to_nested_dict(make_default_config()) + assert nested["array"]["order"] == "C" + assert nested["async"]["concurrency"] == 10 # serialized key + assert "async_" not in nested + assert nested["codecs"]["blosc"] == "zarr.codecs.blosc.BloscCodec" From 1d5a65e7ff552a3406c805f87aef8f080f3c82af Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 12:05:06 +0200 Subject: [PATCH 05/25] feat(config): add env-var and YAML ingest --- src/zarr/core/config.py | 92 ++++++++++++++++++++++++++++++++++++++ tests/test_config_typed.py | 29 ++++++++++++ 2 files changed, 121 insertions(+) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index eff60cbb3a..dd544276f0 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -29,6 +29,9 @@ from __future__ import annotations +import ast +import contextlib +import os from collections.abc import Mapping from dataclasses import dataclass, field, fields, replace from typing import TYPE_CHECKING, Any, Literal, cast @@ -193,6 +196,95 @@ def convert(obj: Any) -> Any: return convert(cfg) # type: ignore[no-any-return] +ENV_PREFIX = "ZARR_" + + +def _parse_env_value(raw: str) -> Any: + """Parse an env value with ``ast.literal_eval``; fall back to the raw string.""" + try: + return ast.literal_eval(raw) + except (ValueError, SyntaxError): + return raw + + +def collect_env(environ: Mapping[str, str]) -> dict[str, Any]: + """Collect ``ZARR_*`` environment variables into a flat dotted-key map. + + ``ZARR_FOO__BAR_BAZ=1`` becomes ``{"foo.bar_baz": 1}`` — the key is + lower-cased and ``__`` denotes nested access. + """ + out: dict[str, Any] = {} + for name, raw in environ.items(): + if not name.startswith(ENV_PREFIX): + continue + body = name[len(ENV_PREFIX) :] + dotted = body.lower().replace("__", ".") + out[dotted] = _parse_env_value(raw) + return out + + +def _config_search_paths() -> list[str]: + """Standard YAML config locations, mirroring donfig's search order.""" + paths: list[str] = [] + env_path = os.environ.get("ZARR_CONFIG") + if env_path: + paths.append(env_path) + paths.append(os.path.join(os.path.expanduser("~"), ".config", "zarr")) + return paths + + +def collect_yaml(paths: list[str]) -> dict[str, Any]: + """Merge YAML config files found at ``paths`` into a flat dotted-key map.""" + import yaml + + merged: dict[str, Any] = {} + for path in paths: + candidates: list[str] = [] + if os.path.isdir(path): + candidates.extend( + os.path.join(path, fn) + for fn in sorted(os.listdir(path)) + if fn.endswith((".yaml", ".yml")) + ) + elif os.path.isfile(path): + candidates.append(path) + for candidate in candidates: + with contextlib.suppress(FileNotFoundError): + with open(candidate) as fh: + data = yaml.safe_load(fh) + if isinstance(data, Mapping): + merged.update(_flatten_mapping(data)) + return merged + + +def _flatten_mapping(data: Mapping[str, Any], prefix: str = "") -> dict[str, Any]: + out: dict[str, Any] = {} + for k, v in data.items(): + key = f"{prefix}{k}" if not prefix else f"{prefix}.{k}" + if isinstance(v, Mapping) and k != "codecs": + out.update(_flatten_mapping(v, key)) + else: + out[key] = v + return out + + +def apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, Any]) -> ZarrConfig: + """Apply a flat dotted-key override map to a snapshot.""" + for key, value in overrides.items(): + cfg = replace_path(cfg, key, value) + return cfg + + +def build_config(environ: Mapping[str, str] | None = None) -> ZarrConfig: + """Build the base snapshot: defaults < YAML files < environment variables.""" + if environ is None: + environ = os.environ + return apply_overrides( + apply_overrides(make_default_config(), collect_yaml(_config_search_paths())), + collect_env(environ), + ) + + class BadConfigError(ValueError): _msg = "bad Config: %r" diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index e010a8fa87..1fb43f9427 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -4,6 +4,9 @@ from zarr.core.config import ( DEFAULT_CODECS, + apply_overrides, + build_config, + collect_env, get_path, make_default_config, replace_path, @@ -54,3 +57,29 @@ def test_to_nested_dict_uses_serialized_keys() -> None: assert nested["async"]["concurrency"] == 10 # serialized key assert "async_" not in nested assert nested["codecs"]["blosc"] == "zarr.codecs.blosc.BloscCodec" + + +def test_collect_env_parses_nested_and_literal() -> None: + env = { + "ZARR_ARRAY__ORDER": "F", + "ZARR_ASYNC__CONCURRENCY": "32", + "ZARR_CODECS__MY_CODEC": "my.module.MyCodec", + "UNRELATED": "ignored", + } + out = collect_env(env) + assert out["array.order"] == "F" + assert out["async.concurrency"] == 32 # ast.literal_eval -> int + assert out["codecs.my_codec"] == "my.module.MyCodec" # non-literal -> raw str + assert "unrelated" not in out + + +def test_apply_overrides_and_build_config_precedence() -> None: + cfg = apply_overrides( + build_config(environ={}), + {"array.order": "F", "codecs.x": "pkg.X"}, + ) + assert cfg.array.order == "F" + assert cfg.codecs["x"] == "pkg.X" + # env overrides defaults + cfg2 = build_config(environ={"ZARR_JSON_INDENT": "4"}) + assert cfg2.json_indent == 4 From bba06b650ebb9fed738fade248bee4b68c90c20a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 12:12:27 +0200 Subject: [PATCH 06/25] fix(config): exclude ZARR_CONFIG meta-var from env ingest Add _ENV_META_VARS frozenset containing ZARR_CONFIG and skip those names in collect_env() before stripping the prefix. This prevents ZARR_CONFIG=/path/to/cfg.yaml from becoming {"config": "..."} which crashed build_config() with KeyError because ZarrConfig has no 'config' field. Add two regression tests covering the skip and the no-raise behaviour. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 9 +++++++++ tests/test_config_typed.py | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index dd544276f0..2881744143 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -198,6 +198,10 @@ def convert(obj: Any) -> Any: ENV_PREFIX = "ZARR_" +# Meta-variables that control WHERE config is loaded from, not config values themselves. +# These must be excluded from the env-override map to avoid spurious KeyErrors. +_ENV_META_VARS: frozenset[str] = frozenset({"ZARR_CONFIG"}) + def _parse_env_value(raw: str) -> Any: """Parse an env value with ``ast.literal_eval``; fall back to the raw string.""" @@ -212,11 +216,16 @@ def collect_env(environ: Mapping[str, str]) -> dict[str, Any]: ``ZARR_FOO__BAR_BAZ=1`` becomes ``{"foo.bar_baz": 1}`` — the key is lower-cased and ``__`` denotes nested access. + + Variables listed in ``_ENV_META_VARS`` (e.g. ``ZARR_CONFIG``) are + directives about where config lives and are skipped. """ out: dict[str, Any] = {} for name, raw in environ.items(): if not name.startswith(ENV_PREFIX): continue + if name in _ENV_META_VARS: + continue body = name[len(ENV_PREFIX) :] dotted = body.lower().replace("__", ".") out[dotted] = _parse_env_value(raw) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 1fb43f9427..e9006245e8 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -83,3 +83,20 @@ def test_apply_overrides_and_build_config_precedence() -> None: # env overrides defaults cfg2 = build_config(environ={"ZARR_JSON_INDENT": "4"}) assert cfg2.json_indent == 4 + + +def test_collect_env_skips_zarr_config_meta_var() -> None: + """ZARR_CONFIG is a directive about where config lives, not a config key itself.""" + env = {"ZARR_CONFIG": "/some/path.yaml", "ZARR_ARRAY__ORDER": "F"} + out = collect_env(env) + assert "config" not in out + assert out["array.order"] == "F" + + +def test_build_config_zarr_config_env_does_not_raise() -> None: + """Setting ZARR_CONFIG to a nonexistent path must not crash build_config.""" + cfg = build_config(environ={"ZARR_CONFIG": "/nonexistent/path.yaml"}) + # The nonexistent YAML path is simply skipped; defaults remain intact. + from zarr.core.config import make_default_config + + assert cfg == make_default_config() From ba98ec74bbd6938fe251847f8dfeb0ae79d366b6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 12:24:20 +0200 Subject: [PATCH 07/25] feat(config): add typed proxy with get/set/reset and deprecations --- src/zarr/core/config.py | 199 ++++++++++++++++++++++++++++++++++++- tests/test_config_typed.py | 47 +++++++++ 2 files changed, 243 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 2881744143..9e33ee34e0 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -32,12 +32,16 @@ import ast import contextlib import os +import warnings from collections.abc import Mapping +from contextvars import ContextVar from dataclasses import dataclass, field, fields, replace -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, Self, cast, overload from donfig import Config as DConfig +from zarr.errors import ZarrDeprecationWarning + if TYPE_CHECKING: from donfig.config_obj import ConfigSet @@ -294,6 +298,192 @@ def build_config(environ: Mapping[str, str] | None = None) -> ZarrConfig: ) +_MISSING = object() + + +class _ConfigSet: + """Context manager returned by ``ZarrConfigManager.set``. + + The change is applied immediately (permanent by default); using the object + as a ``with`` block restores the prior state on exit. + """ + + def __init__(self, manager: ZarrConfigManager, prev_base: ZarrConfig, token: Any) -> None: + self._manager = manager + self._prev_base = prev_base + self._token = token + + def __enter__(self) -> Self: + return self + + def __exit__(self, *exc: object) -> None: + self._manager._restore(self._prev_base, self._token) + + +class ZarrConfigManager: + """Typed, donfig-compatible configuration object.""" + + def __init__(self) -> None: + self._base: ZarrConfig = build_config() + self._scope: ContextVar[ZarrConfig] = ContextVar("zarr_config_scope") + + # --- state resolution ------------------------------------------------- + def _current(self) -> ZarrConfig: + return self._scope.get(self._base) + + def _restore(self, prev_base: ZarrConfig, token: Any) -> None: + self._base = prev_base + self._scope.reset(token) + + # --- typed attribute access ------------------------------------------ + @property + def default_zarr_format(self) -> Literal[2, 3]: + return self._current().default_zarr_format + + @property + def array(self) -> ArraySettings: + return self._current().array + + @property + def async_(self) -> AsyncSettings: + return self._current().async_ + + @property + def threading(self) -> ThreadingSettings: + return self._current().threading + + @property + def codec_pipeline(self) -> CodecPipelineSettings: + return self._current().codec_pipeline + + @property + def json_indent(self) -> int: + return self._current().json_indent + + @property + def codecs(self) -> Mapping[str, str]: + return self._current().codecs + + @property + def buffer(self) -> str: + return self._current().buffer + + @property + def ndbuffer(self) -> str: + return self._current().ndbuffer + + # --- string API: get -------------------------------------------------- + @overload + def get(self, key: Literal["default_zarr_format"]) -> Literal[2, 3]: ... + @overload + def get(self, key: Literal["array.order"]) -> Literal["C", "F"]: ... + @overload + def get(self, key: Literal["array.write_empty_chunks"]) -> bool: ... + @overload + def get(self, key: Literal["array.read_missing_chunks"]) -> bool: ... + @overload + def get(self, key: Literal["array.target_shard_size_bytes"]) -> int | None: ... + @overload + def get(self, key: Literal["array.rectilinear_chunks"]) -> bool: ... + @overload + def get(self, key: Literal["array.sharding_coalesce_max_gap_bytes"]) -> int: ... + @overload + def get(self, key: Literal["array.sharding_coalesce_max_bytes"]) -> int: ... + @overload + def get(self, key: Literal["async.concurrency"]) -> int: ... + @overload + def get(self, key: Literal["async.timeout"]) -> float | None: ... + @overload + def get(self, key: Literal["threading.max_workers"]) -> int | None: ... + @overload + def get(self, key: Literal["json_indent"]) -> int: ... + @overload + def get(self, key: Literal["codec_pipeline.path"]) -> str: ... + @overload + def get(self, key: Literal["codec_pipeline.batch_size"]) -> int: ... + @overload + def get(self, key: Literal["buffer"]) -> str: ... + @overload + def get(self, key: Literal["ndbuffer"]) -> str: ... + @overload + def get(self, key: str, default: Any = ...) -> Any: ... + + def get(self, key: str, default: Any = _MISSING) -> Any: + resolved = self._apply_deprecation(key) + if resolved is None: + if default is _MISSING: + raise KeyError(key) + return default + try: + return get_path(self._current(), resolved) + except KeyError: + if default is _MISSING: + raise + return default + + # --- string API: set -------------------------------------------------- + def set(self, updates: Mapping[str, Any]) -> _ConfigSet: + prev_base = self._base + new = self._current() + for key, value in updates.items(): + resolved = self._apply_deprecation(key) + if resolved is None: + continue + new = replace_path(new, resolved, value) + self._base = new + token = self._scope.set(new) + return _ConfigSet(self, prev_base, token) + + # --- lifecycle -------------------------------------------------------- + def reset(self) -> None: + self._base = build_config() + with contextlib.suppress(LookupError): + self._scope.set(self._base) + + def refresh(self) -> None: + self._base = build_config() + + def enable_gpu(self) -> _ConfigSet: + return self.set( + {"buffer": "zarr.buffer.gpu.Buffer", "ndbuffer": "zarr.buffer.gpu.NDBuffer"} + ) + + # --- compat / introspection ------------------------------------------ + @property + def defaults(self) -> dict[str, Any]: + return to_nested_dict(make_default_config()) + + def to_dict(self) -> dict[str, Any]: + return to_nested_dict(self._current()) + + def update(self, updates: Mapping[str, Any]) -> None: + self.set(updates) + + def pprint(self) -> None: + import pprint as _pp + + _pp.pprint(self.to_dict()) + + # --- deprecations ----------------------------------------------------- + def _apply_deprecation(self, key: str) -> str | None: + if key not in deprecations: + return key + new_key = deprecations[key] + if new_key is None: + warnings.warn( + f"Configuration key {key!r} has been removed and no longer has any effect.", + ZarrDeprecationWarning, + stacklevel=3, + ) + return None + warnings.warn( + f"Configuration key {key!r} has been renamed to {new_key!r}.", + ZarrDeprecationWarning, + stacklevel=3, + ) + return new_key + + class BadConfigError(ValueError): _msg = "bad Config: %r" @@ -326,8 +516,8 @@ def enable_gpu(self) -> ConfigSet: # these keys were removed from the config as part of the 3.1.0 release. -# these deprecations should be removed in 3.1.1 or thereabouts. -deprecations = { +# These deprecations should be removed in 3.1.1 or thereabouts. +deprecations: dict[str, str | None] = { "array.v2_default_compressor.numeric": None, "array.v2_default_compressor.string": None, "array.v2_default_compressor.bytes": None, @@ -344,6 +534,9 @@ def enable_gpu(self) -> ConfigSet: "array.v3_default_compressors": None, } +# Provisional new instance; Task 4 makes this THE module-level `config`. +_typed_config = ZarrConfigManager() + # The default configuration for zarr config = Config( "zarr", diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index e9006245e8..4a1eeed620 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -1,9 +1,12 @@ from __future__ import annotations +from concurrent.futures import ThreadPoolExecutor + import pytest from zarr.core.config import ( DEFAULT_CODECS, + ZarrConfigManager, apply_overrides, build_config, collect_env, @@ -100,3 +103,47 @@ def test_build_config_zarr_config_env_does_not_raise() -> None: from zarr.core.config import make_default_config assert cfg == make_default_config() + + +def test_proxy_attribute_and_string_access() -> None: + cfg = ZarrConfigManager() + assert cfg.array.order == "C" + assert cfg.get("array.order") == "C" + assert cfg.get("async.concurrency") == 10 + assert cfg.get("codecs", {})["blosc"] == "zarr.codecs.blosc.BloscCodec" + assert cfg.get("does.not.exist", "fallback") == "fallback" + + +def test_set_permanent_and_context() -> None: + cfg = ZarrConfigManager() + cfg.set({"array.order": "F"}) + assert cfg.get("array.order") == "F" # permanent + with cfg.set({"array.order": "C"}): + assert cfg.get("array.order") == "C" + assert cfg.get("array.order") == "F" # restored to permanent value + cfg.reset() + assert cfg.get("array.order") == "C" + + +def test_permanent_set_visible_in_worker_thread() -> None: + cfg = ZarrConfigManager() + cfg.set({"async.concurrency": 77}) + try: + with ThreadPoolExecutor(max_workers=1) as ex: + seen = ex.submit(lambda: cfg.get("async.concurrency")).result() + assert seen == 77 # ThreadPoolExecutor does not copy contextvars + finally: + cfg.reset() + + +def test_defaults_and_enable_gpu() -> None: + cfg = ZarrConfigManager() + assert cfg.defaults["array"]["order"] == "C" + with cfg.set({"buffer": "x"}): + pass + cfg.enable_gpu() + try: + assert cfg.get("buffer") == "zarr.buffer.gpu.Buffer" + assert cfg.get("ndbuffer") == "zarr.buffer.gpu.NDBuffer" + finally: + cfg.reset() From f407ec6f429573164797713e441d884606844c9d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 12:34:45 +0200 Subject: [PATCH 08/25] fix(config): keep context scope in sync on refresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After any reset() or set() call, a ContextVar scope entry is planted in the current context. Without re-syncing _scope in refresh(), a subsequent rebuild of _base is invisible in that context because _current() always prefers the scope entry. Fix refresh() to call self._scope.set(self._base) after rebuilding, matching the pattern already used in reset(). Also remove the dead contextlib.suppress(LookupError) in reset() — ContextVar.set() never raises LookupError. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 7 +++++-- tests/test_config_typed.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 9e33ee34e0..07c25d724b 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -437,11 +437,14 @@ def set(self, updates: Mapping[str, Any]) -> _ConfigSet: # --- lifecycle -------------------------------------------------------- def reset(self) -> None: self._base = build_config() - with contextlib.suppress(LookupError): - self._scope.set(self._base) + # Sync the scope so _current() returns the new base in this context. + self._scope.set(self._base) def refresh(self) -> None: self._base = build_config() + # Sync the scope so the rebuilt base is visible in the calling context. + # Without this, any prior reset()/set() scope entry would shadow the refresh. + self._scope.set(self._base) def enable_gpu(self) -> _ConfigSet: return self.set( diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 4a1eeed620..0e3a3c77b8 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -147,3 +147,17 @@ def test_defaults_and_enable_gpu() -> None: assert cfg.get("ndbuffer") == "zarr.buffer.gpu.NDBuffer" finally: cfg.reset() + + +def test_refresh_not_shadowed_by_prior_scope(monkeypatch: pytest.MonkeyPatch) -> None: + """refresh() must be visible in the calling context even after a prior set()/reset().""" + mgr = ZarrConfigManager() + # plant a scope entry in this thread/context (as reset()/set() would) + mgr.set({"array.order": "F"}) + assert mgr.get("array.order") == "F" + # change the environment so a rebuild differs, then refresh + monkeypatch.setenv("ZARR_JSON_INDENT", "7") + mgr.refresh() + # refresh must be visible in THIS context, not shadowed by the prior scope + assert mgr.get("json_indent") == 7 + assert mgr.get("array.order") == "C" # the prior permanent set is gone after rebuild From 7e753042be4074733358ec20aa23c84c725f85c0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 13:00:34 +0200 Subject: [PATCH 09/25] feat(config): replace donfig with typed config object Remove the donfig Config subclass and its module-level instance. Promote ZarrConfigManager (the typed proxy built in Tasks 1-3) to the public `config` export. Update tests to the new nested-dict `defaults` form and fix a `codec_pipeline.name` typo (should be `codec_pipeline.path`). Also add **kwargs support to `set()` for call-sites that pass top-level keys as keyword arguments, and raise BadConfigError (ValueError) for removed deprecated keys to preserve the backwards-compat guarantee tested by `test_deprecated_config`. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 168 ++++++++++------------------------------ tests/test_config.py | 90 +++++++-------------- 2 files changed, 69 insertions(+), 189 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 07c25d724b..f8e2240ae5 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -1,30 +1,35 @@ """ -The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. -For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations -in the registry and then select them in the config. +Typed configuration for zarr. -Example: - An implementation of the bytes codec in a class ``your.module.NewBytesCodec`` requires the value of ``codecs.bytes`` - to be ``your.module.NewBytesCodec``. Donfig can be configured programmatically, by environment variables, or from - YAML files in standard locations. +The module exposes a single `config` object (a `ZarrConfigManager` instance) that +holds all runtime settings. Values can be read, overridden, and restored through a +simple string-key API that mirrors the old donfig interface: - ```python - from your.module import NewBytesCodec - from zarr.core.config import register_codec, config +- `config.get(key)` — read a dotted-key value (e.g. `config.get("async.concurrency")`). +- `config.set({key: value})` — permanent override; also usable as a context manager to + restore the previous state on exit. +- `config.reset()` — rebuild from defaults + environment. +- `config.refresh()` — alias for `reset`; called by the registry after env changes. +- `config.defaults` — nested dict of built-in default values. +- `config.enable_gpu()` — switch buffer/ndbuffer to GPU implementations. - register_codec("bytes", NewBytesCodec) - config.set({"codecs.bytes": "your.module.NewBytesCodec"}) - ``` +Environment variables use the `ZARR_` prefix and `__` for nesting: - Instead of setting the value programmatically with ``config.set``, you can also set the value with an environment - variable. The environment variable ``ZARR_CODECS__BYTES`` can be set to ``your.module.NewBytesCodec``. The double - underscore ``__`` is used to indicate nested access. +```bash +export ZARR_CODECS__BYTES="your.module.NewBytesCodec" +``` - ```bash - export ZARR_CODECS__BYTES="your.module.NewBytesCodec" - ``` +Programmatic override: -For more information, see the Donfig documentation at https://github.com/pytroll/donfig. +```python +from your.module import NewBytesCodec +from zarr.core.config import config + +config.set({"codecs.bytes": "your.module.NewBytesCodec"}) +``` + +For selecting custom implementations of codecs, pipelines, buffers, and ndbuffers, +register the implementation in the registry first, then set the path via `config.set`. """ from __future__ import annotations @@ -36,16 +41,10 @@ from collections.abc import Mapping from contextvars import ContextVar from dataclasses import dataclass, field, fields, replace -from typing import TYPE_CHECKING, Any, Literal, Self, cast, overload - -from donfig import Config as DConfig +from typing import Any, Literal, Self, cast, overload from zarr.errors import ZarrDeprecationWarning -if TYPE_CHECKING: - from donfig.config_obj import ConfigSet - - DEFAULT_CODECS: dict[str, str] = { "blosc": "zarr.codecs.blosc.BloscCodec", "gzip": "zarr.codecs.gzip.GzipCodec", @@ -422,10 +421,22 @@ def get(self, key: str, default: Any = _MISSING) -> Any: return default # --- string API: set -------------------------------------------------- - def set(self, updates: Mapping[str, Any]) -> _ConfigSet: + def set(self, updates: Mapping[str, Any] | None = None, **kwargs: Any) -> _ConfigSet: + """Apply one or more config overrides. + + Accepts either a mapping of dotted keys to values, keyword arguments + (for top-level keys), or both:: + + config.set({"array.order": "F"}) + config.set(default_zarr_format=2) + """ + all_updates: dict[str, Any] = {} + if updates: + all_updates.update(updates) + all_updates.update(kwargs) prev_base = self._base new = self._current() - for key, value in updates.items(): + for key, value in all_updates.items(): resolved = self._apply_deprecation(key) if resolved is None: continue @@ -473,12 +484,9 @@ def _apply_deprecation(self, key: str) -> str | None: return key new_key = deprecations[key] if new_key is None: - warnings.warn( - f"Configuration key {key!r} has been removed and no longer has any effect.", - ZarrDeprecationWarning, - stacklevel=3, + raise BadConfigError( + f"Configuration key {key!r} has been removed and no longer has any effect." ) - return None warnings.warn( f"Configuration key {key!r} has been renamed to {new_key!r}.", ZarrDeprecationWarning, @@ -491,33 +499,6 @@ class BadConfigError(ValueError): _msg = "bad Config: %r" -class Config(DConfig): # type: ignore[misc] - """The Config will collect configuration from config files and environment variables - - Example environment variables: - Grabs environment variables of the form "ZARR_FOO__BAR_BAZ=123" and - turns these into config variables of the form ``{"foo": {"bar-baz": 123}}`` - It transforms the key and value in the following way: - - - Lower-cases the key text - - Treats ``__`` (double-underscore) as nested access - - Calls ``ast.literal_eval`` on the value - - """ - - def reset(self) -> None: - self.clear() - self.refresh() - - def enable_gpu(self) -> ConfigSet: - """ - Configure Zarr to use GPUs where possible. - """ - return self.set( - {"buffer": "zarr.buffer.gpu.Buffer", "ndbuffer": "zarr.buffer.gpu.NDBuffer"} - ) - - # these keys were removed from the config as part of the 3.1.0 release. # These deprecations should be removed in 3.1.1 or thereabouts. deprecations: dict[str, str | None] = { @@ -537,70 +518,7 @@ def enable_gpu(self) -> ConfigSet: "array.v3_default_compressors": None, } -# Provisional new instance; Task 4 makes this THE module-level `config`. -_typed_config = ZarrConfigManager() - -# The default configuration for zarr -config = Config( - "zarr", - defaults=[ - { - "default_zarr_format": 3, - "array": { - "order": "C", - "write_empty_chunks": False, - "read_missing_chunks": True, - "target_shard_size_bytes": None, - "rectilinear_chunks": False, - "sharding_coalesce_max_gap_bytes": 1 << 20, # 1 MiB - "sharding_coalesce_max_bytes": 16 << 20, # 16 MiB - }, - "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, - "json_indent": 2, - "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, - }, - "codecs": { - "blosc": "zarr.codecs.blosc.BloscCodec", - "gzip": "zarr.codecs.gzip.GzipCodec", - "zstd": "zarr.codecs.zstd.ZstdCodec", - "bytes": "zarr.codecs.bytes.BytesCodec", - "endian": "zarr.codecs.bytes.BytesCodec", # compatibility with earlier versions of ZEP1 - "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", - "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", - "transpose": "zarr.codecs.transpose.TransposeCodec", - "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", - "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", - "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", - "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", - "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", - "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", - "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", - "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", - "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", - "numcodecs.astype": "zarr.codecs.numcodecs.AsType", - "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", - "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", - "numcodecs.delta": "zarr.codecs.numcodecs.Delta", - "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", - "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", - "numcodecs.gzip": "zarr.codecs.numcodecs.GZip", - "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", - "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", - "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", - "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", - "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", - "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", - "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", - }, - "buffer": "zarr.buffer.cpu.Buffer", - "ndbuffer": "zarr.buffer.cpu.NDBuffer", - } - ], - deprecations=deprecations, -) +config = ZarrConfigManager() def parse_indexing_order(data: Any) -> Literal["C", "F"]: diff --git a/tests/test_config.py b/tests/test_config.py index a758378dc7..e1d48a8079 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -22,7 +22,7 @@ from zarr.core.buffer import NDBuffer from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline -from zarr.core.config import BadConfigError, config +from zarr.core.config import DEFAULT_CODECS, BadConfigError, config from zarr.core.indexing import SelectorTuple from zarr.errors import ChunkNotFoundError, ZarrUserWarning from zarr.registry import ( @@ -45,66 +45,28 @@ def test_config_defaults_set() -> None: - # regression test for available defaults - assert ( - config.defaults - == [ - { - "default_zarr_format": 3, - "array": { - "order": "C", - "write_empty_chunks": False, - "read_missing_chunks": True, - "target_shard_size_bytes": None, - "rectilinear_chunks": False, - "sharding_coalesce_max_gap_bytes": 1 << 20, - "sharding_coalesce_max_bytes": 16 << 20, - }, - "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, - "json_indent": 2, - "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, - }, - "codecs": { - "blosc": "zarr.codecs.blosc.BloscCodec", - "gzip": "zarr.codecs.gzip.GzipCodec", - "zstd": "zarr.codecs.zstd.ZstdCodec", - "bytes": "zarr.codecs.bytes.BytesCodec", - "endian": "zarr.codecs.bytes.BytesCodec", # compatibility with earlier versions of ZEP1 - "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", - "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", - "transpose": "zarr.codecs.transpose.TransposeCodec", - "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", - "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", - "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", - "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", - "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", - "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", - "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", - "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", - "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", - "numcodecs.astype": "zarr.codecs.numcodecs.AsType", - "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", - "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", - "numcodecs.delta": "zarr.codecs.numcodecs.Delta", - "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", - "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", - "numcodecs.gzip": "zarr.codecs.numcodecs.GZip", - "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", - "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", - "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", - "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", - "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", - "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", - "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", - }, - "buffer": "zarr.buffer.cpu.Buffer", - "ndbuffer": "zarr.buffer.cpu.NDBuffer", - } - ] - ) + assert config.defaults == { + "default_zarr_format": 3, + "array": { + "order": "C", + "write_empty_chunks": False, + "read_missing_chunks": True, + "target_shard_size_bytes": None, + "rectilinear_chunks": False, + "sharding_coalesce_max_gap_bytes": 1 << 20, + "sharding_coalesce_max_bytes": 16 << 20, + }, + "async": {"concurrency": 10, "timeout": None}, + "threading": {"max_workers": None}, + "json_indent": 2, + "codec_pipeline": { + "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", + "batch_size": 1, + }, + "codecs": dict(DEFAULT_CODECS), + "buffer": "zarr.buffer.cpu.Buffer", + "ndbuffer": "zarr.buffer.cpu.NDBuffer", + } assert config.get("array.order") == "C" assert config.get("async.concurrency") == 10 assert config.get("async.timeout") is None @@ -156,7 +118,7 @@ def test_config_codec_pipeline_class(store: Store) -> None: # has default value assert get_pipeline_class().__name__ != "" - config.set({"codec_pipeline.name": "zarr.core.codec_pipeline.BatchedCodecPipeline"}) + config.set({"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"}) assert get_pipeline_class() == zarr.core.codec_pipeline.BatchedCodecPipeline _mock = Mock() @@ -206,7 +168,7 @@ class MockEnvCodecPipeline(CodecPipeline): @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_config_codec_implementation(store: Store) -> None: # has default value - assert fully_qualified_name(get_codec_class("blosc")) == config.defaults[0]["codecs"]["blosc"] + assert fully_qualified_name(get_codec_class("blosc")) == config.defaults["codecs"]["blosc"] _mock = Mock() @@ -259,7 +221,7 @@ def test_config_ndbuffer_implementation(store: Store) -> None: def test_config_buffer_implementation() -> None: # has default value - assert config.defaults[0]["buffer"] == "zarr.buffer.cpu.Buffer" + assert config.defaults["buffer"] == "zarr.buffer.cpu.Buffer" arr = zeros(shape=(100,), store=StoreExpectingTestBuffer()) From aa63864fa2cb06d19030ae2e5cdbc4d2310c9ed4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 13:15:59 +0200 Subject: [PATCH 10/25] fix(config): make get() honor defaults for removed deprecated keys `_apply_deprecation` now accepts a `raise_on_removed` parameter. `set()` passes `raise_on_removed=True` (keeps raising `BadConfigError` for removed keys). `get()` passes `raise_on_removed=False` so removed keys are treated as absent: a caller-supplied default is returned, or `KeyError` is raised when no default is given. This restores the donfig-faithful behavior where `config.get("removed.key", default)` never raised. Also removes the now-dead `if resolved is None: continue` branch from `set()` and rewrites the module docstring to drop the phrase "mirrors the old donfig interface". Co-Authored-By: Claude Sonnet 4.6 Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 41 +++++++++++++++++++++++++++++--------- tests/test_config_typed.py | 29 +++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index f8e2240ae5..5537f52101 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -3,7 +3,7 @@ The module exposes a single `config` object (a `ZarrConfigManager` instance) that holds all runtime settings. Values can be read, overridden, and restored through a -simple string-key API that mirrors the old donfig interface: +simple string-key API: - `config.get(key)` — read a dotted-key value (e.g. `config.get("async.concurrency")`). - `config.set({key: value})` — permanent override; also usable as a context manager to @@ -408,8 +408,9 @@ def get(self, key: Literal["ndbuffer"]) -> str: ... def get(self, key: str, default: Any = ...) -> Any: ... def get(self, key: str, default: Any = _MISSING) -> Any: - resolved = self._apply_deprecation(key) + resolved = self._apply_deprecation(key, raise_on_removed=False) if resolved is None: + # Key was removed; treat as absent — honour the caller's default. if default is _MISSING: raise KeyError(key) return default @@ -437,9 +438,7 @@ def set(self, updates: Mapping[str, Any] | None = None, **kwargs: Any) -> _Confi prev_base = self._base new = self._current() for key, value in all_updates.items(): - resolved = self._apply_deprecation(key) - if resolved is None: - continue + resolved = self._apply_deprecation(key, raise_on_removed=True) new = replace_path(new, resolved, value) self._base = new token = self._scope.set(new) @@ -479,14 +478,38 @@ def pprint(self) -> None: _pp.pprint(self.to_dict()) # --- deprecations ----------------------------------------------------- - def _apply_deprecation(self, key: str) -> str | None: + @overload + def _apply_deprecation(self, key: str, *, raise_on_removed: Literal[True]) -> str: ... + @overload + def _apply_deprecation(self, key: str, *, raise_on_removed: Literal[False]) -> str | None: ... + + def _apply_deprecation(self, key: str, *, raise_on_removed: bool) -> str | None: + """Resolve a possibly-deprecated config key. + + Parameters + ---------- + key : str + The dotted config key supplied by the caller. + raise_on_removed : bool + When `True` (used by `set`), raise `BadConfigError` if the key has been + removed. When `False` (used by `get`), return `None` instead so the + caller can treat the key as absent and honour the caller's default. + + Returns + ------- + str or None + The canonical (possibly redirected) key, or `None` when the key was + removed and `raise_on_removed` is `False`. + """ if key not in deprecations: return key new_key = deprecations[key] if new_key is None: - raise BadConfigError( - f"Configuration key {key!r} has been removed and no longer has any effect." - ) + if raise_on_removed: + raise BadConfigError( + f"Configuration key {key!r} has been removed and no longer has any effect." + ) + return None warnings.warn( f"Configuration key {key!r} has been renamed to {new_key!r}.", ZarrDeprecationWarning, diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 0e3a3c77b8..853a75de2b 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -6,6 +6,7 @@ from zarr.core.config import ( DEFAULT_CODECS, + BadConfigError, ZarrConfigManager, apply_overrides, build_config, @@ -161,3 +162,31 @@ def test_refresh_not_shadowed_by_prior_scope(monkeypatch: pytest.MonkeyPatch) -> # refresh must be visible in THIS context, not shadowed by the prior scope assert mgr.get("json_indent") == 7 assert mgr.get("array.order") == "C" # the prior permanent set is gone after rebuild + + +# --------------------------------------------------------------------------- +# Removed-deprecated-key behavior (donfig-faithful) +# --------------------------------------------------------------------------- + +_REMOVED_KEY = "array.v2_default_compressor.numeric" + + +def test_get_removed_deprecated_key_with_default() -> None: + """get() with a removed deprecated key and a default must return the default silently.""" + mgr = ZarrConfigManager() + result = mgr.get(_REMOVED_KEY, "fallback") + assert result == "fallback" + + +def test_get_removed_deprecated_key_no_default_raises_key_error() -> None: + """get() with a removed deprecated key and no default must raise KeyError, not BadConfigError.""" + mgr = ZarrConfigManager() + with pytest.raises(KeyError): + mgr.get(_REMOVED_KEY) + + +def test_set_removed_deprecated_key_raises_bad_config_error() -> None: + """set() with a removed deprecated key must still raise BadConfigError.""" + mgr = ZarrConfigManager() + with pytest.raises(BadConfigError): + mgr.set({_REMOVED_KEY: "some_value"}) From 85595937df413ad3483605d6f1625220658d8fc7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 13:18:44 +0200 Subject: [PATCH 11/25] fix(config): tolerate unknown keys from env/YAML ingest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apply_overrides() now catches KeyError from replace_path() for each unknown key, emits a ZarrUserWarning naming the skipped key, and continues. This prevents a stray ZARR_*=... env var or extra YAML key from crashing build_config() (and therefore import zarr). config.set() remains strict — it calls replace_path() directly, not through apply_overrides(). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 18 +++++++++++++++--- tests/test_config_typed.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 5537f52101..d2fc8d0a5b 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -43,7 +43,7 @@ from dataclasses import dataclass, field, fields, replace from typing import Any, Literal, Self, cast, overload -from zarr.errors import ZarrDeprecationWarning +from zarr.errors import ZarrDeprecationWarning, ZarrUserWarning DEFAULT_CODECS: dict[str, str] = { "blosc": "zarr.codecs.blosc.BloscCodec", @@ -281,9 +281,21 @@ def _flatten_mapping(data: Mapping[str, Any], prefix: str = "") -> dict[str, Any def apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, Any]) -> ZarrConfig: - """Apply a flat dotted-key override map to a snapshot.""" + """Apply a flat dotted-key override map to a snapshot. + + Used exclusively by `build_config` for env/YAML ingest. Unknown keys are + skipped with a warning rather than raising, so a stray environment variable + or extra YAML key never prevents `import zarr` from succeeding. + """ for key, value in overrides.items(): - cfg = replace_path(cfg, key, value) + try: + cfg = replace_path(cfg, key, value) + except KeyError: + warnings.warn( + f"Unrecognized zarr config key {key!r} from environment or YAML — ignoring.", + ZarrUserWarning, + stacklevel=2, + ) return cfg diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 853a75de2b..e7cbcf7715 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -190,3 +190,37 @@ def test_set_removed_deprecated_key_raises_bad_config_error() -> None: mgr = ZarrConfigManager() with pytest.raises(BadConfigError): mgr.set({_REMOVED_KEY: "some_value"}) + + +# --------------------------------------------------------------------------- +# Tolerant ingest: unknown env/YAML keys must warn and be skipped, not crash +# --------------------------------------------------------------------------- + + +def test_build_config_unknown_env_key_warns_and_skips() -> None: + """build_config with an unrecognized env var warns and skips it; known keys still apply.""" + with pytest.warns(UserWarning, match="future.key"): + cfg = build_config(environ={"ZARR_FUTURE__KEY": "1", "ZARR_ARRAY__ORDER": "F"}) + # Known key was applied + assert cfg.array.order == "F" + # All other fields are still at default + default = make_default_config() + from dataclasses import fields as dc_fields + + for f in dc_fields(default): + if f.name != "array": + assert getattr(cfg, f.name) == getattr(default, f.name) + + +def test_apply_overrides_unknown_key_warns_and_returns_default() -> None: + """apply_overrides with a totally unknown key warns and returns an otherwise-default config.""" + default = make_default_config() + with pytest.warns(UserWarning, match="totally.bogus.key"): + result = apply_overrides(default, {"totally.bogus.key": 123}) + assert result == default + + +def test_config_set_still_strict_for_unknown_keys() -> None: + """config.set() must remain strict: unknown structured keys raise KeyError.""" + with pytest.raises(KeyError): + ZarrConfigManager().set({"totally.bogus.key": 1}) From e8b781b60c3a6f13f89bb318b3b6fb62430b5a86 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 13:53:45 +0200 Subject: [PATCH 12/25] build: drop donfig dependency, add pyyaml Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- pyproject.toml | 5 ++--- src/zarr/__init__.py | 2 +- tests/test_config_typed.py | 8 ++++++++ uv.lock | 16 ++-------------- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 02e66c67e8..789a215cf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ 'numcodecs>=0.14', 'google-crc32c>=1.5', 'typing_extensions>=4.14', - 'donfig>=0.8', + 'pyyaml', ] dynamic = [ @@ -243,7 +243,6 @@ extra-dependencies = [ 's3fs @ git+https://github.com/fsspec/s3fs', 'universal_pathlib @ git+https://github.com/fsspec/universal_pathlib', 'typing_extensions @ git+https://github.com/python/typing_extensions', - 'donfig @ git+https://github.com/pytroll/donfig', 'obstore @ git+https://github.com/developmentseed/obstore@main#subdirectory=obstore', ] @@ -269,7 +268,7 @@ extra-dependencies = [ 's3fs==2023.10.0', 'universal_pathlib==0.2.0', 'typing_extensions==4.14.*', - 'donfig==0.8.*', + 'pyyaml==6.*', 'obstore==0.5.*', ] diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index cdf3840c3b..3322f76c2f 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -68,7 +68,7 @@ def print_packages(packages: list[str]) -> None: "numpy", "numcodecs", "typing_extensions", - "donfig", + "pyyaml", ] optional = [ "botocore", diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index e7cbcf7715..2d78297293 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -224,3 +224,11 @@ def test_config_set_still_strict_for_unknown_keys() -> None: """config.set() must remain strict: unknown structured keys raise KeyError.""" with pytest.raises(KeyError): ZarrConfigManager().set({"totally.bogus.key": 1}) + + +def test_donfig_not_imported() -> None: + import sys + + import zarr # noqa: F401 + + assert "donfig" not in sys.modules diff --git a/uv.lock b/uv.lock index 799ea6e45a..18e852c989 100644 --- a/uv.lock +++ b/uv.lock @@ -915,18 +915,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" }, ] -[[package]] -name = "donfig" -version = "0.8.1.post1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyyaml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/25/71/80cc718ff6d7abfbabacb1f57aaa42e9c1552bfdd01e64ddd704e4a03638/donfig-0.8.1.post1.tar.gz", hash = "sha256:3bef3413a4c1c601b585e8d297256d0c1470ea012afa6e8461dc28bfb7c23f52", size = 19506, upload-time = "2024-05-23T14:14:31.513Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl", hash = "sha256:2a3175ce74a06109ff9307d90a230f81215cbac9a751f4d1c6194644b8204f9d", size = 21592, upload-time = "2024-05-23T14:13:55.283Z" }, -] - [[package]] name = "execnet" version = "2.1.2" @@ -3954,11 +3942,11 @@ wheels = [ name = "zarr" source = { editable = "." } dependencies = [ - { name = "donfig" }, { name = "google-crc32c" }, { name = "numcodecs" }, { name = "numpy" }, { name = "packaging" }, + { name = "pyyaml" }, { name = "typing-extensions" }, ] @@ -4075,13 +4063,13 @@ test = [ requires-dist = [ { name = "cast-value-rs", marker = "extra == 'cast-value-rs'" }, { name = "cupy-cuda12x", marker = "sys_platform != 'darwin' and extra == 'gpu'" }, - { name = "donfig", specifier = ">=0.8" }, { name = "fsspec", marker = "extra == 'remote'", specifier = ">=2023.10.0" }, { name = "google-crc32c", specifier = ">=1.5" }, { name = "numcodecs", specifier = ">=0.14" }, { name = "numpy", specifier = ">=2" }, { name = "obstore", marker = "extra == 'remote'", specifier = ">=0.5.1" }, { name = "packaging", specifier = ">=22.0" }, + { name = "pyyaml" }, { name = "typer", marker = "extra == 'cli'" }, { name = "typing-extensions", specifier = ">=4.14" }, { name = "universal-pathlib", marker = "extra == 'optional'" }, From 6cb60d5b3e232e63f5a740fd79071373d4f20f8f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 14:30:04 +0200 Subject: [PATCH 13/25] test(config): drift-protection + typing assertions; docs + changelog - Add _structured_leaf_keys helper (uses get_type_hints to handle from __future__ import annotations) and test_every_structured_key_has_a_get_overload that walks ZarrConfig recursively and asserts every leaf has a get() overload - Add TYPE_CHECKING smoke function using assert_type to prove precise static return types for config.get() and attribute access - Create changes/+statically-typed-config.misc.md changelog fragment - Update docs/user-guide/config.md: remove donfig prose, fix pprint call - Update docs/user-guide/installation.md: replace donfig dep with pyyaml - Pin pyyaml>=6 in pyproject.toml project dependencies Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- changes/+statically-typed-config.misc.md | 8 ++++ docs/user-guide/config.md | 19 ++++---- docs/user-guide/installation.md | 2 +- pyproject.toml | 2 +- tests/test_config_typed.py | 58 ++++++++++++++++++++++++ uv.lock | 2 +- 6 files changed, 78 insertions(+), 13 deletions(-) create mode 100644 changes/+statically-typed-config.misc.md diff --git a/changes/+statically-typed-config.misc.md b/changes/+statically-typed-config.misc.md new file mode 100644 index 0000000000..c42d579423 --- /dev/null +++ b/changes/+statically-typed-config.misc.md @@ -0,0 +1,8 @@ +Replaced the `donfig`-based configuration with a statically-typed +configuration object. `zarr.config` now provides precise static types for +attribute access (`zarr.config.array.order`) and for the dotted-string API +(`zarr.config.get("array.order")`). The string API, environment-variable +ingestion (`ZARR_FOO__BAR`), YAML config files, `config.set` (permanent and +as a context manager), `config.reset`, `config.enable_gpu`, and the +`deprecations` mechanism are all preserved. The `donfig` dependency has been +removed. diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 71c021b070..554c34917d 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -1,7 +1,8 @@ # Runtime configuration -[`zarr.config`][] is responsible for managing the configuration of zarr and -is based on the [donfig](https://github.com/pytroll/donfig) Python library. +[`zarr.config`][] is a `ZarrConfigManager` instance that manages all runtime +settings for zarr. It provides both typed attribute access and a dotted-string +key API. Configuration values can be set using code like the following: @@ -18,12 +19,13 @@ zarr.config.set({'array.order': 'F'}) print(zarr.config.get('array.order')) ``` -Alternatively, configuration values can be set using environment variables, e.g. +Alternatively, configuration values can be set using environment variables. +The variable name uses a `ZARR_` prefix, with `__` to denote nesting, e.g. `ZARR_ARRAY__ORDER=F`. -The configuration can also be read from a YAML file in standard locations. -For more information, see the -[donfig documentation](https://donfig.readthedocs.io/en/latest/). +The configuration can also be read from YAML files. Place a `zarr.yaml` (or +any `.yaml`/`.yml` file) in `~/.config/zarr/`, or point the `ZARR_CONFIG` +environment variable at a specific file path. Configuration options include the following: @@ -46,8 +48,5 @@ This is the current default configuration: ```python exec="true" session="config" source="above" result="ansi" from pprint import pprint -import io -output = io.StringIO() -zarr.config.pprint(stream=output, width=60) -print(output.getvalue()) +pprint(zarr.config.to_dict()) ``` diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md index c902acf171..a710f417bc 100644 --- a/docs/user-guide/installation.md +++ b/docs/user-guide/installation.md @@ -10,7 +10,7 @@ Required dependencies include: - [numcodecs](https://numcodecs.readthedocs.io) (0.14 or later) - [google-crc32c](https://github.com/googleapis/python-crc32c) (1.5 or later) - [typing_extensions](https://typing-extensions.readthedocs.io) (4.9 or later) -- [donfig](https://donfig.readthedocs.io) (0.8 or later) +- [pyyaml](https://pyyaml.org) (6 or later) ## pip diff --git a/pyproject.toml b/pyproject.toml index 789a215cf8..5e1664b36e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ 'numcodecs>=0.14', 'google-crc32c>=1.5', 'typing_extensions>=4.14', - 'pyyaml', + 'pyyaml>=6', ] dynamic = [ diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 2d78297293..01d71b15f4 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -1,12 +1,16 @@ from __future__ import annotations +import dataclasses +import typing from concurrent.futures import ThreadPoolExecutor import pytest from zarr.core.config import ( + _SERIALIZED_NAMES, DEFAULT_CODECS, BadConfigError, + ZarrConfig, ZarrConfigManager, apply_overrides, build_config, @@ -232,3 +236,57 @@ def test_donfig_not_imported() -> None: import zarr # noqa: F401 assert "donfig" not in sys.modules + + +# --------------------------------------------------------------------------- +# Drift-protection: every structured leaf key must have a get() overload +# --------------------------------------------------------------------------- + + +def _structured_leaf_keys(cfg_cls: type, prefix: str = "") -> list[str]: + """Walk a settings dataclass recursively and return every dotted leaf key. + + Uses ``typing.get_type_hints`` instead of ``f.type`` so that the + ``from __future__ import annotations`` string-annotation form is resolved + to real types before ``dataclasses.is_dataclass`` is called. + """ + keys: list[str] = [] + resolved_hints = typing.get_type_hints(cfg_cls) + for f in dataclasses.fields(cfg_cls): + serialized = _SERIALIZED_NAMES.get(f.name, f.name) + key = f"{prefix}.{serialized}" if prefix else serialized + resolved_type = resolved_hints[f.name] + if dataclasses.is_dataclass(resolved_type): + keys.extend(_structured_leaf_keys(typing.cast(type, resolved_type), key)) + elif f.name == "codecs": + # open mapping — intentionally not enumerated + continue + else: + keys.append(key) + return keys + + +def test_every_structured_key_has_a_get_overload() -> None: + """Enumerate every typed leaf key in ZarrConfig and assert a matching get() overload exists.""" + overloads = typing.get_overloads(ZarrConfigManager.get) + literal_keys: set[str] = set() + for ov in overloads: + hints = typing.get_type_hints(ov) + key_hint = hints.get("key") + if typing.get_origin(key_hint) is typing.Literal: + literal_keys.update(typing.get_args(key_hint)) + leaf_keys = _structured_leaf_keys(ZarrConfig) + missing = set(leaf_keys) - literal_keys + assert not missing, f"get() overloads missing for: {sorted(missing)}" + + +# --------------------------------------------------------------------------- +# Static-typing smoke test (only checked by mypy, not executed at runtime) +# --------------------------------------------------------------------------- + +if typing.TYPE_CHECKING: + + def _typing_smoke(cfg: ZarrConfigManager) -> None: + typing.assert_type(cfg.get("array.order"), typing.Literal["C", "F"]) + typing.assert_type(cfg.array.order, typing.Literal["C", "F"]) + typing.assert_type(cfg.get("async.concurrency"), int) diff --git a/uv.lock b/uv.lock index 18e852c989..a39badf7ee 100644 --- a/uv.lock +++ b/uv.lock @@ -4069,7 +4069,7 @@ requires-dist = [ { name = "numpy", specifier = ">=2" }, { name = "obstore", marker = "extra == 'remote'", specifier = ">=0.5.1" }, { name = "packaging", specifier = ">=22.0" }, - { name = "pyyaml" }, + { name = "pyyaml", specifier = ">=6" }, { name = "typer", marker = "extra == 'cli'" }, { name = "typing-extensions", specifier = ">=4.14" }, { name = "universal-pathlib", marker = "extra == 'optional'" }, From 126681f5d8028982569c74e166d5652a6ba44e24 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 14:47:52 +0200 Subject: [PATCH 14/25] fix(config): deep-merge YAML codecs and thread environ to config search path Remove the `k != "codecs"` guard in `_flatten_mapping` so a YAML `codecs:` block produces flat dotted keys (e.g. `codecs.bytes`) that flow through `_replace_recursive`'s Mapping branch and MERGE into the existing codec dict rather than replacing it wholesale. Also fix `_config_search_paths` to accept and consult the `environ` mapping supplied by `build_config` instead of reading `os.environ` directly, making `build_config(environ=...)` self-consistent. Adds regression tests (RED/GREEN) and a changelog note about `config.defaults` now returning a plain `dict` instead of donfig's `list[dict]`. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- changes/+statically-typed-config.misc.md | 4 +++ src/zarr/core/config.py | 8 ++--- tests/test_config_typed.py | 43 ++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/changes/+statically-typed-config.misc.md b/changes/+statically-typed-config.misc.md index c42d579423..459ebad2cc 100644 --- a/changes/+statically-typed-config.misc.md +++ b/changes/+statically-typed-config.misc.md @@ -6,3 +6,7 @@ ingestion (`ZARR_FOO__BAR`), YAML config files, `config.set` (permanent and as a context manager), `config.reset`, `config.enable_gpu`, and the `deprecations` mechanism are all preserved. The `donfig` dependency has been removed. + +Note: `zarr.config.defaults` now returns a nested `dict` directly; donfig +previously returned a one-element `list[dict]`, so callers that used +`config.defaults[0]` must be updated to use `config.defaults`. diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index d2fc8d0a5b..92dc2cb615 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -235,10 +235,10 @@ def collect_env(environ: Mapping[str, str]) -> dict[str, Any]: return out -def _config_search_paths() -> list[str]: +def _config_search_paths(environ: Mapping[str, str]) -> list[str]: """Standard YAML config locations, mirroring donfig's search order.""" paths: list[str] = [] - env_path = os.environ.get("ZARR_CONFIG") + env_path = environ.get("ZARR_CONFIG") if env_path: paths.append(env_path) paths.append(os.path.join(os.path.expanduser("~"), ".config", "zarr")) @@ -273,7 +273,7 @@ def _flatten_mapping(data: Mapping[str, Any], prefix: str = "") -> dict[str, Any out: dict[str, Any] = {} for k, v in data.items(): key = f"{prefix}{k}" if not prefix else f"{prefix}.{k}" - if isinstance(v, Mapping) and k != "codecs": + if isinstance(v, Mapping): out.update(_flatten_mapping(v, key)) else: out[key] = v @@ -304,7 +304,7 @@ def build_config(environ: Mapping[str, str] | None = None) -> ZarrConfig: if environ is None: environ = os.environ return apply_overrides( - apply_overrides(make_default_config(), collect_yaml(_config_search_paths())), + apply_overrides(make_default_config(), collect_yaml(_config_search_paths(environ))), collect_env(environ), ) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 01d71b15f4..2829470b24 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -238,6 +238,49 @@ def test_donfig_not_imported() -> None: assert "donfig" not in sys.modules +# --------------------------------------------------------------------------- +# YAML codec block merging — regression for the "wipes all defaults" bug +# --------------------------------------------------------------------------- + + +def test_yaml_codecs_block_merges_not_replaces(tmp_path: pytest.TempPathFactory) -> None: + """A YAML file with a codecs: block must MERGE into the defaults, not replace them.""" + yaml_file = tmp_path / "zarr.yaml" # type: ignore[operator] + yaml_file.write_text("codecs:\n bytes: my.custom.BytesCodec\n mycodec: my.Mod.MyCodec\n") + cfg = build_config(environ={"ZARR_CONFIG": str(yaml_file)}) + # overrides applied + assert cfg.codecs["bytes"] == "my.custom.BytesCodec" + assert cfg.codecs["mycodec"] == "my.Mod.MyCodec" + # defaults PRESERVED + assert cfg.codecs["blosc"] == "zarr.codecs.blosc.BloscCodec" + assert cfg.codecs["zstd"] == "zarr.codecs.zstd.ZstdCodec" + # exactly one net-new key added ("bytes" overwrites existing; "mycodec" is new) + assert len(cfg.codecs) == len(DEFAULT_CODECS) + 1 + + +def test_yaml_dotted_codec_name_merges(tmp_path: pytest.TempPathFactory) -> None: + """Dotted codec keys like numcodecs.bz2 in YAML must merge, not replace the whole dict.""" + yaml_file = tmp_path / "zarr.yaml" # type: ignore[operator] + yaml_file.write_text("codecs:\n numcodecs.bz2: my.Override\n") + cfg = build_config(environ={"ZARR_CONFIG": str(yaml_file)}) + # dotted key correctly round-tripped + assert cfg.codecs["numcodecs.bz2"] == "my.Override" + # all other defaults preserved + assert cfg.codecs["blosc"] == "zarr.codecs.blosc.BloscCodec" + assert len(cfg.codecs) == len(DEFAULT_CODECS) # bz2 was already there; just overwritten + + +def test_build_config_environ_yaml_path_is_read(tmp_path: pytest.TempPathFactory) -> None: + """ZARR_CONFIG supplied via build_config(environ=...) must actually be read.""" + yaml_file = tmp_path / "zarr.yaml" # type: ignore[operator] + yaml_file.write_text("json_indent: 9\n") + cfg = build_config(environ={"ZARR_CONFIG": str(yaml_file)}) + assert cfg.json_indent == 9 + # Non-existent path must still not raise + cfg2 = build_config(environ={"ZARR_CONFIG": "/nonexistent/path.yaml"}) + assert cfg2.json_indent == make_default_config().json_indent + + # --------------------------------------------------------------------------- # Drift-protection: every structured leaf key must have a get() overload # --------------------------------------------------------------------------- From c7760e74f80ac599622d168b7181846b669ef9c1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 15:36:28 +0200 Subject: [PATCH 15/25] test(config): correct tmp_path annotation to pathlib.Path Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- tests/test_config_typed.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 2829470b24..577768006f 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -21,6 +21,9 @@ to_nested_dict, ) +if typing.TYPE_CHECKING: + import pathlib + def test_default_config_values() -> None: cfg = make_default_config() @@ -243,9 +246,9 @@ def test_donfig_not_imported() -> None: # --------------------------------------------------------------------------- -def test_yaml_codecs_block_merges_not_replaces(tmp_path: pytest.TempPathFactory) -> None: +def test_yaml_codecs_block_merges_not_replaces(tmp_path: pathlib.Path) -> None: """A YAML file with a codecs: block must MERGE into the defaults, not replace them.""" - yaml_file = tmp_path / "zarr.yaml" # type: ignore[operator] + yaml_file = tmp_path / "zarr.yaml" yaml_file.write_text("codecs:\n bytes: my.custom.BytesCodec\n mycodec: my.Mod.MyCodec\n") cfg = build_config(environ={"ZARR_CONFIG": str(yaml_file)}) # overrides applied @@ -258,9 +261,9 @@ def test_yaml_codecs_block_merges_not_replaces(tmp_path: pytest.TempPathFactory) assert len(cfg.codecs) == len(DEFAULT_CODECS) + 1 -def test_yaml_dotted_codec_name_merges(tmp_path: pytest.TempPathFactory) -> None: +def test_yaml_dotted_codec_name_merges(tmp_path: pathlib.Path) -> None: """Dotted codec keys like numcodecs.bz2 in YAML must merge, not replace the whole dict.""" - yaml_file = tmp_path / "zarr.yaml" # type: ignore[operator] + yaml_file = tmp_path / "zarr.yaml" yaml_file.write_text("codecs:\n numcodecs.bz2: my.Override\n") cfg = build_config(environ={"ZARR_CONFIG": str(yaml_file)}) # dotted key correctly round-tripped @@ -270,9 +273,9 @@ def test_yaml_dotted_codec_name_merges(tmp_path: pytest.TempPathFactory) -> None assert len(cfg.codecs) == len(DEFAULT_CODECS) # bz2 was already there; just overwritten -def test_build_config_environ_yaml_path_is_read(tmp_path: pytest.TempPathFactory) -> None: +def test_build_config_environ_yaml_path_is_read(tmp_path: pathlib.Path) -> None: """ZARR_CONFIG supplied via build_config(environ=...) must actually be read.""" - yaml_file = tmp_path / "zarr.yaml" # type: ignore[operator] + yaml_file = tmp_path / "zarr.yaml" yaml_file.write_text("json_indent: 9\n") cfg = build_config(environ={"ZARR_CONFIG": str(yaml_file)}) assert cfg.json_indent == 9 From f3a40cb6407e260285c267de563ebd636ef6baa8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 17:21:12 +0200 Subject: [PATCH 16/25] doc: rename changelog fragment to PR number 4101 Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- changes/{+statically-typed-config.misc.md => 4101.misc.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename changes/{+statically-typed-config.misc.md => 4101.misc.md} (100%) diff --git a/changes/+statically-typed-config.misc.md b/changes/4101.misc.md similarity index 100% rename from changes/+statically-typed-config.misc.md rename to changes/4101.misc.md From 4b173ea55c62838d7f980915e24e1a8d9d2eab21 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 17:23:00 +0200 Subject: [PATCH 17/25] docs: remove superpowers design docs from branch The design spec and implementation plan live in a gist instead: https://gist.github.com/d-v-b/2a95ff0104824ef52545ed9baf1b66c3 Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- .../2026-06-25-statically-typed-config.md | 997 ------------------ ...26-06-25-statically-typed-config-design.md | 339 ------ 2 files changed, 1336 deletions(-) delete mode 100644 docs/superpowers/plans/2026-06-25-statically-typed-config.md delete mode 100644 docs/superpowers/specs/2026-06-25-statically-typed-config-design.md diff --git a/docs/superpowers/plans/2026-06-25-statically-typed-config.md b/docs/superpowers/plans/2026-06-25-statically-typed-config.md deleted file mode 100644 index 94320dc2fc..0000000000 --- a/docs/superpowers/plans/2026-06-25-statically-typed-config.md +++ /dev/null @@ -1,997 +0,0 @@ -# Statically-typed configuration (drop donfig) Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Replace donfig with a hand-typed, dataclass-backed configuration object that preserves donfig's dotted-string API exactly while adding precise static types. - -**Architecture:** A tree of frozen dataclasses is the schema/source-of-truth. A process-global base snapshot plus a `ContextVar` scope provide donfig-compatible mutable-global semantics with `with`-restore. A proxy object (`config`) exposes typed attribute access and a hand-written overloaded `get`/`set` string API, plus env-var and YAML ingest and deprecation handling. - -**Tech Stack:** Python 3.11+, `dataclasses`, `typing.overload`, `contextvars`, PyYAML, pytest, mypy (strict). - -## Global Constraints - -- Backwards compatibility is the top priority. These must keep identical behavior: `config.get("a.b.c")`, `config.get("a.b.c", default)`, `config.get("codecs", {}).get(key)`, permanent `config.set({...})`, `with config.set({...})`, `config.reset()`, `config.refresh()`, `config.enable_gpu()`, `config.defaults`, `BadConfigError`, `parse_indexing_order`, `ZARR_FOO__BAR` env ingest, YAML ingest, deprecation warnings. -- Public import paths unchanged: `from zarr.core.config import config, BadConfigError, parse_indexing_order`; `zarr.config`. -- mypy strict must pass; PEP8, max line length 100 (prefer <90); numpydoc docstrings on public API. -- Use `uv run` for all pytest/mypy/python invocations (e.g. `uv run pytest ...`). -- No new runtime dependency on `tytr`. Overloads are hand-written. -- The serialized key for the async namespace stays `"async"`; the dataclass field is `async_`. -- `codecs` is an open `Mapping[str, str]` subtree. -- Keep all current config keys, defaults, and the existing `deprecations` mapping verbatim. -- Frequent commits; one logical change per commit. - -## File Structure - -- `src/zarr/core/config.py` — **rewritten** (single module, preserves import paths). Contains: schema dataclasses, path helpers, ingest functions, deprecations, the `ZarrConfigManager` proxy, the module-level `config` instance, `BadConfigError`, `parse_indexing_order`. -- `src/zarr/__init__.py` — remove `"donfig"` from the `required` version-report list. -- `pyproject.toml` — remove the three donfig entries; ensure `pyyaml` is a declared runtime dependency. -- `tests/test_config.py` — existing suite; update only the `defaults` structural assertion. -- `tests/test_config_typed.py` — **new**: schema/helpers/ingest/state/drift/typing unit tests. -- `changes/.misc.md` — **new**: changelog entry. - ---- - -### Task 1: Schema dataclasses + path helpers - -**Files:** -- Modify: `src/zarr/core/config.py` (add new code; do not remove donfig yet) -- Test: `tests/test_config_typed.py` - -**Interfaces:** -- Produces: - - Frozen dataclasses `ArraySettings`, `AsyncSettings`, `ThreadingSettings`, `CodecPipelineSettings`, `ZarrConfig`. - - `DEFAULT_CODECS: dict[str, str]` — the default codec-name→import-path map. - - `make_default_config() -> ZarrConfig`. - - `get_path(cfg: ZarrConfig, key: str) -> Any` — read a dotted key; raises `KeyError` if absent. - - `replace_path(cfg: ZarrConfig, key: str, value: Any) -> ZarrConfig` — return a new snapshot with the dotted key updated. - - `to_nested_dict(cfg: ZarrConfig) -> dict[str, Any]` — donfig-style nested dict using serialized keys (`"async"`, not `"async_"`). - -- [ ] **Step 1: Write the failing test** - -Add to `tests/test_config_typed.py`: - -```python -from __future__ import annotations - -import pytest - -from zarr.core.config import ( - DEFAULT_CODECS, - ZarrConfig, - get_path, - make_default_config, - replace_path, - to_nested_dict, -) - - -def test_default_config_values() -> None: - cfg = make_default_config() - assert cfg.default_zarr_format == 3 - assert cfg.array.order == "C" - assert cfg.array.sharding_coalesce_max_bytes == 16 << 20 - assert cfg.async_.concurrency == 10 - assert cfg.async_.timeout is None - assert cfg.threading.max_workers is None - assert cfg.json_indent == 2 - assert cfg.codec_pipeline.path == "zarr.core.codec_pipeline.BatchedCodecPipeline" - assert cfg.codecs["blosc"] == "zarr.codecs.blosc.BloscCodec" - assert cfg.codecs == DEFAULT_CODECS - - -def test_get_path_structured_and_async_alias() -> None: - cfg = make_default_config() - assert get_path(cfg, "array.order") == "C" - assert get_path(cfg, "async.concurrency") == 10 # serialized key, not async_ - assert get_path(cfg, "json_indent") == 2 - assert get_path(cfg, "codecs") == DEFAULT_CODECS - assert get_path(cfg, "codecs.blosc") == "zarr.codecs.blosc.BloscCodec" - with pytest.raises(KeyError): - get_path(cfg, "array.nonexistent") - - -def test_replace_path_is_immutable_and_typed() -> None: - cfg = make_default_config() - cfg2 = replace_path(cfg, "array.order", "F") - assert cfg.array.order == "C" # original unchanged (frozen) - assert cfg2.array.order == "F" - cfg3 = replace_path(cfg, "async.concurrency", 99) - assert cfg3.async_.concurrency == 99 - cfg4 = replace_path(cfg, "codecs.my_codec", "my.module.MyCodec") - assert cfg4.codecs["my_codec"] == "my.module.MyCodec" - assert "my_codec" not in cfg.codecs - - -def test_to_nested_dict_uses_serialized_keys() -> None: - nested = to_nested_dict(make_default_config()) - assert nested["array"]["order"] == "C" - assert nested["async"]["concurrency"] == 10 # serialized key - assert "async_" not in nested - assert nested["codecs"]["blosc"] == "zarr.codecs.blosc.BloscCodec" -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `uv run pytest tests/test_config_typed.py -v` -Expected: FAIL — `ImportError` (names not yet defined). - -- [ ] **Step 3: Write minimal implementation** - -Add near the top of `src/zarr/core/config.py` (after `from __future__ import annotations` and imports; add `from dataclasses import dataclass, field, fields, replace`, `from collections.abc import Mapping`, `from typing import Any`): - -```python -DEFAULT_CODECS: dict[str, str] = { - "blosc": "zarr.codecs.blosc.BloscCodec", - "gzip": "zarr.codecs.gzip.GzipCodec", - "zstd": "zarr.codecs.zstd.ZstdCodec", - "bytes": "zarr.codecs.bytes.BytesCodec", - "endian": "zarr.codecs.bytes.BytesCodec", - "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", - "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", - "transpose": "zarr.codecs.transpose.TransposeCodec", - "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", - "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", - "numcodecs.bz2": "zarr.codecs.numcodecs.BZ2", - "numcodecs.crc32": "zarr.codecs.numcodecs.CRC32", - "numcodecs.crc32c": "zarr.codecs.numcodecs.CRC32C", - "numcodecs.lz4": "zarr.codecs.numcodecs.LZ4", - "numcodecs.lzma": "zarr.codecs.numcodecs.LZMA", - "numcodecs.zfpy": "zarr.codecs.numcodecs.ZFPY", - "numcodecs.adler32": "zarr.codecs.numcodecs.Adler32", - "numcodecs.astype": "zarr.codecs.numcodecs.AsType", - "numcodecs.bitround": "zarr.codecs.numcodecs.BitRound", - "numcodecs.blosc": "zarr.codecs.numcodecs.Blosc", - "numcodecs.delta": "zarr.codecs.numcodecs.Delta", - "numcodecs.fixedscaleoffset": "zarr.codecs.numcodecs.FixedScaleOffset", - "numcodecs.fletcher32": "zarr.codecs.numcodecs.Fletcher32", - "numcodecs.gzip": "zarr.codecs.numcodecs.GZip", - "numcodecs.jenkins_lookup3": "zarr.codecs.numcodecs.JenkinsLookup3", - "numcodecs.pcodec": "zarr.codecs.numcodecs.PCodec", - "numcodecs.packbits": "zarr.codecs.numcodecs.PackBits", - "numcodecs.shuffle": "zarr.codecs.numcodecs.Shuffle", - "numcodecs.quantize": "zarr.codecs.numcodecs.Quantize", - "numcodecs.zlib": "zarr.codecs.numcodecs.Zlib", - "numcodecs.zstd": "zarr.codecs.numcodecs.Zstd", -} - -# Map serialized dotted-key segments to Python field names where they differ -# (Python keywords cannot be used as identifiers). -_FIELD_ALIASES: dict[str, str] = {"async": "async_"} -_SERIALIZED_NAMES: dict[str, str] = {v: k for k, v in _FIELD_ALIASES.items()} - - -@dataclass(frozen=True, slots=True) -class ArraySettings: - order: Literal["C", "F"] = "C" - write_empty_chunks: bool = False - read_missing_chunks: bool = True - target_shard_size_bytes: int | None = None - rectilinear_chunks: bool = False - sharding_coalesce_max_gap_bytes: int = 1 << 20 - sharding_coalesce_max_bytes: int = 16 << 20 - - -@dataclass(frozen=True, slots=True) -class AsyncSettings: - concurrency: int = 10 - timeout: float | None = None - - -@dataclass(frozen=True, slots=True) -class ThreadingSettings: - max_workers: int | None = None - - -@dataclass(frozen=True, slots=True) -class CodecPipelineSettings: - path: str = "zarr.core.codec_pipeline.BatchedCodecPipeline" - batch_size: int = 1 - - -@dataclass(frozen=True, slots=True) -class ZarrConfig: - default_zarr_format: Literal[2, 3] = 3 - array: ArraySettings = field(default_factory=ArraySettings) - async_: AsyncSettings = field(default_factory=AsyncSettings) - threading: ThreadingSettings = field(default_factory=ThreadingSettings) - json_indent: int = 2 - codec_pipeline: CodecPipelineSettings = field(default_factory=CodecPipelineSettings) - codecs: Mapping[str, str] = field(default_factory=lambda: dict(DEFAULT_CODECS)) - buffer: str = "zarr.buffer.cpu.Buffer" - ndbuffer: str = "zarr.buffer.cpu.NDBuffer" - - -def make_default_config() -> ZarrConfig: - """Return a fresh `ZarrConfig` populated with the built-in defaults.""" - return ZarrConfig() - - -def _resolve_field(obj: Any, segment: str) -> str: - """Translate a serialized key segment to the dataclass field name.""" - return _FIELD_ALIASES.get(segment, segment) - - -def get_path(cfg: ZarrConfig, key: str) -> Any: - """Read a dotted-string key from a `ZarrConfig` snapshot. - - Raises - ------ - KeyError - If the key does not resolve to a value. - """ - obj: Any = cfg - segments = key.split(".") - for i, segment in enumerate(segments): - if isinstance(obj, Mapping): - # remaining segments index into an open mapping (e.g. codecs.*) - remainder = ".".join(segments[i:]) - try: - return obj[remainder] - except KeyError: - raise KeyError(key) from None - field_name = _resolve_field(obj, segment) - if not hasattr(obj, field_name): - raise KeyError(key) - obj = getattr(obj, field_name) - return obj - - -def replace_path(cfg: ZarrConfig, key: str, value: Any) -> ZarrConfig: - """Return a new `ZarrConfig` with the dotted-string key set to ``value``.""" - segments = key.split(".") - return _replace_recursive(cfg, segments, value, key) - - -def _replace_recursive(obj: Any, segments: list[str], value: Any, key: str) -> Any: - segment = segments[0] - if isinstance(obj, Mapping): - remainder = ".".join(segments) - return {**obj, remainder: value} - field_name = _resolve_field(obj, segment) - if not hasattr(obj, field_name): - raise KeyError(key) - if len(segments) == 1: - return replace(obj, **{field_name: value}) - child = getattr(obj, field_name) - new_child = _replace_recursive(child, segments[1:], value, key) - return replace(obj, **{field_name: new_child}) - - -def to_nested_dict(cfg: ZarrConfig) -> dict[str, Any]: - """Convert a `ZarrConfig` to a donfig-style nested dict (serialized keys).""" - - def convert(obj: Any) -> Any: - if isinstance(obj, Mapping): - return dict(obj) - if hasattr(type(obj), "__dataclass_fields__"): - out: dict[str, Any] = {} - for f in fields(obj): - serialized = _SERIALIZED_NAMES.get(f.name, f.name) - out[serialized] = convert(getattr(obj, f.name)) - return out - return obj - - return convert(cfg) # type: ignore[no-any-return] -``` - -Ensure `Literal` and `Any` are imported at the top of the module. - -- [ ] **Step 4: Run test to verify it passes** - -Run: `uv run pytest tests/test_config_typed.py -v` -Expected: PASS (4 tests). - -- [ ] **Step 5: Commit** - -```bash -git add src/zarr/core/config.py tests/test_config_typed.py -git commit -m "feat(config): add frozen dataclass schema and path helpers" -``` - ---- - -### Task 2: Env-var and YAML ingest - -**Files:** -- Modify: `src/zarr/core/config.py` -- Test: `tests/test_config_typed.py` - -**Interfaces:** -- Consumes: `ZarrConfig`, `replace_path` (Task 1). -- Produces: - - `collect_env(environ: Mapping[str, str]) -> dict[str, Any]` — flat dotted-key → value map from `ZARR_*` vars. - - `collect_yaml(paths: list[str]) -> dict[str, Any]` — flat dotted-key map merged from YAML files (missing files skipped). - - `apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, Any]) -> ZarrConfig`. - - `build_config(environ: Mapping[str, str] | None = None) -> ZarrConfig` — defaults < YAML < env. - -- [ ] **Step 1: Write the failing test** - -Add to `tests/test_config_typed.py`: - -```python -from zarr.core.config import apply_overrides, build_config, collect_env - - -def test_collect_env_parses_nested_and_literal() -> None: - env = { - "ZARR_ARRAY__ORDER": "F", - "ZARR_ASYNC__CONCURRENCY": "32", - "ZARR_CODECS__MY_CODEC": "my.module.MyCodec", - "UNRELATED": "ignored", - } - out = collect_env(env) - assert out["array.order"] == "F" - assert out["async.concurrency"] == 32 # ast.literal_eval -> int - assert out["codecs.my_codec"] == "my.module.MyCodec" # non-literal -> raw str - assert "unrelated" not in out - - -def test_apply_overrides_and_build_config_precedence() -> None: - cfg = apply_overrides( - build_config(environ={}), - {"array.order": "F", "codecs.x": "pkg.X"}, - ) - assert cfg.array.order == "F" - assert cfg.codecs["x"] == "pkg.X" - # env overrides defaults - cfg2 = build_config(environ={"ZARR_JSON_INDENT": "4"}) - assert cfg2.json_indent == 4 -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `uv run pytest tests/test_config_typed.py -k "env or precedence" -v` -Expected: FAIL — `ImportError`. - -- [ ] **Step 3: Write minimal implementation** - -Add to `src/zarr/core/config.py` (add `import ast`, `import os`, `import contextlib` at top): - -```python -ENV_PREFIX = "ZARR_" - - -def _parse_env_value(raw: str) -> Any: - """Parse an env value with ``ast.literal_eval``; fall back to the raw string.""" - try: - return ast.literal_eval(raw) - except (ValueError, SyntaxError): - return raw - - -def collect_env(environ: Mapping[str, str]) -> dict[str, Any]: - """Collect ``ZARR_*`` environment variables into a flat dotted-key map. - - ``ZARR_FOO__BAR_BAZ=1`` becomes ``{"foo.bar_baz": 1}`` — the key is - lower-cased and ``__`` denotes nested access. - """ - out: dict[str, Any] = {} - for name, raw in environ.items(): - if not name.startswith(ENV_PREFIX): - continue - body = name[len(ENV_PREFIX) :] - dotted = body.lower().replace("__", ".") - out[dotted] = _parse_env_value(raw) - return out - - -def _config_search_paths() -> list[str]: - """Standard YAML config locations, mirroring donfig's search order.""" - paths: list[str] = [] - env_path = os.environ.get("ZARR_CONFIG") - if env_path: - paths.append(env_path) - paths.append(os.path.join(os.path.expanduser("~"), ".config", "zarr")) - return paths - - -def collect_yaml(paths: list[str]) -> dict[str, Any]: - """Merge YAML config files found at ``paths`` into a flat dotted-key map.""" - import yaml - - merged: dict[str, Any] = {} - for path in paths: - candidates: list[str] = [] - if os.path.isdir(path): - for fn in sorted(os.listdir(path)): - if fn.endswith((".yaml", ".yml")): - candidates.append(os.path.join(path, fn)) - elif os.path.isfile(path): - candidates.append(path) - for candidate in candidates: - with contextlib.suppress(FileNotFoundError): - with open(candidate) as fh: - data = yaml.safe_load(fh) - if isinstance(data, Mapping): - merged.update(_flatten_mapping(data)) - return merged - - -def _flatten_mapping(data: Mapping[str, Any], prefix: str = "") -> dict[str, Any]: - out: dict[str, Any] = {} - for k, v in data.items(): - key = f"{prefix}{k}" if not prefix else f"{prefix}.{k}" - if isinstance(v, Mapping) and k not in ("codecs",): - out.update(_flatten_mapping(v, key)) - else: - out[key] = v - return out - - -def apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, Any]) -> ZarrConfig: - """Apply a flat dotted-key override map to a snapshot.""" - for key, value in overrides.items(): - cfg = replace_path(cfg, key, value) - return cfg - - -def build_config(environ: Mapping[str, str] | None = None) -> ZarrConfig: - """Build the base snapshot: defaults < YAML files < environment variables.""" - if environ is None: - environ = os.environ - cfg = make_default_config() - cfg = apply_overrides(cfg, collect_yaml(_config_search_paths())) - cfg = apply_overrides(cfg, collect_env(environ)) - return cfg -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `uv run pytest tests/test_config_typed.py -k "env or precedence" -v` -Expected: PASS. - -- [ ] **Step 5: Commit** - -```bash -git add src/zarr/core/config.py tests/test_config_typed.py -git commit -m "feat(config): add env-var and YAML ingest" -``` - ---- - -### Task 3: State holder + proxy with typed get/set/reset - -**Files:** -- Modify: `src/zarr/core/config.py` -- Test: `tests/test_config_typed.py` - -**Interfaces:** -- Consumes: `ZarrConfig`, `build_config`, `get_path`, `replace_path`, `to_nested_dict`, `deprecations` (Tasks 1–2 + existing). -- Produces: - - `class ZarrConfigManager` with: typed properties (`array`, `async_`, `threading`, `codec_pipeline`, `default_zarr_format`, `json_indent`, `codecs`, `buffer`, `ndbuffer`); overloaded `get(key, default=...)`; `set(mapping) -> _ConfigSet`; `reset()`; `refresh()`; `enable_gpu()`; `defaults` property; compat shims `to_dict()`, `update(mapping)`, `pprint()`. - - module-level `config: ZarrConfigManager`. - - `_ConfigSet` context manager. - -- [ ] **Step 1: Write the failing test** - -Add to `tests/test_config_typed.py`: - -```python -from concurrent.futures import ThreadPoolExecutor - -from zarr.core.config import ZarrConfigManager - - -def test_proxy_attribute_and_string_access() -> None: - cfg = ZarrConfigManager() - assert cfg.array.order == "C" - assert cfg.get("array.order") == "C" - assert cfg.get("async.concurrency") == 10 - assert cfg.get("codecs", {})["blosc"] == "zarr.codecs.blosc.BloscCodec" - assert cfg.get("does.not.exist", "fallback") == "fallback" - - -def test_set_permanent_and_context() -> None: - cfg = ZarrConfigManager() - cfg.set({"array.order": "F"}) - assert cfg.get("array.order") == "F" # permanent - with cfg.set({"array.order": "C"}): - assert cfg.get("array.order") == "C" - assert cfg.get("array.order") == "F" # restored to permanent value - cfg.reset() - assert cfg.get("array.order") == "C" - - -def test_permanent_set_visible_in_worker_thread() -> None: - cfg = ZarrConfigManager() - cfg.set({"async.concurrency": 77}) - try: - with ThreadPoolExecutor(max_workers=1) as ex: - seen = ex.submit(lambda: cfg.get("async.concurrency")).result() - assert seen == 77 # ThreadPoolExecutor does not copy contextvars - finally: - cfg.reset() - - -def test_defaults_and_enable_gpu() -> None: - cfg = ZarrConfigManager() - assert cfg.defaults["array"]["order"] == "C" - with cfg.set({"buffer": "x"}): - pass - cfg.enable_gpu() - try: - assert cfg.get("buffer") == "zarr.buffer.gpu.Buffer" - assert cfg.get("ndbuffer") == "zarr.buffer.gpu.NDBuffer" - finally: - cfg.reset() -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `uv run pytest tests/test_config_typed.py -k "proxy or set_permanent or worker or defaults_and" -v` -Expected: FAIL — `ImportError` / attribute errors. - -- [ ] **Step 3: Write minimal implementation** - -Add to `src/zarr/core/config.py` (add `from contextvars import ContextVar`, `from typing import overload`, `import warnings` if not present): - -```python -_MISSING = object() - - -class _ConfigSet: - """Context manager returned by ``ZarrConfigManager.set``. - - The change is applied immediately (permanent by default); using the object - as a ``with`` block restores the prior state on exit. - """ - - def __init__(self, manager: ZarrConfigManager, prev_base: ZarrConfig, token: Any) -> None: - self._manager = manager - self._prev_base = prev_base - self._token = token - - def __enter__(self) -> _ConfigSet: - return self - - def __exit__(self, *exc: object) -> None: - self._manager._restore(self._prev_base, self._token) - - -class ZarrConfigManager: - """Typed, donfig-compatible configuration object.""" - - def __init__(self) -> None: - self._base: ZarrConfig = build_config() - self._scope: ContextVar[ZarrConfig] = ContextVar("zarr_config_scope") - - # --- state resolution ------------------------------------------------- - def _current(self) -> ZarrConfig: - return self._scope.get(self._base) - - def _restore(self, prev_base: ZarrConfig, token: Any) -> None: - self._base = prev_base - self._scope.reset(token) - - # --- typed attribute access ------------------------------------------ - @property - def default_zarr_format(self) -> Literal[2, 3]: - return self._current().default_zarr_format - - @property - def array(self) -> ArraySettings: - return self._current().array - - @property - def async_(self) -> AsyncSettings: - return self._current().async_ - - @property - def threading(self) -> ThreadingSettings: - return self._current().threading - - @property - def codec_pipeline(self) -> CodecPipelineSettings: - return self._current().codec_pipeline - - @property - def json_indent(self) -> int: - return self._current().json_indent - - @property - def codecs(self) -> Mapping[str, str]: - return self._current().codecs - - @property - def buffer(self) -> str: - return self._current().buffer - - @property - def ndbuffer(self) -> str: - return self._current().ndbuffer - - # --- string API: get -------------------------------------------------- - @overload - def get(self, key: Literal["default_zarr_format"]) -> Literal[2, 3]: ... - @overload - def get(self, key: Literal["array.order"]) -> Literal["C", "F"]: ... - @overload - def get(self, key: Literal["array.write_empty_chunks"]) -> bool: ... - @overload - def get(self, key: Literal["array.read_missing_chunks"]) -> bool: ... - @overload - def get(self, key: Literal["array.target_shard_size_bytes"]) -> int | None: ... - @overload - def get(self, key: Literal["array.rectilinear_chunks"]) -> bool: ... - @overload - def get(self, key: Literal["array.sharding_coalesce_max_gap_bytes"]) -> int: ... - @overload - def get(self, key: Literal["array.sharding_coalesce_max_bytes"]) -> int: ... - @overload - def get(self, key: Literal["async.concurrency"]) -> int: ... - @overload - def get(self, key: Literal["async.timeout"]) -> float | None: ... - @overload - def get(self, key: Literal["threading.max_workers"]) -> int | None: ... - @overload - def get(self, key: Literal["json_indent"]) -> int: ... - @overload - def get(self, key: Literal["codec_pipeline.path"]) -> str: ... - @overload - def get(self, key: Literal["codec_pipeline.batch_size"]) -> int: ... - @overload - def get(self, key: Literal["buffer"]) -> str: ... - @overload - def get(self, key: Literal["ndbuffer"]) -> str: ... - @overload - def get(self, key: str, default: Any = ...) -> Any: ... - - def get(self, key: str, default: Any = _MISSING) -> Any: - resolved = self._apply_deprecation(key) - if resolved is None: - if default is _MISSING: - raise KeyError(key) - return default - try: - return get_path(self._current(), resolved) - except KeyError: - if default is _MISSING: - raise - return default - - # --- string API: set -------------------------------------------------- - def set(self, updates: Mapping[str, Any]) -> _ConfigSet: - prev_base = self._base - new = self._current() - for key, value in updates.items(): - resolved = self._apply_deprecation(key) - if resolved is None: - continue - new = replace_path(new, resolved, value) - self._base = new - token = self._scope.set(new) - return _ConfigSet(self, prev_base, token) - - # --- lifecycle -------------------------------------------------------- - def reset(self) -> None: - self._base = build_config() - with contextlib.suppress(LookupError): - self._scope.set(self._base) - - def refresh(self) -> None: - self._base = build_config() - - def enable_gpu(self) -> _ConfigSet: - return self.set( - {"buffer": "zarr.buffer.gpu.Buffer", "ndbuffer": "zarr.buffer.gpu.NDBuffer"} - ) - - # --- compat / introspection ------------------------------------------ - @property - def defaults(self) -> dict[str, Any]: - return to_nested_dict(make_default_config()) - - def to_dict(self) -> dict[str, Any]: - return to_nested_dict(self._current()) - - def update(self, updates: Mapping[str, Any]) -> None: - self.set(updates) - - def pprint(self) -> None: - import pprint as _pp - - _pp.pprint(self.to_dict()) - - # --- deprecations ----------------------------------------------------- - def _apply_deprecation(self, key: str) -> str | None: - if key not in deprecations: - return key - new_key = deprecations[key] - if new_key is None: - warnings.warn( - f"Configuration key {key!r} has been removed and no longer has " - f"any effect.", - ZarrDeprecationWarning, - stacklevel=3, - ) - return None - warnings.warn( - f"Configuration key {key!r} has been renamed to {new_key!r}.", - ZarrDeprecationWarning, - stacklevel=3, - ) - return new_key -``` - -Add `from zarr.errors import ZarrDeprecationWarning` to the imports — this class already exists in `src/zarr/errors.py` (a `DeprecationWarning` subclass). Define the module-level instance at the bottom of the schema/proxy section but BEFORE the existing donfig `config = Config(...)` (which Task 4 removes): - -```python -# Provisional new instance; Task 4 makes this THE module-level `config`. -_typed_config = ZarrConfigManager() -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `uv run pytest tests/test_config_typed.py -k "proxy or set_permanent or worker or defaults_and" -v` -Expected: PASS. - -- [ ] **Step 5: Commit** - -```bash -git add src/zarr/core/config.py tests/test_config_typed.py -git commit -m "feat(config): add typed proxy with get/set/reset and deprecations" -``` - ---- - -### Task 4: Swap out donfig (make `config` the new proxy) - -**Files:** -- Modify: `src/zarr/core/config.py` (remove donfig `Config` subclass and instance; promote proxy) -- Test: existing `tests/test_config.py` (and the full suite) - -**Interfaces:** -- Consumes: everything from Tasks 1–3. -- Produces: module-level `config: ZarrConfigManager`; unchanged exports `BadConfigError`, `parse_indexing_order`. - -- [ ] **Step 1: Update the existing `defaults` assertion test** - -In `tests/test_config.py::test_config_defaults_set`, replace the `config.defaults == [ {...} ]` list-of-one-dict assertion with the new nested-dict form: - -```python -def test_config_defaults_set() -> None: - assert config.defaults == { - "default_zarr_format": 3, - "array": { - "order": "C", - "write_empty_chunks": False, - "read_missing_chunks": True, - "target_shard_size_bytes": None, - "rectilinear_chunks": False, - "sharding_coalesce_max_gap_bytes": 1 << 20, - "sharding_coalesce_max_bytes": 16 << 20, - }, - "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, - "json_indent": 2, - "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, - }, - "codecs": dict(DEFAULT_CODECS), - "buffer": "zarr.buffer.cpu.Buffer", - "ndbuffer": "zarr.buffer.cpu.NDBuffer", - } - assert config.get("array.order") == "C" - assert config.get("async.concurrency") == 10 - assert config.get("async.timeout") is None - assert config.get("codec_pipeline.batch_size") == 1 - assert config.get("json_indent") == 2 -``` - -Add `from zarr.core.config import DEFAULT_CONFIG` is not needed; import `DEFAULT_CODECS` in the test's existing config import line. - -- [ ] **Step 2: Run to verify it fails** - -Run: `uv run pytest tests/test_config.py::test_config_defaults_set -v` -Expected: FAIL — `config.defaults` is still donfig's list form. - -- [ ] **Step 3: Remove donfig and promote the proxy** - -In `src/zarr/core/config.py`: -1. Delete the `from donfig import Config as DConfig` import and the `if TYPE_CHECKING: from donfig.config_obj import ConfigSet` block. -2. Delete the `class Config(DConfig): ...` definition (its `reset`/`enable_gpu` now live on `ZarrConfigManager`). -3. Delete the `config = Config("zarr", defaults=[...], deprecations=deprecations)` block. The big defaults dict is now expressed by the dataclasses + `DEFAULT_CODECS`; keep the `deprecations` dict (it is consumed by `ZarrConfigManager`). -4. Replace the provisional `_typed_config = ZarrConfigManager()` line with: - -```python -config = ZarrConfigManager() -``` - -5. Update the module docstring at the top: replace donfig references with a description of the typed config and the `ZARR_FOO__BAR` env-var behavior (keep the example showing `config.set({"codecs.bytes": "your.module.NewBytesCodec"})` and the `ZARR_CODECS__BYTES` env var — both still work). -6. Keep `parse_indexing_order` and `BadConfigError` exactly as-is. - -- [ ] **Step 4: Run the full config + dependent suites** - -Run: `uv run pytest tests/test_config.py -v` -Expected: PASS. - -Run: `uv run pytest tests/test_api.py tests/test_buffer.py tests/test_codec_entrypoints.py tests/test_v2.py tests/test_sync.py tests/test_common.py -q` -Expected: PASS (these import/use `config`). - -Run: `uv run pytest tests -q` -Expected: PASS (full suite; backwards-compat gate). - -- [ ] **Step 5: Run mypy** - -Run: `uv run mypy src/zarr/core/config.py src/zarr/registry.py src/zarr/core/sync.py` -Expected: no errors. Confirm `reveal_type` is not needed here; fix any typing fallout in consumers (e.g. casts that referenced donfig types). - -- [ ] **Step 6: Commit** - -```bash -git add src/zarr/core/config.py tests/test_config.py -git commit -m "feat(config): replace donfig with typed config object" -``` - ---- - -### Task 5: Remove the donfig dependency - -**Files:** -- Modify: `pyproject.toml` (lines ~39, ~246, ~272), `src/zarr/__init__.py` (~line 71) -- Test: import smoke test - -**Interfaces:** none new. - -- [ ] **Step 1: Write the failing test** - -Add to `tests/test_config_typed.py`: - -```python -def test_donfig_not_imported() -> None: - import sys - - import zarr # noqa: F401 - - assert "donfig" not in sys.modules -``` - -- [ ] **Step 2: Run to verify it fails** - -Run: `uv run pytest tests/test_config_typed.py::test_donfig_not_imported -v` -Expected: FAIL — donfig still imported somewhere / installed and pulled in. - -- [ ] **Step 3: Edit dependency declarations** - -In `pyproject.toml`: -- Remove `'donfig>=0.8',` from the `dependencies` list (~line 39). -- Add `'pyyaml',` to the `dependencies` list (donfig previously pulled YAML support transitively; we now use it directly). -- Remove `'donfig @ git+https://github.com/pytroll/donfig',` from the `dynamic`/upstream group (~line 246). -- Remove `'donfig==0.8.*',` from the minimal-pins group (~line 272). - -In `src/zarr/__init__.py`, remove `"donfig",` from the `required` list (~line 71). - -- [ ] **Step 4: Re-sync the environment and verify** - -Run: `uv run --reinstall-package zarr pytest tests/test_config_typed.py::test_donfig_not_imported -v` -Expected: PASS. - -Run: `uv run python -c "import zarr; print(zarr.config.get('array.order'))"` -Expected: prints `C`. - -- [ ] **Step 5: Commit** - -```bash -git add pyproject.toml src/zarr/__init__.py tests/test_config_typed.py -git commit -m "build: drop donfig dependency, add pyyaml" -``` - ---- - -### Task 6: Drift-protection, typing assertions, docs, changelog - -**Files:** -- Test: `tests/test_config_typed.py` -- Modify: `src/zarr/core/config.py` docstring (if not already done in Task 4) -- Create: `changes/.misc.md` - -**Interfaces:** none new. - -- [ ] **Step 1: Write the drift-protection + typing tests** - -Add to `tests/test_config_typed.py`: - -```python -import typing - -from zarr.core.config import ZarrConfig, ZarrConfigManager, _SERIALIZED_NAMES - - -def _structured_leaf_keys(cfg_cls: type, prefix: str = "") -> list[str]: - import dataclasses - - keys: list[str] = [] - for f in dataclasses.fields(cfg_cls): - serialized = _SERIALIZED_NAMES.get(f.name, f.name) - key = f"{prefix}{serialized}" if not prefix else f"{prefix}.{serialized}" - ftype = f.type - if dataclasses.is_dataclass(ftype): - keys.extend(_structured_leaf_keys(ftype, key)) - elif f.name == "codecs": - continue # open mapping, intentionally not enumerated - else: - keys.append(key) - return keys - - -def test_every_structured_key_has_a_get_overload() -> None: - overloads = typing.get_overloads(ZarrConfigManager.get) - literal_keys: set[str] = set() - for ov in overloads: - hints = typing.get_type_hints(ov) - key_hint = hints.get("key") - if typing.get_origin(key_hint) is typing.Literal: - literal_keys.update(typing.get_args(key_hint)) - missing = set(_structured_leaf_keys(ZarrConfig)) - literal_keys - assert not missing, f"get() overloads missing for: {sorted(missing)}" - - -if typing.TYPE_CHECKING: - - def _typing_smoke(cfg: ZarrConfigManager) -> None: - typing.assert_type(cfg.get("array.order"), typing.Literal["C", "F"]) - typing.assert_type(cfg.array.order, typing.Literal["C", "F"]) - typing.assert_type(cfg.get("async.concurrency"), int) -``` - -Note: `f.type` may be a string under `from __future__ import annotations`. If so, resolve with `typing.get_type_hints(cfg_cls)` inside `_structured_leaf_keys` instead of reading `f.type` directly. Adjust the helper accordingly so dataclass detection works on resolved types. - -- [ ] **Step 2: Run to verify it fails (then passes once overloads complete)** - -Run: `uv run pytest tests/test_config_typed.py -k "overload" -v` -Expected: PASS if all overloads from Task 3 are present; if it lists missing keys, add the corresponding `get` overloads in `config.py` and re-run until PASS. - -- [ ] **Step 3: Type-check the typing smoke test** - -Run: `uv run mypy tests/test_config_typed.py` -Expected: no errors (`assert_type` calls confirm the precise static types). - -- [ ] **Step 4: Add the changelog entry** - -Create `changes/.misc.md` (replace `` with the PR number) with: - -```markdown -Replaced the ``donfig``-based configuration with a statically-typed -configuration object. ``zarr.config`` now provides precise static types for -attribute access (``zarr.config.array.order``) and for the dotted-string API -(``zarr.config.get("array.order")``). The string API, environment-variable -ingestion (``ZARR_FOO__BAR``), YAML config files, ``config.set`` (permanent and -as a context manager), ``config.reset``, ``config.enable_gpu``, and the -``deprecations`` mechanism are all preserved. The ``donfig`` dependency has been -removed. -``` - -- [ ] **Step 5: Update the module docstring (if not done in Task 4)** - -Confirm `src/zarr/core/config.py`'s top docstring no longer references donfig and documents the typed API + `ZARR_*` env vars + YAML. (Use single-backtick markdown — docs are mkdocs.) - -- [ ] **Step 6: Full verification + commit** - -Run: `uv run pytest tests/test_config.py tests/test_config_typed.py -q` -Expected: PASS. - -Run: `uv run pytest tests -q` -Expected: PASS. - -Run: `uv run mypy src tests/test_config_typed.py` -Expected: no errors. - -```bash -git add tests/test_config_typed.py src/zarr/core/config.py changes/ -git commit -m "test(config): drift-protection + typing assertions; docs + changelog" -``` - ---- - -## Self-Review - -**Spec coverage:** -- Schema dataclasses → Task 1. Open `codecs` mapping → Task 1 (`get_path`/`replace_path` mapping handling) + tests. State holder (base + contextvar) → Task 3. Proxy + typed attribute access → Task 3. Hand-written overloads → Task 3, completeness enforced Task 6. Env + YAML ingest → Task 2. Deprecations → Task 3. Backwards-compat surface → Task 4 (full suite) + preserved methods (`to_dict`/`update`/`pprint`/`refresh`/`reset`/`enable_gpu`/`defaults`). donfig removal → Task 5. Drift protection + typing assertions + changelog + docs → Task 6. -- `async_` alias rationale → realized via `_FIELD_ALIASES`/`_SERIALIZED_NAMES` in Tasks 1/3/6. - -**Type consistency:** `ZarrConfig`, `ArraySettings`, `AsyncSettings`, `ThreadingSettings`, `CodecPipelineSettings`, `get_path`, `replace_path`, `to_nested_dict`, `build_config`, `collect_env`, `collect_yaml`, `apply_overrides`, `ZarrConfigManager`, `_ConfigSet`, `_FIELD_ALIASES`, `_SERIALIZED_NAMES`, `DEFAULT_CODECS` are used consistently across tasks. - -**Known follow-ups for the implementer (not placeholders — explicit decisions):** -- If `from __future__ import annotations` makes `dataclasses.fields(...).type` a string, resolve via `get_type_hints` in the drift helper (Task 6, Step 1). -- `set` semantics note: a top-level `config.set({...})` updates `_base` (cross-thread, permanent) and the contextvar scope; `with config.set({...})` restores both on exit. This matches donfig's permanent-by-default behavior while keeping cross-thread visibility (verified by `test_permanent_set_visible_in_worker_thread`). diff --git a/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md b/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md deleted file mode 100644 index e0549735c3..0000000000 --- a/docs/superpowers/specs/2026-06-25-statically-typed-config-design.md +++ /dev/null @@ -1,339 +0,0 @@ -# Statically-typed configuration for zarr-python - -**Date:** 2026-06-25 -**Status:** Approved design, ready for implementation planning - -## Problem - -zarr-python's configuration is built on [donfig](https://github.com/pytroll/donfig). -donfig stores config as an untyped nested `dict`, so there is no static type -information for any configuration value. `config.get("array.order")` is typed as -`Any`, `config.array` does not exist as a typed attribute, and there is no way for -a type checker to catch a misspelled key or a wrong-typed value. - -We want to drop donfig entirely and model the configuration as plain frozen -dataclasses, which gives native static typing for attribute access -(`config.array.order`), while retaining donfig's ergonomic dotted-string API -(`config.get("array.order")`, `config.set({"array.order": "F"})`) with precise -static types via hand-written overloads. This is the technique demonstrated in the -[`tytr`](https://github.com/d-v-b/tytr) project: a flattened mapping from dotted -keys to value types, surfaced through an overloaded getter/setter. - -## Non-negotiable constraint: backwards compatibility - -**Backwards compatibility is extremely important for this work.** The public -`zarr.config` object is widely used in downstream code, notebooks, and -documentation. The replacement MUST be a drop-in for every documented and -commonly-used pattern. Concretely: - -- All of these must continue to work with identical behavior and (where they - returned values) identical return values: - - `config.get("a.b.c")` and `config.get("a.b.c", default)` - - subtree retrieval: `config.get("codecs", {}).get(key)` - - `config.set({"a.b.c": value})` applied **permanently** - - `with config.set({"a.b.c": value}):` applied **scoped**, restored on exit - - `config.reset()` - - `config.enable_gpu()` - - `config.defaults` - - `BadConfigError` - - the `ZARR_FOO__BAR` environment-variable ingestion - - YAML config-file ingestion from standard locations - - the `deprecations` key-redirection/removal warnings -- Public import paths are unchanged: `from zarr.core.config import config, - BadConfigError, parse_indexing_order` and `zarr.config`. -- donfig provides a broader method surface (`to_dict`, `update`, `merge`, - `pprint`, `clear`, `refresh`, `collect`, ...). We preserve the subset zarr - itself uses (`get`, `set`, `reset`, `enable_gpu`, `defaults`, `clear`, - `refresh`) and additionally provide compatible shims for `to_dict`/`update`/ - `pprint` since these are plausible downstream uses. Any donfig method we do not - reimplement must raise a clear, actionable error pointing at the new API rather - than an `AttributeError`. -- Behavior changes are only acceptable where they are strictly additive (new - precise types) or where donfig behavior was undocumented/incidental. Any - observable change is called out in the changelog with migration guidance. -- A `towncrier` changelog entry under `changes/` documents the donfig removal and - confirms the API is preserved. - -## Architecture - -Three layers with clear boundaries. - -### Layer A — schema (frozen dataclasses) - -The configuration shape is a tree of frozen, slotted dataclasses. This is the -single source of truth for both structure and defaults. - -> **Naming note:** a distinct `ArrayConfig` already exists in -> `src/zarr/core/array_spec.py` (a runtime per-array object, unrelated to the -> global config). To avoid collision and confusion, the global-config schema -> dataclasses are named with a `Config` suffix scoped under the config module -> (e.g. the array-namespace schema below). If the names below would still read -> ambiguously next to the existing `ArrayConfig`, prefer an explicit suffix such -> as `ArraySettings` / `ZarrSettings` during implementation. The final names are -> an implementation detail; the structure is what matters. - -```python -@dataclass(frozen=True, slots=True) -class ArrayConfig: - order: Literal["C", "F"] = "C" - write_empty_chunks: bool = False - read_missing_chunks: bool = True - target_shard_size_bytes: int | None = None - rectilinear_chunks: bool = False - sharding_coalesce_max_gap_bytes: int = 1 << 20 # 1 MiB - sharding_coalesce_max_bytes: int = 16 << 20 # 16 MiB - -@dataclass(frozen=True, slots=True) -class AsyncConfig: - concurrency: int = 10 - timeout: float | None = None - -@dataclass(frozen=True, slots=True) -class ThreadingConfig: - max_workers: int | None = None - -@dataclass(frozen=True, slots=True) -class CodecPipelineConfig: - path: str = "zarr.core.codec_pipeline.BatchedCodecPipeline" - batch_size: int = 1 - -@dataclass(frozen=True, slots=True) -class ZarrConfig: - default_zarr_format: Literal[2, 3] = 3 - array: ArrayConfig = field(default_factory=ArrayConfig) - async_: AsyncConfig = field(default_factory=AsyncConfig) # serialized key: "async" - threading: ThreadingConfig = field(default_factory=ThreadingConfig) - json_indent: int = 2 - codec_pipeline: CodecPipelineConfig = field(default_factory=CodecPipelineConfig) - codecs: Mapping[str, str] = field(default_factory=lambda: dict(DEFAULT_CODECS)) - buffer: str = "zarr.buffer.cpu.Buffer" - ndbuffer: str = "zarr.buffer.cpu.NDBuffer" -``` - -Notes: -- `config.array.order` etc. are natively typed by the dataclass — no overloads - needed for the attribute-access path. -- `async_` carries the serialized key `"async"` (an illegal Python identifier). - The mapping between Python field name and serialized dotted key is recorded in a - small per-class `__key_aliases__` (or equivalent) so the string API and ingest - layers translate correctly. Attribute access for `async` is only available via - the string API (`config.get("async.concurrency")`); this matches donfig, which - also has no `config.async` attribute. - -#### Why the `async_` alias is unavoidable (and harmless) - -A natural objection: the `async_` alias is ugly — can't a programmatic -`TypedDict("ZarrConfig", {"async": int, ...})` keep the real key `"async"` and -avoid the alias? It can keep the *string* key, but it does **not** avoid the -problem, because the constraint here is a **syntax** rule, not a typing one: - -- `async` has been a hard keyword since Python 3.7. `config.async` is a - `SyntaxError` regardless of the type machinery behind it. `getattr(config, - "async")` works at runtime but cannot be statically typed precisely. So - attribute access to a field literally named `async` is impossible in any - approach. -- Functional/programmatic `TypedDict` does **not** lose static typing — type - checkers fully support `cfg["async"]` typed from a functional TypedDict. But it - does not rescue attribute access either; it merely moves you from `config.async` - (illegal) to `config["async"]` (subscript). It buys nothing the alias didn't, - and it gives up the natural dotted-attribute ergonomics (`config.array.order`) - for *every other* namespace, which would then also be subscript access. - -So the real axis is attribute-access vs subscript/string-access, not "typed vs -untyped". Every option is fully typed; only `config.async` (the attribute form) is -forbidden, by Python syntax, in all of them. - -Crucially, this is confined to the new typed-attribute convenience and does **not** -touch backwards compatibility. donfig never exposed a `config.async` attribute; the -only place `async` appears today is the *string key* `"async.concurrency"` (and the -env var `ZARR_ASYNC__CONCURRENCY`). Those are strings and behave identically -whether the schema is a dataclass or a TypedDict, and the serialized key stays -`"async"`. Therefore we keep both, fully typed: - -- `config.get("async.concurrency") -> int` — the real key, full backwards compat, - the **primary** documented path. -- `config.async_.concurrency -> int` — the optional typed-attribute convenience, - with the alias documented. - -Net: the dataclass approach keeps full static typing *and* clean attribute access -for every namespace except the one Python forbids by syntax — and for that one, no -approach can do better than an alias or a subscript. The `async_` wrinkle is -cosmetic, confined to attribute access, and costs nothing on the compatibility -surface that matters. -- `codecs` is an open `Mapping[str, str]` subtree (per design decision): users - register arbitrary codec names at runtime via `config.set({"codecs.foo": ...})` - and `ZARR_CODECS__FOO=...`. Structured keys get precise static types; codec keys - degrade to the string fallback. `DEFAULT_CODECS` holds the current default codec - name → import-path mapping verbatim. - -### Layer B — state holder (base snapshot + contextvar overlay) - -State is held as immutable `ZarrConfig` snapshots. To preserve donfig's exact -runtime semantics — in particular cross-thread visibility of permanent sets — we -use a **hybrid** of a process-global base and a context-local overlay rather than a -pure `ContextVar`. - -Rationale: zarr runs work in `ThreadPoolExecutor` (`src/zarr/core/sync.py`). -`ThreadPoolExecutor` does **not** copy `contextvars` into worker threads. A pure -`ContextVar` would make a permanent `config.set({...})` invisible inside worker -threads — a silent regression versus donfig's process-global dict mutation. The -hybrid avoids this. - -- `_base: ZarrConfig` — a module-global snapshot, process-wide, visible across all - threads. A **permanent** `config.set(...)` (not used as a `with` block) replaces - this reference. -- `_overlay: ContextVar[ZarrConfig | None]` — a context-local override. `with - config.set(...)` sets this and resets it via the returned `Token` on exit. - Provides async-safe and thread-safe scoping for the common `with config.set(...)` - idiom. -- Resolution: the effective snapshot is `_overlay.get() or _base`. -- Every mutation produces a **new** frozen `ZarrConfig` by applying the requested - dotted-key updates through `dataclasses.replace` along the path (a small - recursive `replace_path(snapshot, "a.b.c", value) -> ZarrConfig` helper). For the - open `codecs` mapping, updates copy-and-extend the dict. - -`config.set(...)` semantics, matching donfig: -- Applies immediately (mutates effective state) **and** returns a context-manager - object. -- If used as `with config.set(...):`, the prior state is restored on `__exit__`. -- If not used as a context manager, the change persists (permanent set updates - `_base`). - -### Layer C — proxy (`config`) - -`config` is the shared singleton everyone imports. It is **not** the data; it reads -the current resolved snapshot on each access, so existing `from zarr.core.config -import config` references continue to observe live updates (preserving donfig's -import-by-reference behavior). It exposes: - -- Typed attribute properties delegating to the resolved snapshot: `config.array -> - ArrayConfig`, `config.async_ -> AsyncConfig`, `config.json_indent -> int`, etc. -- The donfig-compatible string API: `get`, `set`, `reset`, `enable_gpu`, - `defaults`, plus compat shims (`to_dict`, `update`, `pprint`). - -## The typed string API (hand-written overloads) - -Per the design decision, the dotted-key → value-type overloads are **hand-written** -(no codegen, no `tytr` runtime dependency). This is the `tytr` getter pattern, -authored directly: - -```python -class _ConfigProxy: - @overload - def get(self, key: Literal["default_zarr_format"]) -> Literal[2, 3]: ... - @overload - def get(self, key: Literal["array.order"]) -> Literal["C", "F"]: ... - @overload - def get(self, key: Literal["array.write_empty_chunks"]) -> bool: ... - @overload - def get(self, key: Literal["async.concurrency"]) -> int: ... - @overload - def get(self, key: Literal["async.timeout"]) -> float | None: ... - @overload - def get(self, key: Literal["json_indent"]) -> int: ... - # ... one overload per structured leaf key ... - @overload - def get(self, key: str, default: object = ...) -> Any: ... # codecs.*, subtrees, unknown keys - def get(self, key: str, default: object = _MISSING) -> Any: ... -``` - -`set` mirrors this: an overloaded surface (or a `TypedDict` of optional dotted -keys) so that `config.set({"array.order": "F"})` type-checks the value against the -key. The open `codecs.*` keys and whole-subtree gets (`config.get("codecs", {})`) -resolve through the `str` fallback overload. - -### Drift protection - -Hand-written overloads can drift from the dataclass schema. A regression test walks -`ZarrConfig` recursively, enumerates every structured dotted leaf key, and asserts -each has a corresponding `get` overload with a matching return type (introspected -via `typing.get_overloads`). CI fails on any missing/mismatched overload. This -neutralizes the main downside of the hand-written approach. - -## Ingest sources - -Both retained (per design decision). Reimplemented in zarr (~a few dozen lines) -rather than vendoring donfig's loader. - -Precedence, lowest to highest: - -1. dataclass defaults -2. YAML config files -3. environment variables -4. runtime `config.set(...)` - -- **Environment variables:** collect `ZARR_*`, lower-case the key, treat `__` as - nested access, `ast.literal_eval` the value (with literal-eval failure falling - back to the raw string, matching donfig). Builds overrides merged into the base - snapshot at construction. -- **YAML files:** read from standard locations — `ZARR_CONFIG` env var path(s) and - the default config directory (e.g. `~/.config/zarr`), matching donfig's search - behavior. Parsed with the existing YAML dependency and merged under env vars. - -Ingested values are validated/coerced into the dataclass field types where the key -is structured; unknown keys under open subtrees (`codecs.*`) pass through as -strings. - -## Deprecations - -donfig's `deprecations` mechanism (old-key → new-key, or `None` for removed) is -reimplemented. Accessing or setting a deprecated key emits the same warning and -redirects to the new key (or raises/warns for removed keys). The existing -`deprecations` mapping in `config.py` is carried over verbatim: - -```python -deprecations = { - "array.v2_default_compressor.numeric": None, - # ... unchanged ... -} -``` - -## Backwards-compatibility verification - -Beyond the per-feature preservation above: - -- A compatibility test module exercises every pattern in the "Non-negotiable - constraint" list against the new implementation. -- `config.defaults` returns a representation equivalent to today's (the existing - `test_config_defaults_set` is updated to the new snapshot representation while - asserting the same values). -- Methods not reimplemented raise an informative error naming the supported - replacement, never a bare `AttributeError`. - -## Testing - -- Existing `tests/test_config.py` remains largely valid since the string API is - preserved; only `config.defaults` structural assertions are updated. -- New tests: - - overload ↔ dataclass sync (drift protection). - - env-var ingestion (including `ZARR_CODECS__*` dynamic keys). - - YAML-file ingestion and precedence ordering. - - permanent-set visibility inside a `ThreadPoolExecutor` worker (the hybrid - state-model regression). - - `with config.set(...)` scoping under threads and asyncio tasks. - - deprecation warnings/redirects. -- Static-typing assertions in the test suite (the repo type-checks tests): - `reveal_type(config.get("array.order"))` is `Literal['C', 'F']`, - `reveal_type(config.array.order)` is `Literal['C', 'F']`, and a wrong-typed - `config.set({"array.order": "Q"})` is a type error. - -## Files affected - -- `src/zarr/core/config.py` — rewritten: dataclasses, proxy, state holder, string - API, ingest, deprecations. Keeps `config`, `BadConfigError`, - `parse_indexing_order` exports. -- `pyproject.toml` — remove `donfig` dependency; ensure a YAML dependency is - declared (currently transitive via donfig). -- `src/zarr/__init__.py` — remove `donfig` from the version-reporting table. -- `tests/test_config.py` and new test modules — as above. -- `changes/.misc.md` (or `.feature.md`) — changelog entry. -- Documentation referencing donfig (`config.py` module docstring, any docs/ pages) - — updated to describe the new typed API while keeping the string-API examples. - -## Out of scope - -- Changing the set of configuration keys or their defaults. -- Migrating the `codecs` registry out of config (the open `dict[str, str]` subtree - is retained). -- Any change to the `ArrayConfig`/`ArraySpec` runtime objects in - `core/array_spec.py` beyond what is needed to read from the new config. From 96fcf95bdeb609d2b9df55f7dcf9994ed724b8ac Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 17:35:51 +0200 Subject: [PATCH 18/25] test(config): refactor to table-driven Expect/ExpectFail cases Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- tests/test_config_typed.py | 389 ++++++++++++++++++++++++++----------- 1 file changed, 276 insertions(+), 113 deletions(-) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 577768006f..72dc2501b6 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -6,6 +6,7 @@ import pytest +from tests.conftest import Expect, ExpectFail from zarr.core.config import ( _SERIALIZED_NAMES, DEFAULT_CODECS, @@ -24,102 +25,293 @@ if typing.TYPE_CHECKING: import pathlib +# --------------------------------------------------------------------------- +# Module-level constants used in parametrize lists (evaluated at collection time) +# --------------------------------------------------------------------------- -def test_default_config_values() -> None: +_REMOVED_KEY = "array.v2_default_compressor.numeric" +_DEFAULT = make_default_config() + +# --------------------------------------------------------------------------- +# 1. get_path — success cases +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "case", + [ + Expect(input="array.order", output="C", id="array-order"), + Expect(input="async.concurrency", output=10, id="async-concurrency-alias"), + Expect(input="json_indent", output=2, id="json-indent"), + Expect(input="codecs", output=DEFAULT_CODECS, id="codecs-dict"), + Expect(input="codecs.blosc", output="zarr.codecs.blosc.BloscCodec", id="codecs-blosc"), + ], + ids=lambda c: c.id, +) +def test_get_path(case: Expect[str, object]) -> None: + assert get_path(make_default_config(), case.input) == case.output + + +@pytest.mark.parametrize( + "case", + [ + ExpectFail(input="array.nonexistent", exception=KeyError, id="nonexistent-key"), + ], + ids=lambda c: c.id, +) +def test_get_path_raises(case: ExpectFail[str]) -> None: + with case.raises(): + get_path(make_default_config(), case.input) + + +# --------------------------------------------------------------------------- +# 2. replace_path +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "case", + [ + Expect(input=("array.order", "F"), output="F", id="array-order"), + Expect(input=("async.concurrency", 99), output=99, id="async-concurrency-alias"), + Expect( + input=("codecs.my_codec", "my.module.MyCodec"), + output="my.module.MyCodec", + id="codec-new-key", + ), + ], + ids=lambda c: c.id, +) +def test_replace_path(case: Expect[tuple[str, object], object]) -> None: + key, value = case.input + result = replace_path(make_default_config(), key, value) + assert get_path(result, key) == case.output + + +def test_replace_path_is_immutable() -> None: + """Original config is unchanged after replace_path (frozen dataclass).""" cfg = make_default_config() - assert cfg.default_zarr_format == 3 + _ = replace_path(cfg, "array.order", "F") assert cfg.array.order == "C" - assert cfg.array.sharding_coalesce_max_bytes == 16 << 20 - assert cfg.async_.concurrency == 10 - assert cfg.async_.timeout is None - assert cfg.threading.max_workers is None - assert cfg.json_indent == 2 - assert cfg.codec_pipeline.path == "zarr.core.codec_pipeline.BatchedCodecPipeline" - assert cfg.codecs["blosc"] == "zarr.codecs.blosc.BloscCodec" - assert cfg.codecs == DEFAULT_CODECS -def test_get_path_structured_and_async_alias() -> None: - cfg = make_default_config() - assert get_path(cfg, "array.order") == "C" - assert get_path(cfg, "async.concurrency") == 10 # serialized key, not async_ - assert get_path(cfg, "json_indent") == 2 - assert get_path(cfg, "codecs") == DEFAULT_CODECS - assert get_path(cfg, "codecs.blosc") == "zarr.codecs.blosc.BloscCodec" - with pytest.raises(KeyError): - get_path(cfg, "array.nonexistent") +# --------------------------------------------------------------------------- +# 3. collect_env +# --------------------------------------------------------------------------- -def test_replace_path_is_immutable_and_typed() -> None: - cfg = make_default_config() - cfg2 = replace_path(cfg, "array.order", "F") - assert cfg.array.order == "C" # original unchanged (frozen) - assert cfg2.array.order == "F" - cfg3 = replace_path(cfg, "async.concurrency", 99) - assert cfg3.async_.concurrency == 99 - cfg4 = replace_path(cfg, "codecs.my_codec", "my.module.MyCodec") - assert cfg4.codecs["my_codec"] == "my.module.MyCodec" - assert "my_codec" not in cfg.codecs - - -def test_to_nested_dict_uses_serialized_keys() -> None: - nested = to_nested_dict(make_default_config()) - assert nested["array"]["order"] == "C" - assert nested["async"]["concurrency"] == 10 # serialized key - assert "async_" not in nested - assert nested["codecs"]["blosc"] == "zarr.codecs.blosc.BloscCodec" - - -def test_collect_env_parses_nested_and_literal() -> None: - env = { - "ZARR_ARRAY__ORDER": "F", - "ZARR_ASYNC__CONCURRENCY": "32", - "ZARR_CODECS__MY_CODEC": "my.module.MyCodec", - "UNRELATED": "ignored", - } - out = collect_env(env) - assert out["array.order"] == "F" - assert out["async.concurrency"] == 32 # ast.literal_eval -> int - assert out["codecs.my_codec"] == "my.module.MyCodec" # non-literal -> raw str - assert "unrelated" not in out - - -def test_apply_overrides_and_build_config_precedence() -> None: - cfg = apply_overrides( - build_config(environ={}), - {"array.order": "F", "codecs.x": "pkg.X"}, - ) - assert cfg.array.order == "F" - assert cfg.codecs["x"] == "pkg.X" - # env overrides defaults - cfg2 = build_config(environ={"ZARR_JSON_INDENT": "4"}) - assert cfg2.json_indent == 4 +@pytest.mark.parametrize( + "case", + [ + Expect( + input={ + "ZARR_ARRAY__ORDER": "F", + "ZARR_ASYNC__CONCURRENCY": "32", + "ZARR_CODECS__MY_CODEC": "my.module.MyCodec", + "UNRELATED": "ignored", + }, + output={ + "array.order": "F", + "async.concurrency": 32, + "codecs.my_codec": "my.module.MyCodec", + }, + id="nested-and-literal", + ), + Expect( + input={"ZARR_CONFIG": "/some/path.yaml", "ZARR_ARRAY__ORDER": "F"}, + output={"array.order": "F"}, + id="zarr-config-meta-var-skipped", + ), + ], + ids=lambda c: c.id, +) +def test_collect_env(case: Expect[dict[str, str], dict[str, object]]) -> None: + assert collect_env(case.input) == case.output -def test_collect_env_skips_zarr_config_meta_var() -> None: - """ZARR_CONFIG is a directive about where config lives, not a config key itself.""" - env = {"ZARR_CONFIG": "/some/path.yaml", "ZARR_ARRAY__ORDER": "F"} - out = collect_env(env) - assert "config" not in out - assert out["array.order"] == "F" +# --------------------------------------------------------------------------- +# 4. build_config +# --------------------------------------------------------------------------- -def test_build_config_zarr_config_env_does_not_raise() -> None: - """Setting ZARR_CONFIG to a nonexistent path must not crash build_config.""" - cfg = build_config(environ={"ZARR_CONFIG": "/nonexistent/path.yaml"}) - # The nonexistent YAML path is simply skipped; defaults remain intact. - from zarr.core.config import make_default_config +@pytest.mark.parametrize( + "case", + [ + Expect(input={}, output=_DEFAULT, id="empty-environ"), + Expect( + input={"ZARR_CONFIG": "/nonexistent/path.yaml"}, + output=_DEFAULT, + id="zarr-config-nonexistent", + ), + Expect( + input={"ZARR_JSON_INDENT": "4"}, + output=replace_path(_DEFAULT, "json_indent", 4), + id="json-indent-env", + ), + ], + ids=lambda c: c.id, +) +def test_build_config(case: Expect[dict[str, str], ZarrConfig]) -> None: + assert build_config(environ=case.input) == case.output - assert cfg == make_default_config() +# --------------------------------------------------------------------------- +# 5. apply_overrides +# --------------------------------------------------------------------------- -def test_proxy_attribute_and_string_access() -> None: - cfg = ZarrConfigManager() + +@pytest.mark.parametrize( + "case", + [ + Expect( + input={"array.order": "F", "codecs.x": "pkg.X"}, + output=replace_path(replace_path(_DEFAULT, "array.order", "F"), "codecs.x", "pkg.X"), + id="array-order-and-codec", + ), + ], + ids=lambda c: c.id, +) +def test_apply_overrides(case: Expect[dict[str, object], ZarrConfig]) -> None: + assert apply_overrides(build_config(environ={}), case.input) == case.output + + +# --------------------------------------------------------------------------- +# 6. to_nested_dict +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "case", + [ + Expect( + input=make_default_config(), + output=("C", 10, "zarr.codecs.blosc.BloscCodec"), + id="default-serialized-keys", + ), + ], + ids=lambda c: c.id, +) +def test_to_nested_dict(case: Expect[ZarrConfig, tuple[str, int, str]]) -> None: + nested = to_nested_dict(case.input) + order, concurrency, blosc = case.output + assert nested["array"]["order"] == order + assert nested["async"]["concurrency"] == concurrency + assert "async_" not in nested # serialized key, not the Python attribute name + assert nested["codecs"]["blosc"] == blosc + + +# --------------------------------------------------------------------------- +# 7. ZarrConfigManager.get — proxy string access +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "case", + [ + Expect(input="array.order", output="C", id="array-order"), + Expect(input="async.concurrency", output=10, id="async-concurrency-alias"), + Expect(input="codecs", output=DEFAULT_CODECS, id="codecs-dict"), + ], + ids=lambda c: c.id, +) +def test_proxy_get(case: Expect[str, object]) -> None: + assert ZarrConfigManager().get(case.input) == case.output + + +@pytest.mark.parametrize( + "case", + [ + Expect(input=("does.not.exist", "fallback"), output="fallback", id="default-fallback"), + ], + ids=lambda c: c.id, +) +def test_proxy_get_with_default(case: Expect[tuple[str, object], object]) -> None: + key, default = case.input + assert ZarrConfigManager().get(key, default) == case.output + + +# --------------------------------------------------------------------------- +# 8. Removed-deprecated-key behavior +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "case", + [ + Expect(input="fallback", output="fallback", id="get-with-default"), + ], + ids=lambda c: c.id, +) +def test_removed_deprecated_key_get_default(case: Expect[str, str]) -> None: + """get() with a removed deprecated key and a default returns the default silently.""" + assert ZarrConfigManager().get(_REMOVED_KEY, case.input) == case.output + + +@pytest.mark.parametrize( + "case", + [ + ExpectFail(input=_REMOVED_KEY, exception=KeyError, id="get-no-default"), + ], + ids=lambda c: c.id, +) +def test_removed_deprecated_key_get_raises(case: ExpectFail[str]) -> None: + """get() with a removed deprecated key and no default raises KeyError.""" + with case.raises(): + ZarrConfigManager().get(case.input) + + +# --------------------------------------------------------------------------- +# 9. set() must raise for both removed-deprecated keys and totally unknown keys +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "case", + [ + ExpectFail( + input={_REMOVED_KEY: "some_value"}, + exception=BadConfigError, + id="set-removed-deprecated", + ), + ExpectFail( + input={"totally.bogus.key": 1}, + exception=KeyError, + id="set-unknown-key", + ), + ], + ids=lambda c: c.id, +) +def test_set_invalid_key_raises(case: ExpectFail[dict[str, object]]) -> None: + """set() raises for both removed deprecated keys and totally unknown structured keys.""" + with case.raises(): + ZarrConfigManager().set(case.input) + + +# --------------------------------------------------------------------------- +# Default config values (dedicated — direct attribute assertions are clearest here) +# --------------------------------------------------------------------------- + + +def test_default_config_values() -> None: + cfg = make_default_config() + assert cfg.default_zarr_format == 3 assert cfg.array.order == "C" - assert cfg.get("array.order") == "C" - assert cfg.get("async.concurrency") == 10 - assert cfg.get("codecs", {})["blosc"] == "zarr.codecs.blosc.BloscCodec" - assert cfg.get("does.not.exist", "fallback") == "fallback" + assert cfg.array.sharding_coalesce_max_bytes == 16 << 20 + assert cfg.async_.concurrency == 10 + assert cfg.async_.timeout is None + assert cfg.threading.max_workers is None + assert cfg.json_indent == 2 + assert cfg.codec_pipeline.path == "zarr.core.codec_pipeline.BatchedCodecPipeline" + assert cfg.codecs["blosc"] == "zarr.codecs.blosc.BloscCodec" + assert cfg.codecs == DEFAULT_CODECS + # proxy attribute access via ZarrConfigManager + mgr = ZarrConfigManager() + assert mgr.array.order == "C" + + +# --------------------------------------------------------------------------- +# Stateful / behavioral tests (kept as dedicated functions) +# --------------------------------------------------------------------------- def test_set_permanent_and_context() -> None: @@ -171,34 +363,6 @@ def test_refresh_not_shadowed_by_prior_scope(monkeypatch: pytest.MonkeyPatch) -> assert mgr.get("array.order") == "C" # the prior permanent set is gone after rebuild -# --------------------------------------------------------------------------- -# Removed-deprecated-key behavior (donfig-faithful) -# --------------------------------------------------------------------------- - -_REMOVED_KEY = "array.v2_default_compressor.numeric" - - -def test_get_removed_deprecated_key_with_default() -> None: - """get() with a removed deprecated key and a default must return the default silently.""" - mgr = ZarrConfigManager() - result = mgr.get(_REMOVED_KEY, "fallback") - assert result == "fallback" - - -def test_get_removed_deprecated_key_no_default_raises_key_error() -> None: - """get() with a removed deprecated key and no default must raise KeyError, not BadConfigError.""" - mgr = ZarrConfigManager() - with pytest.raises(KeyError): - mgr.get(_REMOVED_KEY) - - -def test_set_removed_deprecated_key_raises_bad_config_error() -> None: - """set() with a removed deprecated key must still raise BadConfigError.""" - mgr = ZarrConfigManager() - with pytest.raises(BadConfigError): - mgr.set({_REMOVED_KEY: "some_value"}) - - # --------------------------------------------------------------------------- # Tolerant ingest: unknown env/YAML keys must warn and be skipped, not crash # --------------------------------------------------------------------------- @@ -227,10 +391,9 @@ def test_apply_overrides_unknown_key_warns_and_returns_default() -> None: assert result == default -def test_config_set_still_strict_for_unknown_keys() -> None: - """config.set() must remain strict: unknown structured keys raise KeyError.""" - with pytest.raises(KeyError): - ZarrConfigManager().set({"totally.bogus.key": 1}) +# --------------------------------------------------------------------------- +# donfig not imported +# --------------------------------------------------------------------------- def test_donfig_not_imported() -> None: From 9ac3c76bb2924e71908aa7554331542717e5ec1c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 17:40:20 +0200 Subject: [PATCH 19/25] test(config): restore codecs in-place mutation guard in immutability test Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- tests/test_config_typed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 72dc2501b6..5a360c3508 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -93,6 +93,10 @@ def test_replace_path_is_immutable() -> None: cfg = make_default_config() _ = replace_path(cfg, "array.order", "F") assert cfg.array.order == "C" + # the open `codecs` dict must not be mutated in place either: a frozen + # dataclass forbids attribute re-assignment but not `dict.__setitem__`. + _ = replace_path(cfg, "codecs.my_codec", "my.module.MyCodec") + assert "my_codec" not in cfg.codecs # --------------------------------------------------------------------------- From 4e66915b684c03a90120e77a72d2974725e812d4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 20:22:31 +0200 Subject: [PATCH 20/25] test(config): verify get() overload return types match the schema Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- tests/test_config_typed.py | 81 ++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 8 deletions(-) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 5a360c3508..530b45204c 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -456,27 +456,33 @@ def test_build_config_environ_yaml_path_is_read(tmp_path: pathlib.Path) -> None: # --------------------------------------------------------------------------- -def _structured_leaf_keys(cfg_cls: type, prefix: str = "") -> list[str]: - """Walk a settings dataclass recursively and return every dotted leaf key. +def _structured_leaf_specs(cfg_cls: type, prefix: str = "") -> dict[str, object]: + """Walk a settings dataclass recursively and return ``{dotted_key: resolved_type}``. Uses ``typing.get_type_hints`` instead of ``f.type`` so that the ``from __future__ import annotations`` string-annotation form is resolved - to real types before ``dataclasses.is_dataclass`` is called. + to real types before ``dataclasses.is_dataclass`` is called. The open + ``codecs`` mapping is intentionally excluded. """ - keys: list[str] = [] + specs: dict[str, object] = {} resolved_hints = typing.get_type_hints(cfg_cls) for f in dataclasses.fields(cfg_cls): serialized = _SERIALIZED_NAMES.get(f.name, f.name) key = f"{prefix}.{serialized}" if prefix else serialized resolved_type = resolved_hints[f.name] if dataclasses.is_dataclass(resolved_type): - keys.extend(_structured_leaf_keys(typing.cast(type, resolved_type), key)) + specs.update(_structured_leaf_specs(typing.cast(type, resolved_type), key)) elif f.name == "codecs": # open mapping — intentionally not enumerated continue else: - keys.append(key) - return keys + specs[key] = resolved_type + return specs + + +def _structured_leaf_keys(cfg_cls: type, prefix: str = "") -> list[str]: + """Return every dotted leaf key for a settings dataclass (derived from specs).""" + return list(_structured_leaf_specs(cfg_cls, prefix)) def test_every_structured_key_has_a_get_overload() -> None: @@ -493,6 +499,48 @@ def test_every_structured_key_has_a_get_overload() -> None: assert not missing, f"get() overloads missing for: {sorted(missing)}" +def test_get_overload_return_types_match_fields() -> None: + """Assert that each get() overload's return type matches the dataclass field type. + + Builds two maps using ``typing.get_type_hints`` — one from the dataclass + field annotations, one from the overload return hints — then compares them + key by key. A mismatch (e.g. ``-> str`` instead of ``-> Literal["C","F"]``) + is reported as a clear failure rather than a missing-overload failure. + """ + # Build map: key -> return type from overloads + overloads = typing.get_overloads(ZarrConfigManager.get) + overload_return: dict[str, object] = {} + for ov in overloads: + hints = typing.get_type_hints(ov) + key_hint = hints.get("key") + if typing.get_origin(key_hint) is typing.Literal: + (literal_val,) = typing.get_args(key_hint) + overload_return[literal_val] = hints["return"] + + # Build map: key -> field type from the dataclass schema + field_specs = _structured_leaf_specs(ZarrConfig) + + missing: list[str] = [] + mismatched: list[str] = [] + for key, expected_type in field_specs.items(): + if key not in overload_return: + missing.append(f" {key!r}: missing overload") + elif overload_return[key] != expected_type: + mismatched.append( + f" {key!r}: overload returns {overload_return[key]!r}," + f" field type is {expected_type!r}" + ) + + errors: list[str] = [] + if missing: + errors.append("get() overloads missing for keys:\n" + "\n".join(missing)) + if mismatched: + errors.append( + "get() overload return types do not match field types:\n" + "\n".join(mismatched) + ) + assert not errors, "\n\n".join(errors) + + # --------------------------------------------------------------------------- # Static-typing smoke test (only checked by mypy, not executed at runtime) # --------------------------------------------------------------------------- @@ -500,6 +548,23 @@ def test_every_structured_key_has_a_get_overload() -> None: if typing.TYPE_CHECKING: def _typing_smoke(cfg: ZarrConfigManager) -> None: + # --- positive assertions: each distinct return shape --- typing.assert_type(cfg.get("array.order"), typing.Literal["C", "F"]) - typing.assert_type(cfg.array.order, typing.Literal["C", "F"]) typing.assert_type(cfg.get("async.concurrency"), int) + typing.assert_type(cfg.get("array.write_empty_chunks"), bool) + typing.assert_type(cfg.get("async.timeout"), float | None) + typing.assert_type(cfg.get("threading.max_workers"), int | None) + typing.assert_type(cfg.get("default_zarr_format"), typing.Literal[2, 3]) + typing.assert_type(cfg.get("buffer"), str) + typing.assert_type(cfg.array.order, typing.Literal["C", "F"]) + + # --- negative: precision-from-above guards --- + # The return type is Literal["C","F"], which is narrower than str. + # If the overload were widened to -> str, assert_type would pass and + # the ignore below would become unused, causing warn_unused_ignores to + # fail CI. + typing.assert_type(cfg.get("array.order"), str) # type: ignore[assert-type] + typing.assert_type(cfg.get("default_zarr_format"), int) # type: ignore[assert-type] + + # --- negative: bad key type must be rejected by all overloads --- + cfg.get(123) # type: ignore[call-overload] From 033d11773565c5e0325d398e8d567811e11bb265 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 20:23:49 +0200 Subject: [PATCH 21/25] docs(config): document why set() is not statically value-typed set() takes Mapping[str, Any] (no static value validation) because precise typing needs an open TypedDict (declared keys + arbitrary codecs.* str keys), which mypy 2.x does not support; a closed TypedDict would break the open config.set({'codecs.': ...}) idiom. Revisit when mypy ships PEP 728 or we adopt a checker that supports it. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 92dc2cb615..49058f9fb2 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -434,6 +434,28 @@ def get(self, key: str, default: Any = _MISSING) -> Any: return default # --- string API: set -------------------------------------------------- + # + # NOTE: `set` accepts `Mapping[str, Any]`, so — unlike `get`, which is fully + # typed via per-key overloads — it does NOT statically validate values: + # `config.set({"array.order": "Q"})` is not a type error; it is caught at + # runtime instead. This is a deliberate, documented limitation. + # + # Static value typing would require an *open* TypedDict — declared structured + # keys validated by type, PLUS arbitrary `codecs.` string keys allowed + # (PEP 728 `extra_items`/`closed`). mypy (2.x) supports PEP 728 in no syntax + # and offers no feature flag for it. A *closed* TypedDict would instead reject + # the open codec-selection idiom + # `config.set({"codecs.bytes": "your.module.NewBytesCodec"})` and any + # dynamically built `dict[str, Any]` — a backwards-compatibility regression + # (the `codecs` namespace maps a codec name to a class path and is extended at + # runtime by users/plugins, so its keys cannot be enumerated statically). + # So `set` is intentionally permissive and validated at runtime: unknown + # structured keys raise (see `replace_path`), while `codecs.*` stays writable. + # + # REVISIT when mypy ships PEP 728 open-TypedDict support, or if zarr adopts a + # type checker that supports it (e.g. pyright's open/closed TypedDicts). At + # that point `set` can take an open TypedDict for static value validation + # while keeping `codecs.*` open. def set(self, updates: Mapping[str, Any] | None = None, **kwargs: Any) -> _ConfigSet: """Apply one or more config overrides. @@ -442,6 +464,12 @@ def set(self, updates: Mapping[str, Any] | None = None, **kwargs: Any) -> _Confi config.set({"array.order": "F"}) config.set(default_zarr_format=2) + + Unlike `get`, `set` does not statically type-check values: an invalid + value such as `config.set({"array.order": "Q"})` is reported at runtime, + not by the type checker. See the implementation comment above for the + rationale (the open `codecs.*` namespace prevents a precise TypedDict + under current mypy). """ all_updates: dict[str, Any] = {} if updates: From 037932954e6e6997d14dba0a4b442baa1baf7863 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 20:45:23 +0200 Subject: [PATCH 22/25] refactor(config): prefer object over Any for pass-through values Use object instead of Any for parameters/returns that merely store, pass through, or compare values (path helpers, env/YAML ingest, set/update kwargs, token, parse_indexing_order). Any is kept where it is load-bearing: the dynamic dataclasses.replace/fields dispatch, the get() fallback overload that powers config.get('codecs', {}).get(name), and the heterogeneous nested tree returned by to_nested_dict/defaults/to_dict (navigated by key). object also let mypy narrow parse_indexing_order, removing a redundant cast. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 63 +++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 49058f9fb2..5127658a7e 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -39,7 +39,7 @@ import os import warnings from collections.abc import Mapping -from contextvars import ContextVar +from contextvars import ContextVar, Token from dataclasses import dataclass, field, fields, replace from typing import Any, Literal, Self, cast, overload @@ -131,12 +131,12 @@ def make_default_config() -> ZarrConfig: return ZarrConfig() -def _resolve_field(obj: Any, segment: str) -> str: +def _resolve_field(obj: object, segment: str) -> str: """Translate a serialized key segment to the dataclass field name.""" return _FIELD_ALIASES.get(segment, segment) -def get_path(cfg: ZarrConfig, key: str) -> Any: +def get_path(cfg: ZarrConfig, key: str) -> object: """Read a dotted-string key from a `ZarrConfig` snapshot. Raises @@ -144,7 +144,7 @@ def get_path(cfg: ZarrConfig, key: str) -> Any: KeyError If the key does not resolve to a value. """ - obj: Any = cfg + obj: object = cfg segments = key.split(".") for i, segment in enumerate(segments): if isinstance(obj, Mapping): @@ -161,13 +161,16 @@ def get_path(cfg: ZarrConfig, key: str) -> Any: return obj -def replace_path(cfg: ZarrConfig, key: str, value: Any) -> ZarrConfig: +def replace_path(cfg: ZarrConfig, key: str, value: object) -> ZarrConfig: """Return a new `ZarrConfig` with the dotted-string key set to ``value``.""" segments = key.split(".") return cast(ZarrConfig, _replace_recursive(cfg, segments, value, key)) -def _replace_recursive(obj: Any, segments: list[str], value: Any, key: str) -> Any: +# `obj: Any` is load-bearing here: the function dispatches dynamically between a +# `Mapping` (codecs subtree) and a dataclass instance, and `dataclasses.replace` +# requires a dataclass-typed argument that `object` would reject. +def _replace_recursive(obj: Any, segments: list[str], value: object, key: str) -> object: segment = segments[0] if isinstance(obj, Mapping): remainder = ".".join(segments) @@ -183,8 +186,14 @@ def _replace_recursive(obj: Any, segments: list[str], value: Any, key: str) -> A def to_nested_dict(cfg: ZarrConfig) -> dict[str, Any]: - """Convert a `ZarrConfig` to a donfig-style nested dict (serialized keys).""" + """Convert a `ZarrConfig` to a donfig-style nested dict (serialized keys). + Returns a heterogeneous, JSON-like tree (nested dicts and scalars) that + callers navigate by key, so `Any` values are appropriate here. + """ + + # `obj: Any` is also load-bearing: `dataclasses.fields` requires a + # dataclass-typed argument that `object` would reject. def convert(obj: Any) -> Any: if isinstance(obj, Mapping): return dict(obj) @@ -206,7 +215,7 @@ def convert(obj: Any) -> Any: _ENV_META_VARS: frozenset[str] = frozenset({"ZARR_CONFIG"}) -def _parse_env_value(raw: str) -> Any: +def _parse_env_value(raw: str) -> object: """Parse an env value with ``ast.literal_eval``; fall back to the raw string.""" try: return ast.literal_eval(raw) @@ -214,7 +223,7 @@ def _parse_env_value(raw: str) -> Any: return raw -def collect_env(environ: Mapping[str, str]) -> dict[str, Any]: +def collect_env(environ: Mapping[str, str]) -> dict[str, object]: """Collect ``ZARR_*`` environment variables into a flat dotted-key map. ``ZARR_FOO__BAR_BAZ=1`` becomes ``{"foo.bar_baz": 1}`` — the key is @@ -223,7 +232,7 @@ def collect_env(environ: Mapping[str, str]) -> dict[str, Any]: Variables listed in ``_ENV_META_VARS`` (e.g. ``ZARR_CONFIG``) are directives about where config lives and are skipped. """ - out: dict[str, Any] = {} + out: dict[str, object] = {} for name, raw in environ.items(): if not name.startswith(ENV_PREFIX): continue @@ -245,11 +254,11 @@ def _config_search_paths(environ: Mapping[str, str]) -> list[str]: return paths -def collect_yaml(paths: list[str]) -> dict[str, Any]: +def collect_yaml(paths: list[str]) -> dict[str, object]: """Merge YAML config files found at ``paths`` into a flat dotted-key map.""" import yaml - merged: dict[str, Any] = {} + merged: dict[str, object] = {} for path in paths: candidates: list[str] = [] if os.path.isdir(path): @@ -269,8 +278,8 @@ def collect_yaml(paths: list[str]) -> dict[str, Any]: return merged -def _flatten_mapping(data: Mapping[str, Any], prefix: str = "") -> dict[str, Any]: - out: dict[str, Any] = {} +def _flatten_mapping(data: Mapping[str, object], prefix: str = "") -> dict[str, object]: + out: dict[str, object] = {} for k, v in data.items(): key = f"{prefix}{k}" if not prefix else f"{prefix}.{k}" if isinstance(v, Mapping): @@ -280,7 +289,7 @@ def _flatten_mapping(data: Mapping[str, Any], prefix: str = "") -> dict[str, Any return out -def apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, Any]) -> ZarrConfig: +def apply_overrides(cfg: ZarrConfig, overrides: Mapping[str, object]) -> ZarrConfig: """Apply a flat dotted-key override map to a snapshot. Used exclusively by `build_config` for env/YAML ingest. Unknown keys are @@ -319,7 +328,9 @@ class _ConfigSet: as a ``with`` block restores the prior state on exit. """ - def __init__(self, manager: ZarrConfigManager, prev_base: ZarrConfig, token: Any) -> None: + def __init__( + self, manager: ZarrConfigManager, prev_base: ZarrConfig, token: Token[ZarrConfig] + ) -> None: self._manager = manager self._prev_base = prev_base self._token = token @@ -342,7 +353,7 @@ def __init__(self) -> None: def _current(self) -> ZarrConfig: return self._scope.get(self._base) - def _restore(self, prev_base: ZarrConfig, token: Any) -> None: + def _restore(self, prev_base: ZarrConfig, token: Token[ZarrConfig]) -> None: self._base = prev_base self._scope.reset(token) @@ -417,9 +428,12 @@ def get(self, key: Literal["buffer"]) -> str: ... @overload def get(self, key: Literal["ndbuffer"]) -> str: ... @overload - def get(self, key: str, default: Any = ...) -> Any: ... + # The fallback `-> Any` is deliberate: it lets `config.get("codecs", {})` be + # used as a mapping (e.g. `.get(name)` in the registry) and supports unknown + # keys. `object` here would force every such call site to narrow first. + def get(self, key: str, default: object = ...) -> Any: ... - def get(self, key: str, default: Any = _MISSING) -> Any: + def get(self, key: str, default: object = _MISSING) -> Any: resolved = self._apply_deprecation(key, raise_on_removed=False) if resolved is None: # Key was removed; treat as absent — honour the caller's default. @@ -456,7 +470,7 @@ def get(self, key: str, default: Any = _MISSING) -> Any: # type checker that supports it (e.g. pyright's open/closed TypedDicts). At # that point `set` can take an open TypedDict for static value validation # while keeping `codecs.*` open. - def set(self, updates: Mapping[str, Any] | None = None, **kwargs: Any) -> _ConfigSet: + def set(self, updates: Mapping[str, object] | None = None, **kwargs: object) -> _ConfigSet: """Apply one or more config overrides. Accepts either a mapping of dotted keys to values, keyword arguments @@ -471,7 +485,7 @@ def set(self, updates: Mapping[str, Any] | None = None, **kwargs: Any) -> _Confi rationale (the open `codecs.*` namespace prevents a precise TypedDict under current mypy). """ - all_updates: dict[str, Any] = {} + all_updates: dict[str, object] = {} if updates: all_updates.update(updates) all_updates.update(kwargs) @@ -509,7 +523,7 @@ def defaults(self) -> dict[str, Any]: def to_dict(self) -> dict[str, Any]: return to_nested_dict(self._current()) - def update(self, updates: Mapping[str, Any]) -> None: + def update(self, updates: Mapping[str, object]) -> None: self.set(updates) def pprint(self) -> None: @@ -584,8 +598,9 @@ class BadConfigError(ValueError): config = ZarrConfigManager() -def parse_indexing_order(data: Any) -> Literal["C", "F"]: +def parse_indexing_order(data: object) -> Literal["C", "F"]: if data in ("C", "F"): - return cast("Literal['C', 'F']", data) + # the membership check narrows `data` to Literal["C", "F"] + return data msg = f"Expected one of ('C', 'F'), got {data} instead." raise ValueError(msg) From 86c8bcacd0cacdc69d1b6397368c25d1d3da4a08 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 22:53:37 +0200 Subject: [PATCH 23/25] feat(config): suggest the closest key on an unknown config key config.get/config.set on an unknown key now raise a KeyError that names the key and suggests the most similar valid one (via difflib), e.g. 'array.0rder' -> "Did you mean 'array.order'?". Candidates are the schema's container and leaf keys plus current codecs.* entries. Still a KeyError, so existing handlers are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 45 +++++++++++++++++++++++++++++++++++--- tests/test_config_typed.py | 39 +++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 5127658a7e..77d468434d 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -36,6 +36,7 @@ import ast import contextlib +import difflib import os import warnings from collections.abc import Mapping @@ -185,6 +186,40 @@ def _replace_recursive(obj: Any, segments: list[str], value: object, key: str) - return replace(obj, **{field_name: new_child}) +def _all_keys(cfg: ZarrConfig) -> list[str]: + """Return every valid dotted key for ``cfg``. + + Includes container keys (e.g. ``array``), leaf keys (e.g. ``array.order``), + and the current ``codecs.`` entries. Used to suggest a close match + when an unknown key is requested. + """ + keys: list[str] = [] + + # `obj: Any` is load-bearing: `dataclasses.fields` requires a dataclass arg. + def walk(obj: Any, prefix: str) -> None: + for f in fields(obj): + serialized = _SERIALIZED_NAMES.get(f.name, f.name) + key = f"{prefix}.{serialized}" if prefix else serialized + keys.append(key) + value = getattr(obj, f.name) + if isinstance(value, Mapping): + keys.extend(f"{key}.{name}" for name in value) + elif hasattr(type(value), "__dataclass_fields__"): + walk(value, key) + + walk(cfg, "") + return keys + + +def _unknown_key_error(key: str, cfg: ZarrConfig) -> KeyError: + """Build a `KeyError` for an unknown config key, suggesting the closest match.""" + msg = f"{key!r} is not a valid configuration key." + matches = difflib.get_close_matches(key, _all_keys(cfg), n=1) + if matches: + msg += f" Did you mean {matches[0]!r}?" + return KeyError(msg) + + def to_nested_dict(cfg: ZarrConfig) -> dict[str, Any]: """Convert a `ZarrConfig` to a donfig-style nested dict (serialized keys). @@ -440,11 +475,12 @@ def get(self, key: str, default: object = _MISSING) -> Any: if default is _MISSING: raise KeyError(key) return default + current = self._current() try: - return get_path(self._current(), resolved) + return get_path(current, resolved) except KeyError: if default is _MISSING: - raise + raise _unknown_key_error(key, current) from None return default # --- string API: set -------------------------------------------------- @@ -493,7 +529,10 @@ def set(self, updates: Mapping[str, object] | None = None, **kwargs: object) -> new = self._current() for key, value in all_updates.items(): resolved = self._apply_deprecation(key, raise_on_removed=True) - new = replace_path(new, resolved, value) + try: + new = replace_path(new, resolved, value) + except KeyError: + raise _unknown_key_error(key, new) from None self._base = new token = self._scope.set(new) return _ConfigSet(self, prev_base, token) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 530b45204c..5ceb31db2c 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -291,6 +291,45 @@ def test_set_invalid_key_raises(case: ExpectFail[dict[str, object]]) -> None: ZarrConfigManager().set(case.input) +# --------------------------------------------------------------------------- +# 10. Unknown keys produce a helpful "did you mean" message (get and set) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "case", + [ + ExpectFail(input="array.0rder", exception=KeyError, msg=r"array\.order", id="get-typo"), + ExpectFail( + input="zzzzzzzz", + exception=KeyError, + msg="not a valid configuration key", + id="get-no-match", + ), + ], + ids=lambda c: c.id, +) +def test_get_unknown_key_message(case: ExpectFail[str]) -> None: + """get() on an unknown key reports it and suggests the closest valid key.""" + with case.raises(): + ZarrConfigManager().get(case.input) + + +@pytest.mark.parametrize( + "case", + [ + ExpectFail( + input={"array.0rder": "F"}, exception=KeyError, msg=r"array\.order", id="set-typo" + ), + ], + ids=lambda c: c.id, +) +def test_set_unknown_key_message(case: ExpectFail[dict[str, object]]) -> None: + """set() on an unknown structured key suggests the closest valid key.""" + with case.raises(): + ZarrConfigManager().set(case.input) + + # --------------------------------------------------------------------------- # Default config values (dedicated — direct attribute assertions are clearest here) # --------------------------------------------------------------------------- From ba646b3d336c812d27487c9e8a25b2b517c982a1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 25 Jun 2026 23:07:48 +0200 Subject: [PATCH 24/25] feat(config): roster fallback for unknown keys; scope suggestion to level When an unknown key has no close match, list the available keys at the deepest resolvable level (capped at 10, '... (N more)' beyond). Suggestions are now scoped to the failed segment vs that level's children, which avoids misleading prefix matches (e.g. 'codecs.unknown' no longer suggests 'codecs.numcodecs.zstd'; it lists the codec names instead). Use explicit len()/!= '' checks rather than collection truthiness, and is_dataclass for narrowing. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XKHgWSxDXtTmNgAebZg41U --- src/zarr/core/config.py | 75 +++++++++++++++++++++++++------------- tests/test_config_typed.py | 46 +++++++++++++++++++---- 2 files changed, 89 insertions(+), 32 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 77d468434d..cdc27f25ab 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -41,7 +41,7 @@ import warnings from collections.abc import Mapping from contextvars import ContextVar, Token -from dataclasses import dataclass, field, fields, replace +from dataclasses import dataclass, field, fields, is_dataclass, replace from typing import Any, Literal, Self, cast, overload from zarr.errors import ZarrDeprecationWarning, ZarrUserWarning @@ -186,37 +186,62 @@ def _replace_recursive(obj: Any, segments: list[str], value: object, key: str) - return replace(obj, **{field_name: new_child}) -def _all_keys(cfg: ZarrConfig) -> list[str]: - """Return every valid dotted key for ``cfg``. +_ROSTER_LIMIT = 10 + + +def _children(obj: object) -> list[str]: + """Return the immediate child key names of a config node (else an empty list).""" + if isinstance(obj, Mapping): + return list(obj) + if is_dataclass(obj): + return [_SERIALIZED_NAMES.get(f.name, f.name) for f in fields(obj)] + return [] - Includes container keys (e.g. ``array``), leaf keys (e.g. ``array.order``), - and the current ``codecs.`` entries. Used to suggest a close match - when an unknown key is requested. - """ - keys: list[str] = [] - # `obj: Any` is load-bearing: `dataclasses.fields` requires a dataclass arg. - def walk(obj: Any, prefix: str) -> None: - for f in fields(obj): - serialized = _SERIALIZED_NAMES.get(f.name, f.name) - key = f"{prefix}.{serialized}" if prefix else serialized - keys.append(key) - value = getattr(obj, f.name) - if isinstance(value, Mapping): - keys.extend(f"{key}.{name}" for name in value) - elif hasattr(type(value), "__dataclass_fields__"): - walk(value, key) +def _resolve_for_suggestion(cfg: ZarrConfig, key: str) -> tuple[str, list[str], str]: + """Walk ``key`` as far as it resolves. - walk(cfg, "") - return keys + Returns the deepest resolvable dotted prefix, that node's child key names, + and the first segment that failed to resolve (the remainder is treated as a + single key once an open mapping like ``codecs`` is reached). For + ``"array.bogus"`` this is ``("array", [], "bogus")``; + for an unknown top-level key, ``("", [], )``. + """ + obj: object = cfg + prefix = "" + segments = key.split(".") + for i, segment in enumerate(segments): + if isinstance(obj, Mapping): + # the remainder indexes into an open mapping as a single key + return prefix, _children(obj), ".".join(segments[i:]) + field_name = _resolve_field(obj, segment) + if not hasattr(obj, field_name): + return prefix, _children(obj), segment + obj = getattr(obj, field_name) + prefix = f"{prefix}.{segment}" if prefix else segment + return prefix, _children(obj), "" def _unknown_key_error(key: str, cfg: ZarrConfig) -> KeyError: - """Build a `KeyError` for an unknown config key, suggesting the closest match.""" + """Build a `KeyError` for an unknown config key. + + Resolves ``key`` to the deepest valid level, then suggests the closest child + key there if one is similar enough; otherwise lists the available keys at + that level (capped at `_ROSTER_LIMIT`). + """ msg = f"{key!r} is not a valid configuration key." - matches = difflib.get_close_matches(key, _all_keys(cfg), n=1) - if matches: - msg += f" Did you mean {matches[0]!r}?" + prefix, children, failed = _resolve_for_suggestion(cfg, key) + matches = difflib.get_close_matches(failed, children, n=1) if failed != "" else [] + if len(matches) > 0: + suggestion = f"{prefix}.{matches[0]}" if prefix != "" else matches[0] + return KeyError(f"{msg} Did you mean {suggestion!r}?") + if len(children) > 0: + shown = sorted(children) + roster = ", ".join(shown[:_ROSTER_LIMIT]) + if len(shown) > _ROSTER_LIMIT: + roster += f", ... ({len(shown) - _ROSTER_LIMIT} more)" + where = f" under {prefix!r}" if prefix != "" else "" + msg = f"{msg} Valid keys{where}: {roster}." return KeyError(msg) diff --git a/tests/test_config_typed.py b/tests/test_config_typed.py index 5ceb31db2c..878562049a 100644 --- a/tests/test_config_typed.py +++ b/tests/test_config_typed.py @@ -299,18 +299,41 @@ def test_set_invalid_key_raises(case: ExpectFail[dict[str, object]]) -> None: @pytest.mark.parametrize( "case", [ - ExpectFail(input="array.0rder", exception=KeyError, msg=r"array\.order", id="get-typo"), + # close match at the deepest resolvable level -> "Did you mean ...?" ExpectFail( - input="zzzzzzzz", + input="arr4y", exception=KeyError, msg=r"Did you mean .array.", id="suggest-top" + ), + ExpectFail( + input="array.0rder", + exception=KeyError, + msg=r"Did you mean .array\.order.", + id="suggest-nested", + ), + ExpectFail( + input="codecs.bl0sc", + exception=KeyError, + msg=r"Did you mean .codecs\.blosc.", + id="suggest-codec", + ), + # no close match -> roster of available keys at the last resolvable level + ExpectFail(input="foo", exception=KeyError, msg=r"Valid keys: .*array", id="roster-top"), + ExpectFail( + input="array.foo", exception=KeyError, - msg="not a valid configuration key", - id="get-no-match", + msg=r"Valid keys under .array.: .*order", + id="roster-nested", + ), + ExpectFail( + input="codecs.zzzzzzzz", + exception=KeyError, + msg=r"under .codecs.: .*more\)", + id="roster-truncated", ), ], ids=lambda c: c.id, ) def test_get_unknown_key_message(case: ExpectFail[str]) -> None: - """get() on an unknown key reports it and suggests the closest valid key.""" + """get() on an unknown key suggests the closest key or lists what's available.""" with case.raises(): ZarrConfigManager().get(case.input) @@ -319,13 +342,22 @@ def test_get_unknown_key_message(case: ExpectFail[str]) -> None: "case", [ ExpectFail( - input={"array.0rder": "F"}, exception=KeyError, msg=r"array\.order", id="set-typo" + input={"array.0rder": "F"}, + exception=KeyError, + msg=r"Did you mean .array\.order.", + id="set-suggest", + ), + ExpectFail( + input={"array.foo": "F"}, + exception=KeyError, + msg=r"Valid keys under .array.: .*order", + id="set-roster", ), ], ids=lambda c: c.id, ) def test_set_unknown_key_message(case: ExpectFail[dict[str, object]]) -> None: - """set() on an unknown structured key suggests the closest valid key.""" + """set() shares the same helpful unknown-key error as get().""" with case.raises(): ZarrConfigManager().set(case.input) From fae7fac746b191041c8cbb858aa170a05123de5e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 26 Jun 2026 10:27:03 +0200 Subject: [PATCH 25/25] test(config): drop tautological codec_pipeline default assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first `set`/`get` pair set `codec_pipeline.path` to the default `BatchedCodecPipeline` and asserted that default, so it passed regardless of whether `set` did anything — a no-op assertion donfig's permissiveness had masked (it also carried the original `.name` typo). The Mock pipeline block below already exercises set->get->actual-use with a non-default class, so this pair was redundant. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_018fnKiuy15kq7cPgRSKD3dr --- tests/test_config.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index e1d48a8079..1eac0a1253 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -118,9 +118,6 @@ def test_config_codec_pipeline_class(store: Store) -> None: # has default value assert get_pipeline_class().__name__ != "" - config.set({"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"}) - assert get_pipeline_class() == zarr.core.codec_pipeline.BatchedCodecPipeline - _mock = Mock() class MockCodecPipeline(BatchedCodecPipeline):