imcf · lguerard · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 
 > Tiled processing of arbitrarily large images — any image, any function.
 
-```
+```text
 ┌──────┬──────┬──────┐     fn(tile) → labels      ┌──────┬──────┬──────┐
 │ tile │ tile │ tile │  ─────────────────────►    │  1   │  2   │  3   │
 ├──────┼──────┼──────┤                            ├──────┼──────┼──────┤
@@ -295,6 +295,7 @@ Full docs, guides and tutorials: **<https://imcf.one/patchworks/>**
 - dask[array], numpy, zarr, scipy
 
 Optional:
+
 - `psutil` — accurate RAM sizing for `tile_shape="auto"`
 - `nvidia-ml-py` — accurate GPU VRAM sizing
 - `tqdm` — progress bars

diff --git a/docs/examples/stardist.md b/docs/examples/stardist.md
@@ -47,18 +47,18 @@ tile_process(
     Load the model **outside** the `fn` closure. If you load it inside,
     it will be re-initialised (and potentially re-downloaded) once per tile.
 
-    For distributed execution, use `functools.partial` with a cached model:
+For distributed execution, use `functools.partial` with a cached model:
 
-    ```python
-    from functools import lru_cache
+```python
+from functools import lru_cache
 
 
-    @lru_cache(maxsize=1)
-    def _get_model():
-        return StarDist2D.from_pretrained("2D_versatile_fluo")
+@lru_cache(maxsize=1)
+def _get_model():
+    return StarDist2D.from_pretrained("2D_versatile_fluo")
 
 
-    def stardist_fn(tile):
-        model = _get_model()
-        ...
-    ```
+def stardist_fn(tile):
+    model = _get_model()
+    ...
+```
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -46,11 +46,11 @@ patchworks can be installed from PyPI on all operating systems, for Python ≥ 3
 
 ## The one function you need
 
-```python
-from patchworks import tile_process
+    ```python
+    from patchworks import tile_process
 
-result = tile_process(image, fn)
-```
+    result = tile_process(image, fn)
+    ```
 
 `tile_process(image, fn)` splits `image` into tiles, runs `fn` on each tile,
 and returns a globally consistent label array.
@@ -65,17 +65,17 @@ and returns a globally consistent label array.
 patchworks is method-agnostic. Your function receives a NumPy array (one tile)
 and must return an integer label array of the same shape:
 
-```python
-import numpy as np
+    ```python
+    import numpy as np
 
 
-def my_fn(tile: np.ndarray) -> np.ndarray:
-    from skimage.filters import threshold_otsu
-    from skimage.measure import label
+    def my_fn(tile: np.ndarray) -> np.ndarray:
+        from skimage.filters import threshold_otsu
+        from skimage.measure import label
 
-    binary = tile > threshold_otsu(tile)
-    return label(binary).astype("int32")
-```
+        binary = tile > threshold_otsu(tile)
+        return label(binary).astype("int32")
+    ```
 
 The function is called independently on every tile. patchworks ensures that
 objects spanning tile boundaries are merged into a single label.
@@ -155,14 +155,14 @@ objects spanning tile boundaries are merged into a single label.
 Methods like Cellpose and StarDist need spatial context at tile boundaries.
 Use `overlap` (in voxels) so boundary objects are fully visible:
 
-```python
-result = tile_process(
-    "image.zarr",
-    my_fn,
-    tile_shape=(1, 2048, 2048),
-    overlap=20,  # 20-voxel halo on every side
-)
-```
+    ```python
+    result = tile_process(
+        "image.zarr",
+        my_fn,
+        tile_shape=(1, 2048, 2048),
+        overlap=20,  # 20-voxel halo on every side
+    )
+    ```
 
 !!! info "How overlap works"
     Each tile is expanded by `overlap` voxels on every side before calling `fn`.
@@ -173,22 +173,22 @@ result = tile_process(
 
 ## Use Cellpose
 
-```python
-from patchworks import tile_process
-from patchworks.plugins.cellpose import cellpose_fn
-
-fn = cellpose_fn("cyto3", gpu=True, diameter=30)
-
-tile_process(
-    "image.zarr",
-    fn,
-    channel=0,
-    tile_shape=(1, 2048, 2048),
-    overlap=20,
-    write_to="labels.zarr",
-    progress=True,
-)
-```
+    ```python
+    from patchworks import tile_process
+    from patchworks.plugins.cellpose import cellpose_fn
+
+    fn = cellpose_fn("cyto3", gpu=True, diameter=30)
+
+    tile_process(
+        "image.zarr",
+        fn,
+        channel=0,
+        tile_shape=(1, 2048, 2048),
+        overlap=20,
+        write_to="labels.zarr",
+        progress=True,
+    )
+    ```
 
 See the [Cellpose 2-D example](examples/cellpose_2d.md) for the full workflow.
 

diff --git a/docs/guide/gpu_distributed.md b/docs/guide/gpu_distributed.md
@@ -60,7 +60,7 @@ in the same process as the kernel. When your segmentation function holds the
 Python GIL (every PyTorch/CUDA `eval` does), the worker thread can't send
 heartbeats. The scheduler declares it dead, and the merge fails:
 
-```
+```python
 FutureCancelledError: lost dependencies
 ```
 

diff --git a/docs/guide/merging.md b/docs/guide/merging.md
@@ -9,7 +9,7 @@ even though it's the same cell.
 
 patchworks solves this with a zarr-native merge algorithm:
 
-```
+```text
 Tile A labels:        Tile B labels:        After merge:
 ┌────────────┐        ┌────────────┐        ┌──────────────────────┐
 │  3   1   2 │        │  1   4   2 │        │  3   1   2 │ 501 5 502│
@@ -32,7 +32,7 @@ Each tile's labels are written to a temporary zarr once. This is critical:
 without staging, any downstream operation that reads the label array re-runs
 your segmentation function. The merge internally reads labels multiple times.
 
-```
+```text
 tile_process calls fn once per tile → staged zarr
                                          │
                          merge reads from staged zarr (no fn calls)

diff --git a/docs/guide/ome_zarr_napari.md b/docs/guide/ome_zarr_napari.md
@@ -76,6 +76,28 @@ and streaming the downsampled result out through dask with bounded chunks. The
 graph never chains level-on-level and no whole plane/volume is held in RAM, so
 terabyte images convert in bounded memory.
 
+### Sharding (fewer files)
+
+A big array becomes tens of thousands of tiny chunk files, which strain
+filesystems and object stores. Sharding packs many chunks into one **shard**
+file (zarr v3), cutting the file count ~100×:
+
+```python
+to_ome_zarr("scan.ims", "scan.zarr", shard=True)        # auto ~512 MB shards
+to_ome_zarr("scan.ims", "scan.zarr", shard=(1, 16, 2048, 2048))  # explicit
+```
+
+Default is `shard=False` for maximum reader compatibility — sharding is
+zarr-v3-only, so older tools may not read it (your zarr/napari stack does).
+A sharded write holds ~one shard per worker in RAM, so very large shards cost
+memory.
+
+### Progress
+
+All write steps show a dask progress bar **by default** (`progress=True`), so
+you can see how long a conversion will take. Pass `progress=False` to silence
+it.
+
 !!! note "Install the readers you need"
     `pip install "patchworks[bioio]"` pulls `bioio` plus the `bioio-bioformats`
     catch-all reader (needs a JVM). For speed, add native readers for your

diff --git a/docs/guide/pitfalls.md b/docs/guide/pitfalls.md
@@ -30,7 +30,7 @@ single-GPU runs — patchworks pins it to 1 thread automatically).
 
 patchworks detects in-process clients at startup and raises immediately:
 
-```
+```python
 RuntimeError: Active Dask client uses an in-process worker (processes=False).
 This breaks the label merge when fn holds the GIL. Use a process-based
 cluster instead:

diff --git a/docs/guide/skip_empty.md b/docs/guide/skip_empty.md
@@ -88,6 +88,6 @@ tile_process(
 After a `tile_process` run with `skip_empty=True`, the log reports exactly
 how many tiles ran your function:
 
-```
+```text
 INFO patchworks._core: skip_empty: 486/2200 tiles ran fn, 1714 skipped (max<=412.0)
 ```
diff --git a/docs/guide/tiling.md b/docs/guide/tiling.md
@@ -13,6 +13,7 @@ peak RAM during segmentation is approximately one tile's worth of data.
 ## Choosing a tile size
 
 The right tile size depends on:
+
 - Your available RAM (or GPU VRAM)
 - The minimum context your segmentation method needs (objects should fit fully
   inside a tile, or you need overlap)
@@ -62,7 +63,7 @@ Methods that need spatial context (Cellpose, StarDist, U-Net) produce wrong
 results near tile edges: objects at the boundary are cut off. Overlap fixes this
 by expanding each tile by `overlap` voxels on every side.
 
-```
+```text
 No overlap:        With overlap=20:
 ┌──────────┐      ┌──────────────────┐
 │          │      │  ░░░░░░░░░░░░░░  │
@@ -86,4 +87,4 @@ No overlap:        With overlap=20:
     automatically clips the depth per axis, so z-tiles of size 1 (typical in
     2-D Cellpose mode) get `depth=0` in z even if you pass `overlap=20`.
 
-    Axes that are too small for the requested overlap simply get a smaller halo.
+  Axes that are too small for the requested overlap simply get a smaller halo.
diff --git a/docs/index.md b/docs/index.md
@@ -2,7 +2,7 @@
 
 **Tiled processing of arbitrarily large images — any image, any function.**
 
-```
+```text
 ┌──────┬──────┬──────┐                    ┌──────┬──────┬──────┐
 │      │      │      │   fn(tile) → IDs   │  1   │  2   │  3   │
 │      │      │      │  ───────────────►  │      │      │      │

diff --git a/src/patchworks/_chunks.py b/src/patchworks/_chunks.py
@@ -49,6 +49,14 @@ def auto_overlap(diameter: float, safety: float = 1.0) -> int:
 
 
 def _get_available_memory() -> int:
+    """Return available system RAM in bytes.
+
+    Returns
+    -------
+    int
+        Available memory via ``psutil``, or an 8 GiB fallback if it is not
+        installed.
+    """
     try:
         import psutil
 
@@ -103,7 +111,14 @@ def safe_worker_count(
 
 
 def _get_gpu_memory() -> int:
-    """Return free GPU VRAM in bytes. Falls back to 8 GiB default."""
+    """Return free GPU VRAM in bytes.
+
+    Returns
+    -------
+    int
+        Free VRAM of GPU 0 via ``nvidia-ml-py``, or an 8 GiB fallback if the
+        query fails.
+    """
     try:
         import pynvml
 

diff --git a/src/patchworks/_cluster.py b/src/patchworks/_cluster.py
@@ -9,7 +9,14 @@
 
 
 def _distributed_client():
-    """Return the active dask.distributed Client, or None."""
+    """Return the active dask.distributed Client, or None.
+
+    Returns
+    -------
+    distributed.Client or None
+        The current client, or ``None`` if none is active / distributed is not
+        installed.
+    """
     try:
         from dask.distributed import get_client
 
@@ -19,12 +26,22 @@ def _distributed_client():
 
 
 def _client_is_in_process(client) -> bool:
-    """True if *client* runs its worker in this process (processes=False).
+    """Whether *client* runs its worker in this process (``processes=False``).
 
     An in-process worker shares the GIL. A long task that holds the GIL
     (e.g. a Cellpose/torch eval) starves the worker heartbeat, the scheduler
     declares it dead, and the P2P merge barrier drops its inputs →
     "FutureCancelledError: lost dependencies".
+
+    Parameters
+    ----------
+    client : distributed.Client
+        The client to inspect.
+
+    Returns
+    -------
+    bool
+        True if any worker address uses the ``inproc://`` transport.
     """
     try:
         for addr in client.scheduler_info().get("workers", {}):