From 92cdd6d42b8e067dac5d5eb324206dc33ca5b30e Mon Sep 17 00:00:00 2001
From: Frans Irgolitsch <frans.irgolitsch@polymtl.ca>
Date: Wed, 29 Apr 2026 17:47:16 -0400
Subject: [PATCH 1/6] perf(fix_illum): cap PyTorch threads via
 configure_all_libraries

---
 scripts/linum_fix_illumination_3d.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/linum_fix_illumination_3d.py b/scripts/linum_fix_illumination_3d.py
index 7063c375..7f32c7d6 100644
--- a/scripts/linum_fix_illumination_3d.py
+++ b/scripts/linum_fix_illumination_3d.py
@@ -57,9 +57,12 @@ def _build_arg_parser() -> argparse.ArgumentParser:
 
 def process_tile(params: dict) -> tuple:
     """Process a tile and add it to the output mosaic."""
-    from linumpy.config.threads import apply_threadpool_limits
+    from linumpy.config.threads import configure_all_libraries
 
-    apply_threadpool_limits()
+    # configure_all_libraries() also caps PyTorch intra-/inter-op threads
+    # (used by BaSiCPy). apply_threadpool_limits() alone misses torch and
+    # lets it default to ~half the host cores, oversubscribing the node.
+    configure_all_libraries()
 
     file = params["slice_file"]
     z = params["z"]

From af469527158e11045077ff7d86481ed15f1fa279 Mon Sep 17 00:00:00 2001
From: Frans Irgolitsch <frans.irgolitsch@polymtl.ca>
Date: Wed, 29 Apr 2026 17:42:19 -0400
Subject: [PATCH 2/6] perf(pipeline): bump fix_illumination maxForks 1->4 based
 on measurement

---
 workflows/reconst_3d/nextflow.config | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflows/reconst_3d/nextflow.config b/workflows/reconst_3d/nextflow.config
index f7cafa5b..3765a1ab 100644
--- a/workflows/reconst_3d/nextflow.config
+++ b/workflows/reconst_3d/nextflow.config
@@ -391,9 +391,8 @@ process {
 
     withName: "resample_mosaic_grid" {
         scratch = false
-        // Parallel resampling on GPU. Peak ~9 GB/fork (Gaussian intermediate at
-        // 2x tile size). With 2x A6000 (48 GB each), 6 forks = 3/GPU = ~27 GB,
-        // leaving headroom for fix_illumination running concurrently.
+        // Measured: ~1 GB GPU memory per fork. IO (RAID at 100%) is the
+        // gating factor at 6 forks, not GPU. Bumping higher won't help.
         maxForks = params.use_gpu ? 6 : null
     }
 
@@ -405,8 +404,9 @@ process {
     }
 
     withName: "fix_illumination" {
-        // Limit to 1 parallel instance - BaSiCPy/PyTorch is memory-intensive
-        maxForks = params.use_gpu ? 1 : null
+        // Measured: BaSiCPy/PyTorch uses ~374 MiB per fork on this dataset.
+        // 4 forks ≈ 1.5 GB, well under per-GPU capacity (49 GB).
+        maxForks = params.use_gpu ? 4 : null
         // Don't set CUDA_VISIBLE_DEVICES - let linumpy.gpu auto-select GPU with most free memory
     }
 

From 3fa415784cbcc17501346678307558810226b2b3 Mon Sep 17 00:00:00 2001
From: Frans Irgolitsch <frans.irgolitsch@polymtl.ca>
Date: Wed, 29 Apr 2026 17:32:01 -0400
Subject: [PATCH 3/6] perf(pipeline): skip pyramids on intermediate ome-zarr
 outputs

---
 scripts/linum_crop_3d_mosaic_below_interface.py  |  3 ++-
 scripts/linum_fix_illumination_3d.py             |  3 ++-
 scripts/linum_normalize_intensities_per_slice.py |  3 ++-
 scripts/linum_stitch_3d_refined.py               |  3 ++-
 workflows/reconst_3d/soct_3d_reconst.nf          | 14 ++++++++------
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/scripts/linum_crop_3d_mosaic_below_interface.py b/scripts/linum_crop_3d_mosaic_below_interface.py
index 0fbcf2c9..ecdfdb44 100644
--- a/scripts/linum_crop_3d_mosaic_below_interface.py
+++ b/scripts/linum_crop_3d_mosaic_below_interface.py
@@ -55,6 +55,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         "to finding the interface*. Original values will\n"
         "remain in output clipped volume (range [0-100]).",
     )
+    p.add_argument("--n_levels", type=int, default=5, help="Number of levels in pyramid representation. [%(default)s]")
     return p
 
 
@@ -101,7 +102,7 @@ def main() -> None:
 
     crop_dask = da.from_array(vol_crop, chunks=vol.chunks)
     # Save cropped volume as OME-Zarr
-    save_omezarr(crop_dask, output_path, voxel_size=res, chunks=vol.chunks)
+    save_omezarr(crop_dask, output_path, voxel_size=res, chunks=vol.chunks, n_levels=args.n_levels)
 
     # Collect metrics using helper function
     original_shape = vol.shape
diff --git a/scripts/linum_fix_illumination_3d.py b/scripts/linum_fix_illumination_3d.py
index 7f32c7d6..b55174e5 100644
--- a/scripts/linum_fix_illumination_3d.py
+++ b/scripts/linum_fix_illumination_3d.py
@@ -42,6 +42,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         type=float,
         help="Values above this percentile will be clipped when\nestimating the flatfield (inside range [0-100]).",
     )
+    p.add_argument("--n_levels", type=int, default=5, help="Number of levels in pyramid representation. [%(default)s]")
     p.add_argument(
         "--use_gpu",
         default=True,
@@ -185,7 +186,7 @@ def main() -> None:
         print(f"Minimum value in the output volume is {min_value}. Clipping at 0.")
         out_dask = da.clip(out_dask, 0.0, None)
 
-    save_omezarr(out_dask, output_zarr, voxel_size=resolution, chunks=vol.chunks)
+    save_omezarr(out_dask, output_zarr, voxel_size=resolution, chunks=vol.chunks, n_levels=args.n_levels)
     tmp_dir.cleanup()
 
 
diff --git a/scripts/linum_normalize_intensities_per_slice.py b/scripts/linum_normalize_intensities_per_slice.py
index 1630dd19..8b5503ca 100644
--- a/scripts/linum_normalize_intensities_per_slice.py
+++ b/scripts/linum_normalize_intensities_per_slice.py
@@ -39,6 +39,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
     p.add_argument("--sigma", type=float, default=1.0, help="Smoothing sigma for estimating the agarose mask. [%(default)s]")
     p.add_argument("--use_gpu", default=True, action=argparse.BooleanOptionalAction, help="Use GPU acceleration if available.")
     p.add_argument("--verbose", action="store_true", help="Print GPU information.")
+    p.add_argument("--n_levels", type=int, default=3, help="Number of levels in pyramid representation. [%(default)s]")
     return p
 
 
@@ -71,7 +72,7 @@ def main() -> None:
 
     vol_normalized, background_thresholds = normalize_volume(vol_data, agarose_mask, args.percentile_max)
 
-    save_omezarr(da.from_array(vol_normalized), args.out_image, res, n_levels=3)
+    save_omezarr(da.from_array(vol_normalized), args.out_image, res, n_levels=args.n_levels)
 
     collect_normalization_metrics(
         vol_normalized=vol_normalized,
diff --git a/scripts/linum_stitch_3d_refined.py b/scripts/linum_stitch_3d_refined.py
index a4490068..dfc859f8 100644
--- a/scripts/linum_stitch_3d_refined.py
+++ b/scripts/linum_stitch_3d_refined.py
@@ -84,6 +84,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         "--output_refinements", type=str, default=None, help="Output JSON file to save computed refinements for analysis."
     )
     p.add_argument("--overwrite", "-f", action="store_true", help="Overwrite output if it exists.")
+    p.add_argument("--n_levels", type=int, default=3, help="Number of levels in pyramid representation. [%(default)s]")
     return p
 
 
@@ -292,7 +293,7 @@ def main() -> None:
 
     from linumpy.io.zarr import save_omezarr
 
-    save_omezarr(da.from_array(output), output_file, resolution, n_levels=3)
+    save_omezarr(da.from_array(output), output_file, resolution, n_levels=args.n_levels)
 
     # Collect metrics
     from linumpy.metrics import PipelineMetrics
diff --git a/workflows/reconst_3d/soct_3d_reconst.nf b/workflows/reconst_3d/soct_3d_reconst.nf
index 01966335..643113b0 100644
--- a/workflows/reconst_3d/soct_3d_reconst.nf
+++ b/workflows/reconst_3d/soct_3d_reconst.nf
@@ -128,7 +128,7 @@ process resample_mosaic_grid {
     def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu"
     """
     linum_resample_mosaic_grid.py ${mosaic_grid} "mosaic_grid_z${slice_id}_resampled.ome.zarr" \
-        -r ${params.resolution} ${gpu_flag} -v
+        -r ${params.resolution} ${gpu_flag} --n_levels 0 -v
     """
 
     stub:
@@ -147,7 +147,8 @@ process fix_focal_curvature {
     script:
     def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu"
     """
-    linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr" ${gpu_flag}
+    linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr" \\
+        --n_levels 0 ${gpu_flag}
     """
 
     stub:
@@ -170,7 +171,7 @@ process fix_illumination {
     """
     linum_fix_illumination_3d.py ${mosaic_grid} "mosaic_grid_z${slice_id}_illum_fix.ome.zarr" \
         --n_processes ${params.processes} \
-        --percentile_max ${params.clip_percentile_upper} ${gpu_flag}
+        --percentile_max ${params.clip_percentile_upper} ${gpu_flag} --n_levels 0
     """
 
     stub:
@@ -244,6 +245,7 @@ process stitch_3d_with_refinement {
         --refinement_mode blend_shift \
         --max_refinement_px ${params.max_blend_refinement_px} \
         ${transform_arg} \
+        --n_levels 0 \
         -f
     """
 
@@ -291,7 +293,7 @@ process beam_profile_correction {
     script:
     """
     linum_compensate_psf_model_free.py ${slice_3d} "slice_z${slice_id}_axial_corr.ome.zarr" \
-        --percentile_max ${params.clip_percentile_upper}
+        --percentile_max ${params.clip_percentile_upper} --n_levels 0
     """
 
     stub:
@@ -315,7 +317,7 @@ process crop_interface {
     linum_crop_3d_mosaic_below_interface.py ${image} "slice_z${slice_id}_crop_interface.ome.zarr" \
         --depth ${params.crop_interface_out_depth} \
         --crop_before_interface \
-        --percentile_max ${params.clip_percentile_upper}
+        --percentile_max ${params.clip_percentile_upper} --n_levels 0
     """
 
     stub:
@@ -338,7 +340,7 @@ process normalize {
     def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu"
     """
     linum_normalize_intensities_per_slice.py ${image} "slice_z${slice_id}_normalize.ome.zarr" \
-        --percentile_max ${params.clip_percentile_upper} ${gpu_flag}
+        --percentile_max ${params.clip_percentile_upper} ${gpu_flag} --n_levels 0
     """
 
     stub:

From e1a56f976c9d964ffd616cf1090ec2304605963a Mon Sep 17 00:00:00 2001
From: Frans Irgolitsch <frans.irgolitsch@polymtl.ca>
Date: Thu, 30 Apr 2026 12:35:50 -0400
Subject: [PATCH 4/6] Docs update

Co-authored-by: Copilot <copilot@github.com>
---
 docs/LIBRARY_MODULES.md                |  20 +-
 docs/N4_GPU.md                         |   4 +-
 docs/NEXTFLOW_WORKFLOWS.md             |   6 +-
 docs/PIPELINE_OVERVIEW.md              |   6 +-
 docs/RECONSTRUCTION_TUNING.md          | 396 +++++++++++++++++++++++++
 docs/RECONSTRUCTION_TUNING_QUICKREF.md | 131 ++++++++
 docs/SCRIPTS_REFERENCE.md              |   2 +-
 docs/SHIFTS_FILE_FORMAT.md             |   2 +-
 docs/SLICE_CONFIG_FEATURE.md           |   2 +-
 docs/SLICE_INTERPOLATION_FEATURE.md    |   4 +-
 docs/pipelines.rst                     |   2 +
 11 files changed, 551 insertions(+), 24 deletions(-)
 create mode 100644 docs/RECONSTRUCTION_TUNING.md
 create mode 100644 docs/RECONSTRUCTION_TUNING_QUICKREF.md

diff --git a/docs/LIBRARY_MODULES.md b/docs/LIBRARY_MODULES.md
index 7ac2e53c..1834882d 100644
--- a/docs/LIBRARY_MODULES.md
+++ b/docs/LIBRARY_MODULES.md
@@ -51,13 +51,13 @@ from linumpy.io.thorlabs import ThorImageOCT
 point: it materialises the requested level into memory and returns
 `(array, voxel_size)`. With ``use_gpu=False`` (default) the array is a
 ``numpy.ndarray``; with ``use_gpu=True`` it dispatches through
-[`linumpy.gpu.zarr_io.read_zarr_to_gpu`](GPU_ACCELERATION.md) and returns a
+`linumpy.gpu.zarr_io.read_zarr_to_gpu` (see {doc}`GPU_ACCELERATION`) and returns a
 ``cupy.ndarray`` (kvikio/GPUDirect Storage when available, otherwise the
 ``zarr.config.enable_gpu`` fallback). Voxel size is ordered to match the array
 axes (Z, Y, X for 3D volumes; Y, X for 2D mosaics).
 
-See [Mosaic Grid Format](MOSAIC_GRID_FORMAT.md) and
-[Slice Config Feature](SLICE_CONFIG_FEATURE.md) for format details.
+See {doc}`Mosaic Grid Format <MOSAIC_GRID_FORMAT>` and
+{doc}`Slice Config Feature <SLICE_CONFIG_FEATURE>` for format details.
 
 ### `linumpy.cli` and `linumpy.config`
 
@@ -84,7 +84,7 @@ from linumpy.metrics import (
 )
 ```
 
-See [Reconstruction Diagnostics](RECONSTRUCTION_DIAGNOSTICS.md).
+See {doc}`Reconstruction Diagnostics <RECONSTRUCTION_DIAGNOSTICS>`.
 
 ### `linumpy.geometry`
 
@@ -146,7 +146,7 @@ CuPy-backed versions of hot paths. Each public entry point either takes a
   fastest backend available (kvikio/GDS native → `zarr.config.enable_gpu`).
 * `gpu.kvikio_zarr` — kvikio / GPUDirect Storage backend for the dispatcher.
 
-See [GPU Acceleration](GPU_ACCELERATION.md) and [N4 GPU](N4_GPU.md).
+See {doc}`GPU Acceleration <GPU_ACCELERATION>` and {doc}`N4 GPU <N4_GPU>`.
 
 ### `linumpy.stack_alignment`
 
@@ -158,7 +158,7 @@ from linumpy.stack_alignment.units import detect_shift_units
 from linumpy.stack_alignment.filter import filter_outliers
 ```
 
-See [Shifts File Format](SHIFTS_FILE_FORMAT.md).
+See {doc}`Shifts File Format <SHIFTS_FILE_FORMAT>`.
 
 ### `linumpy.imaging`
 
@@ -207,11 +207,11 @@ A typical pipeline run wires these subpackages as follows:
    `gpu.n4` backend when CUDA is available) and `imaging.orientation` / RAS
    alignment via `reference.allen`.
 7. **Diagnostics** use `metrics` collectors throughout; see
-   [Reconstruction Diagnostics](RECONSTRUCTION_DIAGNOSTICS.md).
+   {doc}`Reconstruction Diagnostics <RECONSTRUCTION_DIAGNOSTICS>`.
 
 ## See also
 
 * Auto-generated API reference: [api/linumpy](api/linumpy/index.rst)
-* [Pipeline Overview](PIPELINE_OVERVIEW.md)
-* [Scripts Reference](SCRIPTS_REFERENCE.md)
-* [Nextflow Workflows](NEXTFLOW_WORKFLOWS.md)
+* {doc}`Pipeline Overview <PIPELINE_OVERVIEW>`
+* {doc}`Scripts Reference <SCRIPTS_REFERENCE>`
+* {doc}`Nextflow Workflows <NEXTFLOW_WORKFLOWS>`
diff --git a/docs/N4_GPU.md b/docs/N4_GPU.md
index 1b0c965d..6515d353 100644
--- a/docs/N4_GPU.md
+++ b/docs/N4_GPU.md
@@ -134,7 +134,7 @@ server.
 ## 5. Performance
 
 Benchmarks measured on a single NVIDIA GPU (47 GB) using the
-`linum_benchmark_n4_gpu` script (see [Scripts reference](SCRIPTS_REFERENCE.md)).
+`linum_benchmark_n4_gpu` script (see {doc}`SCRIPTS_REFERENCE`).
 The CPU path is `SimpleITK.N4BiasFieldCorrectionImageFilter` with the same
 control-point spacing and iteration schedule as the GPU path. Both paths
 include the `shrink_factor` downsample. The GPU column already excludes a
@@ -218,7 +218,7 @@ directory.
 
 The Nextflow `reconst_3d` workflow exposes a single global GPU switch
 (`params.use_gpu`, defined in the reconst_3d Nextflow config; see
-[Nextflow workflows](NEXTFLOW_WORKFLOWS.md)).
+{doc}`Nextflow workflows <NEXTFLOW_WORKFLOWS>`).
 When set, the `correct_bias_field` process runs the GPU N4 backend with
 `maxForks = 1` to avoid GPU contention; otherwise it uses the SimpleITK
 CPU path with `params.processes` threads. No per-process flag is needed:
diff --git a/docs/NEXTFLOW_WORKFLOWS.md b/docs/NEXTFLOW_WORKFLOWS.md
index 2a078e64..9e5fe1f3 100644
--- a/docs/NEXTFLOW_WORKFLOWS.md
+++ b/docs/NEXTFLOW_WORKFLOWS.md
@@ -269,7 +269,7 @@ gaps are not interpolated (insufficient information) and remain as holes.
 
 When zmorph's quality gates fail the slot is left as a genuine gap (no zarr
 output); a manifest fragment and diagnostics JSON are still emitted. See
-[SLICE_INTERPOLATION_FEATURE.md](SLICE_INTERPOLATION_FEATURE.md) for details.
+{doc}`SLICE_INTERPOLATION_FEATURE` for details.
 
 #### Automatic Slice Quality Assessment
 
@@ -638,7 +638,7 @@ params {
 For GPU support:
 - NVIDIA GPU with CUDA support
 - CuPy installed: `uv pip install cupy-cuda12x`
-- See [GPU_ACCELERATION.md](GPU_ACCELERATION.md) for detailed setup
+- See {doc}`GPU_ACCELERATION` for detailed setup
 
 ### Expected Speedups
 
@@ -1124,7 +1124,7 @@ without exhaustion.
 zmorph's quality gates reject the interpolation it emits a manifest fragment
 with `interpolation_failed=true` and no zarr; `finalise_interpolation`
 stamps that into `slice_config_final.csv` and the slot stays a genuine gap
-in the stacked volume. See [`SLICE_INTERPOLATION_FEATURE.md`](SLICE_INTERPOLATION_FEATURE.md)
+in the stacked volume. See {doc}`SLICE_INTERPOLATION_FEATURE`
 for the full policy.
 
 ### `finalise_interpolation` is published-only
diff --git a/docs/PIPELINE_OVERVIEW.md b/docs/PIPELINE_OVERVIEW.md
index c8b55d0c..97cef98e 100644
--- a/docs/PIPELINE_OVERVIEW.md
+++ b/docs/PIPELINE_OVERVIEW.md
@@ -47,8 +47,6 @@ flowchart TD
     P3 --> INTER
     INTER --> R00
     INTER --> R01
-    R14 --> OUT
-    R14b --> OUT
     R16 --> OUT
 
     classDef opt fill:#f5f5f5,stroke:#999,stroke-dasharray:3 3,color:#555
@@ -273,7 +271,7 @@ interpolate_missing_slice
 - Fills single-slice gaps using the two neighbouring slices
 - Enabled by `interpolate_missing_slices = true` (default)
 - Default method is **zmorph** (z-aware morphing via fractional affine
-  transforms on the boundary planes) — see [SLICE_INTERPOLATION_FEATURE.md](SLICE_INTERPOLATION_FEATURE.md)
+  transforms on the boundary planes) — see {doc}`SLICE_INTERPOLATION_FEATURE`
 - When zmorph's quality gates fail the slot stays a genuine gap (no zarr
   emitted) — nothing is fabricated
 - `finalise_interpolation` merges per-slice manifest fragments into
@@ -664,7 +662,7 @@ nextflow run preproc_rawtiles.nf --input /path/to/data --output /path/to/output
 
 - NVIDIA GPU with CUDA support
 - CuPy installed (`uv pip install cupy-cuda12x`)
-- See [GPU_ACCELERATION.md](GPU_ACCELERATION.md) for detailed setup
+- See {doc}`GPU_ACCELERATION` for detailed setup
 
 ---
 
diff --git a/docs/RECONSTRUCTION_TUNING.md b/docs/RECONSTRUCTION_TUNING.md
new file mode 100644
index 00000000..1bd24405
--- /dev/null
+++ b/docs/RECONSTRUCTION_TUNING.md
@@ -0,0 +1,396 @@
+# Reconstruction Parameter Tuning Guide
+
+A walkthrough of the parameters that most affect 3D reconstruction quality, what
+they do, what raising/lowering them changes, and which knobs to reach for when
+specific artifacts show up. Aimed at users who are new to the pipeline; if you
+already know your way around it, the companion {doc}`Reconstruction Tuning
+Quick Reference <RECONSTRUCTION_TUNING_QUICKREF>` is denser.
+
+For the runtime-mechanics view of the same parameters, see the inline comments
+in `workflows/reconst_3d/nextflow.config` (described in {doc}`NEXTFLOW_WORKFLOWS`).
+For diagnostic *scripts* (rotation drift, tile dilation, motor-only stitching),
+see {doc}`RECONSTRUCTION_DIAGNOSTICS`.
+
+---
+
+## How to approach tuning
+
+Reconstruction problems almost always show up at slice boundaries: edges that
+don't line up between slices, intensity steps, sudden Z-jumps, or rotated
+sections. The pipeline runs in stages, and each artifact is usually produced
+by one specific stage. The fastest tuning workflow is:
+
+1. **Look at the previews first.** `output/common_space_previews/*.png` shows
+   each slice in XY/XZ/YZ. Most artifacts are visible at a glance.
+2. **Identify the stage.** Use the symptom map below to pick a stage.
+3. **Adjust the smallest set of parameters that targets that stage.**
+4. **Re-run with `-resume`.** Nextflow caches everything upstream of the
+   change, so iteration is fast.
+5. **Verify against previews and `output/stack/*.csv` metrics.**
+
+The pipeline stages, in order:
+
+```
+Tile stitching → Slice quality → Common-space alignment → Missing-slice
+  interpolation → Pairwise registration → Stacking → Bias correction → Atlas
+```
+
+A change to a parameter in stage *N* invalidates caches for stage *N* and all
+later stages. Earlier stages are reused.
+
+---
+
+## Quick orientation: which stage produces which artifact?
+
+| Artifact | Most likely stage |
+|---|---|
+| Tiles within a slice misaligned, seams visible | Tile stitching |
+| Whole slice looks degraded, smeared, or missing tissue | Slice quality / preprocessing |
+| Slice shifted left/right or up/down relative to neighbours | Common-space alignment |
+| One slice has a clear gap then resumes | Missing-slice interpolation |
+| Two consecutive slices are rotated relative to each other | Pairwise registration |
+| Sudden Z-jump or tilt between slices | Stacking |
+| Slow XY drift across many slices | Stacking (translation accumulation) |
+| Visible step in brightness between slices | Bias correction |
+| Atlas overlay is misaligned | Atlas registration |
+
+---
+
+## Profiles before parameters
+
+Before reaching for individual parameters, pick a profile. Profiles set
+sensible groups of parameters together, and you can override individual values
+on top of a profile. They are defined in `workflows/reconst_3d/nextflow.config`
+(see {doc}`NEXTFLOW_WORKFLOWS`).
+
+- **`-profile conservative`** (recommended starting point): trusts motor
+  positions for XY, applies only rotation from pairwise registration, skips
+  any registration flagged warning/error, interpolates single-slice gaps.
+  Fails gracefully when registration is unreliable.
+- **`-profile aggressive`**: applies full pairwise transforms including XY
+  translations and accumulates them. Best alignment when registration is
+  reliable. Can compound errors when it is not.
+- **`-profile minimal`**: ignores pairwise registration entirely
+  (`apply_pairwise_transforms = false`). Most stable, fastest. Use when motor
+  positions are reliable and registration repeatedly fails.
+
+If the conservative profile produces a clean reconstruction, stop. Most of the
+parameters below exist to handle specific failure modes you may not have.
+
+---
+
+## Stage 1: Tile stitching
+
+Assembles the tiles within each slice into a single 2D image. The dominant
+choice here is whether to trust motor positions or fit a transform from the
+images.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `use_motor_positions_for_stitching` | `true` | Use motor encoder positions for tile placement. Recommended — analysis showed image-based stitching introduces ~3% systematic compression. |
+| `stitch_overlap_fraction` | `0.2` | Expected fraction of overlap between adjacent tiles. **Must match the acquisition setting.** |
+| `stitch_blending_method` | `'diffusion'` | Tile-seam blending. `'diffusion'` is the smoothest, `'average'` is faster, `'none'` shows raw seams (useful when debugging tile placement). |
+| `max_blend_refinement_px` | `10` | Sub-pixel refinement budget when blending seams. Larger values let blending hide small motor inaccuracies but can over-smear true features. |
+| `stitch_global_transform` | `false` | Pool a single 2x2 affine across many slices and reuse it. Helps when individual slices have too few tiles for a stable per-slice fit. Enable if per-slice stitch transforms jitter. |
+
+**If you see seams within a slice:**
+1. Confirm `stitch_overlap_fraction` matches the acquisition.
+2. Try `stitch_blending_method = 'diffusion'` if not already.
+3. As a last resort, raise `max_blend_refinement_px` (10 → 20).
+
+**If individual slices look fine but the whole slice is rotated/skewed
+slightly compared to neighbours**: this is usually a per-slice stitch
+transform problem. Try `stitch_global_transform = true`.
+
+---
+
+## Stage 2: Slice quality assessment
+
+Optionally scores each slice and excludes the bad ones from common-space
+alignment so they don't poison their neighbours.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `auto_assess_quality` | `false` | Enable automatic quality scoring. |
+| `auto_assess_min_quality` | `0.3` | Slices scored below this are excluded. Lower = more permissive. |
+| `auto_assess_exclude_first` | `1` | Always exclude the first N calibration slices. |
+| `auto_exclude_enabled` | `true` | Detect runs of consecutive low-quality registrations after pairwise (different mechanism). |
+
+**When to enable:** if you see one or two badly degraded slices dragging the
+common-space alignment of their neighbours.
+
+---
+
+## Stage 3: Common-space alignment
+
+Aligns each stitched slice into a shared XY canvas using `shifts_xy.csv`
+(motor positions). Most XY-misalignment problems are tuned here.
+
+### Encoder glitch correction
+
+`detect_rehoming` corrects two known motor artifacts:
+
+1. **Encoder glitch spikes**: a single large step that self-cancels with the
+   adjacent step. The motor moved cleanly; the encoder briefly reported a
+   wrong position.
+2. **Tile-FOV expansion events** (legacy data only): adding a new column of
+   tiles at acquisition time produced a motor jump of exactly N × tile_FOV.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `detect_rehoming` | `true` | Enable both passes. Almost always wanted. |
+| `rehoming_max_shift_mm` | `0.5` | Steps below this magnitude aren't checked. Lower to catch smaller glitches at the cost of false positives. |
+| `rehoming_return_fraction` | `0.4` | Detection sensitivity. Lower = more conservative. |
+| `tile_fov_mm` | `null` | Set to the tile field-of-view (e.g. `0.875`) only when re-using older `shifts_xy.csv` files that contain mosaic-expansion artifacts. New shift files generated by `linum_estimate_xy_shift_from_metadata.py` no longer need this. |
+| `tile_fov_tolerance` | `0.05` | Fractional tolerance when matching steps to tile-FOV multiples. |
+
+### Image-based refinement of unreliable transitions
+
+For transitions where the motor reading is flagged unreliable
+(`reliable=0` in shifts CSV), the pipeline can fall back to image-based
+phase correlation between the two slices.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `common_space_refine_unreliable` | `false` | Enable image-based refinement. Turn on when slices grow significantly between acquisitions or when raw motor steps are noisy. |
+| `common_space_refine_max_discrepancy_px` | `0` | Reject the image estimate if it disagrees with motor by more than this many pixels. `0` = accept all. Recommended `50` if you see image refinement producing wild values. |
+| `common_space_refine_min_correlation` | `0.0` | Reject refinements with low correlation quality. Recommended `0.15`–`0.3`. |
+
+**Recipe — large XY jumps between specific slices:**
+
+1. Inspect `shifts_xy.csv` for the affected slice IDs. Look at `x_shift_mm`
+   and `y_shift_mm`. Steps > 0.5 mm are suspect.
+2. Check `output/detect_rehoming_events/shifts_xy_clean.csv`. If `reliable=0`
+   for those rows, the corrected file already accounts for them.
+3. If the misalignment persists, enable
+   `common_space_refine_unreliable = true` and re-run.
+
+### Excluded-slice interpolation
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `common_space_excluded_slice_mode` | `'local_median'` | How to fill XY positions for excluded slices. `'local_median'` averages neighbour shifts. |
+| `common_space_excluded_slice_window` | `2` | Number of neighbours considered. |
+
+---
+
+## Stage 4: Missing-slice interpolation
+
+Fills single-slice gaps so the stack has a continuous Z. The default is
+`zmorph` — a 2D registration of the boundary planes is used to morph
+fractional-affine intermediate planes between the two neighbours.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `interpolate_missing_slices` | `true` | Enable interpolation. Disable to keep gaps explicit. |
+| `interpolation_method` | `'zmorph'` | `'zmorph'` (registration-based morph), `'weighted'` (z-smoothed linear blend), `'average'` (50/50). |
+| `interpolation_blend_method` | `'gaussian'` | `'gaussian'` (feathered) or `'linear'`. |
+| `interpolation_min_overlap_correlation` | `0.3` | If the boundary-plane NCC is below this, falls back to `'weighted'`. |
+| `interpolation_min_ncc_improvement` | `0.05` | If post-registration NCC doesn't improve by this much, falls back to `'weighted'`. |
+
+**When to lower `interpolation_min_overlap_correlation`:** if zmorph keeps
+falling back to weighted on slices that visually look fine, lower to e.g.
+`0.2`. Watch for spurious deformations on noisier boundaries.
+
+**When to disable interpolation:** if you specifically want to see where the
+missing slices are (e.g. for QC), set `interpolate_missing_slices = false`.
+
+---
+
+## Stage 5: Pairwise registration
+
+Computes small inter-slice corrections (rotation, sub-pixel translation). The
+*main* alignment comes from motor positions; pairwise transforms are
+refinements applied on top during stacking.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `registration_transform` | `'euler'` | `'translation'` (XY only) or `'euler'` (XY + rotation). Use `'euler'` for any sample where slices are rotated relative to each other. |
+| `registration_max_translation` | `200.0` | Optimizer bound on translation (px). Keep large; the actual *applied* translation is governed in stacking. Increase only if pairwise metrics report frequent boundary hits. |
+| `registration_max_rotation` | `5.0` | Optimizer bound on rotation (deg). Raise (e.g. to `35.0`) for obliquely-mounted samples where slice-to-slice rotation can be large. |
+| `registration_initial_alignment` | `'both'` | Initialization. `'both'` runs centre-of-mass and gradient inits and picks the better one. |
+
+**Recipe — pairwise registration looks "stuck":**
+
+1. Open a pairwise metrics JSON in `output/register_pairwise/`. Look for
+   `mag` (translation magnitude in px) and `rotation`.
+2. If many slices have `mag` ≈ `registration_max_translation` × 0.95+, the
+   optimizer is hitting the boundary. Raise the bound (e.g. 200 → 400) but
+   inspect: usually this means the input alignment is wrong upstream, not
+   that the bound is too tight.
+3. Keep `skip_warning_transforms = true` (in stacking) so unreliable
+   boundary-hit transforms aren't applied.
+
+---
+
+## Stage 6: Stacking
+
+Where pairwise corrections actually get applied to the volume. This is the
+single most-tuned stage and the source of most reconstruction drift.
+
+### Whether to apply transforms at all
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `apply_pairwise_transforms` | `true` | Master switch. Set `false` to stack motor-only (this is what `-profile minimal` does). |
+| `apply_rotation_only` | `false` | Apply only the rotation component, keep XY from motor positions. The conservative profile sets this `true`. |
+| `use_expected_z_overlap` | `true` | Use the expected slice thickness for Z spacing instead of correlation-based matching. Recommended; correlation matching is brittle at slice boundaries. |
+| `max_rotation_deg` | `5.0` | Clamp applied rotations larger than this. Prevents single bad slices from rotating the entire stack downstream. |
+
+### Per-slice transform gating
+
+Each pairwise registration produces a status (ok/warning/error) and metrics.
+The pipeline can refuse to apply low-quality transforms.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `skip_error_transforms` | `true` | Skip transforms registered against interpolated slices etc. **Keep enabled.** |
+| `skip_warning_transforms` | `true` | Skip transforms that hit the optimizer boundary. **Keep enabled.** Disabling causes Z-positioning errors. |
+| `transform_confidence_high` | `0.6` | Above this, full transform applied. |
+| `transform_confidence_low` | `0.3` | Below this, transform skipped entirely. Between low and high, rotation-only. |
+| `load_transform_max_rotation` | `0.0` | Metric-based rotation gate. Set to e.g. `4.0` for noisy data — transforms with rotation > 4° are not loaded at all. |
+| `load_transform_min_zcorr` | `0.0` | Metric-based Z-correlation gate. Pairs with `load_transform_max_rotation`. |
+
+### Translation accumulation
+
+Pairwise translations can be accumulated as cumulative canvas offsets — this
+"steers the viewing plane" through the volume. Useful when the sample drifts
+slowly across many slices.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `stack_accumulate_translations` | `true` | Enable cumulative-offset accumulation. |
+| `stack_confidence_weight_translations` | `true` | Weight each translation by its confidence before accumulating. Reduces noise from low-quality slices. |
+| `stack_translation_smooth_sigma` | `3.0` | Gaussian smoothing (sigma in slices) over accumulated translations. Higher removes more jitter; too high and you lose legitimate trends. |
+| `stack_max_pairwise_translation` | `0` | Translations exceeding this magnitude (px) are zeroed before accumulation. **Set to 0 to disable.** Even an optimizer-boundary value carries directional information; zeroing it is usually worse than keeping it. Use a non-zero value only if specific slices have clearly erroneous translations worse than zero. |
+| `stack_max_cumulative_drift_px` | `50` | Cap on total accumulated drift. **Recommended: `0` (disabled).** If common-space alignment is correct, accumulated translations should converge naturally; a cap usually hides a real upstream problem. |
+| `stack_smooth_window` | `5` | Moving-average window over per-slice rotations. Reduces visible jumps from outlier slices. |
+
+**Recipe — slow XY drift across many slices in XZ/YZ view:**
+
+1. Plot `output/stack/translation_per_slice.csv` (or read it). If translations
+   trend monotonically, that's drift.
+2. First check that common-space alignment is correct (XY view of each slice
+   aligns). If common-space is the problem, fix it there first.
+3. If common-space looks fine, raise `stack_translation_smooth_sigma`
+   (3 → 5) to wash out noise.
+4. Disable `stack_max_cumulative_drift_px` (set `0`) if it's clamping a real
+   trend.
+
+**Recipe — sudden tilt or jump at one specific slice:**
+
+1. Open `output/stack/stacking_decisions.csv`. The affected row will usually
+   show `transform_loaded=False` (a gap) or the pairwise registration metric
+   will show a high rotation/translation outlier.
+2. If `transform_loaded=False`: check `output/register_pairwise/` metrics for
+   why it was rejected. Often `overall_status="error"` because a neighbour
+   was interpolated.
+3. If a noisy rotation: lower `load_transform_max_rotation` to a tighter
+   value (e.g. 4.0) or raise the gate.
+
+---
+
+## Stage 7: Bias field correction (optional)
+
+N4 bias correction applied after stacking. Removes depth-dependent attenuation
+and slow inter-slice intensity drift.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `correct_bias_field` | `false` | Master switch. |
+| `bias_mode` | `'two_pass'` | `'per_section'`, `'global'`, or `'two_pass'` (per-section then global). `'two_pass'` is the recommended default. |
+| `bias_strength` | `1.0` | Mixing strength. `0.0` = passthrough, `1.0` = full correction. Lower if N4 is over-correcting tissue features. |
+| `bias_histogram_match_per_zplane` | `true` | Match each Z-plane to the global tissue distribution before N4. **Strongly reduces inter-slice intensity steps.** Roughly an order of magnitude better than chunked HM in tested cases. |
+| `bias_tissue_threshold` | `0.005` | Voxels at or below this intensity are considered background and excluded from histogram matching. Lower if tissue is being treated as background. |
+| `bias_zprofile_smooth_sigma` | `2.0` | Gaussian smoothing (sigma in Z-planes) of a residual scalar gain after HM. Eliminates the small inter-slice steps HM cannot remove. `0` = disabled. Typical range 2–4. |
+
+**Recipe — visible intensity steps between slices:**
+
+1. Enable `correct_bias_field = true`.
+2. Keep `bias_histogram_match_per_zplane = true` and
+   `bias_zprofile_smooth_sigma = 2.0`.
+3. If steps remain, raise sigma (2 → 4).
+4. If tissue features are getting flattened, lower `bias_strength` (1.0 → 0.7).
+
+---
+
+## Stage 8: Atlas registration (optional)
+
+Registers the stacked volume to the Allen Mouse Brain Atlas.
+
+| Parameter | Default | Effect |
+|---|---|---|
+| `align_to_ras_enabled` | `false` | Master switch. |
+| `allen_resolution` | `25` | Atlas resolution (10/25/50/100 µm). Lower = slower & higher memory. |
+| `allen_metric` | `'MI'` | `'MI'`, `'MSE'`, `'CC'`, `'AntsCC'`. `'MI'` is robust across modalities. |
+| `allen_registration_level` | `2` | Pyramid level used for registration. Higher = faster but coarser. Output is always written at all pyramid resolutions. |
+| `ras_input_orientation` | `''` | 3-letter code (e.g. `'PIR'`) telling the registrar what orientation the input is in. Crucial for correct atlas alignment. |
+| `ras_initial_rotation` | `''` | `"Rx Ry Rz"` initial rotation hint in degrees. Use when MOMENTS-based init fails. |
+| `allen_preview` | `true` | Save a 3×3 input/aligned/template comparison. Always check this. |
+| `ras_orientation_preview` | `false` | Save a preview after orientation/initial-rotation are applied but before registration. **Enable when tuning `ras_input_orientation` and `ras_initial_rotation`** — you can verify orientation parameters without running the full registration. |
+
+**Recipe — atlas overlay is mirrored or rotated 90°:**
+
+1. Set `ras_orientation_preview = true` and re-run `align_to_ras` only.
+2. Inspect the orientation preview: if the brain is rotated, adjust
+   `ras_initial_rotation` (e.g. `"0.0 0.0 90.0"` for a 90° Z rotation).
+3. If the brain is flipped, adjust `ras_input_orientation` (try the
+   neighbour code: e.g. `'PIR'` → `'AIR'` if anterior/posterior is
+   inverted).
+4. Iterate until the orientation preview looks roughly aligned to the
+   atlas template, then re-run with the full registration.
+
+---
+
+## Reference parameter sets
+
+The canonical config and profile blocks already encode several known-good
+combinations. For per-subject overrides see the deployed configs at
+`/scratch/workspace/sub-XX/nextflow.config`. Common combinations:
+
+**For data with mosaic-grid expansion events (legacy `shifts_xy.csv`):**
+```groovy
+detect_rehoming = true
+tile_fov_mm = 0.875                              // your acquisition tile FOV
+common_space_refine_unreliable = true
+common_space_refine_max_discrepancy_px = 0
+```
+
+**For obliquely-mounted samples (large slice-to-slice rotations):**
+```groovy
+registration_transform = 'euler'
+registration_max_rotation = 35.0
+load_transform_max_rotation = 4.0
+stack_smooth_window = 5
+```
+
+**For visible inter-slice intensity steps:**
+```groovy
+correct_bias_field = true
+bias_mode = 'two_pass'
+bias_histogram_match_per_zplane = true
+bias_zprofile_smooth_sigma = 2.0
+```
+
+**For one or two bad slices contaminating the stack:**
+```groovy
+auto_assess_quality = true
+auto_assess_min_quality = 0.3
+auto_exclude_enabled = true
+auto_exclude_consecutive = 3
+auto_exclude_z_corr = 0.6
+```
+
+---
+
+## See also
+
+- {doc}`Reconstruction Tuning Quick Reference <RECONSTRUCTION_TUNING_QUICKREF>` —
+  symptom→fix table for experienced users.
+- {doc}`Reconstruction Diagnostics <RECONSTRUCTION_DIAGNOSTICS>` — diagnostic
+  scripts and the `diagnostic_mode` master switch.
+- {doc}`Pipeline Overview <PIPELINE_OVERVIEW>` — high-level stage descriptions.
+- {doc}`Nextflow Workflows <NEXTFLOW_WORKFLOWS>` — running and configuring the
+  pipeline.
+- `workflows/reconst_3d/nextflow.config` — full canonical config with
+  per-parameter inline comments (see {doc}`NEXTFLOW_WORKFLOWS`).
diff --git a/docs/RECONSTRUCTION_TUNING_QUICKREF.md b/docs/RECONSTRUCTION_TUNING_QUICKREF.md
new file mode 100644
index 00000000..b7f6f3bc
--- /dev/null
+++ b/docs/RECONSTRUCTION_TUNING_QUICKREF.md
@@ -0,0 +1,131 @@
+# Reconstruction Tuning — Quick Reference
+
+Cheat sheet for tuning the 3D reconstruction pipeline. Assumes you know what
+the stages do. For background and recipes, see the full {doc}`Reconstruction
+Parameter Tuning Guide <RECONSTRUCTION_TUNING>`.
+
+---
+
+## Profiles first
+
+| Profile | When to use |
+|---|---|
+| `-profile conservative` | Default starting point. Motor-XY + rotation-only. |
+| `-profile aggressive` | Registration is reliable; want full pairwise corrections. |
+| `-profile minimal` | Motor-only. Registration consistently fails. |
+
+---
+
+## Symptom → fix
+
+| Symptom | First parameters to try |
+|---|---|
+| Seams within a slice | `stitch_overlap_fraction` (match acquisition); `stitch_blending_method='diffusion'`; raise `max_blend_refinement_px` |
+| Per-slice stitch transform jitter | `stitch_global_transform=true` |
+| One/two bad slices contaminating neighbours | `auto_assess_quality=true`, `auto_assess_min_quality=0.3` |
+| Large XY jump at specific slices | Check `shifts_xy.csv`; `detect_rehoming=true`; `common_space_refine_unreliable=true` |
+| Repeating large XY steps near tile-FOV multiples (legacy data) | `tile_fov_mm=<acquisition_fov_mm>`, `tile_fov_tolerance=0.05` |
+| Image refinement producing wild values | `common_space_refine_max_discrepancy_px=50`, `common_space_refine_min_correlation=0.15` |
+| Single-slice gap visible in Z | `interpolate_missing_slices=true` (default); lower `interpolation_min_overlap_correlation` if zmorph keeps falling back |
+| Slice-to-slice rotation in oblique samples | `registration_transform='euler'`, raise `registration_max_rotation` (e.g. 35.0) |
+| Sudden tilt/jump at one slice | Check `output/stack/stacking_decisions.csv` for `transform_loaded=False`; tighten `load_transform_max_rotation` (e.g. 4.0) |
+| Slow XY drift across many slices | Raise `stack_translation_smooth_sigma` (3→5); set `stack_max_cumulative_drift_px=0` |
+| Optimizer-boundary translation hits | Keep `skip_warning_transforms=true`; **don't** zero them via `stack_max_pairwise_translation` unless they're clearly worse than zero |
+| Visible inter-slice intensity steps | `correct_bias_field=true`, `bias_histogram_match_per_zplane=true`, `bias_zprofile_smooth_sigma=2.0` (raise to 4 if persists) |
+| Bias correction over-flattening tissue | Lower `bias_strength` (1.0 → 0.7) |
+| Atlas overlay rotated 90° / flipped | `ras_orientation_preview=true`, then tune `ras_input_orientation` and `ras_initial_rotation` |
+
+---
+
+## Per-stage quick tables
+
+### Tile stitching
+| Param | Default | Lever |
+|---|---|---|
+| `use_motor_positions_for_stitching` | `true` | Keep on. |
+| `stitch_overlap_fraction` | `0.2` | Must match acquisition. |
+| `stitch_blending_method` | `'diffusion'` | `'none'` to debug seams. |
+| `stitch_global_transform` | `false` | Pool affine across slices. |
+
+### Common-space alignment
+| Param | Default | Lever |
+|---|---|---|
+| `detect_rehoming` | `true` | Keep on. |
+| `rehoming_max_shift_mm` | `0.5` | Lower to catch smaller glitches. |
+| `tile_fov_mm` | `null` | Set only for legacy shift CSVs with FOV-multiple jumps. |
+| `common_space_refine_unreliable` | `false` | Image-based fallback for `reliable=0` shifts. |
+| `common_space_refine_max_discrepancy_px` | `0` | Recommended `50` to gate wild image estimates. |
+| `common_space_refine_min_correlation` | `0.0` | Recommended `0.15`–`0.3`. |
+
+### Pairwise registration
+| Param | Default | Lever |
+|---|---|---|
+| `registration_transform` | `'euler'` | `'translation'` if no rotation expected. |
+| `registration_max_rotation` | `5.0` | Raise for oblique samples (e.g. 35.0). |
+| `registration_max_translation` | `200.0` | Keep large; not the actual applied bound. |
+
+### Stacking
+| Param | Default | Lever |
+|---|---|---|
+| `apply_pairwise_transforms` | `true` | `false` for motor-only. |
+| `apply_rotation_only` | `false` | `true` (conservative): keep XY from motor. |
+| `skip_warning_transforms` | `true` | **Keep on.** Disabling causes Z errors. |
+| `skip_error_transforms` | `true` | **Keep on.** |
+| `max_rotation_deg` | `5.0` | Clamp applied rotation. |
+| `load_transform_max_rotation` | `0.0` | Tighten gate (e.g. 4.0) for noisy pairwise rotations. |
+| `stack_accumulate_translations` | `true` | Cumulative canvas offsets. |
+| `stack_translation_smooth_sigma` | `3.0` | Higher = smoother accumulated drift. |
+| `stack_max_cumulative_drift_px` | `50` | Set `0` to disable cap (recommended). |
+| `stack_max_pairwise_translation` | `0` | Set `0` to disable boundary zeroing (recommended). |
+| `stack_smooth_window` | `5` | Moving avg over rotations. |
+| `transform_confidence_high` | `0.6` | Full transform threshold. |
+| `transform_confidence_low` | `0.3` | Skip threshold. Between = rotation-only. |
+
+### Bias correction
+| Param | Default | Lever |
+|---|---|---|
+| `correct_bias_field` | `false` | Master switch. |
+| `bias_mode` | `'two_pass'` | `'per_section'` if per-slice issues only. |
+| `bias_strength` | `1.0` | Lower if over-flattening. |
+| `bias_histogram_match_per_zplane` | `true` | Keep on. |
+| `bias_zprofile_smooth_sigma` | `2.0` | Range 2–4. |
+| `bias_tissue_threshold` | `0.005` | Lower if tissue mistaken for bg. |
+
+### Atlas registration
+| Param | Default | Lever |
+|---|---|---|
+| `align_to_ras_enabled` | `false` | Master switch. |
+| `allen_resolution` | `25` | Atlas µm. |
+| `allen_registration_level` | `2` | Higher = faster, coarser. |
+| `ras_input_orientation` | `''` | 3-letter code (e.g. `'PIR'`). |
+| `ras_initial_rotation` | `''` | `"Rx Ry Rz"` deg hint. |
+| `ras_orientation_preview` | `false` | Turn on while tuning orientation. |
+
+---
+
+## Diagnostic file map
+
+| File | What to look at |
+|---|---|
+| `output/common_space_previews/*.png` | Per-slice XY/XZ/YZ — most artifacts visible here. |
+| `output/stack/stacking_decisions.csv` | `transform_loaded` flags, applied rotations/translations. |
+| `output/stack/z_matches.csv` | Z-overlap correlation per pair. |
+| `output/register_pairwise/*_metrics.json` | Per-pair `mag`, `rotation`, `overall_status`, `z_correlation`. |
+| `output/detect_rehoming_events/shifts_xy_clean.csv` | Corrected shifts; `reliable` column. |
+| `shifts_xy.csv` | Raw motor steps (`x_shift_mm`, `y_shift_mm`). Steps > 0.5mm are suspect. |
+
+---
+
+## Rerun cascade rules
+
+A change to a parameter invalidates that stage and all downstream stages.
+Use `-resume` — Nextflow caches everything upstream automatically.
+
+| Change to | Re-runs from |
+|---|---|
+| `detect_rehoming`, `tile_fov_mm` | `detect_rehoming` → all downstream |
+| `common_space_*` | `common_space` → all downstream |
+| `registration_*` | `register_pairwise` → all downstream |
+| `apply_*`, `stack_*`, `load_transform_*`, `transform_confidence_*` | `stack` → all downstream |
+| `bias_*`, `correct_bias_field` | `correct_bias_field` → atlas |
+| `align_to_ras_*`, `allen_*`, `ras_*` | `align_to_ras` only |
diff --git a/docs/SCRIPTS_REFERENCE.md b/docs/SCRIPTS_REFERENCE.md
index 4450867f..1d2e97f5 100644
--- a/docs/SCRIPTS_REFERENCE.md
+++ b/docs/SCRIPTS_REFERENCE.md
@@ -593,7 +593,7 @@ genuine gap (no zarr output) and the failure is stamped into
 `slice_config_final.csv`. `average` and `weighted` are available as explicit
 baselines.
 
-See [SLICE_INTERPOLATION_FEATURE.md](SLICE_INTERPOLATION_FEATURE.md) for the
+See {doc}`SLICE_INTERPOLATION_FEATURE` for the
 physical model and parameter-tuning guidance.
 
 ```bash
diff --git a/docs/SHIFTS_FILE_FORMAT.md b/docs/SHIFTS_FILE_FORMAT.md
index c563c740..8b93b440 100644
--- a/docs/SHIFTS_FILE_FORMAT.md
+++ b/docs/SHIFTS_FILE_FORMAT.md
@@ -126,7 +126,7 @@ If some slices are excluded from reconstruction, the pairwise shifts must be acc
 
 ### Solution
 
-See [SLICE_CONFIG_FEATURE.md](SLICE_CONFIG_FEATURE.md) for the slice configuration system that handles this.
+See {doc}`SLICE_CONFIG_FEATURE` for the slice configuration system that handles this.
 
 ### Example
 
diff --git a/docs/SLICE_CONFIG_FEATURE.md b/docs/SLICE_CONFIG_FEATURE.md
index 77ea5d6c..1a726607 100644
--- a/docs/SLICE_CONFIG_FEATURE.md
+++ b/docs/SLICE_CONFIG_FEATURE.md
@@ -132,7 +132,7 @@ After `finalise_interpolation`, any slice that reached `linum_interpolate_missin
   - `interpolation_method_used` empty
   - `interpolation_fallback_reason` = one of `low_overlap_ncc`, `no_foreground_planes`, `registration_exception`, `reg_did_not_improve`, `affine_determinant_non_positive`
 
-The pipeline never fabricates a slice from a weighted blend when registration fails; blending two neighbours that could not be registered introduces ghost contours and would also be made-up data. See [`SLICE_INTERPOLATION_FEATURE.md`](SLICE_INTERPOLATION_FEATURE.md) for the interpolation algorithm details and rationale.
+The pipeline never fabricates a slice from a weighted blend when registration fails; blending two neighbours that could not be registered introduces ghost contours and would also be made-up data. See {doc}`SLICE_INTERPOLATION_FEATURE` for the interpolation algorithm details and rationale.
 
 ---
 
diff --git a/docs/SLICE_INTERPOLATION_FEATURE.md b/docs/SLICE_INTERPOLATION_FEATURE.md
index 42b86034..6d8f69e1 100644
--- a/docs/SLICE_INTERPOLATION_FEATURE.md
+++ b/docs/SLICE_INTERPOLATION_FEATURE.md
@@ -25,7 +25,7 @@ survives to the final quality report.
   therefore fabricated slice.
 - Per-slice JSON diagnostics and per-slice manifest fragments are merged
   directly into `slice_config.csv` — the single source of truth for per-slice
-  decisions, see [`SLICE_CONFIG_FEATURE.md`](SLICE_CONFIG_FEATURE.md).
+  decisions, see {doc}`SLICE_CONFIG_FEATURE`.
   Successful interpolations stamp `interpolated=true`; failures stamp
   `interpolation_failed=true` plus the specific `fallback_reason`.
 - Downstream propagation: `linum_register_pairwise.py` automatically marks
@@ -60,7 +60,7 @@ constraints.
 ### Driving the interpolator from `slice_config.csv`
 
 The preprocessing pipeline generates an initial `slice_config.csv`
-(see [`SLICE_CONFIG_FEATURE.md`](SLICE_CONFIG_FEATURE.md)). The user
+(see {doc}`SLICE_CONFIG_FEATURE`). The user
 (or the automated quality assessment) may mark additional slices with
 `use=false`. In the reconstruction pipeline those slices are filtered out
 *before* common-space alignment, which leaves a gap in the numeric slice ID
diff --git a/docs/pipelines.rst b/docs/pipelines.rst
index 25d6c691..c8363062 100644
--- a/docs/pipelines.rst
+++ b/docs/pipelines.rst
@@ -7,4 +7,6 @@ Pipelines
    PIPELINE_OVERVIEW
    NEXTFLOW_WORKFLOWS
    RECONST_2_5D_WORKFLOW
+   RECONSTRUCTION_TUNING
+   RECONSTRUCTION_TUNING_QUICKREF
    RECONSTRUCTION_DIAGNOSTICS

From d797bbe6e2039ae50aa1f017d8689e1922b9cdac Mon Sep 17 00:00:00 2001
From: Frans Irgolitsch <frans.irgolitsch@polymtl.ca>
Date: Fri, 1 May 2026 12:24:43 -0400
Subject: [PATCH 5/6] feat(resample): GPU round-robin assignment +
 Helpers.gpuPinBlock

Squash of resample/GPU/workflow work on dev:
- Round-robin GPU assignment for resample_mosaic_grid (linumpy.gpu)
- Per-gpu active-slot counters via Helpers.gpuPinBlock for fair maxForks balance
- CUDA device pin in prefetch worker thread + workflow CUDA_VISIBLE_DEVICES
- Keeps simple per-tile CPU-read -> GPU-rescale -> CPU-write path
---
 linumpy/gpu/__init__.py                 | 47 +++++++++++++++-------
 linumpy/gpu/interpolation.py            |  4 +-
 scripts/linum_resample_mosaic_grid.py   | 44 ++++++++-------------
 workflows/reconst_3d/lib/Helpers.groovy | 52 +++++++++++++++++++++++++
 workflows/reconst_3d/nextflow.config    |  9 +++--
 workflows/reconst_3d/soct_3d_reconst.nf |  2 +
 6 files changed, 114 insertions(+), 44 deletions(-)

diff --git a/linumpy/gpu/__init__.py b/linumpy/gpu/__init__.py
index 499f8ead..ee83dc46 100644
--- a/linumpy/gpu/__init__.py
+++ b/linumpy/gpu/__init__.py
@@ -47,21 +47,43 @@
 
         # Test if CUDA is actually available
         try:
-            # First, find the GPU with most free memory
             n_devices = cp.cuda.runtime.getDeviceCount()
 
             if n_devices > 0:
-                best_gpu_id = 0
-                best_free_memory = 0
+                # Pick a GPU. If CUDA_VISIBLE_DEVICES is set externally
+                # (e.g. by Nextflow), cupy already sees only the allowed
+                # cards and n_devices reflects that — just use device 0.
+                # Otherwise round-robin across physical devices using a
+                # file-locked counter. This is robust against the race
+                # where many forks start concurrently and would all pick
+                # the "GPU with most free memory" snapshot at the same
+                # time. It is also indifferent to which work items are in
+                # flight, unlike workload-id modulo schemes.
+                if n_devices == 1 or os.environ.get("CUDA_VISIBLE_DEVICES"):
+                    best_gpu_id = 0
+                else:
+                    import fcntl
+                    import tempfile
+                    from pathlib import Path
+
+                    counter_path = Path(tempfile.gettempdir()) / f"linumpy_gpu_counter_{os.getuid()}"
+                    # Open or create the counter, take an exclusive lock,
+                    # increment, release. This serialises the assignment
+                    # decision across all linumpy processes on the host.
+                    with counter_path.open("a+") as fp:
+                        fcntl.flock(fp.fileno(), fcntl.LOCK_EX)
+                        fp.seek(0)
+                        try:
+                            counter = int((fp.read() or "0").strip())
+                        except ValueError:
+                            counter = 0
+                        best_gpu_id = counter % n_devices
+                        fp.seek(0)
+                        fp.truncate()
+                        fp.write(str(counter + 1))
+                        fp.flush()
+                        fcntl.flock(fp.fileno(), fcntl.LOCK_UN)
 
-                for i in range(n_devices):
-                    with cp.cuda.Device(i):
-                        free, total = cp.cuda.runtime.memGetInfo()
-                        if free > best_free_memory:
-                            best_free_memory = free
-                            best_gpu_id = i
-
-                # Select the best GPU
                 cp.cuda.Device(best_gpu_id).use()
 
                 CUPY_AVAILABLE = True
@@ -74,11 +96,10 @@
                 GPU_MEMORY_GB = mem_info[1] / (1024**3)  # Total memory in GB
 
                 if n_devices > 1:
-                    # Only show message if there are multiple GPUs
                     import sys
 
                     print(
-                        f"Auto-selected GPU {best_gpu_id}: {GPU_DEVICE_NAME} ({best_free_memory / (1024**3):.1f} GB free)",
+                        f"Selected GPU {best_gpu_id}/{n_devices}: {GPU_DEVICE_NAME} ({mem_info[0] / (1024**3):.1f} GB free)",
                         file=sys.stderr,
                     )
             else:
diff --git a/linumpy/gpu/interpolation.py b/linumpy/gpu/interpolation.py
index 97da205d..c5a52d68 100644
--- a/linumpy/gpu/interpolation.py
+++ b/linumpy/gpu/interpolation.py
@@ -155,7 +155,9 @@ def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) -
     from cupyx.scipy.ndimage import zoom as cp_zoom
 
     input_was_gpu = is_cupy_array(image)
-    img_gpu = cp.asarray(image if image.dtype == np.float32 else image.astype(np.float32))
+    # Cast to float32 *after* H2D so PCIe carries the smaller (or unchanged) dtype.
+    # `copy=False` skips the device copy when the array is already float32.
+    img_gpu = cp.asarray(image).astype(np.float32, copy=False)
 
     # Scale factors: input/output for Gaussian sigma, output/input for zoom.
     scale_factors = tuple(i / o for i, o in zip(image.shape, output_shape, strict=False))
diff --git a/scripts/linum_resample_mosaic_grid.py b/scripts/linum_resample_mosaic_grid.py
index f346266a..329b0d41 100644
--- a/scripts/linum_resample_mosaic_grid.py
+++ b/scripts/linum_resample_mosaic_grid.py
@@ -22,7 +22,6 @@
 from linumpy.geometry.resampling import resolution_is_mm
 from linumpy.gpu import GPU_AVAILABLE, print_gpu_info
 from linumpy.gpu.interpolation import resize
-from linumpy.gpu.zarr_io import gpu_zarr_context
 from linumpy.io import OmeZarrWriter, read_omezarr
 
 
@@ -67,12 +66,8 @@ def rescale(image: Any, scale: float | Sequence[float], order: int = 1, use_gpu:
 
 
 def _read_tile(vol: Any, i: Any, j: Any, tile_shape: Any) -> Any:
-    """Read one tile from the input zarr array (I/O stage of the pipeline).
-
-    Returns whatever array type the zarr backend yields: NumPy by default, or
-    CuPy when the volume was opened inside :func:`gpu_zarr_context`.
-    """
-    return vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]]
+    """Read one tile from the input zarr array (I/O stage of the pipeline)."""
+    return np.asarray(vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]])
 
 
 def _run_pipelined(
@@ -95,10 +90,15 @@ def _run_pipelined(
     if not tile_iter:
         return
 
-    # Note: do NOT periodically call cp.get_default_memory_pool().free_all_blocks().
-    # All tiles are the same size, so the pool reuses GPU buffers across iterations.
-    # Flushing forces cudaMalloc on the next tile and stalls the GPU; CuPy already
-    # retries with free_all_blocks() automatically on OOM.
+    cp: Any = None
+    cupy_available = False
+    if use_gpu:
+        try:
+            import cupy as cp
+
+            cupy_available = True
+        except Exception:
+            pass
 
     with ThreadPoolExecutor(max_workers=1) as prefetch_executor:
         i0, j0 = tile_iter[0]
@@ -116,6 +116,9 @@ def _run_pipelined(
                 :, i * out_tile_shape[1] : (i + 1) * out_tile_shape[1], j * out_tile_shape[2] : (j + 1) * out_tile_shape[2]
             ] = resampled
 
+            if cupy_available and cp is not None and k % 10 == 9:
+                cp.get_default_memory_pool().free_all_blocks()
+
 
 def main() -> None:
     """Run function."""
@@ -169,24 +172,11 @@ def main() -> None:
     out_shape = (out_tile_shape[0], nx * out_tile_shape[1], ny * out_tile_shape[2])
     print(f"  Output shape: {out_shape} ({total_tiles} tiles)")
 
-    tile_iter = list(itertools.product(range(nx), range(ny)))
+    out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True)
 
-    if use_gpu:
-        # Open BOTH the input volume and the output writer inside the GPU zarr
-        # context. That way per-tile reads land directly on device and per-tile
-        # writes accept cupy arrays through zarr's GPU buffer prototype, so the
-        # resample loop is fully device-resident:
-        #   read tile -> cupy gaussian + zoom -> write cupy slice
-        with gpu_zarr_context():
-            vol_gpu, _ = read_omezarr(args.in_mosaic)
-            out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True)
-            _run_pipelined(vol_gpu, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu)
-    else:
-        out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True)
-        _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu)
+    tile_iter = list(itertools.product(range(nx), range(ny)))
+    _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu)
 
-    # Pyramid build reads the just-written level0 back via dask; run it outside
-    # the GPU context so dask uses the normal numpy buffer prototype.
     print("Building pyramid...")
     out_res = [target_res] * 3
     out_zarr.finalize(out_res, args.n_levels)
diff --git a/workflows/reconst_3d/lib/Helpers.groovy b/workflows/reconst_3d/lib/Helpers.groovy
index 6e60bc3e..baac906a 100644
--- a/workflows/reconst_3d/lib/Helpers.groovy
+++ b/workflows/reconst_3d/lib/Helpers.groovy
@@ -300,4 +300,56 @@ class Helpers {
         if (params.stack_translation_min_zcorr > 0) opts += " --translation_min_zcorr ${params.stack_translation_min_zcorr}"
         return opts
     }
+
+    // -------------------------------------------------------------------------
+    // GPU pinning
+    // -------------------------------------------------------------------------
+
+    /**
+     * Bash block that pins the current task to the least-loaded physical GPU.
+     *
+     * In-process pinning via `cp.cuda.Device(N).use()` is leaky — zarr's GPU
+     * buffer prototype and the kvikio/GDS path still allocate on device 0
+     * regardless. Setting `CUDA_VISIBLE_DEVICES` in the shell hides the other
+     * cards from the process so no library can route around the choice.
+     *
+     * The returned snippet maintains per-GPU active-slot counters under
+     * /tmp guarded by `flock`. On entry it picks the GPU with the fewest
+     * running forks; on shell exit (`trap ... EXIT`) it decrements that
+     * slot. This balances better than `slice_id % gpu_count` (which can
+     * give 3/1 splits when concurrent slices share parity) and better than
+     * a simple round-robin counter (which ignores fork-completion order).
+     *
+     * Pass `params` and a short `tag` used for the per-task log line.
+     * When `params.use_gpu` is false this returns an empty string.
+     */
+    static String gpuPinBlock(params, String tag) {
+        if (!params.use_gpu) return ''
+        def n = params.gpu_count as int
+        return """
+        UID_=\$(id -u)
+        GPU_LOCK=/tmp/linumpy_nf_gpu_slots_\${UID_}.lock
+        GPU_SLOT_PREFIX=/tmp/linumpy_nf_gpu_slot_\${UID_}_
+        _gpu_acquire() {
+            exec 9>\$GPU_LOCK
+            flock 9
+            local best=0 best_n=999999 i n
+            for i in \$(seq 0 \$((${n} - 1))); do
+                n=\$(cat \${GPU_SLOT_PREFIX}\$i 2>/dev/null || echo 0)
+                if [ "\$n" -lt "\$best_n" ]; then best=\$i; best_n=\$n; fi
+            done
+            echo \$((best_n + 1)) > \${GPU_SLOT_PREFIX}\$best
+            echo \$best
+        }
+        _gpu_release() {
+            exec 9>\$GPU_LOCK
+            flock 9
+            local n=\$(cat \${GPU_SLOT_PREFIX}\$1 2>/dev/null || echo 1)
+            echo \$((n - 1)) > \${GPU_SLOT_PREFIX}\$1
+        }
+        export CUDA_VISIBLE_DEVICES=\$(_gpu_acquire)
+        trap "_gpu_release \$CUDA_VISIBLE_DEVICES" EXIT
+        echo "[${tag}] CUDA_VISIBLE_DEVICES=\$CUDA_VISIBLE_DEVICES"
+        """
+    }
 }
diff --git a/workflows/reconst_3d/nextflow.config b/workflows/reconst_3d/nextflow.config
index 3765a1ab..ad1ac377 100644
--- a/workflows/reconst_3d/nextflow.config
+++ b/workflows/reconst_3d/nextflow.config
@@ -16,6 +16,8 @@ params {
     // COMPUTE RESOURCES
     // =========================================================================
     use_gpu = true           // Enable GPU acceleration (auto-fallback to CPU if unavailable)
+    gpu_count = 1            // Number of physical GPUs to round-robin across in resample_mosaic_grid.
+                             // Override per-subject (e.g. `gpu_count = 2` on dual-A6000 hosts).
     processes = 8            // Number of parallel Python processes per Nextflow task
 
     // CPU resource management
@@ -391,9 +393,10 @@ process {
 
     withName: "resample_mosaic_grid" {
         scratch = false
-        // Measured: ~1 GB GPU memory per fork. IO (RAID at 100%) is the
-        // gating factor at 6 forks, not GPU. Bumping higher won't help.
-        maxForks = params.use_gpu ? 6 : null
+        // Per-tile GPU work (~few hundred MB peak per fork). Forks are pinned
+        // per-GPU via Helpers.gpuPinBlock so they round-robin across gpu_count
+        // physical devices.
+        maxForks = params.use_gpu ? 4 : null
     }
 
     withName: "fix_focal_curvature" {
diff --git a/workflows/reconst_3d/soct_3d_reconst.nf b/workflows/reconst_3d/soct_3d_reconst.nf
index 643113b0..ace9115a 100644
--- a/workflows/reconst_3d/soct_3d_reconst.nf
+++ b/workflows/reconst_3d/soct_3d_reconst.nf
@@ -126,7 +126,9 @@ process resample_mosaic_grid {
 
     script:
     def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu"
+    def gpu_pin_block = Helpers.gpuPinBlock(params, "resample slice=${slice_id}")
     """
+    ${gpu_pin_block}
     linum_resample_mosaic_grid.py ${mosaic_grid} "mosaic_grid_z${slice_id}_resampled.ome.zarr" \
         -r ${params.resolution} ${gpu_flag} --n_levels 0 -v
     """

From ab8288483f6239204db839db5c394a37a2de67a2 Mon Sep 17 00:00:00 2001
From: Frans Irgolitsch <frans.irgolitsch@polymtl.ca>
Date: Fri, 1 May 2026 13:19:11 -0400
Subject: [PATCH 6/6] fix(resample): clamp output tile shape to >=1 to avoid
 zero-sized axes

Heavy downsampling of small axes (e.g. Z=5 by factor 0.1) used to round
to zero, producing zero-sized output and ZeroDivisionError downstream.
Clamp output_shape to a minimum of 1 in both rescale() and out_tile_shape.
---
 scripts/linum_resample_mosaic_grid.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/linum_resample_mosaic_grid.py b/scripts/linum_resample_mosaic_grid.py
index 329b0d41..2d29eee4 100644
--- a/scripts/linum_resample_mosaic_grid.py
+++ b/scripts/linum_resample_mosaic_grid.py
@@ -61,7 +61,9 @@ def rescale(image: Any, scale: float | Sequence[float], order: int = 1, use_gpu:
         Rescaled image.
     """
     scale_tuple = tuple([float(scale)] * image.ndim) if isinstance(scale, (int, float)) else tuple(scale)
-    output_shape = tuple(round(s * sc) for s, sc in zip(image.shape, scale_tuple, strict=False))
+    # Clamp to >=1 so heavy downsampling of small axes (e.g. Z=5 by factor 0.1)
+    # doesn't produce a zero-sized output that triggers ZeroDivisionError downstream.
+    output_shape = tuple(max(1, round(s * sc)) for s, sc in zip(image.shape, scale_tuple, strict=False))
     return resize(image, output_shape, order=order, anti_aliasing=True, use_gpu=use_gpu)
 
 
@@ -163,7 +165,7 @@ def main() -> None:
     print(f"  Target resolution: {args.resolution} µm")
     print(f"  Scale factor: {scaling_factor}")
 
-    out_tile_shape = tuple(round(s * sc) for s, sc in zip(tile_shape, scaling_factor, strict=False))
+    out_tile_shape = tuple(max(1, round(s * sc)) for s, sc in zip(tile_shape, scaling_factor, strict=False))
 
     nx = vol.shape[1] // tile_shape[1]
     ny = vol.shape[2] // tile_shape[2]