From 92cdd6d42b8e067dac5d5eb324206dc33ca5b30e Mon Sep 17 00:00:00 2001 From: Frans Irgolitsch Date: Wed, 29 Apr 2026 17:47:16 -0400 Subject: [PATCH 1/6] perf(fix_illum): cap PyTorch threads via configure_all_libraries --- scripts/linum_fix_illumination_3d.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/linum_fix_illumination_3d.py b/scripts/linum_fix_illumination_3d.py index 7063c375..7f32c7d6 100644 --- a/scripts/linum_fix_illumination_3d.py +++ b/scripts/linum_fix_illumination_3d.py @@ -57,9 +57,12 @@ def _build_arg_parser() -> argparse.ArgumentParser: def process_tile(params: dict) -> tuple: """Process a tile and add it to the output mosaic.""" - from linumpy.config.threads import apply_threadpool_limits + from linumpy.config.threads import configure_all_libraries - apply_threadpool_limits() + # configure_all_libraries() also caps PyTorch intra-/inter-op threads + # (used by BaSiCPy). apply_threadpool_limits() alone misses torch and + # lets it default to ~half the host cores, oversubscribing the node. + configure_all_libraries() file = params["slice_file"] z = params["z"] From af469527158e11045077ff7d86481ed15f1fa279 Mon Sep 17 00:00:00 2001 From: Frans Irgolitsch Date: Wed, 29 Apr 2026 17:42:19 -0400 Subject: [PATCH 2/6] perf(pipeline): bump fix_illumination maxForks 1->4 based on measurement --- workflows/reconst_3d/nextflow.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/reconst_3d/nextflow.config b/workflows/reconst_3d/nextflow.config index f7cafa5b..3765a1ab 100644 --- a/workflows/reconst_3d/nextflow.config +++ b/workflows/reconst_3d/nextflow.config @@ -391,9 +391,8 @@ process { withName: "resample_mosaic_grid" { scratch = false - // Parallel resampling on GPU. Peak ~9 GB/fork (Gaussian intermediate at - // 2x tile size). With 2x A6000 (48 GB each), 6 forks = 3/GPU = ~27 GB, - // leaving headroom for fix_illumination running concurrently. + // Measured: ~1 GB GPU memory per fork. IO (RAID at 100%) is the + // gating factor at 6 forks, not GPU. Bumping higher won't help. maxForks = params.use_gpu ? 6 : null } @@ -405,8 +404,9 @@ process { } withName: "fix_illumination" { - // Limit to 1 parallel instance - BaSiCPy/PyTorch is memory-intensive - maxForks = params.use_gpu ? 1 : null + // Measured: BaSiCPy/PyTorch uses ~374 MiB per fork on this dataset. + // 4 forks ≈ 1.5 GB, well under per-GPU capacity (49 GB). + maxForks = params.use_gpu ? 4 : null // Don't set CUDA_VISIBLE_DEVICES - let linumpy.gpu auto-select GPU with most free memory } From 3fa415784cbcc17501346678307558810226b2b3 Mon Sep 17 00:00:00 2001 From: Frans Irgolitsch Date: Wed, 29 Apr 2026 17:32:01 -0400 Subject: [PATCH 3/6] perf(pipeline): skip pyramids on intermediate ome-zarr outputs --- scripts/linum_crop_3d_mosaic_below_interface.py | 3 ++- scripts/linum_fix_illumination_3d.py | 3 ++- scripts/linum_normalize_intensities_per_slice.py | 3 ++- scripts/linum_stitch_3d_refined.py | 3 ++- workflows/reconst_3d/soct_3d_reconst.nf | 14 ++++++++------ 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/scripts/linum_crop_3d_mosaic_below_interface.py b/scripts/linum_crop_3d_mosaic_below_interface.py index 0fbcf2c9..ecdfdb44 100644 --- a/scripts/linum_crop_3d_mosaic_below_interface.py +++ b/scripts/linum_crop_3d_mosaic_below_interface.py @@ -55,6 +55,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: "to finding the interface*. Original values will\n" "remain in output clipped volume (range [0-100]).", ) + p.add_argument("--n_levels", type=int, default=5, help="Number of levels in pyramid representation. [%(default)s]") return p @@ -101,7 +102,7 @@ def main() -> None: crop_dask = da.from_array(vol_crop, chunks=vol.chunks) # Save cropped volume as OME-Zarr - save_omezarr(crop_dask, output_path, voxel_size=res, chunks=vol.chunks) + save_omezarr(crop_dask, output_path, voxel_size=res, chunks=vol.chunks, n_levels=args.n_levels) # Collect metrics using helper function original_shape = vol.shape diff --git a/scripts/linum_fix_illumination_3d.py b/scripts/linum_fix_illumination_3d.py index 7f32c7d6..b55174e5 100644 --- a/scripts/linum_fix_illumination_3d.py +++ b/scripts/linum_fix_illumination_3d.py @@ -42,6 +42,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: type=float, help="Values above this percentile will be clipped when\nestimating the flatfield (inside range [0-100]).", ) + p.add_argument("--n_levels", type=int, default=5, help="Number of levels in pyramid representation. [%(default)s]") p.add_argument( "--use_gpu", default=True, @@ -185,7 +186,7 @@ def main() -> None: print(f"Minimum value in the output volume is {min_value}. Clipping at 0.") out_dask = da.clip(out_dask, 0.0, None) - save_omezarr(out_dask, output_zarr, voxel_size=resolution, chunks=vol.chunks) + save_omezarr(out_dask, output_zarr, voxel_size=resolution, chunks=vol.chunks, n_levels=args.n_levels) tmp_dir.cleanup() diff --git a/scripts/linum_normalize_intensities_per_slice.py b/scripts/linum_normalize_intensities_per_slice.py index 1630dd19..8b5503ca 100644 --- a/scripts/linum_normalize_intensities_per_slice.py +++ b/scripts/linum_normalize_intensities_per_slice.py @@ -39,6 +39,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: p.add_argument("--sigma", type=float, default=1.0, help="Smoothing sigma for estimating the agarose mask. [%(default)s]") p.add_argument("--use_gpu", default=True, action=argparse.BooleanOptionalAction, help="Use GPU acceleration if available.") p.add_argument("--verbose", action="store_true", help="Print GPU information.") + p.add_argument("--n_levels", type=int, default=3, help="Number of levels in pyramid representation. [%(default)s]") return p @@ -71,7 +72,7 @@ def main() -> None: vol_normalized, background_thresholds = normalize_volume(vol_data, agarose_mask, args.percentile_max) - save_omezarr(da.from_array(vol_normalized), args.out_image, res, n_levels=3) + save_omezarr(da.from_array(vol_normalized), args.out_image, res, n_levels=args.n_levels) collect_normalization_metrics( vol_normalized=vol_normalized, diff --git a/scripts/linum_stitch_3d_refined.py b/scripts/linum_stitch_3d_refined.py index a4490068..dfc859f8 100644 --- a/scripts/linum_stitch_3d_refined.py +++ b/scripts/linum_stitch_3d_refined.py @@ -84,6 +84,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: "--output_refinements", type=str, default=None, help="Output JSON file to save computed refinements for analysis." ) p.add_argument("--overwrite", "-f", action="store_true", help="Overwrite output if it exists.") + p.add_argument("--n_levels", type=int, default=3, help="Number of levels in pyramid representation. [%(default)s]") return p @@ -292,7 +293,7 @@ def main() -> None: from linumpy.io.zarr import save_omezarr - save_omezarr(da.from_array(output), output_file, resolution, n_levels=3) + save_omezarr(da.from_array(output), output_file, resolution, n_levels=args.n_levels) # Collect metrics from linumpy.metrics import PipelineMetrics diff --git a/workflows/reconst_3d/soct_3d_reconst.nf b/workflows/reconst_3d/soct_3d_reconst.nf index 01966335..643113b0 100644 --- a/workflows/reconst_3d/soct_3d_reconst.nf +++ b/workflows/reconst_3d/soct_3d_reconst.nf @@ -128,7 +128,7 @@ process resample_mosaic_grid { def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu" """ linum_resample_mosaic_grid.py ${mosaic_grid} "mosaic_grid_z${slice_id}_resampled.ome.zarr" \ - -r ${params.resolution} ${gpu_flag} -v + -r ${params.resolution} ${gpu_flag} --n_levels 0 -v """ stub: @@ -147,7 +147,8 @@ process fix_focal_curvature { script: def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu" """ - linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr" ${gpu_flag} + linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr" \\ + --n_levels 0 ${gpu_flag} """ stub: @@ -170,7 +171,7 @@ process fix_illumination { """ linum_fix_illumination_3d.py ${mosaic_grid} "mosaic_grid_z${slice_id}_illum_fix.ome.zarr" \ --n_processes ${params.processes} \ - --percentile_max ${params.clip_percentile_upper} ${gpu_flag} + --percentile_max ${params.clip_percentile_upper} ${gpu_flag} --n_levels 0 """ stub: @@ -244,6 +245,7 @@ process stitch_3d_with_refinement { --refinement_mode blend_shift \ --max_refinement_px ${params.max_blend_refinement_px} \ ${transform_arg} \ + --n_levels 0 \ -f """ @@ -291,7 +293,7 @@ process beam_profile_correction { script: """ linum_compensate_psf_model_free.py ${slice_3d} "slice_z${slice_id}_axial_corr.ome.zarr" \ - --percentile_max ${params.clip_percentile_upper} + --percentile_max ${params.clip_percentile_upper} --n_levels 0 """ stub: @@ -315,7 +317,7 @@ process crop_interface { linum_crop_3d_mosaic_below_interface.py ${image} "slice_z${slice_id}_crop_interface.ome.zarr" \ --depth ${params.crop_interface_out_depth} \ --crop_before_interface \ - --percentile_max ${params.clip_percentile_upper} + --percentile_max ${params.clip_percentile_upper} --n_levels 0 """ stub: @@ -338,7 +340,7 @@ process normalize { def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu" """ linum_normalize_intensities_per_slice.py ${image} "slice_z${slice_id}_normalize.ome.zarr" \ - --percentile_max ${params.clip_percentile_upper} ${gpu_flag} + --percentile_max ${params.clip_percentile_upper} ${gpu_flag} --n_levels 0 """ stub: From e1a56f976c9d964ffd616cf1090ec2304605963a Mon Sep 17 00:00:00 2001 From: Frans Irgolitsch Date: Thu, 30 Apr 2026 12:35:50 -0400 Subject: [PATCH 4/6] Docs update Co-authored-by: Copilot --- docs/LIBRARY_MODULES.md | 20 +- docs/N4_GPU.md | 4 +- docs/NEXTFLOW_WORKFLOWS.md | 6 +- docs/PIPELINE_OVERVIEW.md | 6 +- docs/RECONSTRUCTION_TUNING.md | 396 +++++++++++++++++++++++++ docs/RECONSTRUCTION_TUNING_QUICKREF.md | 131 ++++++++ docs/SCRIPTS_REFERENCE.md | 2 +- docs/SHIFTS_FILE_FORMAT.md | 2 +- docs/SLICE_CONFIG_FEATURE.md | 2 +- docs/SLICE_INTERPOLATION_FEATURE.md | 4 +- docs/pipelines.rst | 2 + 11 files changed, 551 insertions(+), 24 deletions(-) create mode 100644 docs/RECONSTRUCTION_TUNING.md create mode 100644 docs/RECONSTRUCTION_TUNING_QUICKREF.md diff --git a/docs/LIBRARY_MODULES.md b/docs/LIBRARY_MODULES.md index 7ac2e53c..1834882d 100644 --- a/docs/LIBRARY_MODULES.md +++ b/docs/LIBRARY_MODULES.md @@ -51,13 +51,13 @@ from linumpy.io.thorlabs import ThorImageOCT point: it materialises the requested level into memory and returns `(array, voxel_size)`. With ``use_gpu=False`` (default) the array is a ``numpy.ndarray``; with ``use_gpu=True`` it dispatches through -[`linumpy.gpu.zarr_io.read_zarr_to_gpu`](GPU_ACCELERATION.md) and returns a +`linumpy.gpu.zarr_io.read_zarr_to_gpu` (see {doc}`GPU_ACCELERATION`) and returns a ``cupy.ndarray`` (kvikio/GPUDirect Storage when available, otherwise the ``zarr.config.enable_gpu`` fallback). Voxel size is ordered to match the array axes (Z, Y, X for 3D volumes; Y, X for 2D mosaics). -See [Mosaic Grid Format](MOSAIC_GRID_FORMAT.md) and -[Slice Config Feature](SLICE_CONFIG_FEATURE.md) for format details. +See {doc}`Mosaic Grid Format ` and +{doc}`Slice Config Feature ` for format details. ### `linumpy.cli` and `linumpy.config` @@ -84,7 +84,7 @@ from linumpy.metrics import ( ) ``` -See [Reconstruction Diagnostics](RECONSTRUCTION_DIAGNOSTICS.md). +See {doc}`Reconstruction Diagnostics `. ### `linumpy.geometry` @@ -146,7 +146,7 @@ CuPy-backed versions of hot paths. Each public entry point either takes a fastest backend available (kvikio/GDS native → `zarr.config.enable_gpu`). * `gpu.kvikio_zarr` — kvikio / GPUDirect Storage backend for the dispatcher. -See [GPU Acceleration](GPU_ACCELERATION.md) and [N4 GPU](N4_GPU.md). +See {doc}`GPU Acceleration ` and {doc}`N4 GPU `. ### `linumpy.stack_alignment` @@ -158,7 +158,7 @@ from linumpy.stack_alignment.units import detect_shift_units from linumpy.stack_alignment.filter import filter_outliers ``` -See [Shifts File Format](SHIFTS_FILE_FORMAT.md). +See {doc}`Shifts File Format `. ### `linumpy.imaging` @@ -207,11 +207,11 @@ A typical pipeline run wires these subpackages as follows: `gpu.n4` backend when CUDA is available) and `imaging.orientation` / RAS alignment via `reference.allen`. 7. **Diagnostics** use `metrics` collectors throughout; see - [Reconstruction Diagnostics](RECONSTRUCTION_DIAGNOSTICS.md). + {doc}`Reconstruction Diagnostics `. ## See also * Auto-generated API reference: [api/linumpy](api/linumpy/index.rst) -* [Pipeline Overview](PIPELINE_OVERVIEW.md) -* [Scripts Reference](SCRIPTS_REFERENCE.md) -* [Nextflow Workflows](NEXTFLOW_WORKFLOWS.md) +* {doc}`Pipeline Overview ` +* {doc}`Scripts Reference ` +* {doc}`Nextflow Workflows ` diff --git a/docs/N4_GPU.md b/docs/N4_GPU.md index 1b0c965d..6515d353 100644 --- a/docs/N4_GPU.md +++ b/docs/N4_GPU.md @@ -134,7 +134,7 @@ server. ## 5. Performance Benchmarks measured on a single NVIDIA GPU (47 GB) using the -`linum_benchmark_n4_gpu` script (see [Scripts reference](SCRIPTS_REFERENCE.md)). +`linum_benchmark_n4_gpu` script (see {doc}`SCRIPTS_REFERENCE`). The CPU path is `SimpleITK.N4BiasFieldCorrectionImageFilter` with the same control-point spacing and iteration schedule as the GPU path. Both paths include the `shrink_factor` downsample. The GPU column already excludes a @@ -218,7 +218,7 @@ directory. The Nextflow `reconst_3d` workflow exposes a single global GPU switch (`params.use_gpu`, defined in the reconst_3d Nextflow config; see -[Nextflow workflows](NEXTFLOW_WORKFLOWS.md)). +{doc}`Nextflow workflows `). When set, the `correct_bias_field` process runs the GPU N4 backend with `maxForks = 1` to avoid GPU contention; otherwise it uses the SimpleITK CPU path with `params.processes` threads. No per-process flag is needed: diff --git a/docs/NEXTFLOW_WORKFLOWS.md b/docs/NEXTFLOW_WORKFLOWS.md index 2a078e64..9e5fe1f3 100644 --- a/docs/NEXTFLOW_WORKFLOWS.md +++ b/docs/NEXTFLOW_WORKFLOWS.md @@ -269,7 +269,7 @@ gaps are not interpolated (insufficient information) and remain as holes. When zmorph's quality gates fail the slot is left as a genuine gap (no zarr output); a manifest fragment and diagnostics JSON are still emitted. See -[SLICE_INTERPOLATION_FEATURE.md](SLICE_INTERPOLATION_FEATURE.md) for details. +{doc}`SLICE_INTERPOLATION_FEATURE` for details. #### Automatic Slice Quality Assessment @@ -638,7 +638,7 @@ params { For GPU support: - NVIDIA GPU with CUDA support - CuPy installed: `uv pip install cupy-cuda12x` -- See [GPU_ACCELERATION.md](GPU_ACCELERATION.md) for detailed setup +- See {doc}`GPU_ACCELERATION` for detailed setup ### Expected Speedups @@ -1124,7 +1124,7 @@ without exhaustion. zmorph's quality gates reject the interpolation it emits a manifest fragment with `interpolation_failed=true` and no zarr; `finalise_interpolation` stamps that into `slice_config_final.csv` and the slot stays a genuine gap -in the stacked volume. See [`SLICE_INTERPOLATION_FEATURE.md`](SLICE_INTERPOLATION_FEATURE.md) +in the stacked volume. See {doc}`SLICE_INTERPOLATION_FEATURE` for the full policy. ### `finalise_interpolation` is published-only diff --git a/docs/PIPELINE_OVERVIEW.md b/docs/PIPELINE_OVERVIEW.md index c8b55d0c..97cef98e 100644 --- a/docs/PIPELINE_OVERVIEW.md +++ b/docs/PIPELINE_OVERVIEW.md @@ -47,8 +47,6 @@ flowchart TD P3 --> INTER INTER --> R00 INTER --> R01 - R14 --> OUT - R14b --> OUT R16 --> OUT classDef opt fill:#f5f5f5,stroke:#999,stroke-dasharray:3 3,color:#555 @@ -273,7 +271,7 @@ interpolate_missing_slice - Fills single-slice gaps using the two neighbouring slices - Enabled by `interpolate_missing_slices = true` (default) - Default method is **zmorph** (z-aware morphing via fractional affine - transforms on the boundary planes) — see [SLICE_INTERPOLATION_FEATURE.md](SLICE_INTERPOLATION_FEATURE.md) + transforms on the boundary planes) — see {doc}`SLICE_INTERPOLATION_FEATURE` - When zmorph's quality gates fail the slot stays a genuine gap (no zarr emitted) — nothing is fabricated - `finalise_interpolation` merges per-slice manifest fragments into @@ -664,7 +662,7 @@ nextflow run preproc_rawtiles.nf --input /path/to/data --output /path/to/output - NVIDIA GPU with CUDA support - CuPy installed (`uv pip install cupy-cuda12x`) -- See [GPU_ACCELERATION.md](GPU_ACCELERATION.md) for detailed setup +- See {doc}`GPU_ACCELERATION` for detailed setup --- diff --git a/docs/RECONSTRUCTION_TUNING.md b/docs/RECONSTRUCTION_TUNING.md new file mode 100644 index 00000000..1bd24405 --- /dev/null +++ b/docs/RECONSTRUCTION_TUNING.md @@ -0,0 +1,396 @@ +# Reconstruction Parameter Tuning Guide + +A walkthrough of the parameters that most affect 3D reconstruction quality, what +they do, what raising/lowering them changes, and which knobs to reach for when +specific artifacts show up. Aimed at users who are new to the pipeline; if you +already know your way around it, the companion {doc}`Reconstruction Tuning +Quick Reference ` is denser. + +For the runtime-mechanics view of the same parameters, see the inline comments +in `workflows/reconst_3d/nextflow.config` (described in {doc}`NEXTFLOW_WORKFLOWS`). +For diagnostic *scripts* (rotation drift, tile dilation, motor-only stitching), +see {doc}`RECONSTRUCTION_DIAGNOSTICS`. + +--- + +## How to approach tuning + +Reconstruction problems almost always show up at slice boundaries: edges that +don't line up between slices, intensity steps, sudden Z-jumps, or rotated +sections. The pipeline runs in stages, and each artifact is usually produced +by one specific stage. The fastest tuning workflow is: + +1. **Look at the previews first.** `output/common_space_previews/*.png` shows + each slice in XY/XZ/YZ. Most artifacts are visible at a glance. +2. **Identify the stage.** Use the symptom map below to pick a stage. +3. **Adjust the smallest set of parameters that targets that stage.** +4. **Re-run with `-resume`.** Nextflow caches everything upstream of the + change, so iteration is fast. +5. **Verify against previews and `output/stack/*.csv` metrics.** + +The pipeline stages, in order: + +``` +Tile stitching → Slice quality → Common-space alignment → Missing-slice + interpolation → Pairwise registration → Stacking → Bias correction → Atlas +``` + +A change to a parameter in stage *N* invalidates caches for stage *N* and all +later stages. Earlier stages are reused. + +--- + +## Quick orientation: which stage produces which artifact? + +| Artifact | Most likely stage | +|---|---| +| Tiles within a slice misaligned, seams visible | Tile stitching | +| Whole slice looks degraded, smeared, or missing tissue | Slice quality / preprocessing | +| Slice shifted left/right or up/down relative to neighbours | Common-space alignment | +| One slice has a clear gap then resumes | Missing-slice interpolation | +| Two consecutive slices are rotated relative to each other | Pairwise registration | +| Sudden Z-jump or tilt between slices | Stacking | +| Slow XY drift across many slices | Stacking (translation accumulation) | +| Visible step in brightness between slices | Bias correction | +| Atlas overlay is misaligned | Atlas registration | + +--- + +## Profiles before parameters + +Before reaching for individual parameters, pick a profile. Profiles set +sensible groups of parameters together, and you can override individual values +on top of a profile. They are defined in `workflows/reconst_3d/nextflow.config` +(see {doc}`NEXTFLOW_WORKFLOWS`). + +- **`-profile conservative`** (recommended starting point): trusts motor + positions for XY, applies only rotation from pairwise registration, skips + any registration flagged warning/error, interpolates single-slice gaps. + Fails gracefully when registration is unreliable. +- **`-profile aggressive`**: applies full pairwise transforms including XY + translations and accumulates them. Best alignment when registration is + reliable. Can compound errors when it is not. +- **`-profile minimal`**: ignores pairwise registration entirely + (`apply_pairwise_transforms = false`). Most stable, fastest. Use when motor + positions are reliable and registration repeatedly fails. + +If the conservative profile produces a clean reconstruction, stop. Most of the +parameters below exist to handle specific failure modes you may not have. + +--- + +## Stage 1: Tile stitching + +Assembles the tiles within each slice into a single 2D image. The dominant +choice here is whether to trust motor positions or fit a transform from the +images. + +| Parameter | Default | Effect | +|---|---|---| +| `use_motor_positions_for_stitching` | `true` | Use motor encoder positions for tile placement. Recommended — analysis showed image-based stitching introduces ~3% systematic compression. | +| `stitch_overlap_fraction` | `0.2` | Expected fraction of overlap between adjacent tiles. **Must match the acquisition setting.** | +| `stitch_blending_method` | `'diffusion'` | Tile-seam blending. `'diffusion'` is the smoothest, `'average'` is faster, `'none'` shows raw seams (useful when debugging tile placement). | +| `max_blend_refinement_px` | `10` | Sub-pixel refinement budget when blending seams. Larger values let blending hide small motor inaccuracies but can over-smear true features. | +| `stitch_global_transform` | `false` | Pool a single 2x2 affine across many slices and reuse it. Helps when individual slices have too few tiles for a stable per-slice fit. Enable if per-slice stitch transforms jitter. | + +**If you see seams within a slice:** +1. Confirm `stitch_overlap_fraction` matches the acquisition. +2. Try `stitch_blending_method = 'diffusion'` if not already. +3. As a last resort, raise `max_blend_refinement_px` (10 → 20). + +**If individual slices look fine but the whole slice is rotated/skewed +slightly compared to neighbours**: this is usually a per-slice stitch +transform problem. Try `stitch_global_transform = true`. + +--- + +## Stage 2: Slice quality assessment + +Optionally scores each slice and excludes the bad ones from common-space +alignment so they don't poison their neighbours. + +| Parameter | Default | Effect | +|---|---|---| +| `auto_assess_quality` | `false` | Enable automatic quality scoring. | +| `auto_assess_min_quality` | `0.3` | Slices scored below this are excluded. Lower = more permissive. | +| `auto_assess_exclude_first` | `1` | Always exclude the first N calibration slices. | +| `auto_exclude_enabled` | `true` | Detect runs of consecutive low-quality registrations after pairwise (different mechanism). | + +**When to enable:** if you see one or two badly degraded slices dragging the +common-space alignment of their neighbours. + +--- + +## Stage 3: Common-space alignment + +Aligns each stitched slice into a shared XY canvas using `shifts_xy.csv` +(motor positions). Most XY-misalignment problems are tuned here. + +### Encoder glitch correction + +`detect_rehoming` corrects two known motor artifacts: + +1. **Encoder glitch spikes**: a single large step that self-cancels with the + adjacent step. The motor moved cleanly; the encoder briefly reported a + wrong position. +2. **Tile-FOV expansion events** (legacy data only): adding a new column of + tiles at acquisition time produced a motor jump of exactly N × tile_FOV. + +| Parameter | Default | Effect | +|---|---|---| +| `detect_rehoming` | `true` | Enable both passes. Almost always wanted. | +| `rehoming_max_shift_mm` | `0.5` | Steps below this magnitude aren't checked. Lower to catch smaller glitches at the cost of false positives. | +| `rehoming_return_fraction` | `0.4` | Detection sensitivity. Lower = more conservative. | +| `tile_fov_mm` | `null` | Set to the tile field-of-view (e.g. `0.875`) only when re-using older `shifts_xy.csv` files that contain mosaic-expansion artifacts. New shift files generated by `linum_estimate_xy_shift_from_metadata.py` no longer need this. | +| `tile_fov_tolerance` | `0.05` | Fractional tolerance when matching steps to tile-FOV multiples. | + +### Image-based refinement of unreliable transitions + +For transitions where the motor reading is flagged unreliable +(`reliable=0` in shifts CSV), the pipeline can fall back to image-based +phase correlation between the two slices. + +| Parameter | Default | Effect | +|---|---|---| +| `common_space_refine_unreliable` | `false` | Enable image-based refinement. Turn on when slices grow significantly between acquisitions or when raw motor steps are noisy. | +| `common_space_refine_max_discrepancy_px` | `0` | Reject the image estimate if it disagrees with motor by more than this many pixels. `0` = accept all. Recommended `50` if you see image refinement producing wild values. | +| `common_space_refine_min_correlation` | `0.0` | Reject refinements with low correlation quality. Recommended `0.15`–`0.3`. | + +**Recipe — large XY jumps between specific slices:** + +1. Inspect `shifts_xy.csv` for the affected slice IDs. Look at `x_shift_mm` + and `y_shift_mm`. Steps > 0.5 mm are suspect. +2. Check `output/detect_rehoming_events/shifts_xy_clean.csv`. If `reliable=0` + for those rows, the corrected file already accounts for them. +3. If the misalignment persists, enable + `common_space_refine_unreliable = true` and re-run. + +### Excluded-slice interpolation + +| Parameter | Default | Effect | +|---|---|---| +| `common_space_excluded_slice_mode` | `'local_median'` | How to fill XY positions for excluded slices. `'local_median'` averages neighbour shifts. | +| `common_space_excluded_slice_window` | `2` | Number of neighbours considered. | + +--- + +## Stage 4: Missing-slice interpolation + +Fills single-slice gaps so the stack has a continuous Z. The default is +`zmorph` — a 2D registration of the boundary planes is used to morph +fractional-affine intermediate planes between the two neighbours. + +| Parameter | Default | Effect | +|---|---|---| +| `interpolate_missing_slices` | `true` | Enable interpolation. Disable to keep gaps explicit. | +| `interpolation_method` | `'zmorph'` | `'zmorph'` (registration-based morph), `'weighted'` (z-smoothed linear blend), `'average'` (50/50). | +| `interpolation_blend_method` | `'gaussian'` | `'gaussian'` (feathered) or `'linear'`. | +| `interpolation_min_overlap_correlation` | `0.3` | If the boundary-plane NCC is below this, falls back to `'weighted'`. | +| `interpolation_min_ncc_improvement` | `0.05` | If post-registration NCC doesn't improve by this much, falls back to `'weighted'`. | + +**When to lower `interpolation_min_overlap_correlation`:** if zmorph keeps +falling back to weighted on slices that visually look fine, lower to e.g. +`0.2`. Watch for spurious deformations on noisier boundaries. + +**When to disable interpolation:** if you specifically want to see where the +missing slices are (e.g. for QC), set `interpolate_missing_slices = false`. + +--- + +## Stage 5: Pairwise registration + +Computes small inter-slice corrections (rotation, sub-pixel translation). The +*main* alignment comes from motor positions; pairwise transforms are +refinements applied on top during stacking. + +| Parameter | Default | Effect | +|---|---|---| +| `registration_transform` | `'euler'` | `'translation'` (XY only) or `'euler'` (XY + rotation). Use `'euler'` for any sample where slices are rotated relative to each other. | +| `registration_max_translation` | `200.0` | Optimizer bound on translation (px). Keep large; the actual *applied* translation is governed in stacking. Increase only if pairwise metrics report frequent boundary hits. | +| `registration_max_rotation` | `5.0` | Optimizer bound on rotation (deg). Raise (e.g. to `35.0`) for obliquely-mounted samples where slice-to-slice rotation can be large. | +| `registration_initial_alignment` | `'both'` | Initialization. `'both'` runs centre-of-mass and gradient inits and picks the better one. | + +**Recipe — pairwise registration looks "stuck":** + +1. Open a pairwise metrics JSON in `output/register_pairwise/`. Look for + `mag` (translation magnitude in px) and `rotation`. +2. If many slices have `mag` ≈ `registration_max_translation` × 0.95+, the + optimizer is hitting the boundary. Raise the bound (e.g. 200 → 400) but + inspect: usually this means the input alignment is wrong upstream, not + that the bound is too tight. +3. Keep `skip_warning_transforms = true` (in stacking) so unreliable + boundary-hit transforms aren't applied. + +--- + +## Stage 6: Stacking + +Where pairwise corrections actually get applied to the volume. This is the +single most-tuned stage and the source of most reconstruction drift. + +### Whether to apply transforms at all + +| Parameter | Default | Effect | +|---|---|---| +| `apply_pairwise_transforms` | `true` | Master switch. Set `false` to stack motor-only (this is what `-profile minimal` does). | +| `apply_rotation_only` | `false` | Apply only the rotation component, keep XY from motor positions. The conservative profile sets this `true`. | +| `use_expected_z_overlap` | `true` | Use the expected slice thickness for Z spacing instead of correlation-based matching. Recommended; correlation matching is brittle at slice boundaries. | +| `max_rotation_deg` | `5.0` | Clamp applied rotations larger than this. Prevents single bad slices from rotating the entire stack downstream. | + +### Per-slice transform gating + +Each pairwise registration produces a status (ok/warning/error) and metrics. +The pipeline can refuse to apply low-quality transforms. + +| Parameter | Default | Effect | +|---|---|---| +| `skip_error_transforms` | `true` | Skip transforms registered against interpolated slices etc. **Keep enabled.** | +| `skip_warning_transforms` | `true` | Skip transforms that hit the optimizer boundary. **Keep enabled.** Disabling causes Z-positioning errors. | +| `transform_confidence_high` | `0.6` | Above this, full transform applied. | +| `transform_confidence_low` | `0.3` | Below this, transform skipped entirely. Between low and high, rotation-only. | +| `load_transform_max_rotation` | `0.0` | Metric-based rotation gate. Set to e.g. `4.0` for noisy data — transforms with rotation > 4° are not loaded at all. | +| `load_transform_min_zcorr` | `0.0` | Metric-based Z-correlation gate. Pairs with `load_transform_max_rotation`. | + +### Translation accumulation + +Pairwise translations can be accumulated as cumulative canvas offsets — this +"steers the viewing plane" through the volume. Useful when the sample drifts +slowly across many slices. + +| Parameter | Default | Effect | +|---|---|---| +| `stack_accumulate_translations` | `true` | Enable cumulative-offset accumulation. | +| `stack_confidence_weight_translations` | `true` | Weight each translation by its confidence before accumulating. Reduces noise from low-quality slices. | +| `stack_translation_smooth_sigma` | `3.0` | Gaussian smoothing (sigma in slices) over accumulated translations. Higher removes more jitter; too high and you lose legitimate trends. | +| `stack_max_pairwise_translation` | `0` | Translations exceeding this magnitude (px) are zeroed before accumulation. **Set to 0 to disable.** Even an optimizer-boundary value carries directional information; zeroing it is usually worse than keeping it. Use a non-zero value only if specific slices have clearly erroneous translations worse than zero. | +| `stack_max_cumulative_drift_px` | `50` | Cap on total accumulated drift. **Recommended: `0` (disabled).** If common-space alignment is correct, accumulated translations should converge naturally; a cap usually hides a real upstream problem. | +| `stack_smooth_window` | `5` | Moving-average window over per-slice rotations. Reduces visible jumps from outlier slices. | + +**Recipe — slow XY drift across many slices in XZ/YZ view:** + +1. Plot `output/stack/translation_per_slice.csv` (or read it). If translations + trend monotonically, that's drift. +2. First check that common-space alignment is correct (XY view of each slice + aligns). If common-space is the problem, fix it there first. +3. If common-space looks fine, raise `stack_translation_smooth_sigma` + (3 → 5) to wash out noise. +4. Disable `stack_max_cumulative_drift_px` (set `0`) if it's clamping a real + trend. + +**Recipe — sudden tilt or jump at one specific slice:** + +1. Open `output/stack/stacking_decisions.csv`. The affected row will usually + show `transform_loaded=False` (a gap) or the pairwise registration metric + will show a high rotation/translation outlier. +2. If `transform_loaded=False`: check `output/register_pairwise/` metrics for + why it was rejected. Often `overall_status="error"` because a neighbour + was interpolated. +3. If a noisy rotation: lower `load_transform_max_rotation` to a tighter + value (e.g. 4.0) or raise the gate. + +--- + +## Stage 7: Bias field correction (optional) + +N4 bias correction applied after stacking. Removes depth-dependent attenuation +and slow inter-slice intensity drift. + +| Parameter | Default | Effect | +|---|---|---| +| `correct_bias_field` | `false` | Master switch. | +| `bias_mode` | `'two_pass'` | `'per_section'`, `'global'`, or `'two_pass'` (per-section then global). `'two_pass'` is the recommended default. | +| `bias_strength` | `1.0` | Mixing strength. `0.0` = passthrough, `1.0` = full correction. Lower if N4 is over-correcting tissue features. | +| `bias_histogram_match_per_zplane` | `true` | Match each Z-plane to the global tissue distribution before N4. **Strongly reduces inter-slice intensity steps.** Roughly an order of magnitude better than chunked HM in tested cases. | +| `bias_tissue_threshold` | `0.005` | Voxels at or below this intensity are considered background and excluded from histogram matching. Lower if tissue is being treated as background. | +| `bias_zprofile_smooth_sigma` | `2.0` | Gaussian smoothing (sigma in Z-planes) of a residual scalar gain after HM. Eliminates the small inter-slice steps HM cannot remove. `0` = disabled. Typical range 2–4. | + +**Recipe — visible intensity steps between slices:** + +1. Enable `correct_bias_field = true`. +2. Keep `bias_histogram_match_per_zplane = true` and + `bias_zprofile_smooth_sigma = 2.0`. +3. If steps remain, raise sigma (2 → 4). +4. If tissue features are getting flattened, lower `bias_strength` (1.0 → 0.7). + +--- + +## Stage 8: Atlas registration (optional) + +Registers the stacked volume to the Allen Mouse Brain Atlas. + +| Parameter | Default | Effect | +|---|---|---| +| `align_to_ras_enabled` | `false` | Master switch. | +| `allen_resolution` | `25` | Atlas resolution (10/25/50/100 µm). Lower = slower & higher memory. | +| `allen_metric` | `'MI'` | `'MI'`, `'MSE'`, `'CC'`, `'AntsCC'`. `'MI'` is robust across modalities. | +| `allen_registration_level` | `2` | Pyramid level used for registration. Higher = faster but coarser. Output is always written at all pyramid resolutions. | +| `ras_input_orientation` | `''` | 3-letter code (e.g. `'PIR'`) telling the registrar what orientation the input is in. Crucial for correct atlas alignment. | +| `ras_initial_rotation` | `''` | `"Rx Ry Rz"` initial rotation hint in degrees. Use when MOMENTS-based init fails. | +| `allen_preview` | `true` | Save a 3×3 input/aligned/template comparison. Always check this. | +| `ras_orientation_preview` | `false` | Save a preview after orientation/initial-rotation are applied but before registration. **Enable when tuning `ras_input_orientation` and `ras_initial_rotation`** — you can verify orientation parameters without running the full registration. | + +**Recipe — atlas overlay is mirrored or rotated 90°:** + +1. Set `ras_orientation_preview = true` and re-run `align_to_ras` only. +2. Inspect the orientation preview: if the brain is rotated, adjust + `ras_initial_rotation` (e.g. `"0.0 0.0 90.0"` for a 90° Z rotation). +3. If the brain is flipped, adjust `ras_input_orientation` (try the + neighbour code: e.g. `'PIR'` → `'AIR'` if anterior/posterior is + inverted). +4. Iterate until the orientation preview looks roughly aligned to the + atlas template, then re-run with the full registration. + +--- + +## Reference parameter sets + +The canonical config and profile blocks already encode several known-good +combinations. For per-subject overrides see the deployed configs at +`/scratch/workspace/sub-XX/nextflow.config`. Common combinations: + +**For data with mosaic-grid expansion events (legacy `shifts_xy.csv`):** +```groovy +detect_rehoming = true +tile_fov_mm = 0.875 // your acquisition tile FOV +common_space_refine_unreliable = true +common_space_refine_max_discrepancy_px = 0 +``` + +**For obliquely-mounted samples (large slice-to-slice rotations):** +```groovy +registration_transform = 'euler' +registration_max_rotation = 35.0 +load_transform_max_rotation = 4.0 +stack_smooth_window = 5 +``` + +**For visible inter-slice intensity steps:** +```groovy +correct_bias_field = true +bias_mode = 'two_pass' +bias_histogram_match_per_zplane = true +bias_zprofile_smooth_sigma = 2.0 +``` + +**For one or two bad slices contaminating the stack:** +```groovy +auto_assess_quality = true +auto_assess_min_quality = 0.3 +auto_exclude_enabled = true +auto_exclude_consecutive = 3 +auto_exclude_z_corr = 0.6 +``` + +--- + +## See also + +- {doc}`Reconstruction Tuning Quick Reference ` — + symptom→fix table for experienced users. +- {doc}`Reconstruction Diagnostics ` — diagnostic + scripts and the `diagnostic_mode` master switch. +- {doc}`Pipeline Overview ` — high-level stage descriptions. +- {doc}`Nextflow Workflows ` — running and configuring the + pipeline. +- `workflows/reconst_3d/nextflow.config` — full canonical config with + per-parameter inline comments (see {doc}`NEXTFLOW_WORKFLOWS`). diff --git a/docs/RECONSTRUCTION_TUNING_QUICKREF.md b/docs/RECONSTRUCTION_TUNING_QUICKREF.md new file mode 100644 index 00000000..b7f6f3bc --- /dev/null +++ b/docs/RECONSTRUCTION_TUNING_QUICKREF.md @@ -0,0 +1,131 @@ +# Reconstruction Tuning — Quick Reference + +Cheat sheet for tuning the 3D reconstruction pipeline. Assumes you know what +the stages do. For background and recipes, see the full {doc}`Reconstruction +Parameter Tuning Guide `. + +--- + +## Profiles first + +| Profile | When to use | +|---|---| +| `-profile conservative` | Default starting point. Motor-XY + rotation-only. | +| `-profile aggressive` | Registration is reliable; want full pairwise corrections. | +| `-profile minimal` | Motor-only. Registration consistently fails. | + +--- + +## Symptom → fix + +| Symptom | First parameters to try | +|---|---| +| Seams within a slice | `stitch_overlap_fraction` (match acquisition); `stitch_blending_method='diffusion'`; raise `max_blend_refinement_px` | +| Per-slice stitch transform jitter | `stitch_global_transform=true` | +| One/two bad slices contaminating neighbours | `auto_assess_quality=true`, `auto_assess_min_quality=0.3` | +| Large XY jump at specific slices | Check `shifts_xy.csv`; `detect_rehoming=true`; `common_space_refine_unreliable=true` | +| Repeating large XY steps near tile-FOV multiples (legacy data) | `tile_fov_mm=`, `tile_fov_tolerance=0.05` | +| Image refinement producing wild values | `common_space_refine_max_discrepancy_px=50`, `common_space_refine_min_correlation=0.15` | +| Single-slice gap visible in Z | `interpolate_missing_slices=true` (default); lower `interpolation_min_overlap_correlation` if zmorph keeps falling back | +| Slice-to-slice rotation in oblique samples | `registration_transform='euler'`, raise `registration_max_rotation` (e.g. 35.0) | +| Sudden tilt/jump at one slice | Check `output/stack/stacking_decisions.csv` for `transform_loaded=False`; tighten `load_transform_max_rotation` (e.g. 4.0) | +| Slow XY drift across many slices | Raise `stack_translation_smooth_sigma` (3→5); set `stack_max_cumulative_drift_px=0` | +| Optimizer-boundary translation hits | Keep `skip_warning_transforms=true`; **don't** zero them via `stack_max_pairwise_translation` unless they're clearly worse than zero | +| Visible inter-slice intensity steps | `correct_bias_field=true`, `bias_histogram_match_per_zplane=true`, `bias_zprofile_smooth_sigma=2.0` (raise to 4 if persists) | +| Bias correction over-flattening tissue | Lower `bias_strength` (1.0 → 0.7) | +| Atlas overlay rotated 90° / flipped | `ras_orientation_preview=true`, then tune `ras_input_orientation` and `ras_initial_rotation` | + +--- + +## Per-stage quick tables + +### Tile stitching +| Param | Default | Lever | +|---|---|---| +| `use_motor_positions_for_stitching` | `true` | Keep on. | +| `stitch_overlap_fraction` | `0.2` | Must match acquisition. | +| `stitch_blending_method` | `'diffusion'` | `'none'` to debug seams. | +| `stitch_global_transform` | `false` | Pool affine across slices. | + +### Common-space alignment +| Param | Default | Lever | +|---|---|---| +| `detect_rehoming` | `true` | Keep on. | +| `rehoming_max_shift_mm` | `0.5` | Lower to catch smaller glitches. | +| `tile_fov_mm` | `null` | Set only for legacy shift CSVs with FOV-multiple jumps. | +| `common_space_refine_unreliable` | `false` | Image-based fallback for `reliable=0` shifts. | +| `common_space_refine_max_discrepancy_px` | `0` | Recommended `50` to gate wild image estimates. | +| `common_space_refine_min_correlation` | `0.0` | Recommended `0.15`–`0.3`. | + +### Pairwise registration +| Param | Default | Lever | +|---|---|---| +| `registration_transform` | `'euler'` | `'translation'` if no rotation expected. | +| `registration_max_rotation` | `5.0` | Raise for oblique samples (e.g. 35.0). | +| `registration_max_translation` | `200.0` | Keep large; not the actual applied bound. | + +### Stacking +| Param | Default | Lever | +|---|---|---| +| `apply_pairwise_transforms` | `true` | `false` for motor-only. | +| `apply_rotation_only` | `false` | `true` (conservative): keep XY from motor. | +| `skip_warning_transforms` | `true` | **Keep on.** Disabling causes Z errors. | +| `skip_error_transforms` | `true` | **Keep on.** | +| `max_rotation_deg` | `5.0` | Clamp applied rotation. | +| `load_transform_max_rotation` | `0.0` | Tighten gate (e.g. 4.0) for noisy pairwise rotations. | +| `stack_accumulate_translations` | `true` | Cumulative canvas offsets. | +| `stack_translation_smooth_sigma` | `3.0` | Higher = smoother accumulated drift. | +| `stack_max_cumulative_drift_px` | `50` | Set `0` to disable cap (recommended). | +| `stack_max_pairwise_translation` | `0` | Set `0` to disable boundary zeroing (recommended). | +| `stack_smooth_window` | `5` | Moving avg over rotations. | +| `transform_confidence_high` | `0.6` | Full transform threshold. | +| `transform_confidence_low` | `0.3` | Skip threshold. Between = rotation-only. | + +### Bias correction +| Param | Default | Lever | +|---|---|---| +| `correct_bias_field` | `false` | Master switch. | +| `bias_mode` | `'two_pass'` | `'per_section'` if per-slice issues only. | +| `bias_strength` | `1.0` | Lower if over-flattening. | +| `bias_histogram_match_per_zplane` | `true` | Keep on. | +| `bias_zprofile_smooth_sigma` | `2.0` | Range 2–4. | +| `bias_tissue_threshold` | `0.005` | Lower if tissue mistaken for bg. | + +### Atlas registration +| Param | Default | Lever | +|---|---|---| +| `align_to_ras_enabled` | `false` | Master switch. | +| `allen_resolution` | `25` | Atlas µm. | +| `allen_registration_level` | `2` | Higher = faster, coarser. | +| `ras_input_orientation` | `''` | 3-letter code (e.g. `'PIR'`). | +| `ras_initial_rotation` | `''` | `"Rx Ry Rz"` deg hint. | +| `ras_orientation_preview` | `false` | Turn on while tuning orientation. | + +--- + +## Diagnostic file map + +| File | What to look at | +|---|---| +| `output/common_space_previews/*.png` | Per-slice XY/XZ/YZ — most artifacts visible here. | +| `output/stack/stacking_decisions.csv` | `transform_loaded` flags, applied rotations/translations. | +| `output/stack/z_matches.csv` | Z-overlap correlation per pair. | +| `output/register_pairwise/*_metrics.json` | Per-pair `mag`, `rotation`, `overall_status`, `z_correlation`. | +| `output/detect_rehoming_events/shifts_xy_clean.csv` | Corrected shifts; `reliable` column. | +| `shifts_xy.csv` | Raw motor steps (`x_shift_mm`, `y_shift_mm`). Steps > 0.5mm are suspect. | + +--- + +## Rerun cascade rules + +A change to a parameter invalidates that stage and all downstream stages. +Use `-resume` — Nextflow caches everything upstream automatically. + +| Change to | Re-runs from | +|---|---| +| `detect_rehoming`, `tile_fov_mm` | `detect_rehoming` → all downstream | +| `common_space_*` | `common_space` → all downstream | +| `registration_*` | `register_pairwise` → all downstream | +| `apply_*`, `stack_*`, `load_transform_*`, `transform_confidence_*` | `stack` → all downstream | +| `bias_*`, `correct_bias_field` | `correct_bias_field` → atlas | +| `align_to_ras_*`, `allen_*`, `ras_*` | `align_to_ras` only | diff --git a/docs/SCRIPTS_REFERENCE.md b/docs/SCRIPTS_REFERENCE.md index 4450867f..1d2e97f5 100644 --- a/docs/SCRIPTS_REFERENCE.md +++ b/docs/SCRIPTS_REFERENCE.md @@ -593,7 +593,7 @@ genuine gap (no zarr output) and the failure is stamped into `slice_config_final.csv`. `average` and `weighted` are available as explicit baselines. -See [SLICE_INTERPOLATION_FEATURE.md](SLICE_INTERPOLATION_FEATURE.md) for the +See {doc}`SLICE_INTERPOLATION_FEATURE` for the physical model and parameter-tuning guidance. ```bash diff --git a/docs/SHIFTS_FILE_FORMAT.md b/docs/SHIFTS_FILE_FORMAT.md index c563c740..8b93b440 100644 --- a/docs/SHIFTS_FILE_FORMAT.md +++ b/docs/SHIFTS_FILE_FORMAT.md @@ -126,7 +126,7 @@ If some slices are excluded from reconstruction, the pairwise shifts must be acc ### Solution -See [SLICE_CONFIG_FEATURE.md](SLICE_CONFIG_FEATURE.md) for the slice configuration system that handles this. +See {doc}`SLICE_CONFIG_FEATURE` for the slice configuration system that handles this. ### Example diff --git a/docs/SLICE_CONFIG_FEATURE.md b/docs/SLICE_CONFIG_FEATURE.md index 77ea5d6c..1a726607 100644 --- a/docs/SLICE_CONFIG_FEATURE.md +++ b/docs/SLICE_CONFIG_FEATURE.md @@ -132,7 +132,7 @@ After `finalise_interpolation`, any slice that reached `linum_interpolate_missin - `interpolation_method_used` empty - `interpolation_fallback_reason` = one of `low_overlap_ncc`, `no_foreground_planes`, `registration_exception`, `reg_did_not_improve`, `affine_determinant_non_positive` -The pipeline never fabricates a slice from a weighted blend when registration fails; blending two neighbours that could not be registered introduces ghost contours and would also be made-up data. See [`SLICE_INTERPOLATION_FEATURE.md`](SLICE_INTERPOLATION_FEATURE.md) for the interpolation algorithm details and rationale. +The pipeline never fabricates a slice from a weighted blend when registration fails; blending two neighbours that could not be registered introduces ghost contours and would also be made-up data. See {doc}`SLICE_INTERPOLATION_FEATURE` for the interpolation algorithm details and rationale. --- diff --git a/docs/SLICE_INTERPOLATION_FEATURE.md b/docs/SLICE_INTERPOLATION_FEATURE.md index 42b86034..6d8f69e1 100644 --- a/docs/SLICE_INTERPOLATION_FEATURE.md +++ b/docs/SLICE_INTERPOLATION_FEATURE.md @@ -25,7 +25,7 @@ survives to the final quality report. therefore fabricated slice. - Per-slice JSON diagnostics and per-slice manifest fragments are merged directly into `slice_config.csv` — the single source of truth for per-slice - decisions, see [`SLICE_CONFIG_FEATURE.md`](SLICE_CONFIG_FEATURE.md). + decisions, see {doc}`SLICE_CONFIG_FEATURE`. Successful interpolations stamp `interpolated=true`; failures stamp `interpolation_failed=true` plus the specific `fallback_reason`. - Downstream propagation: `linum_register_pairwise.py` automatically marks @@ -60,7 +60,7 @@ constraints. ### Driving the interpolator from `slice_config.csv` The preprocessing pipeline generates an initial `slice_config.csv` -(see [`SLICE_CONFIG_FEATURE.md`](SLICE_CONFIG_FEATURE.md)). The user +(see {doc}`SLICE_CONFIG_FEATURE`). The user (or the automated quality assessment) may mark additional slices with `use=false`. In the reconstruction pipeline those slices are filtered out *before* common-space alignment, which leaves a gap in the numeric slice ID diff --git a/docs/pipelines.rst b/docs/pipelines.rst index 25d6c691..c8363062 100644 --- a/docs/pipelines.rst +++ b/docs/pipelines.rst @@ -7,4 +7,6 @@ Pipelines PIPELINE_OVERVIEW NEXTFLOW_WORKFLOWS RECONST_2_5D_WORKFLOW + RECONSTRUCTION_TUNING + RECONSTRUCTION_TUNING_QUICKREF RECONSTRUCTION_DIAGNOSTICS From d797bbe6e2039ae50aa1f017d8689e1922b9cdac Mon Sep 17 00:00:00 2001 From: Frans Irgolitsch Date: Fri, 1 May 2026 12:24:43 -0400 Subject: [PATCH 5/6] feat(resample): GPU round-robin assignment + Helpers.gpuPinBlock Squash of resample/GPU/workflow work on dev: - Round-robin GPU assignment for resample_mosaic_grid (linumpy.gpu) - Per-gpu active-slot counters via Helpers.gpuPinBlock for fair maxForks balance - CUDA device pin in prefetch worker thread + workflow CUDA_VISIBLE_DEVICES - Keeps simple per-tile CPU-read -> GPU-rescale -> CPU-write path --- linumpy/gpu/__init__.py | 47 +++++++++++++++------- linumpy/gpu/interpolation.py | 4 +- scripts/linum_resample_mosaic_grid.py | 44 ++++++++------------- workflows/reconst_3d/lib/Helpers.groovy | 52 +++++++++++++++++++++++++ workflows/reconst_3d/nextflow.config | 9 +++-- workflows/reconst_3d/soct_3d_reconst.nf | 2 + 6 files changed, 114 insertions(+), 44 deletions(-) diff --git a/linumpy/gpu/__init__.py b/linumpy/gpu/__init__.py index 499f8ead..ee83dc46 100644 --- a/linumpy/gpu/__init__.py +++ b/linumpy/gpu/__init__.py @@ -47,21 +47,43 @@ # Test if CUDA is actually available try: - # First, find the GPU with most free memory n_devices = cp.cuda.runtime.getDeviceCount() if n_devices > 0: - best_gpu_id = 0 - best_free_memory = 0 + # Pick a GPU. If CUDA_VISIBLE_DEVICES is set externally + # (e.g. by Nextflow), cupy already sees only the allowed + # cards and n_devices reflects that — just use device 0. + # Otherwise round-robin across physical devices using a + # file-locked counter. This is robust against the race + # where many forks start concurrently and would all pick + # the "GPU with most free memory" snapshot at the same + # time. It is also indifferent to which work items are in + # flight, unlike workload-id modulo schemes. + if n_devices == 1 or os.environ.get("CUDA_VISIBLE_DEVICES"): + best_gpu_id = 0 + else: + import fcntl + import tempfile + from pathlib import Path + + counter_path = Path(tempfile.gettempdir()) / f"linumpy_gpu_counter_{os.getuid()}" + # Open or create the counter, take an exclusive lock, + # increment, release. This serialises the assignment + # decision across all linumpy processes on the host. + with counter_path.open("a+") as fp: + fcntl.flock(fp.fileno(), fcntl.LOCK_EX) + fp.seek(0) + try: + counter = int((fp.read() or "0").strip()) + except ValueError: + counter = 0 + best_gpu_id = counter % n_devices + fp.seek(0) + fp.truncate() + fp.write(str(counter + 1)) + fp.flush() + fcntl.flock(fp.fileno(), fcntl.LOCK_UN) - for i in range(n_devices): - with cp.cuda.Device(i): - free, total = cp.cuda.runtime.memGetInfo() - if free > best_free_memory: - best_free_memory = free - best_gpu_id = i - - # Select the best GPU cp.cuda.Device(best_gpu_id).use() CUPY_AVAILABLE = True @@ -74,11 +96,10 @@ GPU_MEMORY_GB = mem_info[1] / (1024**3) # Total memory in GB if n_devices > 1: - # Only show message if there are multiple GPUs import sys print( - f"Auto-selected GPU {best_gpu_id}: {GPU_DEVICE_NAME} ({best_free_memory / (1024**3):.1f} GB free)", + f"Selected GPU {best_gpu_id}/{n_devices}: {GPU_DEVICE_NAME} ({mem_info[0] / (1024**3):.1f} GB free)", file=sys.stderr, ) else: diff --git a/linumpy/gpu/interpolation.py b/linumpy/gpu/interpolation.py index 97da205d..c5a52d68 100644 --- a/linumpy/gpu/interpolation.py +++ b/linumpy/gpu/interpolation.py @@ -155,7 +155,9 @@ def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) - from cupyx.scipy.ndimage import zoom as cp_zoom input_was_gpu = is_cupy_array(image) - img_gpu = cp.asarray(image if image.dtype == np.float32 else image.astype(np.float32)) + # Cast to float32 *after* H2D so PCIe carries the smaller (or unchanged) dtype. + # `copy=False` skips the device copy when the array is already float32. + img_gpu = cp.asarray(image).astype(np.float32, copy=False) # Scale factors: input/output for Gaussian sigma, output/input for zoom. scale_factors = tuple(i / o for i, o in zip(image.shape, output_shape, strict=False)) diff --git a/scripts/linum_resample_mosaic_grid.py b/scripts/linum_resample_mosaic_grid.py index f346266a..329b0d41 100644 --- a/scripts/linum_resample_mosaic_grid.py +++ b/scripts/linum_resample_mosaic_grid.py @@ -22,7 +22,6 @@ from linumpy.geometry.resampling import resolution_is_mm from linumpy.gpu import GPU_AVAILABLE, print_gpu_info from linumpy.gpu.interpolation import resize -from linumpy.gpu.zarr_io import gpu_zarr_context from linumpy.io import OmeZarrWriter, read_omezarr @@ -67,12 +66,8 @@ def rescale(image: Any, scale: float | Sequence[float], order: int = 1, use_gpu: def _read_tile(vol: Any, i: Any, j: Any, tile_shape: Any) -> Any: - """Read one tile from the input zarr array (I/O stage of the pipeline). - - Returns whatever array type the zarr backend yields: NumPy by default, or - CuPy when the volume was opened inside :func:`gpu_zarr_context`. - """ - return vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]] + """Read one tile from the input zarr array (I/O stage of the pipeline).""" + return np.asarray(vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]]) def _run_pipelined( @@ -95,10 +90,15 @@ def _run_pipelined( if not tile_iter: return - # Note: do NOT periodically call cp.get_default_memory_pool().free_all_blocks(). - # All tiles are the same size, so the pool reuses GPU buffers across iterations. - # Flushing forces cudaMalloc on the next tile and stalls the GPU; CuPy already - # retries with free_all_blocks() automatically on OOM. + cp: Any = None + cupy_available = False + if use_gpu: + try: + import cupy as cp + + cupy_available = True + except Exception: + pass with ThreadPoolExecutor(max_workers=1) as prefetch_executor: i0, j0 = tile_iter[0] @@ -116,6 +116,9 @@ def _run_pipelined( :, i * out_tile_shape[1] : (i + 1) * out_tile_shape[1], j * out_tile_shape[2] : (j + 1) * out_tile_shape[2] ] = resampled + if cupy_available and cp is not None and k % 10 == 9: + cp.get_default_memory_pool().free_all_blocks() + def main() -> None: """Run function.""" @@ -169,24 +172,11 @@ def main() -> None: out_shape = (out_tile_shape[0], nx * out_tile_shape[1], ny * out_tile_shape[2]) print(f" Output shape: {out_shape} ({total_tiles} tiles)") - tile_iter = list(itertools.product(range(nx), range(ny))) + out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True) - if use_gpu: - # Open BOTH the input volume and the output writer inside the GPU zarr - # context. That way per-tile reads land directly on device and per-tile - # writes accept cupy arrays through zarr's GPU buffer prototype, so the - # resample loop is fully device-resident: - # read tile -> cupy gaussian + zoom -> write cupy slice - with gpu_zarr_context(): - vol_gpu, _ = read_omezarr(args.in_mosaic) - out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True) - _run_pipelined(vol_gpu, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu) - else: - out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True) - _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu) + tile_iter = list(itertools.product(range(nx), range(ny))) + _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu) - # Pyramid build reads the just-written level0 back via dask; run it outside - # the GPU context so dask uses the normal numpy buffer prototype. print("Building pyramid...") out_res = [target_res] * 3 out_zarr.finalize(out_res, args.n_levels) diff --git a/workflows/reconst_3d/lib/Helpers.groovy b/workflows/reconst_3d/lib/Helpers.groovy index 6e60bc3e..baac906a 100644 --- a/workflows/reconst_3d/lib/Helpers.groovy +++ b/workflows/reconst_3d/lib/Helpers.groovy @@ -300,4 +300,56 @@ class Helpers { if (params.stack_translation_min_zcorr > 0) opts += " --translation_min_zcorr ${params.stack_translation_min_zcorr}" return opts } + + // ------------------------------------------------------------------------- + // GPU pinning + // ------------------------------------------------------------------------- + + /** + * Bash block that pins the current task to the least-loaded physical GPU. + * + * In-process pinning via `cp.cuda.Device(N).use()` is leaky — zarr's GPU + * buffer prototype and the kvikio/GDS path still allocate on device 0 + * regardless. Setting `CUDA_VISIBLE_DEVICES` in the shell hides the other + * cards from the process so no library can route around the choice. + * + * The returned snippet maintains per-GPU active-slot counters under + * /tmp guarded by `flock`. On entry it picks the GPU with the fewest + * running forks; on shell exit (`trap ... EXIT`) it decrements that + * slot. This balances better than `slice_id % gpu_count` (which can + * give 3/1 splits when concurrent slices share parity) and better than + * a simple round-robin counter (which ignores fork-completion order). + * + * Pass `params` and a short `tag` used for the per-task log line. + * When `params.use_gpu` is false this returns an empty string. + */ + static String gpuPinBlock(params, String tag) { + if (!params.use_gpu) return '' + def n = params.gpu_count as int + return """ + UID_=\$(id -u) + GPU_LOCK=/tmp/linumpy_nf_gpu_slots_\${UID_}.lock + GPU_SLOT_PREFIX=/tmp/linumpy_nf_gpu_slot_\${UID_}_ + _gpu_acquire() { + exec 9>\$GPU_LOCK + flock 9 + local best=0 best_n=999999 i n + for i in \$(seq 0 \$((${n} - 1))); do + n=\$(cat \${GPU_SLOT_PREFIX}\$i 2>/dev/null || echo 0) + if [ "\$n" -lt "\$best_n" ]; then best=\$i; best_n=\$n; fi + done + echo \$((best_n + 1)) > \${GPU_SLOT_PREFIX}\$best + echo \$best + } + _gpu_release() { + exec 9>\$GPU_LOCK + flock 9 + local n=\$(cat \${GPU_SLOT_PREFIX}\$1 2>/dev/null || echo 1) + echo \$((n - 1)) > \${GPU_SLOT_PREFIX}\$1 + } + export CUDA_VISIBLE_DEVICES=\$(_gpu_acquire) + trap "_gpu_release \$CUDA_VISIBLE_DEVICES" EXIT + echo "[${tag}] CUDA_VISIBLE_DEVICES=\$CUDA_VISIBLE_DEVICES" + """ + } } diff --git a/workflows/reconst_3d/nextflow.config b/workflows/reconst_3d/nextflow.config index 3765a1ab..ad1ac377 100644 --- a/workflows/reconst_3d/nextflow.config +++ b/workflows/reconst_3d/nextflow.config @@ -16,6 +16,8 @@ params { // COMPUTE RESOURCES // ========================================================================= use_gpu = true // Enable GPU acceleration (auto-fallback to CPU if unavailable) + gpu_count = 1 // Number of physical GPUs to round-robin across in resample_mosaic_grid. + // Override per-subject (e.g. `gpu_count = 2` on dual-A6000 hosts). processes = 8 // Number of parallel Python processes per Nextflow task // CPU resource management @@ -391,9 +393,10 @@ process { withName: "resample_mosaic_grid" { scratch = false - // Measured: ~1 GB GPU memory per fork. IO (RAID at 100%) is the - // gating factor at 6 forks, not GPU. Bumping higher won't help. - maxForks = params.use_gpu ? 6 : null + // Per-tile GPU work (~few hundred MB peak per fork). Forks are pinned + // per-GPU via Helpers.gpuPinBlock so they round-robin across gpu_count + // physical devices. + maxForks = params.use_gpu ? 4 : null } withName: "fix_focal_curvature" { diff --git a/workflows/reconst_3d/soct_3d_reconst.nf b/workflows/reconst_3d/soct_3d_reconst.nf index 643113b0..ace9115a 100644 --- a/workflows/reconst_3d/soct_3d_reconst.nf +++ b/workflows/reconst_3d/soct_3d_reconst.nf @@ -126,7 +126,9 @@ process resample_mosaic_grid { script: def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu" + def gpu_pin_block = Helpers.gpuPinBlock(params, "resample slice=${slice_id}") """ + ${gpu_pin_block} linum_resample_mosaic_grid.py ${mosaic_grid} "mosaic_grid_z${slice_id}_resampled.ome.zarr" \ -r ${params.resolution} ${gpu_flag} --n_levels 0 -v """ From ab8288483f6239204db839db5c394a37a2de67a2 Mon Sep 17 00:00:00 2001 From: Frans Irgolitsch Date: Fri, 1 May 2026 13:19:11 -0400 Subject: [PATCH 6/6] fix(resample): clamp output tile shape to >=1 to avoid zero-sized axes Heavy downsampling of small axes (e.g. Z=5 by factor 0.1) used to round to zero, producing zero-sized output and ZeroDivisionError downstream. Clamp output_shape to a minimum of 1 in both rescale() and out_tile_shape. --- scripts/linum_resample_mosaic_grid.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/linum_resample_mosaic_grid.py b/scripts/linum_resample_mosaic_grid.py index 329b0d41..2d29eee4 100644 --- a/scripts/linum_resample_mosaic_grid.py +++ b/scripts/linum_resample_mosaic_grid.py @@ -61,7 +61,9 @@ def rescale(image: Any, scale: float | Sequence[float], order: int = 1, use_gpu: Rescaled image. """ scale_tuple = tuple([float(scale)] * image.ndim) if isinstance(scale, (int, float)) else tuple(scale) - output_shape = tuple(round(s * sc) for s, sc in zip(image.shape, scale_tuple, strict=False)) + # Clamp to >=1 so heavy downsampling of small axes (e.g. Z=5 by factor 0.1) + # doesn't produce a zero-sized output that triggers ZeroDivisionError downstream. + output_shape = tuple(max(1, round(s * sc)) for s, sc in zip(image.shape, scale_tuple, strict=False)) return resize(image, output_shape, order=order, anti_aliasing=True, use_gpu=use_gpu) @@ -163,7 +165,7 @@ def main() -> None: print(f" Target resolution: {args.resolution} µm") print(f" Scale factor: {scaling_factor}") - out_tile_shape = tuple(round(s * sc) for s, sc in zip(tile_shape, scaling_factor, strict=False)) + out_tile_shape = tuple(max(1, round(s * sc)) for s, sc in zip(tile_shape, scaling_factor, strict=False)) nx = vol.shape[1] // tile_shape[1] ny = vol.shape[2] // tile_shape[2]