From ab73f2db9178a4a121dc6ba04cd4d1bd302aee34 Mon Sep 17 00:00:00 2001 From: Glen Beane <356266+gbeane@users.noreply.github.com> Date: Fri, 19 Jun 2026 13:01:07 -0400 Subject: [PATCH 1/2] Default NWB export to per-identity files; add --multisubject mode --- docs/development/jabs-nwb-format.md | 150 ++++++------ docs/user-guide/file-formats.md | 58 ++--- docs/user-guide/nwb-export.md | 119 +++++----- packages/jabs-io/pyproject.toml | 1 + .../jabs-io/src/jabs/io/internal/pose/nwb.py | 193 ++++++++++++--- .../jabs-io/tests/internal/pose/test_nwb.py | 222 ++++++++++++++---- .../resources/docs/user_guide/file-formats.md | 106 +++++---- .../resources/docs/user_guide/nwb-export.md | 119 +++++----- src/jabs/scripts/cli/cli.py | 34 +-- src/jabs/scripts/cli/convert_to_nwb.py | 20 +- tests/scripts/test_convert_to_nwb.py | 96 +++++++- uv.lock | 25 +- 12 files changed, 777 insertions(+), 366 deletions(-) diff --git a/docs/development/jabs-nwb-format.md b/docs/development/jabs-nwb-format.md index 951db2a8..221ee8e8 100644 --- a/docs/development/jabs-nwb-format.md +++ b/docs/development/jabs-nwb-format.md @@ -1,12 +1,14 @@ # JABS NWB Format This document describes the NWB files produced by JABS. It covers the two output modes -(combined and per-identity), the full file layout, how animal pose, static objects, and -dynamic objects are stored, the `jabs_metadata` scratch field, and why the -`ndx-multisubjects` extension is not currently used. +(per-identity and multisubject), the full file layout, how animal pose, static objects, +and dynamic objects are stored, the `jabs_metadata` scratch field, and how the +`ndx-multisubjects` extension is used for multisubject files. JABS NWB files use the [ndx-pose 0.2](https://github.com/rly/ndx-pose) extension for -all pose and object data. +all pose and object data, and the +[ndx-multisubjects](https://github.com/nehatk17/ndx-multisubjects) extension for +multisubject files. --- @@ -14,23 +16,10 @@ all pose and object data. JABS can write NWB in two modes, selectable at export time. -### Combined file (default) - -All identities from a single recording session are written into one NWB file. This is -the simplest output and potentially the most compatible with third-party NWB tooling. - -``` -session.nwb - └── all identities, all objects -``` - -**When to use:** sharing data with collaborators, archiving, downstream analysis that -needs all animals in one place. - -### Per-identity files +### Per-identity files (default) One NWB file is written per animal. The output path is used as a naming template; the -combined file is never created. Static and dynamic objects are written to every +`OUTPUT` file itself is never created. Static and dynamic objects are written to every per-identity file identically (they are session-level, not animal-level data). ``` @@ -43,8 +32,23 @@ Identity names in the filenames come from `external_ids` in the pose file (sanit for HDF5 compatibility) or fall back to `subject_1`, `subject_2`, … when no external IDs are present. -**When to use:** downstream workflows that require one NWB file per animal (e.g. tools -that expect a single `Subject` in each file). +**When to use:** the default; downstream workflows that require one NWB file per animal +(e.g. tools that expect a single `Subject` in each file), and DANDI upload. + +### Multisubject single file (`--multisubject`) + +All identities from a single recording session are written into one self-contained NWB +file, an `NdxMultiSubjectsNWBFile` carrying a `SubjectsTable` (one row per animal). + +``` +session.nwb + └── all identities + all objects + SubjectsTable +``` + +**When to use:** sharing a whole session as one artifact with tools that understand the +ndx-multisubjects extension. See [How multisubject files use +ndx-multisubjects](#how-multisubject-files-use-ndx-multisubjects) below for the layout +and rationale. #### Reading per-identity files @@ -67,11 +71,13 @@ Validation ensures the expected number of sibling files are present before mergi ## Full NWB layout -The layout below shows a combined file containing two animal identities, two static +The layout below shows a multisubject file containing two animal identities, two static objects (`corners`, `lixit`), and one dynamic object (`fecal_boli`). ``` -NWBFile +NdxMultiSubjectsNWBFile +├── acquisition/ +│ └── SubjectsTable [DynamicTable] multisubject mode only — one row per subject ├── processing/ │ └── behavior/ [ProcessingModule] │ ├── Skeletons/ [Skeletons container] @@ -111,9 +117,10 @@ NWBFile └── jabs_metadata/ [ScratchData] JSON string (see below) ``` -In a per-identity file the layout is identical, except only one animal identity -container is present and `jabs_identity_mask` / `jabs_bounding_boxes_` cover -that identity only. +A per-identity file (the default) uses a plain `NWBFile` with the same +`processing/behavior` layout, except there is no `SubjectsTable`, only one animal +identity container is present, and `jabs_identity_mask` / `jabs_bounding_boxes_` +cover that identity only. --- @@ -145,7 +152,7 @@ each frame. | Mode | Shape stored in file | Shape returned by reader | |------------------|-------------------------------|----------------------------------| -| Combined | `(num_frames, num_identities)` | `(num_identities, num_frames)` | +| Multisubject | `(num_frames, num_identities)` | `(num_identities, num_frames)` | | Per-identity | `(num_frames,)` | `(1, num_frames)` | ### Bounding boxes (optional) @@ -159,7 +166,7 @@ self-describing — no external index mapping is required. | Name | `jabs_bounding_boxes_{identity_name}` (one per identity) | | Shape stored in file | `(num_frames, 2, 2)` | | Shape returned by reader | `(num_identities, num_frames, 2, 2)` (all stacked) | -| Both modes | Same per-identity shape in combined and per-identity files | +| Both modes | Same per-identity shape in multisubject and per-identity files | Format: `[[upper_left_x, upper_left_y], [lower_right_x, lower_right_y]]` in pixels. @@ -428,7 +435,6 @@ otherwise scramble the keypoint ordering. | `format_version` | `int` | Always | JABS NWB format version. Currently `1`. | | `identity_names` | `list[str]` | Always | Ordered list of `PoseEstimation` container names that are animal identities. Defines identity order on read. | | `num_identities` | `int` | Always | Total number of animal identities in the recording session. In per-identity mode this equals `split_subject_count`; the file itself contains only one identity. | -| `body_parts` | `list[str]` | Always | Ordered list of keypoint names for animal skeletons. Preserves original write order, since HDF5 returns groups alphabetically. | | `cm_per_pixel` | `float \| null` | Always | Pixel-to-centimetre scale factor. `null` if not available in the source pose file. | | `external_ids` | `list[str] \| null` | Always | Original external identity names from the pose file (e.g. mouse cage IDs). `null` if the pose file had no external IDs. | | `subjects` | `dict[str, dict] \| null` | Always | Per-identity subject metadata keyed by identity name. `null` if no subject metadata is available. Inner dict may contain `subject_id`, `sex`, `species`, `age` (ISO 8601 duration), `date_of_birth` (ISO 8601 datetime), `genotype`, `strain`, `weight`, and `description`. DANDI requires `species`, `sex`, and either `age` or `date_of_birth`. Values are `null` when not available. | @@ -436,20 +442,19 @@ otherwise scramble the keypoint ordering. | `static_object_names` | `list[str]` | When static objects present | Names of all `PoseEstimation` containers that are static objects. | | `dynamic_object_names` | `list[str]` | When dynamic objects present | Names of all `PoseEstimation` containers that are dynamic objects. | | `dynamic_object_shapes` | `dict[str, [int, int]]` | When dynamic objects present | Maps each dynamic object name to `[max_count, n_keypoints]`. Required to reconstruct the 4-D `points` array `(n_predictions, max_count, n_keypoints, 2)` from the flat series list on read. | +| `multisubject` | `bool` | Multisubject mode only | `true` if this is a single multi-subject file written with the ndx-multisubjects extension. Tells the reader to return all identities directly rather than glob for siblings. | | `per_identity_files` | `bool` | Per-identity mode only | `true` if this file is one of a set of per-identity NWB files. | | `source_identity_index` | `int` | Per-identity mode only | Zero-based index of the identity in this file within the original multi-identity dataset. Used to restore original identity order when merging siblings. | | `split_subject_count` | `int` | Per-identity mode only | Total number of subjects in the session across all split files. Used to validate that all sibling files are present before merging. | -### Example — combined file with two identities, static objects, and dynamic objects +### Example — multisubject file with two identities, static objects, and dynamic objects ```json { "format_version": 1, + "multisubject": true, "identity_names": ["subject_1", "subject_2"], "num_identities": 2, - "body_parts": ["nose", "left_ear", "right_ear", "base_neck", "left_front_paw", - "right_front_paw", "center_spine", "left_rear_paw", "right_rear_paw", - "base_tail", "mid_tail", "tip_tail"], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { @@ -494,7 +499,6 @@ otherwise scramble the keypoint ordering. "format_version": 1, "identity_names": ["subject_2"], "num_identities": 3, - "body_parts": ["nose", "left_ear", "..."], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { @@ -555,46 +559,48 @@ writing to NWB. NWB files always contain `(x, y)` order. --- -## Why ndx-multisubjects is not used +## How multisubject files use ndx-multisubjects [ndx-multisubjects](https://github.com/nehatk17/ndx-multisubjects) is a NWB extension -that adds multi-subject support to NWB files through three new types: - -- **`SubjectsTable`** — a `DynamicTable` with one row per animal, storing standard - subject fields (`subject_id`, `sex`, `genotype`, `strain`, `age`, `weight`, etc.) -- **`NdxMultiSubjectsNWBFile`** — a subclass of `NWBFile` that embeds the - `SubjectsTable` in general metadata +that adds multi-subject support through three new types: + +- **`SubjectsTable`** — a `DynamicTable` (fixed name `"SubjectsTable"`) with one row per + animal, storing standard subject fields (`subject_id`, `sex`, `species`, `genotype`, + `strain`, `age`, `weight`, etc.). `subject_id`, `sex`, and `species` are required + columns. +- **`NdxMultiSubjectsNWBFile`** — a drop-in subclass of `NWBFile` (identical constructor + kwargs). - **`SelectSubjectsContainer`** — an `NWBDataInterface` that links data to a subject - subset via a `DynamicTableRegion` + subset via a `DynamicTableRegion`. Standard NWB only supports a single `Subject` on `NWBFile.subject`, so this extension -addresses a real gap for multi-animal recordings. JABS evaluated it and chose not to -adopt it for the following reasons: - -**1. `NdxMultiSubjectsNWBFile` is a non-standard `NWBFile` subclass.** -Any tool that opens a JABS NWB file without the extension installed will either fail -or silently lose the subjects table. The core value of NWB is that files are readable -by the broader ecosystem using only pynwb. This extension undermines that guarantee. - -**2. `SelectSubjectsContainer` does not compose cleanly with ndx-pose.** -The extension's model for associating data with subjects requires wrapping data -containers inside `SelectSubjectsContainer`. JABS `PoseEstimation` containers live -directly in `processing/behavior` and are already named by identity — adding a wrapper -layer would significantly restructure the layout without a commensurate benefit. - -**3. The extension is still Beta.** -ndx-multisubjects is published on PyPI (v0.1.1, November 2025) and sleap-io merged -support for it in December 2025, so the ecosystem is beginning to form. However, at -Beta (0.1.x) the API may still change, and broad adoption across NWB tooling has not -yet occurred. Since JABS NWB support is itself under active development, taking a -dependency on an immature extension adds unnecessary coupling at this stage. - -**What JABS does instead.** -Per-animal biological metadata (`subject_id`, `sex`, `species`, `age`, `date_of_birth`, -`genotype`, `strain`, `weight`, `description`) can be stored in the `subjects` key of -`jabs_metadata`. DANDI requires `species`, `sex`, and either `age` (ISO 8601 duration, -e.g. `"P70D"`) or `date_of_birth` (ISO 8601 datetime) on every subject. -This keeps the file readable by any standard NWB tool while preserving the metadata in -a structured, machine-readable form. If ndx-multisubjects stabilises and achieves -broader adoption, migrating to it would be straightforward since all the underlying -data is already present. \ No newline at end of file +fills a real gap for multi-animal recordings. The default per-identity mode does not need +it (each file has one `Subject`); only `--multisubject` files use it. + +**What JABS writes.** In multisubject mode the root container is an +`NdxMultiSubjectsNWBFile` and a `SubjectsTable` (one row per identity) is attached via +`nwbfile.add_acquisition(...)`, so it lands at `acquisition/SubjectsTable`. The +`PoseEstimation` containers stay directly in `processing/behavior`, named by identity, +exactly as in a per-identity file — JABS does **not** wrap them in a +`SelectSubjectsContainer`, since they are already identity-named and the wrapper would +restructure the layout for no read benefit. + +The table is attached with `add_acquisition` rather than a `subjects_table` attribute +because the extension's 0.1.1 release ships the `NdxMultiSubjectsNWBFile`-to-`subjects_table` +IO mapping commented out; `add_acquisition` is the path the extension's own round-trip +test exercises and is what reliably persists under pynwb 3.x / hdmf 4.x. + +**Round-trip fidelity rides on `jabs_metadata`, not the `SubjectsTable`.** On read, JABS +recovers `subjects`/`external_ids`/`metadata` from the `jabs_metadata` scratch JSON (the +`SubjectsTable` is write-only redundancy aimed at external / DANDI consumers). Because the +required `SubjectsTable` columns differ from JABS's free-form per-identity dicts, the +writer takes the union of provided keys across all identities, writes every column for +every row (a `DynamicTable` requires consistent columns), coerces each cell to text, and +defaults the required columns (`subject_id` to the identity name, `sex` to `"U"`, +`species` to `""`) when absent. + +**Reading requires the extension.** A multisubject file is an `NdxMultiSubjectsNWBFile`, +so pynwb must load the embedded namespace to reconstruct it; the reader opens the file +with `load_namespaces=True`. This is why multisubject mode is opt-in: the default +per-identity output stays a plain `NWBFile` + ndx-pose, readable by the broader NWB +ecosystem without any extra extension installed. \ No newline at end of file diff --git a/docs/user-guide/file-formats.md b/docs/user-guide/file-formats.md index f368fe87..fd5026c8 100644 --- a/docs/user-guide/file-formats.md +++ b/docs/user-guide/file-formats.md @@ -125,26 +125,30 @@ See [NWB Export](nwb-export.md) for the CLI command and output-mode options. JABS writes NWB in two modes: -| Mode | When to use | -|---------------------------------|---------------------------------------------------------------------------------| -| Combined (default) | Local analysis, sharing with collaborators who can parse JABS-specific fields | -| Per-identity (`--per-identity`) | DANDI archive upload; tools that expect one subject per file | - -In **combined** mode all identities are written into a single `.nwb` file. -In **per-identity** mode one file is written per animal, named -`{output_stem}_{identity_name}.nwb`. Static and dynamic objects are written to every -per-identity file identically (they are session-level data). The JABS reader -re-assembles per-identity files transparently: point it at any sibling and it merges -all siblings into a single result in the original identity order. +| Mode | When to use | +|---------------------------------|-----------------------------------------------------------------------------------| +| Per-identity (default) | DANDI archive upload; tools that expect one subject per file | +| Multisubject (`--multisubject`) | A single shareable file holding every subject (via the ndx-multisubjects extension) | + +In **per-identity** mode (the default) one file is written per animal, named +`{output_stem}_{identity_name}.nwb`; the `OUTPUT` path itself is not created. The JABS +reader re-assembles per-identity files transparently: point it at any sibling and it +merges all siblings into a single result in the original identity order. +In **multisubject** mode all identities are written into a single self-contained `.nwb` +file (an `NdxMultiSubjectsNWBFile` with a `SubjectsTable`) using the +[ndx-multisubjects](https://github.com/nehatk17/ndx-multisubjects) extension. Static and +dynamic objects are written to every per-identity file identically (they are +session-level data). ### Full file layout -The layout below shows a combined file with two animal identities, two static objects +The layout below shows a multisubject file with two animal identities, two static objects (`corners`, `lixit`), and one dynamic object (`fecal_boli`). ``` -NWBFile -├── subject/ [Subject] per-identity mode only +NdxMultiSubjectsNWBFile +├── acquisition/ +│ └── SubjectsTable [DynamicTable] multisubject mode only — one row per subject ├── processing/ │ └── behavior/ [ProcessingModule] │ ├── Skeletons/ [Skeletons container] @@ -184,9 +188,11 @@ NWBFile └── jabs_metadata/ [ScratchData] JSON string (see below) ``` -In a per-identity file the layout is identical, except `NWBFile.subject` is populated -(when subject metadata is provided), only one animal identity container is present, and -`jabs_identity_mask` / `jabs_bounding_boxes_` cover that identity only. +A per-identity file (the default) uses a plain `NWBFile` with the same +`processing/behavior` layout, except there is no `SubjectsTable`, `NWBFile.subject` is +populated (when subject metadata is provided), only one animal identity container is +present, and `jabs_identity_mask` / `jabs_bounding_boxes_` cover that identity +only. ### Animal pose @@ -223,7 +229,7 @@ is present in each frame. | Mode | Shape stored in file | Shape returned by reader | |--------------|--------------------------------|--------------------------------| -| Combined | `(num_frames, num_identities)` | `(num_identities, num_frames)` | +| Multisubject | `(num_frames, num_identities)` | `(num_identities, num_frames)` | | Per-identity | `(num_frames,)` | `(1, num_frames)` | ### Bounding boxes (optional) @@ -410,8 +416,9 @@ Every JABS NWB file contains a `ScratchData` object named `jabs_metadata` in the needed for a lossless round-trip. It is required because pynwb returns `PoseEstimationSeries` in alphabetical order from HDF5, which would otherwise scramble the keypoint ordering. Tools that do not use the JABS reader can parse this JSON -directly to recover ordered keypoint names, identity ordering, subject metadata, and -object classification. +directly to recover identity ordering, subject metadata, and object classification. +(Keypoint ordering is not stored here; the JABS reader restores it from the canonical +keypoint index.) #### Keys @@ -420,7 +427,6 @@ object classification. | `format_version` | `int` | Always | JABS NWB format version. Currently `1`. | | `identity_names` | `list[str]` | Always | Ordered list of animal identity container names. Defines identity order on read. | | `num_identities` | `int` | Always | Total number of animal identities in the recording session. | -| `body_parts` | `list[str]` | Always | Ordered list of keypoint names for animal skeletons. | | `cm_per_pixel` | `float \| null` | Always | Pixel-to-centimetre scale factor. `null` if not available. | | `external_ids` | `list[str] \| null` | Always | Original external identity names from the pose file (e.g. cage IDs). `null` if the pose file had no external IDs. | | `subjects` | `dict[str, dict] \| null` | Always | Per-identity biological metadata keyed by identity name, for all identities. Fields: `subject_id`, `sex`, `genotype`, `strain`, `age`, `weight`, `species`, `description`. `null` if none provided. | @@ -428,20 +434,19 @@ object classification. | `static_object_names` | `list[str]` | When static objects present | Names of all static object `PoseEstimation` containers. | | `dynamic_object_names` | `list[str]` | When dynamic objects present | Names of all dynamic object `PoseEstimation` containers. | | `dynamic_object_shapes` | `dict[str, [int, int]]` | When dynamic objects present | Maps each dynamic object name to `[max_count, n_keypoints]`. Required to reconstruct the 4-D points array on read. | +| `multisubject` | `bool` | Multisubject mode only | `true` if this is a single multi-subject file written with the ndx-multisubjects extension. | | `per_identity_files` | `bool` | Per-identity mode only | `true` if this file is one of a set of per-identity NWB files. | | `source_identity_index` | `int` | Per-identity mode only | Zero-based index of the identity in this file. Used to restore original order when merging siblings. | | `split_subject_count` | `int` | Per-identity mode only | Total number of subjects in the session across all split files. Used to validate all siblings are present before merging. | -#### Example — combined file +#### Example — multisubject file ```json { "format_version": 1, + "multisubject": true, "identity_names": ["subject_1", "subject_2"], "num_identities": 2, - "body_parts": ["nose", "left_ear", "right_ear", "base_neck", "left_front_paw", - "right_front_paw", "center_spine", "left_rear_paw", "right_rear_paw", - "base_tail", "mid_tail", "tip_tail"], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { @@ -486,9 +491,6 @@ object classification. "format_version": 1, "identity_names": ["subject_1"], "num_identities": 3, - "body_parts": ["nose", "left_ear", "right_ear", "base_neck", "left_front_paw", - "right_front_paw", "center_spine", "left_rear_paw", "right_rear_paw", - "base_tail", "mid_tail", "tip_tail"], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { diff --git a/docs/user-guide/nwb-export.md b/docs/user-guide/nwb-export.md index b675a240..2ddd739e 100644 --- a/docs/user-guide/nwb-export.md +++ b/docs/user-guide/nwb-export.md @@ -11,14 +11,14 @@ The `jabs-cli convert-to-nwb` command converts a JABS pose estimation HDF5 file pip install "jabs-behavior-classifier[nwb]" ``` -The extra adds `pynwb` and `ndx-pose` as dependencies. +The extra adds `pynwb`, `ndx-pose`, and `ndx-multisubjects` as dependencies. Two output modes are available. **Choose the mode based on how the files will be used:** -| Mode | When to use | -|---------------------------------|-------------------------------------------------------------------------------| -| Combined (default) | Local analysis, sharing with collaborators who can parse JABS-specific fields | -| Per-identity (`--per-identity`) | DANDI archive upload, tools that expect one subject per file | +| Mode | When to use | +|---------------------------------|-----------------------------------------------------------------------------------| +| Per-identity (default) | DANDI archive upload, tools that expect one subject per file | +| Multisubject (`--multisubject`) | A single shareable file holding every subject (via the ndx-multisubjects extension) | --- @@ -31,8 +31,8 @@ jabs-cli convert-to-nwb INPUT_PATH OUTPUT [OPTIONS] | Argument / Option | Description | |------------------------------|------------------------------------------------------------------------------------------------------------------------------| | `INPUT_PATH` | JABS pose HDF5 file, any version v2–v8. Format version is inferred automatically from the filename (e.g. `_pose_est_v6.h5`). | -| `OUTPUT` | Destination `.nwb` file. In `--per-identity` mode, used as a naming template; the file itself is not created directly. | -| `--per-identity` | Write one NWB file per identity instead of a single combined file. | +| `OUTPUT` | Destination `.nwb` file. By default (per-identity), used as a naming template; the file itself is not created directly. With `--multisubject`, the single combined file is written directly to this path. | +| `--multisubject` | Write a single multi-subject NWB file (using the ndx-multisubjects extension) instead of the default one file per identity. | | `--session-description TEXT` | NWB session description string. Defaults to `'JABS PoseEstimation Data'`. | | `--subjects PATH` | Path to a JSON file with per-animal biological metadata. | | `--session-metadata PATH` | Path to a JSON file with NWB session-level metadata (start time, experimenter, etc.). | @@ -40,11 +40,11 @@ jabs-cli convert-to-nwb INPUT_PATH OUTPUT [OPTIONS] ### Examples ```bash -# Single combined file — all identities in one NWB file +# One NWB file per identity (default; recommended for DANDI upload) jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb -# One NWB file per identity (recommended for DANDI upload) -jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --per-identity +# A single multi-subject file holding every identity +jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --multisubject # Include per-animal metadata jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --subjects subjects.json @@ -97,11 +97,12 @@ subject.** All other fields are optional. | `weight` | string | Body weight, e.g. `"25g"` | | `description` | string | Free-text notes | -In per-identity mode, subject metadata is written to both the standard `NWBFile.subject` -field and the `jabs_metadata` scratch field. If no `--subjects` file is provided, a -minimal subject with `subject_id` set to the identity name is written automatically. In -combined mode, subject metadata is written only to `jabs_metadata` (see -[below](#subject-metadata-and-nwbfilesubject)). +In per-identity mode (the default), subject metadata is written to both the standard +`NWBFile.subject` field and the `jabs_metadata` scratch field. If no `--subjects` file is +provided, a minimal subject with `subject_id` set to the identity name is written +automatically. In multisubject mode, subject metadata is written to a `SubjectsTable` +(one row per subject) and to `jabs_metadata` (see +[below](#subject-metadata-by-mode)). --- @@ -137,27 +138,11 @@ All fields are optional. Unknown keys are ignored with a warning. ## Output modes -### Combined file (default) - -All identities from the recording session are written into a single NWB file. - -``` -session.nwb - └── all identities, all objects -``` - -**This is a non-standard NWB usage.** Standard NWB (`NWBFile.subject`) only supports -one subject per file, so combined files cannot populate that field. Instead, all -per-animal metadata is stored in the `jabs_metadata` scratch field (a JSON string). -Tools that do not know about `jabs_metadata` will not see subject metadata at all. - -**DANDI upload:** Combined files will fail DANDI validation because `NWBFile.subject` -is not set. Use per-identity mode for DANDI. - -### Per-identity (`--per-identity`) +### Per-identity files (default) One NWB file is written per animal. The `OUTPUT` path is used as a naming template; -files are written as `{output_stem}_{identity_name}.nwb` in the same directory. +files are written as `{output_stem}_{identity_name}.nwb` in the same directory. The +`OUTPUT` path itself is **not** created. ``` session_subject_1.nwb ← identity 0 + all objects @@ -165,7 +150,7 @@ session_subject_2.nwb ← identity 1 + all objects session_subject_3.nwb ← identity 2 + all objects ``` -**This is the more standard output.** Each file contains exactly one animal, so +**This is the most standard output.** Each file contains exactly one animal, so `NWBFile.subject` is populated with that animal's biological metadata (when provided via `--subjects`). Any standard NWB tool — including the DANDI archive — can read the subject field directly without knowing anything about JABS. @@ -182,16 +167,38 @@ sibling file; it detects the `per_identity_files` flag in `jabs_metadata`, finds siblings, and merges them into a single result with all identities in their original order. -### Subject metadata and `NWBFile.subject` +### Multisubject single file (`--multisubject`) + +All identities from the recording session are written into a single, self-contained NWB +file at `OUTPUT`, using the +[ndx-multisubjects](https://github.com/nehatk17/ndx-multisubjects) extension. + +``` +session.nwb + └── all identities + all objects + a SubjectsTable listing every subject +``` + +The file is an `NdxMultiSubjectsNWBFile` (a drop-in `NWBFile` subclass). Because standard +NWB's `NWBFile.subject` only holds one subject, multiple subjects are instead described by +a **`SubjectsTable`** (one row per animal) stored in `acquisition`. The pose data itself +is laid out in `processing/behavior` exactly as in a per-identity file, and the full, +lossless JABS round-trip still rides on the `jabs_metadata` scratch field. + +This mode is intended for sharing a whole session as one artifact. Reading it back with +the JABS reader returns all identities directly (no sibling files are involved). + +### Subject metadata by mode -| Mode | NWBFile.subject | jabs_metadata.subjects | -|--------------|------------------------------|--------------------------| -| Combined | Not set | Set (all identities) | -| Per-identity | Set for this file's identity | Set (all identities) | +| Mode | NWBFile.subject | SubjectsTable | jabs_metadata.subjects | +|--------------|------------------------------|------------------------|------------------------| +| Per-identity | Set for this file's identity | — | Set (all identities) | +| Multisubject | Not set | One row per subject | Set (all identities) | -`jabs_metadata.subjects` always carries the full dict for all identities, even in -per-identity files. This makes each file self-contained: the JABS reader can recover -complete subject metadata from any sibling without loading the others. +`jabs_metadata.subjects` always carries the full dict for all identities, in both modes. +This makes each per-identity file self-contained: the JABS reader can recover complete +subject metadata from any sibling without loading the others. In multisubject mode the +`SubjectsTable` is provided for standard NWB / DANDI consumers; JABS itself recovers +subject metadata from `jabs_metadata`. --- @@ -201,12 +208,13 @@ For the full format specification — including all field definitions, `jabs_met keys, and worked examples for static and dynamic objects — see [File Formats — NWB Pose File](file-formats.md#nwb-pose-file). -The layout below shows a combined file with two animal identities, two static objects +The layout below shows a multisubject file with two animal identities, two static objects (`corners`, `lixit`), and one dynamic object (`fecal_boli`). ``` -NWBFile -├── subject/ [Subject] per-identity mode only +NdxMultiSubjectsNWBFile +├── acquisition/ +│ └── SubjectsTable [DynamicTable] multisubject mode only — one row per subject ├── processing/ │ └── behavior/ [ProcessingModule] │ ├── Skeletons/ [Skeletons container] @@ -246,7 +254,9 @@ NWBFile └── jabs_metadata/ [ScratchData] JSON string (see below) ``` -In a per-identity file the layout is identical, except: +A per-identity file (the default) uses a plain `NWBFile` and the same +`processing/behavior` layout, except: +- The root is a standard `NWBFile`; there is no `SubjectsTable` - `NWBFile.subject` is populated (when subject metadata is provided) - Only one animal identity container is present - `jabs_identity_mask` / `jabs_bounding_boxes_` cover that identity only @@ -283,7 +293,7 @@ each frame. | Mode | Shape stored in file | Shape returned by reader | |--------------|--------------------------------|--------------------------------| -| Combined | `(num_frames, num_identities)` | `(num_identities, num_frames)` | +| Multisubject | `(num_frames, num_identities)` | `(num_identities, num_frames)` | | Per-identity | `(num_frames,)` | `(1, num_frames)` | --- @@ -370,8 +380,9 @@ needed for a lossless round-trip. Standard NWB fields alone are insufficient bec pynwb returns `PoseEstimationSeries` in alphabetical order from HDF5, which would otherwise scramble the keypoint ordering. -Tools that do not use the JABS reader can parse this JSON directly to recover ordered -keypoint names, identity ordering, subject metadata, and object classification. +Tools that do not use the JABS reader can parse this JSON directly to recover identity +ordering, subject metadata, and object classification. (Keypoint ordering is not stored +here; the JABS reader restores it from the canonical keypoint index.) #### Keys @@ -380,7 +391,6 @@ keypoint names, identity ordering, subject metadata, and object classification. | `format_version` | `int` | Always | JABS NWB format version. Currently `1`. | | `identity_names` | `list[str]` | Always | Ordered list of animal identity container names. Defines identity order on read. | | `num_identities` | `int` | Always | Total number of animal identities in the recording session. | -| `body_parts` | `list[str]` | Always | Ordered list of keypoint names for animal skeletons. | | `cm_per_pixel` | `float \| null` | Always | Pixel-to-centimetre scale factor. `null` if not available. | | `external_ids` | `list[str] \| null` | Always | Original external identity names from the pose file. `null` if the pose file had no external IDs. | | `subjects` | `dict[str, dict] \| null` | Always | Per-identity subject metadata keyed by identity name, for all identities. `null` if no subject metadata is available. Fields: `subject_id`, `sex`, `species`, `age`, `date_of_birth`, `genotype`, `strain`, `weight`, `description`. DANDI requires `species`, `sex`, and either `age` or `date_of_birth`. | @@ -388,20 +398,19 @@ keypoint names, identity ordering, subject metadata, and object classification. | `static_object_names` | `list[str]` | When static objects present | Names of all static object `PoseEstimation` containers. | | `dynamic_object_names` | `list[str]` | When dynamic objects present | Names of all dynamic object `PoseEstimation` containers. | | `dynamic_object_shapes` | `dict[str, [int, int]]` | When dynamic objects present | Maps each dynamic object name to `[max_count, n_keypoints]`. | +| `multisubject` | `bool` | Multisubject mode only | `true` if this is a single multi-subject file written with the ndx-multisubjects extension. | | `per_identity_files` | `bool` | Per-identity mode only | `true` if this file is one of a set of per-identity NWB files. | | `source_identity_index` | `int` | Per-identity mode only | Zero-based index of the identity in this file. | | `split_subject_count` | `int` | Per-identity mode only | Total number of subjects in the session across all split files. | -#### Example — combined file +#### Example — multisubject file ```json { "format_version": 1, + "multisubject": true, "identity_names": ["subject_1", "subject_2"], "num_identities": 2, - "body_parts": ["nose", "left_ear", "right_ear", "base_neck", "left_front_paw", - "right_front_paw", "center_spine", "left_rear_paw", "right_rear_paw", - "base_tail", "mid_tail", "tip_tail"], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { diff --git a/packages/jabs-io/pyproject.toml b/packages/jabs-io/pyproject.toml index 27a9b3f1..621ed785 100644 --- a/packages/jabs-io/pyproject.toml +++ b/packages/jabs-io/pyproject.toml @@ -46,6 +46,7 @@ Issues = "https://github.com/KumarLabJax/JABS-behavior-classifier/issues" [project.optional-dependencies] nwb = [ "ndx-pose>=0.2.2", + "ndx-multisubjects>=0.1.1", "pynwb>=3.1.3", ] h5py = [ diff --git a/packages/jabs-io/src/jabs/io/internal/pose/nwb.py b/packages/jabs-io/src/jabs/io/internal/pose/nwb.py index 295fb2d2..a481e871 100644 --- a/packages/jabs-io/src/jabs/io/internal/pose/nwb.py +++ b/packages/jabs-io/src/jabs/io/internal/pose/nwb.py @@ -22,6 +22,16 @@ except ImportError: _NWB_AVAILABLE = False +try: + # Optional extension used only by the --multisubject write path. Kept in a + # separate guard so a missing extension does not disable the default + # per-identity (ndx-pose) path. + from ndx_multisubjects import NdxMultiSubjectsNWBFile, SubjectsTable + + _MULTISUBJECTS_AVAILABLE = True +except ImportError: + _MULTISUBJECTS_AVAILABLE = False + from jabs.core.abstract.pose_est import PoseEstimation as _JABSPoseEstimation from jabs.core.enums import StorageFormat from jabs.core.types import DynamicObjectData, PoseData @@ -93,6 +103,15 @@ def _require_nwb() -> None: "Install with: pip install 'jabs-io[nwb]'" ) + @staticmethod + def _require_multisubjects() -> None: + """Raise a clear ImportError if the ndx-multisubjects extension is missing.""" + if not _MULTISUBJECTS_AVAILABLE: + raise ImportError( + "The ndx-multisubjects extension is required for multisubject NWB " + "output. Install with: pip install 'jabs-io[nwb]'" + ) + @classmethod def can_handle(cls, data_type): # noqa: D102 return data_type is PoseData @@ -100,35 +119,41 @@ def can_handle(cls, data_type): # noqa: D102 def write(self, data: PoseData, path: str | Path, **kwargs) -> None: """Write PoseData to NWB file(s). - By default, all identities are written to a single NWB file at ``path``. - Pass ``per_identity_files=True`` to write one file per identity instead. - In that mode, ``path`` is used as a naming template — the base file is - *not* created; instead, each identity is written to a sibling file whose - stem is ``{path.stem}_{identity_name}``. Identity names come from - ``data.external_ids`` (sanitized for HDF5 compatibility) or fall back to - ``subject_1``, ``subject_2``, … when ``external_ids`` is ``None``. + By default, one NWB file is written per identity. ``path`` is used as a + naming template — the base file is *not* created; instead, each identity + is written to a sibling file whose stem is ``{path.stem}_{identity_name}``. + Identity names come from ``data.external_ids`` (sanitized for HDF5 + compatibility) or fall back to ``subject_1``, ``subject_2``, … when + ``external_ids`` is ``None``. - Example — single file:: + Pass ``multisubject=True`` to instead write a single, self-contained file + at ``path`` containing every identity, using the ``ndx-multisubjects`` + extension (an :class:`NdxMultiSubjectsNWBFile` with a ``SubjectsTable`` + listing all subjects). This mode is intended for sharing/DANDI. - save(pose_data, "session.nwb") - # → session.nwb (all identities) + Example — per-identity files (default) with external IDs:: - Example — per-identity files with external IDs:: - - save(pose_data, "session.nwb", per_identity_files=True) + save(pose_data, "session.nwb") # pose_data.external_ids = ["mouse_a", "mouse_b"] # → session_mouse_a.nwb # → session_mouse_b.nwb Example — per-identity files without external IDs:: - save(pose_data, "session.nwb", per_identity_files=True) + save(pose_data, "session.nwb") # pose_data.external_ids = None # → session_subject_1.nwb # → session_subject_2.nwb + Example — single multisubject file:: + + save(pose_data, "session.nwb", multisubject=True) + # → session.nwb (all identities + SubjectsTable) + The NWB layout written by this adapter (ndx-pose 0.2):: + acquisition/ + SubjectsTable ← multisubject mode only: one row per subject processing/behavior/ Skeletons/ subject/ ← animal skeleton (keypoints + edges) @@ -145,10 +170,12 @@ def write(self, data: PoseData, path: str | Path, **kwargs) -> None: Args: data: The PoseData to write. - path: Output file path (.nwb). In per-identity mode this is a - naming template; the actual files are written alongside it. + path: Output file path (.nwb). In the default per-identity mode this + is a naming template; the actual files are written alongside it. + In multisubject mode the single file is written at this path. **kwargs: - per_identity_files (bool): Write one NWB file per identity. + multisubject (bool): Write a single multi-subject file using the + ndx-multisubjects extension instead of one file per identity. Default ``False``. session_description (str): NWB session description string. Default ``"JABS PoseEstimation Data"``. @@ -161,23 +188,26 @@ def write(self, data: PoseData, path: str | Path, **kwargs) -> None: Raises: ValueError: If sanitized identity names are not unique (collision after special-character replacement). + ImportError: If ``multisubject=True`` but the ndx-multisubjects + extension is not installed. """ path = Path(path) - per_identity_files = kwargs.get("per_identity_files", False) + multisubject = kwargs.get("multisubject", False) - if per_identity_files: - self._write_per_identity(data, path, **kwargs) + if multisubject: + self._write_multisubject(data, path, **kwargs) else: - self._write_single_file(data, path, **kwargs) + self._write_per_identity(data, path, **kwargs) def read(self, path: str | Path, data_type: type | None = None) -> PoseData: """Read PoseData from an NWB file. - Handles both single-file and per-identity file layouts transparently — - no kwargs are needed; the file records which layout was used. + Handles both multisubject and per-identity file layouts transparently — + no kwargs are needed; the file records which layout was used in its + embedded ``jabs_metadata``. - **Single-file layout:** point ``path`` at the file written by - ``write()``. All identities are returned in one ``PoseData``. + **Multisubject layout:** point ``path`` at a file written with + ``multisubject=True``. All identities are returned in one ``PoseData``. **Per-identity layout:** point ``path`` at *any one* of the sibling files. The reader detects the per-identity flag in the embedded @@ -187,9 +217,13 @@ def read(self, path: str | Path, data_type: type | None = None) -> PoseData: by ``source_identity_index``, and concatenates them into a single ``PoseData`` with all identities restored in their original order. + The legacy combined single-file layout (a plain ``NWBFile`` with multiple + identities, written by JABS versions before this change) is no longer + supported and raises ``ValueError``. + Example:: - # Single file + # Multisubject single file pose_data = load("session.nwb", PoseData) # Per-identity — point at any sibling; result is the same @@ -204,29 +238,57 @@ def read(self, path: str | Path, data_type: type | None = None) -> PoseData: ``PoseData`` with all identities merged in their original order. Raises: - ValueError: If no ``PoseEstimation`` containers are found, or if - the expected number of sibling files cannot be located when - reading a per-identity layout. + ValueError: If no ``PoseEstimation`` containers are found, if the + expected number of sibling files cannot be located when reading a + per-identity layout, or if the file uses the unsupported legacy + single-file layout. """ path = Path(path) pose_data, jabs_meta = self._read_single(path) + if jabs_meta.get("multisubject", False): + return pose_data + if jabs_meta.get("per_identity_files", False): return self._read_merged(path, jabs_meta) - return pose_data + raise ValueError( + f"{path} uses the legacy combined single-file NWB layout, which is no " + "longer supported. Re-export with a current version of JABS " + "(per-identity files, or multisubject mode)." + ) # ------------------------------------------------------------------ # Write helpers # ------------------------------------------------------------------ - def _write_single_file(self, data: PoseData, path: Path, **kwargs) -> None: + def _write_multisubject(self, data: PoseData, path: Path, **kwargs) -> None: + """Write all identities to a single self-contained multisubject NWB file. + + Builds an :class:`NdxMultiSubjectsNWBFile` whose ``behavior`` processing + module holds one ``PoseEstimation`` per identity (identical layout to the + per-identity files), plus a ``SubjectsTable`` in ``acquisition`` listing + every subject. JABS round-trip fidelity rides on the ``jabs_metadata`` + scratch JSON; the ``SubjectsTable`` is for external (e.g. DANDI) consumers. + + Args: + data: The PoseData to write. + path: Output file path for the single combined .nwb file. + **kwargs: Forwarded to :meth:`_make_nwb_file` (session metadata, etc.). + Includes ``multisubject=True`` so an NdxMultiSubjectsNWBFile is built. + + Raises: + ImportError: If the ndx-multisubjects extension is not installed. + ValueError: If sanitized identity names are not unique. + """ + self._require_multisubjects() num_identities = data.points.shape[0] identity_names = [self._identity_name(data, i) for i in range(num_identities)] if len(set(identity_names)) != len(identity_names): raise ValueError(f"Identity names are not unique after sanitization: {identity_names}") nwbfile = self._make_nwb_file(**kwargs) + nwbfile.add_acquisition(self._build_subjects_table(data, identity_names)) skeleton = self._make_skeleton(data.body_parts, data.edges, **kwargs) static_skeletons = self._build_static_skeletons(data.static_objects) dynamic_skeletons = self._build_dynamic_skeletons(data.dynamic_objects) @@ -285,7 +347,7 @@ def _write_single_file(self, data: PoseData, path: Path, **kwargs) -> None: ) ) - jabs_meta = self._build_jabs_metadata(data, identity_names) + jabs_meta = self._build_jabs_metadata(data, identity_names, multisubject=True) nwbfile.add_scratch( ScratchData( name=_JABS_METADATA_KEY, @@ -391,8 +453,13 @@ def _write_per_identity(self, data: PoseData, path: Path, **kwargs) -> None: # ------------------------------------------------------------------ def _read_single(self, path: Path) -> tuple[PoseData, dict]: - """Read a single NWB file and return (PoseData, jabs_metadata_dict).""" - with NWBHDF5IO(str(path), mode="r") as io: + """Read a single NWB file and return (PoseData, jabs_metadata_dict). + + ``load_namespaces=True`` lets pynwb reconstruct the + :class:`NdxMultiSubjectsNWBFile` subclass written in multisubject mode; + it is a no-op for plain ndx-pose per-identity files. + """ + with NWBHDF5IO(str(path), mode="r", load_namespaces=True) as io: nwbfile = io.read() behavior = nwbfile.processing[_PROCESSING_MODULE_NAME] @@ -671,7 +738,10 @@ def _make_nwb_file(**kwargs) -> NWBFile: nwb_kwargs[_field] = kwargs[_field] if kwargs.get("subject") is not None: nwb_kwargs["subject"] = kwargs["subject"] - return NWBFile(**nwb_kwargs) + # NdxMultiSubjectsNWBFile is a drop-in NWBFile subclass (identical + # constructor kwargs); only referenced when the extension is present. + nwb_cls = NdxMultiSubjectsNWBFile if kwargs.get("multisubject") else NWBFile + return nwb_cls(**nwb_kwargs) @staticmethod def _make_subject(subject_meta: dict) -> Subject: @@ -713,6 +783,59 @@ def _make_subject(subject_meta: dict) -> Subject: kwargs["date_of_birth"] = dob return Subject(**kwargs) + @staticmethod + def _build_subjects_table(data: PoseData, identity_names: list[str]) -> SubjectsTable: + """Build an ndx-multisubjects SubjectsTable with one row per identity. + + ``subject_id``, ``sex`` and ``species`` are required by the SubjectsTable + spec and always written (defaulting to the identity name, ``"U"`` and + ``""`` respectively when absent). Optional columns are included only when + at least one identity provides them, and every cell is coerced to ``str`` + so each column is a homogeneous HDF5 text column — a DynamicTable requires + every row to supply the same set of columns. + + Args: + data: PoseData whose ``subjects`` dict supplies per-identity metadata. + identity_names: Sanitized identity names, in order, one row each. + + Returns: + A populated SubjectsTable ready to attach via ``add_acquisition``. + """ + # Map JABS subject-dict keys -> optional SubjectsTable column names. + optional_columns = { + "age": "age", + "date_of_birth": "date_of_birth", + "description": "subject_description", + "genotype": "genotype", + "strain": "strain", + "weight": "weight", + } + subjects = data.subjects or {} + + def _cell(value: object) -> str: + return "" if value is None else str(value) + + # An optional column is written for every row iff any identity provides it. + present = { + column + for jabs_key, column in optional_columns.items() + if any(jabs_key in subjects.get(name, {}) for name in identity_names) + } + + table = SubjectsTable(description="Subjects recorded in this session") + for name in identity_names: + meta = subjects.get(name, {}) + row = { + "subject_id": _cell(meta.get("subject_id")) or name, + "sex": _cell(meta.get("sex")) or "U", + "species": _cell(meta.get("species")), + } + for jabs_key, column in optional_columns.items(): + if column in present: + row[column] = _cell(meta.get(jabs_key)) + table.add_row(**row) + return table + @staticmethod def _make_skeleton( body_parts: list[str], diff --git a/packages/jabs-io/tests/internal/pose/test_nwb.py b/packages/jabs-io/tests/internal/pose/test_nwb.py index fdc7e67e..9bba7695 100644 --- a/packages/jabs-io/tests/internal/pose/test_nwb.py +++ b/packages/jabs-io/tests/internal/pose/test_nwb.py @@ -7,6 +7,7 @@ import h5py import numpy as np import pytest +from ndx_multisubjects import NdxMultiSubjectsNWBFile from ndx_pose import PoseEstimation from pynwb import NWBHDF5IO @@ -117,16 +118,16 @@ def _assert_pose_data_equal(a: PoseData, b: PoseData): # --------------------------------------------------------------------------- -# Single-file roundtrip +# Multisubject single-file roundtrip # --------------------------------------------------------------------------- def test_roundtrip_single_file(tmp_path, adapter): - """Write multi-identity PoseData to one file, read back, assert equality.""" + """Write multi-identity PoseData to one multisubject file, read back, assert equality.""" path = tmp_path / "pose.nwb" data = _make_pose_data() - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) _assert_pose_data_equal(data, loaded) @@ -137,7 +138,7 @@ def test_roundtrip_with_subjects(tmp_path, adapter): path = tmp_path / "pose_subjects.nwb" data = _make_pose_data(external_ids=["mouse_a", "mouse_b"], with_subjects=True) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) _assert_pose_data_equal(data, loaded) @@ -148,7 +149,7 @@ def test_roundtrip_with_bounding_boxes(tmp_path, adapter): path = tmp_path / "pose_bb.nwb" data = _make_pose_data(with_bounding_boxes=True) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) _assert_pose_data_equal(data, loaded) @@ -159,7 +160,7 @@ def test_roundtrip_bounding_boxes_none(tmp_path, adapter): path = tmp_path / "pose_no_bb.nwb" data = _make_pose_data(with_bounding_boxes=False) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.bounding_boxes is None @@ -170,7 +171,7 @@ def test_roundtrip_single_identity(tmp_path, adapter): path = tmp_path / "pose_single.nwb" data = _make_pose_data(num_identities=1) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) _assert_pose_data_equal(data, loaded) @@ -181,7 +182,7 @@ def test_roundtrip_many_identities(tmp_path, adapter): path = tmp_path / "pose_many.nwb" data = _make_pose_data(num_identities=5) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) _assert_pose_data_equal(data, loaded) @@ -192,7 +193,7 @@ def test_external_ids_used_in_naming(tmp_path, adapter): path = tmp_path / "pose_ext.nwb" data = _make_pose_data(external_ids=["mouse_a", "mouse_b"]) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) _assert_pose_data_equal(data, loaded) @@ -203,7 +204,7 @@ def test_external_ids_none(tmp_path, adapter): path = tmp_path / "pose_no_ext.nwb" data = _make_pose_data(external_ids=None) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.external_ids is None @@ -215,7 +216,7 @@ def test_cm_per_pixel_none(tmp_path, adapter): path = tmp_path / "pose_no_cm.nwb" data = _make_pose_data(cm_per_pixel=None) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.cm_per_pixel is None @@ -226,7 +227,7 @@ def test_static_objects_roundtrip(tmp_path, adapter): path = tmp_path / "pose_static.nwb" data = _make_pose_data(with_static_objects=True) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert "lixit" in loaded.static_objects @@ -265,7 +266,7 @@ def test_static_objects_multiple_points(tmp_path, adapter, obj_name, points): metadata=base.metadata, ) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert obj_name in loaded.static_objects @@ -278,9 +279,9 @@ def test_static_objects_nwb_structure(tmp_path, adapter): path = tmp_path / "pose_static_struct.nwb" data = _make_pose_data(with_static_objects=True) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() behavior = nwb.processing["behavior"] @@ -303,9 +304,9 @@ def test_static_object_names_in_jabs_metadata_json(tmp_path, adapter): path = tmp_path / "pose_json_static.nwb" data = _make_pose_data(with_static_objects=True) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() jabs_meta = json.loads(str(nwb.scratch["jabs_metadata"].data)) assert "static_object_names" in jabs_meta @@ -317,9 +318,9 @@ def test_body_parts_not_in_jabs_metadata(tmp_path, adapter): path = tmp_path / "pose_no_bp.nwb" data = _make_pose_data() - adapter.write(data, path) + adapter.write(data, path, multisubject=True) - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() meta = json.loads(str(nwb.scratch["jabs_metadata"].data)) assert "body_parts" not in meta @@ -340,7 +341,7 @@ def test_missing_keypoint_padded_with_nan(tmp_path, adapter, caplog): fps=30, ) path = tmp_path / "pose_missing_kp.nwb" - adapter.write(data, path) + adapter.write(data, path, multisubject=True) # Remove the NOSE series from the written file so the reader sees a missing keypoint. # Walk the behavior processing module to find the NOSE series dynamically so this @@ -387,7 +388,7 @@ def test_keypoint_ordering_derived_from_series_names(tmp_path, adapter): ) path = tmp_path / "pose_order.nwb" - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.body_parts == canonical_names @@ -398,7 +399,7 @@ def test_empty_static_objects(tmp_path, adapter): path = tmp_path / "pose_no_static.nwb" data = _make_pose_data(with_static_objects=False) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.static_objects == {} @@ -421,7 +422,7 @@ def test_static_objects_1d_skipped_with_warning(tmp_path, adapter, caplog): path = tmp_path / "pose_1d_static.nwb" with caplog.at_level(logging.WARNING): - adapter.write(data, path) + adapter.write(data, path, multisubject=True) assert "bad_obj" in caplog.text loaded = adapter.read(path) @@ -443,13 +444,13 @@ def test_subjects_roundtrip(tmp_path, adapter): } ) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.subjects == subjects # Verify it's in the jabs_metadata JSON - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() meta = json.loads(str(nwb.scratch["jabs_metadata"].data)) assert meta["subjects"] == subjects @@ -460,7 +461,7 @@ def test_subjects_none_roundtrip(tmp_path, adapter): path = tmp_path / "pose_no_subjects.nwb" data = _make_pose_data() - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.subjects is None @@ -543,9 +544,9 @@ def test_bounding_boxes_per_identity_containers(tmp_path, adapter): with_bounding_boxes=True, ) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() behavior = nwb.processing["behavior"] @@ -565,7 +566,7 @@ def test_edges_roundtrip(tmp_path, adapter): path = tmp_path / "pose_edges.nwb" data = _make_pose_data(edges=[(0, 1), (1, 2)]) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.edges == [(0, 1), (1, 2)] @@ -576,7 +577,7 @@ def test_empty_edges(tmp_path, adapter): path = tmp_path / "pose_no_edges.nwb" data = _make_pose_data(edges=[]) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.edges == [] @@ -714,7 +715,7 @@ def test_write_sanitizes_external_ids(tmp_path, adapter): path = tmp_path / "pose.nwb" data = _make_pose_data(external_ids=["mouse/A", "mouse/B"]) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) # Data roundtrips correctly even though IDs were sanitized on disk @@ -742,7 +743,7 @@ def test_dynamic_objects_roundtrip(tmp_path, adapter): path = tmp_path / "pose_dyn.nwb" data = _make_pose_data(with_dynamic_objects=True) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) _assert_pose_data_equal(data, loaded) @@ -764,9 +765,9 @@ def test_dynamic_objects_nwb_structure(tmp_path, adapter): path = tmp_path / "pose_dyn_struct.nwb" data = _make_pose_data(with_dynamic_objects=True) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() behavior = nwb.processing["behavior"] @@ -789,12 +790,12 @@ def test_dynamic_objects_empty(tmp_path, adapter): path = tmp_path / "pose_no_dyn.nwb" data = _make_pose_data(with_dynamic_objects=False) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert loaded.dynamic_objects == {} - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() meta = json.loads(str(nwb.scratch["jabs_metadata"].data)) assert "dynamic_object_names" not in meta @@ -825,7 +826,7 @@ def test_dynamic_objects_multi_keypoint(tmp_path, adapter): metadata=base.metadata, ) - adapter.write(data, path) + adapter.write(data, path, multisubject=True) loaded = adapter.read(path) assert "foo" in loaded.dynamic_objects @@ -834,7 +835,7 @@ def test_dynamic_objects_multi_keypoint(tmp_path, adapter): np.testing.assert_array_equal(foo.counts, counts_foo) np.testing.assert_array_equal(foo.sample_indices, sample_indices_foo) - with NWBHDF5IO(str(path), "r") as io: + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: nwb = io.read() foo_pe = nwb.processing["behavior"]["foo"] # 2 slots * 2 keypoints -> 4 series @@ -848,9 +849,9 @@ def test_session_start_time_written_to_nwb(tmp_path, adapter): data = _make_pose_data() session_time = datetime.datetime(2024, 3, 15, 10, 30, 0, tzinfo=datetime.timezone.utc) - adapter.write(data, path, session_start_time=session_time) + adapter.write(data, path, multisubject=True, session_start_time=session_time) - with NWBHDF5IO(str(path), mode="r") as io: + with NWBHDF5IO(str(path), mode="r", load_namespaces=True) as io: nwb = io.read() assert nwb.session_start_time == session_time @@ -863,6 +864,7 @@ def test_session_metadata_fields_written_to_nwb(tmp_path, adapter): adapter.write( data, path, + multisubject=True, lab="Kumar Lab", institution="The Jackson Laboratory", experimenter=["Jane Smith", "John Doe"], @@ -870,10 +872,148 @@ def test_session_metadata_fields_written_to_nwb(tmp_path, adapter): session_id="session_001", ) - with NWBHDF5IO(str(path), mode="r") as io: + with NWBHDF5IO(str(path), mode="r", load_namespaces=True) as io: nwb = io.read() assert nwb.lab == "Kumar Lab" assert nwb.institution == "The Jackson Laboratory" assert list(nwb.experimenter) == ["Jane Smith", "John Doe"] assert nwb.experiment_description == "Open field test" assert nwb.session_id == "session_001" + + +# --------------------------------------------------------------------------- +# Multisubject mode (ndx-multisubjects) +# --------------------------------------------------------------------------- + + +def test_multisubject_nwb_structure(tmp_path, adapter): + """multisubject=True writes one NdxMultiSubjectsNWBFile with a populated SubjectsTable.""" + path = tmp_path / "session.nwb" + data = _make_pose_data( + num_identities=2, external_ids=["mouse_a", "mouse_b"], with_subjects=True + ) + + adapter.write(data, path, multisubject=True) + + assert path.exists() # the single combined file IS created in multisubject mode + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: + nwb = io.read() + assert isinstance(nwb, NdxMultiSubjectsNWBFile) + + # SubjectsTable lives in acquisition, one row per identity. + assert "SubjectsTable" in nwb.acquisition + df = nwb.acquisition["SubjectsTable"].to_dataframe() + assert len(df) == 2 + assert list(df["subject_id"]) == ["mouse_a", "mouse_b"] + + # The behavior module still holds the per-identity pose containers. + behavior = nwb.processing["behavior"] + assert "mouse_a" in behavior.data_interfaces + assert "mouse_b" in behavior.data_interfaces + + # jabs_metadata records the multisubject layout flag. + meta = json.loads(str(nwb.scratch["jabs_metadata"].data)) + assert meta["multisubject"] is True + assert "per_identity_files" not in meta + + +def test_multisubject_subject_id_and_sex_default(tmp_path, adapter): + """Required SubjectsTable columns default sensibly when subjects metadata is absent.""" + path = tmp_path / "session.nwb" + data = _make_pose_data(num_identities=2, external_ids=["mouse_a", "mouse_b"]) + + adapter.write(data, path, multisubject=True) + + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: + df = io.read().acquisition["SubjectsTable"].to_dataframe() + # subject_id defaults to the identity name; sex defaults to "U". + assert list(df["subject_id"]) == ["mouse_a", "mouse_b"] + assert list(df["sex"]) == ["U", "U"] + + +def test_multisubject_heterogeneous_subjects(tmp_path, adapter): + """Subjects with differing key sets do not raise; optional columns are unioned.""" + path = tmp_path / "session.nwb" + subjects = { + "mouse_a": {"sex": "M", "genotype": "WT", "strain": "C57BL/6"}, + "mouse_b": {"sex": "F"}, # missing genotype + strain + } + data = _make_pose_data(external_ids=["mouse_a", "mouse_b"]) + data = data.__class__( + **{**{f: getattr(data, f) for f in data.__dataclass_fields__}, "subjects": subjects} + ) + + adapter.write(data, path, multisubject=True) # must not raise + + # subjects round-trip losslessly via jabs_metadata (not the SubjectsTable). + loaded = adapter.read(path) + assert loaded.subjects == subjects + + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: + df = io.read().acquisition["SubjectsTable"].to_dataframe() + assert "genotype" in df.columns and "strain" in df.columns + row_b = df[df["subject_id"] == "mouse_b"].iloc[0] + assert row_b["genotype"] == "" and row_b["strain"] == "" + + +def test_multisubject_roundtrip_full_features(tmp_path, adapter): + """A multisubject file round-trips bounding boxes, static and dynamic objects together.""" + path = tmp_path / "session.nwb" + data = _make_pose_data( + external_ids=["mouse_a", "mouse_b"], + with_bounding_boxes=True, + with_static_objects=True, + with_dynamic_objects=True, + with_subjects=True, + ) + + adapter.write(data, path, multisubject=True) + loaded = adapter.read(path) + + _assert_pose_data_equal(data, loaded) + + +def test_multisubject_requires_extension(tmp_path, adapter, monkeypatch): + """multisubject=True raises a clear ImportError when ndx-multisubjects is missing.""" + monkeypatch.setattr("jabs.io.internal.pose.nwb._MULTISUBJECTS_AVAILABLE", False) + data = _make_pose_data() + + with pytest.raises(ImportError, match="ndx-multisubjects"): + adapter.write(data, tmp_path / "session.nwb", multisubject=True) + + +def test_read_legacy_single_file_raises(tmp_path, adapter, monkeypatch): + """A file whose jabs_metadata has neither layout flag (legacy single-file) raises on read.""" + path = tmp_path / "legacy.nwb" + data = _make_pose_data(num_identities=2) + + # Simulate the removed legacy writer: a combined file whose jabs_metadata + # carries neither 'multisubject' nor 'per_identity_files' by dropping the + # **extra flags that the current writer passes in. + original = PoseNWBAdapter._build_jabs_metadata + + def _no_layout_flags(data_, identity_names, **extra): + return original(data_, identity_names) + + monkeypatch.setattr(PoseNWBAdapter, "_build_jabs_metadata", staticmethod(_no_layout_flags)) + + adapter.write(data, path, multisubject=True) + + with pytest.raises(ValueError, match="legacy"): + adapter.read(path) + + +def test_multisubject_isolated_from_per_identity_siblings(tmp_path, adapter): + """A multisubject file coexists with per-identity siblings; each reads back correctly.""" + data = _make_pose_data(num_identities=2, external_ids=["mouse_a", "mouse_b"]) + + ms_path = tmp_path / "session.nwb" + adapter.write(data, ms_path, multisubject=True) # → session.nwb + adapter.write(data, tmp_path / "session.nwb") # default → session_mouse_a/b.nwb + + # Reading the multisubject file returns all identities directly. + _assert_pose_data_equal(data, adapter.read(ms_path)) + + # Reading a per-identity sibling auto-merges only the per-identity siblings; + # the multisubject session.nwb is excluded by the glob/split_subject_count filter. + _assert_pose_data_equal(data, adapter.read(tmp_path / "session_mouse_a.nwb")) diff --git a/src/jabs/resources/docs/user_guide/file-formats.md b/src/jabs/resources/docs/user_guide/file-formats.md index 90b46062..fd5026c8 100644 --- a/src/jabs/resources/docs/user_guide/file-formats.md +++ b/src/jabs/resources/docs/user_guide/file-formats.md @@ -125,26 +125,30 @@ See [NWB Export](nwb-export.md) for the CLI command and output-mode options. JABS writes NWB in two modes: -| Mode | When to use | -|---------------------------------|---------------------------------------------------------------------------------| -| Combined (default) | Local analysis, sharing with collaborators who can parse JABS-specific fields | -| Per-identity (`--per-identity`) | DANDI archive upload; tools that expect one subject per file | - -In **combined** mode all identities are written into a single `.nwb` file. -In **per-identity** mode one file is written per animal, named -`{output_stem}_{identity_name}.nwb`. Static and dynamic objects are written to every -per-identity file identically (they are session-level data). The JABS reader -re-assembles per-identity files transparently: point it at any sibling and it merges -all siblings into a single result in the original identity order. +| Mode | When to use | +|---------------------------------|-----------------------------------------------------------------------------------| +| Per-identity (default) | DANDI archive upload; tools that expect one subject per file | +| Multisubject (`--multisubject`) | A single shareable file holding every subject (via the ndx-multisubjects extension) | + +In **per-identity** mode (the default) one file is written per animal, named +`{output_stem}_{identity_name}.nwb`; the `OUTPUT` path itself is not created. The JABS +reader re-assembles per-identity files transparently: point it at any sibling and it +merges all siblings into a single result in the original identity order. +In **multisubject** mode all identities are written into a single self-contained `.nwb` +file (an `NdxMultiSubjectsNWBFile` with a `SubjectsTable`) using the +[ndx-multisubjects](https://github.com/nehatk17/ndx-multisubjects) extension. Static and +dynamic objects are written to every per-identity file identically (they are +session-level data). ### Full file layout -The layout below shows a combined file with two animal identities, two static objects +The layout below shows a multisubject file with two animal identities, two static objects (`corners`, `lixit`), and one dynamic object (`fecal_boli`). ``` -NWBFile -├── subject/ [Subject] per-identity mode only +NdxMultiSubjectsNWBFile +├── acquisition/ +│ └── SubjectsTable [DynamicTable] multisubject mode only — one row per subject ├── processing/ │ └── behavior/ [ProcessingModule] │ ├── Skeletons/ [Skeletons container] @@ -184,9 +188,11 @@ NWBFile └── jabs_metadata/ [ScratchData] JSON string (see below) ``` -In a per-identity file the layout is identical, except `NWBFile.subject` is populated -(when subject metadata is provided), only one animal identity container is present, and -`jabs_identity_mask` / `jabs_bounding_boxes_` cover that identity only. +A per-identity file (the default) uses a plain `NWBFile` with the same +`processing/behavior` layout, except there is no `SubjectsTable`, `NWBFile.subject` is +populated (when subject metadata is provided), only one animal identity container is +present, and `jabs_identity_mask` / `jabs_bounding_boxes_` cover that identity +only. ### Animal pose @@ -206,12 +212,12 @@ base_tail, mid_tail, tip_tail #### PoseEstimationSeries fields (per keypoint) -| Field | Value | -|-------------------------|----------------------------------------------------------------------------------| -| `name` | Keypoint name (e.g. `"nose"`, `"left_ear"`) | +| Field | Value | +|-------------------------|---------------------------------------------------------------------------------| +| `name` | Keypoint name (e.g. `"nose"`, `"left_ear"`) | | `data` | shape `(num_frames, 2)` — `(x, y)` coordinates in pixels | -| `rate` | Frames per second (float) | -| `unit` | `"pixels"` | +| `rate` | Frames per second (float) | +| `unit` | `"pixels"` | | `reference_frame` | `"Top-left corner of video frame, x increases rightward, y increases downward"` | | `confidence` | shape `(num_frames,)` — `0.0` = missing keypoint, `> 0.0` = valid | | `confidence_definition` | `"0.0=invalid/missing keypoint, >0.0=valid keypoint"` | @@ -223,7 +229,7 @@ is present in each frame. | Mode | Shape stored in file | Shape returned by reader | |--------------|--------------------------------|--------------------------------| -| Combined | `(num_frames, num_identities)` | `(num_identities, num_frames)` | +| Multisubject | `(num_frames, num_identities)` | `(num_identities, num_frames)` | | Per-identity | `(num_frames,)` | `(1, num_frames)` | ### Bounding boxes (optional) @@ -304,11 +310,11 @@ Dynamic objects are introduced in JABS pose format v7. The source HDF5 pose file stores dynamic objects under `dynamic_objects/{name}/`: -| Dataset | Shape | Description | -|------------------|-----------------------------------------------------------------------------------------|-----------------------------------------------------| -| `points` | `(n_predictions, max_count, 2)` single-keypoint; `(n_predictions, max_count, n_kp, 2)` multi-keypoint | Keypoint coordinates | -| `counts` | `(n_predictions,)` | Number of valid object instances at each prediction | -| `sample_indices` | `(n_predictions,)` | Frame indices at which predictions were made | +| Dataset | Shape | Description | +|------------------|-------------------------------------------------------------------------------------------------------|-----------------------------------------------------| +| `points` | `(n_predictions, max_count, 2)` single-keypoint; `(n_predictions, max_count, n_kp, 2)` multi-keypoint | Keypoint coordinates | +| `counts` | `(n_predictions,)` | Number of valid object instances at each prediction | +| `sample_indices` | `(n_predictions,)` | Frame indices at which predictions were made | The `points` dataset carries an optional HDF5 attribute `axis_order` (`"xy"` or `"yx"`, default `"yx"`). JABS normalizes all coordinates to `(x, y)` on read. @@ -410,38 +416,37 @@ Every JABS NWB file contains a `ScratchData` object named `jabs_metadata` in the needed for a lossless round-trip. It is required because pynwb returns `PoseEstimationSeries` in alphabetical order from HDF5, which would otherwise scramble the keypoint ordering. Tools that do not use the JABS reader can parse this JSON -directly to recover ordered keypoint names, identity ordering, subject metadata, and -object classification. +directly to recover identity ordering, subject metadata, and object classification. +(Keypoint ordering is not stored here; the JABS reader restores it from the canonical +keypoint index.) #### Keys -| Key | Type | Present | Description | -|-------------------------|---------------------------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| -| `format_version` | `int` | Always | JABS NWB format version. Currently `1`. | -| `identity_names` | `list[str]` | Always | Ordered list of animal identity container names. Defines identity order on read. | -| `num_identities` | `int` | Always | Total number of animal identities in the recording session. | -| `body_parts` | `list[str]` | Always | Ordered list of keypoint names for animal skeletons. | -| `cm_per_pixel` | `float \| null` | Always | Pixel-to-centimetre scale factor. `null` if not available. | -| `external_ids` | `list[str] \| null` | Always | Original external identity names from the pose file (e.g. cage IDs). `null` if the pose file had no external IDs. | +| Key | Type | Present | Description | +|-------------------------|---------------------------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `format_version` | `int` | Always | JABS NWB format version. Currently `1`. | +| `identity_names` | `list[str]` | Always | Ordered list of animal identity container names. Defines identity order on read. | +| `num_identities` | `int` | Always | Total number of animal identities in the recording session. | +| `cm_per_pixel` | `float \| null` | Always | Pixel-to-centimetre scale factor. `null` if not available. | +| `external_ids` | `list[str] \| null` | Always | Original external identity names from the pose file (e.g. cage IDs). `null` if the pose file had no external IDs. | | `subjects` | `dict[str, dict] \| null` | Always | Per-identity biological metadata keyed by identity name, for all identities. Fields: `subject_id`, `sex`, `genotype`, `strain`, `age`, `weight`, `species`, `description`. `null` if none provided. | -| `metadata` | `dict` | Always | Provenance from the source pose file: `source_file`, `pose_format_version`, and optionally `source_file_hash`. | -| `static_object_names` | `list[str]` | When static objects present | Names of all static object `PoseEstimation` containers. | -| `dynamic_object_names` | `list[str]` | When dynamic objects present | Names of all dynamic object `PoseEstimation` containers. | -| `dynamic_object_shapes` | `dict[str, [int, int]]` | When dynamic objects present | Maps each dynamic object name to `[max_count, n_keypoints]`. Required to reconstruct the 4-D points array on read. | -| `per_identity_files` | `bool` | Per-identity mode only | `true` if this file is one of a set of per-identity NWB files. | -| `source_identity_index` | `int` | Per-identity mode only | Zero-based index of the identity in this file. Used to restore original order when merging siblings. | -| `split_subject_count` | `int` | Per-identity mode only | Total number of subjects in the session across all split files. Used to validate all siblings are present before merging. | +| `metadata` | `dict` | Always | Provenance from the source pose file: `source_file`, `pose_format_version`, and optionally `source_file_hash`. | +| `static_object_names` | `list[str]` | When static objects present | Names of all static object `PoseEstimation` containers. | +| `dynamic_object_names` | `list[str]` | When dynamic objects present | Names of all dynamic object `PoseEstimation` containers. | +| `dynamic_object_shapes` | `dict[str, [int, int]]` | When dynamic objects present | Maps each dynamic object name to `[max_count, n_keypoints]`. Required to reconstruct the 4-D points array on read. | +| `multisubject` | `bool` | Multisubject mode only | `true` if this is a single multi-subject file written with the ndx-multisubjects extension. | +| `per_identity_files` | `bool` | Per-identity mode only | `true` if this file is one of a set of per-identity NWB files. | +| `source_identity_index` | `int` | Per-identity mode only | Zero-based index of the identity in this file. Used to restore original order when merging siblings. | +| `split_subject_count` | `int` | Per-identity mode only | Total number of subjects in the session across all split files. Used to validate all siblings are present before merging. | -#### Example — combined file +#### Example — multisubject file ```json { "format_version": 1, + "multisubject": true, "identity_names": ["subject_1", "subject_2"], "num_identities": 2, - "body_parts": ["nose", "left_ear", "right_ear", "base_neck", "left_front_paw", - "right_front_paw", "center_spine", "left_rear_paw", "right_rear_paw", - "base_tail", "mid_tail", "tip_tail"], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { @@ -486,9 +491,6 @@ object classification. "format_version": 1, "identity_names": ["subject_1"], "num_identities": 3, - "body_parts": ["nose", "left_ear", "right_ear", "base_neck", "left_front_paw", - "right_front_paw", "center_spine", "left_rear_paw", "right_rear_paw", - "base_tail", "mid_tail", "tip_tail"], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { diff --git a/src/jabs/resources/docs/user_guide/nwb-export.md b/src/jabs/resources/docs/user_guide/nwb-export.md index b675a240..2ddd739e 100644 --- a/src/jabs/resources/docs/user_guide/nwb-export.md +++ b/src/jabs/resources/docs/user_guide/nwb-export.md @@ -11,14 +11,14 @@ The `jabs-cli convert-to-nwb` command converts a JABS pose estimation HDF5 file pip install "jabs-behavior-classifier[nwb]" ``` -The extra adds `pynwb` and `ndx-pose` as dependencies. +The extra adds `pynwb`, `ndx-pose`, and `ndx-multisubjects` as dependencies. Two output modes are available. **Choose the mode based on how the files will be used:** -| Mode | When to use | -|---------------------------------|-------------------------------------------------------------------------------| -| Combined (default) | Local analysis, sharing with collaborators who can parse JABS-specific fields | -| Per-identity (`--per-identity`) | DANDI archive upload, tools that expect one subject per file | +| Mode | When to use | +|---------------------------------|-----------------------------------------------------------------------------------| +| Per-identity (default) | DANDI archive upload, tools that expect one subject per file | +| Multisubject (`--multisubject`) | A single shareable file holding every subject (via the ndx-multisubjects extension) | --- @@ -31,8 +31,8 @@ jabs-cli convert-to-nwb INPUT_PATH OUTPUT [OPTIONS] | Argument / Option | Description | |------------------------------|------------------------------------------------------------------------------------------------------------------------------| | `INPUT_PATH` | JABS pose HDF5 file, any version v2–v8. Format version is inferred automatically from the filename (e.g. `_pose_est_v6.h5`). | -| `OUTPUT` | Destination `.nwb` file. In `--per-identity` mode, used as a naming template; the file itself is not created directly. | -| `--per-identity` | Write one NWB file per identity instead of a single combined file. | +| `OUTPUT` | Destination `.nwb` file. By default (per-identity), used as a naming template; the file itself is not created directly. With `--multisubject`, the single combined file is written directly to this path. | +| `--multisubject` | Write a single multi-subject NWB file (using the ndx-multisubjects extension) instead of the default one file per identity. | | `--session-description TEXT` | NWB session description string. Defaults to `'JABS PoseEstimation Data'`. | | `--subjects PATH` | Path to a JSON file with per-animal biological metadata. | | `--session-metadata PATH` | Path to a JSON file with NWB session-level metadata (start time, experimenter, etc.). | @@ -40,11 +40,11 @@ jabs-cli convert-to-nwb INPUT_PATH OUTPUT [OPTIONS] ### Examples ```bash -# Single combined file — all identities in one NWB file +# One NWB file per identity (default; recommended for DANDI upload) jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb -# One NWB file per identity (recommended for DANDI upload) -jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --per-identity +# A single multi-subject file holding every identity +jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --multisubject # Include per-animal metadata jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --subjects subjects.json @@ -97,11 +97,12 @@ subject.** All other fields are optional. | `weight` | string | Body weight, e.g. `"25g"` | | `description` | string | Free-text notes | -In per-identity mode, subject metadata is written to both the standard `NWBFile.subject` -field and the `jabs_metadata` scratch field. If no `--subjects` file is provided, a -minimal subject with `subject_id` set to the identity name is written automatically. In -combined mode, subject metadata is written only to `jabs_metadata` (see -[below](#subject-metadata-and-nwbfilesubject)). +In per-identity mode (the default), subject metadata is written to both the standard +`NWBFile.subject` field and the `jabs_metadata` scratch field. If no `--subjects` file is +provided, a minimal subject with `subject_id` set to the identity name is written +automatically. In multisubject mode, subject metadata is written to a `SubjectsTable` +(one row per subject) and to `jabs_metadata` (see +[below](#subject-metadata-by-mode)). --- @@ -137,27 +138,11 @@ All fields are optional. Unknown keys are ignored with a warning. ## Output modes -### Combined file (default) - -All identities from the recording session are written into a single NWB file. - -``` -session.nwb - └── all identities, all objects -``` - -**This is a non-standard NWB usage.** Standard NWB (`NWBFile.subject`) only supports -one subject per file, so combined files cannot populate that field. Instead, all -per-animal metadata is stored in the `jabs_metadata` scratch field (a JSON string). -Tools that do not know about `jabs_metadata` will not see subject metadata at all. - -**DANDI upload:** Combined files will fail DANDI validation because `NWBFile.subject` -is not set. Use per-identity mode for DANDI. - -### Per-identity (`--per-identity`) +### Per-identity files (default) One NWB file is written per animal. The `OUTPUT` path is used as a naming template; -files are written as `{output_stem}_{identity_name}.nwb` in the same directory. +files are written as `{output_stem}_{identity_name}.nwb` in the same directory. The +`OUTPUT` path itself is **not** created. ``` session_subject_1.nwb ← identity 0 + all objects @@ -165,7 +150,7 @@ session_subject_2.nwb ← identity 1 + all objects session_subject_3.nwb ← identity 2 + all objects ``` -**This is the more standard output.** Each file contains exactly one animal, so +**This is the most standard output.** Each file contains exactly one animal, so `NWBFile.subject` is populated with that animal's biological metadata (when provided via `--subjects`). Any standard NWB tool — including the DANDI archive — can read the subject field directly without knowing anything about JABS. @@ -182,16 +167,38 @@ sibling file; it detects the `per_identity_files` flag in `jabs_metadata`, finds siblings, and merges them into a single result with all identities in their original order. -### Subject metadata and `NWBFile.subject` +### Multisubject single file (`--multisubject`) + +All identities from the recording session are written into a single, self-contained NWB +file at `OUTPUT`, using the +[ndx-multisubjects](https://github.com/nehatk17/ndx-multisubjects) extension. + +``` +session.nwb + └── all identities + all objects + a SubjectsTable listing every subject +``` + +The file is an `NdxMultiSubjectsNWBFile` (a drop-in `NWBFile` subclass). Because standard +NWB's `NWBFile.subject` only holds one subject, multiple subjects are instead described by +a **`SubjectsTable`** (one row per animal) stored in `acquisition`. The pose data itself +is laid out in `processing/behavior` exactly as in a per-identity file, and the full, +lossless JABS round-trip still rides on the `jabs_metadata` scratch field. + +This mode is intended for sharing a whole session as one artifact. Reading it back with +the JABS reader returns all identities directly (no sibling files are involved). + +### Subject metadata by mode -| Mode | NWBFile.subject | jabs_metadata.subjects | -|--------------|------------------------------|--------------------------| -| Combined | Not set | Set (all identities) | -| Per-identity | Set for this file's identity | Set (all identities) | +| Mode | NWBFile.subject | SubjectsTable | jabs_metadata.subjects | +|--------------|------------------------------|------------------------|------------------------| +| Per-identity | Set for this file's identity | — | Set (all identities) | +| Multisubject | Not set | One row per subject | Set (all identities) | -`jabs_metadata.subjects` always carries the full dict for all identities, even in -per-identity files. This makes each file self-contained: the JABS reader can recover -complete subject metadata from any sibling without loading the others. +`jabs_metadata.subjects` always carries the full dict for all identities, in both modes. +This makes each per-identity file self-contained: the JABS reader can recover complete +subject metadata from any sibling without loading the others. In multisubject mode the +`SubjectsTable` is provided for standard NWB / DANDI consumers; JABS itself recovers +subject metadata from `jabs_metadata`. --- @@ -201,12 +208,13 @@ For the full format specification — including all field definitions, `jabs_met keys, and worked examples for static and dynamic objects — see [File Formats — NWB Pose File](file-formats.md#nwb-pose-file). -The layout below shows a combined file with two animal identities, two static objects +The layout below shows a multisubject file with two animal identities, two static objects (`corners`, `lixit`), and one dynamic object (`fecal_boli`). ``` -NWBFile -├── subject/ [Subject] per-identity mode only +NdxMultiSubjectsNWBFile +├── acquisition/ +│ └── SubjectsTable [DynamicTable] multisubject mode only — one row per subject ├── processing/ │ └── behavior/ [ProcessingModule] │ ├── Skeletons/ [Skeletons container] @@ -246,7 +254,9 @@ NWBFile └── jabs_metadata/ [ScratchData] JSON string (see below) ``` -In a per-identity file the layout is identical, except: +A per-identity file (the default) uses a plain `NWBFile` and the same +`processing/behavior` layout, except: +- The root is a standard `NWBFile`; there is no `SubjectsTable` - `NWBFile.subject` is populated (when subject metadata is provided) - Only one animal identity container is present - `jabs_identity_mask` / `jabs_bounding_boxes_` cover that identity only @@ -283,7 +293,7 @@ each frame. | Mode | Shape stored in file | Shape returned by reader | |--------------|--------------------------------|--------------------------------| -| Combined | `(num_frames, num_identities)` | `(num_identities, num_frames)` | +| Multisubject | `(num_frames, num_identities)` | `(num_identities, num_frames)` | | Per-identity | `(num_frames,)` | `(1, num_frames)` | --- @@ -370,8 +380,9 @@ needed for a lossless round-trip. Standard NWB fields alone are insufficient bec pynwb returns `PoseEstimationSeries` in alphabetical order from HDF5, which would otherwise scramble the keypoint ordering. -Tools that do not use the JABS reader can parse this JSON directly to recover ordered -keypoint names, identity ordering, subject metadata, and object classification. +Tools that do not use the JABS reader can parse this JSON directly to recover identity +ordering, subject metadata, and object classification. (Keypoint ordering is not stored +here; the JABS reader restores it from the canonical keypoint index.) #### Keys @@ -380,7 +391,6 @@ keypoint names, identity ordering, subject metadata, and object classification. | `format_version` | `int` | Always | JABS NWB format version. Currently `1`. | | `identity_names` | `list[str]` | Always | Ordered list of animal identity container names. Defines identity order on read. | | `num_identities` | `int` | Always | Total number of animal identities in the recording session. | -| `body_parts` | `list[str]` | Always | Ordered list of keypoint names for animal skeletons. | | `cm_per_pixel` | `float \| null` | Always | Pixel-to-centimetre scale factor. `null` if not available. | | `external_ids` | `list[str] \| null` | Always | Original external identity names from the pose file. `null` if the pose file had no external IDs. | | `subjects` | `dict[str, dict] \| null` | Always | Per-identity subject metadata keyed by identity name, for all identities. `null` if no subject metadata is available. Fields: `subject_id`, `sex`, `species`, `age`, `date_of_birth`, `genotype`, `strain`, `weight`, `description`. DANDI requires `species`, `sex`, and either `age` or `date_of_birth`. | @@ -388,20 +398,19 @@ keypoint names, identity ordering, subject metadata, and object classification. | `static_object_names` | `list[str]` | When static objects present | Names of all static object `PoseEstimation` containers. | | `dynamic_object_names` | `list[str]` | When dynamic objects present | Names of all dynamic object `PoseEstimation` containers. | | `dynamic_object_shapes` | `dict[str, [int, int]]` | When dynamic objects present | Maps each dynamic object name to `[max_count, n_keypoints]`. | +| `multisubject` | `bool` | Multisubject mode only | `true` if this is a single multi-subject file written with the ndx-multisubjects extension. | | `per_identity_files` | `bool` | Per-identity mode only | `true` if this file is one of a set of per-identity NWB files. | | `source_identity_index` | `int` | Per-identity mode only | Zero-based index of the identity in this file. | | `split_subject_count` | `int` | Per-identity mode only | Total number of subjects in the session across all split files. | -#### Example — combined file +#### Example — multisubject file ```json { "format_version": 1, + "multisubject": true, "identity_names": ["subject_1", "subject_2"], "num_identities": 2, - "body_parts": ["nose", "left_ear", "right_ear", "base_neck", "left_front_paw", - "right_front_paw", "center_spine", "left_rear_paw", "right_rear_paw", - "base_tail", "mid_tail", "tip_tail"], "cm_per_pixel": 0.043, "external_ids": null, "subjects": { diff --git a/src/jabs/scripts/cli/cli.py b/src/jabs/scripts/cli/cli.py index b5bdaa18..d3f8fb8e 100644 --- a/src/jabs/scripts/cli/cli.py +++ b/src/jabs/scripts/cli/cli.py @@ -381,13 +381,13 @@ def cross_validation( type=click.Path(dir_okay=False, writable=True, path_type=Path), ) @click.option( - "--per-identity", + "--multisubject", is_flag=True, default=False, help=( - "Write one NWB file per identity instead of a single combined file. " - "OUTPUT is used as a naming template; files are written as " - "{output_stem}_{identity_name}.nwb alongside it." + "Write a single multi-subject NWB file (using the ndx-multisubjects " + "extension) instead of the default one file per identity. The combined " + "file is written directly to OUTPUT." ), ) @click.option( @@ -428,7 +428,7 @@ def convert_to_nwb( ctx: click.Context, input_path: Path, output: Path, - per_identity: bool, + multisubject: bool, session_description: str | None, subjects_path: Path | None, session_metadata_path: Path | None, @@ -438,19 +438,21 @@ def convert_to_nwb( INPUT_PATH is a JABS pose HDF5 file (any version, v2-v8). The format version is inferred automatically from the filename (e.g. _pose_est_v6.h5). - OUTPUT is the destination NWB file. In --per-identity mode, OUTPUT is a - naming template and is not created directly; instead one file per identity - is written as {output_stem}_{identity_name}.nwb in the same directory. + OUTPUT is the destination NWB file. By default one file per identity is + written: OUTPUT is a naming template and is not created directly; instead + one file per identity is written as {output_stem}_{identity_name}.nwb in the + same directory. With --multisubject, a single combined file is written + directly to OUTPUT. Examples: \b - # Single file, all identities + # One NWB file per identity (default) jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb \b - # One NWB file per identity - jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --per-identity + # A single multi-subject file (ndx-multisubjects) + jabs-cli convert-to-nwb session_pose_est_v6.h5 session.nwb --multisubject \b # Include per-animal metadata @@ -463,7 +465,7 @@ def convert_to_nwb( if ctx.obj["VERBOSE"]: click.echo(f"Input: {input_path}") click.echo(f"Output: {output}") - click.echo(f"Per-identity: {per_identity}") + click.echo(f"Multisubject: {multisubject}") if subjects_path: click.echo(f"Subjects: {subjects_path}") if session_metadata_path: @@ -498,7 +500,7 @@ def convert_to_nwb( run_conversion( input_path=input_path, output_path=output, - per_identity=per_identity, + multisubject=multisubject, session_description=session_description, subjects=subjects, session_metadata=session_metadata, @@ -506,10 +508,10 @@ def convert_to_nwb( except Exception as e: raise click.ClickException(str(e)) from e - if per_identity: - click.echo(f"Wrote per-identity NWB files to {output.parent}") - else: + if multisubject: click.echo(f"Wrote {output}") + else: + click.echo(f"Wrote per-identity NWB files to {output.parent}") def main(): diff --git a/src/jabs/scripts/cli/convert_to_nwb.py b/src/jabs/scripts/cli/convert_to_nwb.py index f3e110b8..8e897c26 100644 --- a/src/jabs/scripts/cli/convert_to_nwb.py +++ b/src/jabs/scripts/cli/convert_to_nwb.py @@ -152,7 +152,7 @@ def _parse_session_start_time(value: str) -> datetime.datetime: def run_conversion( input_path: Path, output_path: Path, - per_identity: bool = False, + multisubject: bool = False, session_description: str | None = None, subjects: dict[str, dict] | None = None, session_metadata: dict | None = None, @@ -162,12 +162,20 @@ def run_conversion( The pose format version is inferred from the filename (e.g. "_pose_est_v6.h5" → v6). Supported versions: v2-v8. + By default one NWB file is written per identity. In that mode + ``output_path`` is a naming template; actual files are written alongside it + as "{stem}_{identity_name}.nwb". Pass ``multisubject=True`` to instead write + a single combined file at ``output_path`` using the ndx-multisubjects + extension. + Args: input_path: Path to the input JABS pose HDF5 file. - output_path: Destination path for the NWB file. In per-identity mode - this is used as a naming template; actual files are written - alongside it as "{stem}_{identity_name}.nwb". - per_identity: If True, write one NWB file per identity. + output_path: Destination path for the NWB file. In the default + per-identity mode this is used as a naming template; actual files are + written alongside it as "{stem}_{identity_name}.nwb". In multisubject + mode the single combined file is written at this path. + multisubject: If True, write a single multi-subject NWB file using the + ndx-multisubjects extension instead of one file per identity. session_description: Optional NWB session description string. subjects: Optional per-animal biological metadata dict, keyed by identity name. See PoseData.subjects for the expected @@ -192,7 +200,7 @@ def run_conversion( pose_data = pose_to_pose_data(pose, subjects=subjects) - write_kwargs: dict = {"per_identity_files": per_identity} + write_kwargs: dict = {"multisubject": multisubject} if session_description is not None: write_kwargs["session_description"] = session_description diff --git a/tests/scripts/test_convert_to_nwb.py b/tests/scripts/test_convert_to_nwb.py index 3b745437..732268ed 100644 --- a/tests/scripts/test_convert_to_nwb.py +++ b/tests/scripts/test_convert_to_nwb.py @@ -1,10 +1,12 @@ """Tests for convert_to_nwb helper functions.""" import datetime +from unittest import mock import pytest +from click.testing import CliRunner -from jabs.scripts.cli.convert_to_nwb import _parse_session_start_time +from jabs.scripts.cli.convert_to_nwb import _parse_session_start_time, run_conversion def test_parse_utc_offset(): @@ -49,3 +51,95 @@ def test_parse_non_string_raises(value): """Test that ValueError is raised if value is not a string.""" with pytest.raises(ValueError, match="must be a string"): _parse_session_start_time(value) + + +# --------------------------------------------------------------------------- +# run_conversion write-mode wiring +# --------------------------------------------------------------------------- + + +def _patch_conversion_internals(monkeypatch): + """Patch the pose-loading and save boundaries of run_conversion; return the save mock.""" + pose = mock.Mock(num_identities=2, num_frames=10, fps=30) + monkeypatch.setattr("jabs.scripts.cli.convert_to_nwb.open_pose_file", lambda *a, **k: pose) + monkeypatch.setattr( + "jabs.scripts.cli.convert_to_nwb.pose_to_pose_data", + lambda *a, **k: mock.sentinel.pose_data, + ) + save_mock = mock.Mock() + monkeypatch.setattr("jabs.scripts.cli.convert_to_nwb.save", save_mock) + return save_mock + + +def test_run_conversion_multisubject_forwarded(monkeypatch, tmp_path): + """run_conversion(multisubject=True) forwards multisubject=True to save().""" + save_mock = _patch_conversion_internals(monkeypatch) + + run_conversion(tmp_path / "in_pose_est_v6.h5", tmp_path / "out.nwb", multisubject=True) + + save_mock.assert_called_once() + assert save_mock.call_args.kwargs["multisubject"] is True + + +def test_run_conversion_defaults_to_per_identity(monkeypatch, tmp_path): + """run_conversion defaults to multisubject=False (per-identity output).""" + save_mock = _patch_conversion_internals(monkeypatch) + + run_conversion(tmp_path / "in_pose_est_v6.h5", tmp_path / "out.nwb") + + assert save_mock.call_args.kwargs["multisubject"] is False + + +# --------------------------------------------------------------------------- +# convert-to-nwb CLI wiring +# --------------------------------------------------------------------------- + + +def test_cli_multisubject_flag_forwarded(monkeypatch, tmp_path): + """The --multisubject flag is forwarded to run_conversion.""" + from jabs.scripts.cli.cli import cli + + run_mock = mock.Mock() + monkeypatch.setattr("jabs.scripts.cli.cli.run_conversion", run_mock) + input_path = tmp_path / "session_pose_est_v6.h5" + input_path.write_bytes(b"") # must exist for click.Path(exists=True) + output = tmp_path / "session.nwb" + + result = CliRunner().invoke( + cli, ["convert-to-nwb", str(input_path), str(output), "--multisubject"] + ) + + assert result.exit_code == 0, result.output + assert run_mock.call_args.kwargs["multisubject"] is True + + +def test_cli_defaults_to_per_identity(monkeypatch, tmp_path): + """Without --multisubject the CLI requests per-identity output (multisubject=False).""" + from jabs.scripts.cli.cli import cli + + run_mock = mock.Mock() + monkeypatch.setattr("jabs.scripts.cli.cli.run_conversion", run_mock) + input_path = tmp_path / "session_pose_est_v6.h5" + input_path.write_bytes(b"") + output = tmp_path / "session.nwb" + + result = CliRunner().invoke(cli, ["convert-to-nwb", str(input_path), str(output)]) + + assert result.exit_code == 0, result.output + assert run_mock.call_args.kwargs["multisubject"] is False + + +def test_cli_per_identity_flag_removed(tmp_path): + """The old --per-identity flag no longer exists.""" + from jabs.scripts.cli.cli import cli + + input_path = tmp_path / "session_pose_est_v6.h5" + input_path.write_bytes(b"") + output = tmp_path / "session.nwb" + + result = CliRunner().invoke( + cli, ["convert-to-nwb", str(input_path), str(output), "--per-identity"] + ) + + assert result.exit_code != 0 + assert "no such option" in result.output.lower() diff --git a/uv.lock b/uv.lock index f42245a7..4cfd6a40 100644 --- a/uv.lock +++ b/uv.lock @@ -1678,7 +1678,7 @@ wheels = [ [[package]] name = "jabs-behavior" -version = "0.44.2" +version = "0.45.0" source = { editable = "packages/jabs-behavior" } dependencies = [ { name = "jsonschema" }, @@ -1722,7 +1722,7 @@ test = [ [[package]] name = "jabs-behavior-classifier" -version = "0.44.2" +version = "0.45.0" source = { editable = "." } dependencies = [ { name = "argparse-formatter" }, @@ -1834,7 +1834,7 @@ test = [ [[package]] name = "jabs-core" -version = "0.44.2" +version = "0.45.0" source = { editable = "packages/jabs-core" } dependencies = [ { name = "h5py" }, @@ -1892,7 +1892,7 @@ test = [ [[package]] name = "jabs-io" -version = "0.44.2" +version = "0.45.0" source = { editable = "packages/jabs-io" } dependencies = [ { name = "jabs-core" }, @@ -1905,6 +1905,7 @@ h5py = [ { name = "h5py" }, ] nwb = [ + { name = "ndx-multisubjects" }, { name = "ndx-pose" }, { name = "pynwb" }, ] @@ -1936,6 +1937,7 @@ test = [ requires-dist = [ { name = "h5py", marker = "extra == 'h5py'", specifier = ">=3.15.1" }, { name = "jabs-core", editable = "packages/jabs-core" }, + { name = "ndx-multisubjects", marker = "extra == 'nwb'", specifier = ">=0.1.1" }, { name = "ndx-pose", marker = "extra == 'nwb'", specifier = ">=0.2.2" }, { name = "numpy", specifier = ">=2.0.0,<3.0.0" }, { name = "pyarrow", marker = "extra == 'parquet'", specifier = ">=18.0.0" }, @@ -1961,7 +1963,7 @@ test = [ [[package]] name = "jabs-vision" -version = "0.44.2" +version = "0.45.0" source = { editable = "packages/jabs-vision" } dependencies = [ { name = "hydra-core" }, @@ -2757,6 +2759,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/69/f24d3d1c38ad69e256138b4ec2452a8c7cf66be49dc214771ae99dd4f0a0/narwhals-2.20.0-py3-none-any.whl", hash = "sha256:16e750ea5507d4ba6e8d03455b5f93a535e0405976561baea235bca5dc9f475d", size = 449373, upload-time = "2026-04-20T12:11:43.596Z" }, ] +[[package]] +name = "ndx-multisubjects" +version = "0.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "hdmf" }, + { name = "pynwb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/93/42/f6bfac40d886234b522dde62e5e8d0c793edbbb9129265d54b834292f9ad/ndx_multisubjects-0.1.1.tar.gz", hash = "sha256:eec83f2913ca19b99563ea26aa036bc6663611632e868984fff5359d6f018f34", size = 19421, upload-time = "2025-11-25T17:01:36.999Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/74/80892f54af53f3d70eff30e3d43844e4e1f50080fa40e949d5f714d40359/ndx_multisubjects-0.1.1-py3-none-any.whl", hash = "sha256:6f1b766e7211434a5a70ee263232a7581de03cb0cb9364989c59fe589828ad1a", size = 7677, upload-time = "2025-11-25T17:01:35.518Z" }, +] + [[package]] name = "ndx-pose" version = "0.2.2" From d15b96567a7a0f17ab08a296cfcddfe8a532c87a Mon Sep 17 00:00:00 2001 From: Glen Beane <356266+gbeane@users.noreply.github.com> Date: Fri, 19 Jun 2026 14:07:29 -0400 Subject: [PATCH 2/2] Resolve subjects by raw external ID; clarify per-identity write log --- .../jabs-io/src/jabs/io/internal/pose/nwb.py | 55 +++++++++++++++---- .../jabs-io/tests/internal/pose/test_nwb.py | 48 ++++++++++++++++ src/jabs/scripts/cli/convert_to_nwb.py | 9 ++- 3 files changed, 99 insertions(+), 13 deletions(-) diff --git a/packages/jabs-io/src/jabs/io/internal/pose/nwb.py b/packages/jabs-io/src/jabs/io/internal/pose/nwb.py index a481e871..49f32c4f 100644 --- a/packages/jabs-io/src/jabs/io/internal/pose/nwb.py +++ b/packages/jabs-io/src/jabs/io/internal/pose/nwb.py @@ -369,8 +369,8 @@ def _write_per_identity(self, data: PoseData, path: Path, **kwargs) -> None: identity_name = self._identity_name(data, i) identity_path = self._identity_file_path(path, identity_name) - raw_meta = (data.subjects or {}).get(identity_name, {}) - subject_meta = {**raw_meta, "subject_id": raw_meta.get("subject_id", identity_name)} + raw_key, raw_meta = self._resolve_subject(data, i, identity_name) + subject_meta = {**raw_meta, "subject_id": raw_meta.get("subject_id", raw_key)} nwbfile = self._make_nwb_file(subject=self._make_subject(subject_meta), **kwargs) skeleton = self._make_skeleton(data.body_parts, data.edges, **kwargs) # Rebuild static/dynamic skeletons each iteration: HDMF objects can @@ -783,16 +783,42 @@ def _make_subject(subject_meta: dict) -> Subject: kwargs["date_of_birth"] = dob return Subject(**kwargs) + @staticmethod + def _resolve_subject(data: PoseData, index: int, identity_name: str) -> tuple[str, dict]: + """Resolve the (default subject_id, metadata dict) for one identity. + + ``PoseData.subjects`` is keyed by the *raw* external IDs, while + ``identity_name`` is the *sanitized* NWB container name. Look metadata + up by the raw external ID first, falling back to the sanitized name, so + subject metadata survives identity names that required sanitization (e.g. + ``"mouse/A"`` -> ``"mouse_A"``). The returned default subject_id is the + raw external ID when available, otherwise the sanitized name. + + Args: + data: PoseData whose ``subjects``/``external_ids`` supply metadata. + index: Identity index into ``data.external_ids``. + identity_name: Sanitized NWB container name for this identity. + + Returns: + A ``(default_subject_id, metadata)`` tuple. + """ + subjects = data.subjects or {} + raw_key = data.external_ids[index] if data.external_ids is not None else identity_name + meta = subjects.get(raw_key) + if meta is None: + meta = subjects.get(identity_name, {}) + return raw_key, meta + @staticmethod def _build_subjects_table(data: PoseData, identity_names: list[str]) -> SubjectsTable: """Build an ndx-multisubjects SubjectsTable with one row per identity. ``subject_id``, ``sex`` and ``species`` are required by the SubjectsTable - spec and always written (defaulting to the identity name, ``"U"`` and - ``""`` respectively when absent). Optional columns are included only when - at least one identity provides them, and every cell is coerced to ``str`` - so each column is a homogeneous HDF5 text column — a DynamicTable requires - every row to supply the same set of columns. + spec and always written (defaulting to the raw external ID / sanitized + name, ``"U"`` and ``""`` respectively when absent). Optional columns are + included only when at least one identity provides them, and every cell is + coerced to ``str`` so each column is a homogeneous HDF5 text column — a + DynamicTable requires every row to supply the same set of columns. Args: data: PoseData whose ``subjects`` dict supplies per-identity metadata. @@ -810,23 +836,28 @@ def _build_subjects_table(data: PoseData, identity_names: list[str]) -> Subjects "strain": "strain", "weight": "weight", } - subjects = data.subjects or {} def _cell(value: object) -> str: return "" if value is None else str(value) + # data.subjects is keyed by raw external IDs while identity_names are + # sanitized; resolve each identity's (default subject_id, metadata) so + # sanitized names (e.g. "mouse/A" -> "mouse_A") don't lose metadata. + resolved = [ + PoseNWBAdapter._resolve_subject(data, i, name) for i, name in enumerate(identity_names) + ] + # An optional column is written for every row iff any identity provides it. present = { column for jabs_key, column in optional_columns.items() - if any(jabs_key in subjects.get(name, {}) for name in identity_names) + if any(jabs_key in meta for _, meta in resolved) } table = SubjectsTable(description="Subjects recorded in this session") - for name in identity_names: - meta = subjects.get(name, {}) + for default_id, meta in resolved: row = { - "subject_id": _cell(meta.get("subject_id")) or name, + "subject_id": _cell(meta.get("subject_id")) or default_id, "sex": _cell(meta.get("sex")) or "U", "species": _cell(meta.get("species")), } diff --git a/packages/jabs-io/tests/internal/pose/test_nwb.py b/packages/jabs-io/tests/internal/pose/test_nwb.py index 9bba7695..10e76dbe 100644 --- a/packages/jabs-io/tests/internal/pose/test_nwb.py +++ b/packages/jabs-io/tests/internal/pose/test_nwb.py @@ -534,6 +534,30 @@ def test_per_identity_nwbfile_subject_minimal_without_subjects(tmp_path, adapter assert nwb.subject.subject_id == "mouse_a" +def test_per_identity_subjects_keyed_by_raw_external_id(tmp_path, adapter): + """Subjects keyed by raw external IDs survive identity-name sanitization (per-identity).""" + path = tmp_path / "pose.nwb" + # "mouse/A" sanitizes to "mouse_A"; subjects is keyed by the raw ID. + subjects = { + "mouse/A": {"sex": "M", "genotype": "WT", "species": "Mus musculus"}, + "mouse/B": {"sex": "F", "genotype": "KO", "species": "Mus musculus"}, + } + data = _make_pose_data(external_ids=["mouse/A", "mouse/B"]) + data = data.__class__( + **{**{f: getattr(data, f) for f in data.__dataclass_fields__}, "subjects": subjects} + ) + + adapter.write(data, path, per_identity_files=True) + + # File is named by the sanitized identity name; subject metadata must still be present. + with NWBHDF5IO(str(tmp_path / "pose_mouse_A.nwb"), mode="r") as io: + nwb = io.read() + assert nwb.subject is not None + assert nwb.subject.subject_id == "mouse/A" # raw external ID, not sanitized/defaulted + assert nwb.subject.sex == "M" + assert nwb.subject.genotype == "WT" + + def test_bounding_boxes_per_identity_containers(tmp_path, adapter): """Bounding boxes are stored as one TimeSeries per identity, not a single combined array.""" path = tmp_path / "pose_bb_struct.nwb" @@ -956,6 +980,30 @@ def test_multisubject_heterogeneous_subjects(tmp_path, adapter): assert row_b["genotype"] == "" and row_b["strain"] == "" +def test_multisubject_subjects_keyed_by_raw_external_id(tmp_path, adapter): + """SubjectsTable keeps metadata keyed by raw external IDs when names are sanitized.""" + path = tmp_path / "session.nwb" + # "mouse/A" sanitizes to "mouse_A"; subjects is keyed by the raw ID. + subjects = { + "mouse/A": {"sex": "M", "genotype": "WT", "species": "Mus musculus"}, + "mouse/B": {"sex": "F", "genotype": "KO", "species": "Mus musculus"}, + } + data = _make_pose_data(external_ids=["mouse/A", "mouse/B"]) + data = data.__class__( + **{**{f: getattr(data, f) for f in data.__dataclass_fields__}, "subjects": subjects} + ) + + adapter.write(data, path, multisubject=True) + + with NWBHDF5IO(str(path), "r", load_namespaces=True) as io: + df = io.read().acquisition["SubjectsTable"].to_dataframe() + # subject_id defaults to the raw external ID; metadata is preserved, not defaulted. + assert list(df["subject_id"]) == ["mouse/A", "mouse/B"] + row_a = df[df["subject_id"] == "mouse/A"].iloc[0] + assert row_a["sex"] == "M" + assert row_a["genotype"] == "WT" + + def test_multisubject_roundtrip_full_features(tmp_path, adapter): """A multisubject file round-trips bounding boxes, static and dynamic objects together.""" path = tmp_path / "session.nwb" diff --git a/src/jabs/scripts/cli/convert_to_nwb.py b/src/jabs/scripts/cli/convert_to_nwb.py index 8e897c26..7d52ac97 100644 --- a/src/jabs/scripts/cli/convert_to_nwb.py +++ b/src/jabs/scripts/cli/convert_to_nwb.py @@ -217,5 +217,12 @@ def run_conversion( if key in session_metadata: write_kwargs[key] = session_metadata[key] - logger.info("Writing NWB to %s", output_path) + if multisubject: + logger.info("Writing multisubject NWB to %s", output_path) + else: + logger.info( + "Writing per-identity NWB files in %s (using %s as a naming template)", + output_path.parent, + output_path.name, + ) save(pose_data, output_path, **write_kwargs)