Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,29 @@ ckpts/
coverage.json
.coverage*
test_assets/
.nrl_remote_map.json
.nrl_remote_state.json
# Test biproducts
tests/functional/*/

# Gym
/3rdparty/Gym-workspace/Gym/cache/uv/
/3rdparty/Gym-workspace/Gym/res*/*/.venv/
/3rdparty/Gym-workspace/Gym/res*/*/.venv/
/3rdparty/Gym-workspace/Gym/.venv/

# Cache
uv_cache/
hf_home/
hf_datasets_cache/
*logs/
datasets/
/datasets/
wandb/
checkpoints/
results/
code_snapshots/
code_snapshots*/
.cache/

# Runtime env
*runtime_env.yaml
!default_runtime_env.yaml
8 changes: 2 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,18 @@ ckpts/
# Test
coverage.json
.coverage*
unit_results.json
unit_results/
test_assets/
.nrl_remote_map.json
.nrl_remote_state.json
# Test biproducts
tests/functional/*/

# Cache
uv_cache/
hf_home/
hf_datasets_cache/
*logs/
/datasets/
docker/*
!docker/Dockerfile
!docker/Dockerfile.ngc_pytorch
!docker/README.md
wandb/
checkpoints/
results/
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Gym-workspace/Gym
Submodule Gym updated 211 files
114 changes: 70 additions & 44 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,23 @@
# Optional build args to skip vLLM or SGLang dependencies:
# --build-arg SKIP_VLLM_BUILD=1 # Skip vLLM dependencies
# --build-arg SKIP_SGLANG_BUILD=1 # Skip SGLang dependencies
#
# Custom setup (override the default apptainer install):
# docker buildx build --build-context custom-setup=my-setup-dir/ --build-arg CUSTOM_SETUP_FNAME=my_script.sh -f docker/Dockerfile ...
#
# To skip custom setup entirely:
# docker buildx build --build-arg CUSTOM_SETUP_FNAME= -f docker/Dockerfile ...

ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
FROM scratch AS nemo-rl
ARG NRL_GIT_REF=main
ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /

# Default custom-setup stage: installs apptainer.
# Override with: --build-context custom-setup=<your-dir>
FROM scratch AS custom-setup
COPY docker/install_apptainer.sh /

FROM ${BASE_IMAGE} AS base
# An environment variable to indicate that we are in a container.
ENV NRL_CONTAINER=1
Expand Down Expand Up @@ -83,7 +94,7 @@ ENV RAY_USAGE_STATS_ENABLED=0
# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
ENV RAY_ENABLE_UV_RUN_RUNTIME_ENV=0
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs

ENV NEMO_GYM_VENV_DIR=/opt/gym_venvs

FROM base AS hermetic

Expand All @@ -105,48 +116,77 @@ ARG BUILD_CUSTOM_FLASHINFER_REF
# Skip building vLLM or SGLang dependencies (set to any non-empty value to skip)
ARG SKIP_VLLM_BUILD
ARG SKIP_SGLANG_BUILD
# Config paths (relative to repo root) whose NeMo Gym venvs should be prefetched.
# Override to prefetch venvs for different configs, or set to empty to skip.
ARG NEMO_GYM_PREFETCH_CONFIGS="examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml examples/nemo_gym/grpo_nanov3.yaml"

ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
ENV UV_LINK_MODE=copy

# Ensure DeepEP is built for H100 and B200 (also mcore inference unified memory API now invokes a torch API that requires these to be set)
ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"

# First copy only the dependency files
COPY --from=nemo-rl pyproject.toml uv.lock ./
# Copy in the top level __init__.py/package_info.py since build-custom-vllm.sh needs the nemo_rl package to exist.
COPY --from=nemo-rl nemo_rl/__init__.py nemo_rl/package_info.py ./nemo_rl/
COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh
COPY --from=nemo-rl tools/build-custom-flashinfer.sh ./tools/build-custom-flashinfer.sh
COPY --from=nemo-rl --link research/ ./research/
COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/
# Copy in source from build context (defaults to cloned repo, can be overridden)
COPY --from=nemo-rl . /opt/nemo-rl
# Unshallow the repo to get the full history (in the case it was from the scratch layer).
# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
# so do a quick check before trying to unshallow.
RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true

RUN --mount=type=ssh <<"EOF" bash -exu
uv venv --seed
# The custom build scripts will alter the pyproject.toml and uv.lock
if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then
bash tools/build-custom-vllm.sh ${BUILD_CUSTOM_VLLM_URL} ${BUILD_CUSTOM_VLLM_REF} ${BUILD_CUSTOM_VLLM_PRECOMPILED_WHEEL_LOCATION}
UV_LINK_MODE=hardlink bash tools/build-custom-vllm.sh ${BUILD_CUSTOM_VLLM_URL:-} ${BUILD_CUSTOM_VLLM_REF:-} ${BUILD_CUSTOM_VLLM_PRECOMPILED_WHEEL_LOCATION:-}
source 3rdparty/vllm/nemo-rl.env
fi
if [[ -n "${BUILD_CUSTOM_FLASHINFER:-}" ]]; then
bash tools/build-custom-flashinfer.sh ${BUILD_CUSTOM_FLASHINFER_URL} ${BUILD_CUSTOM_FLASHINFER_REF}
UV_LINK_MODE=hardlink bash tools/build-custom-flashinfer.sh ${BUILD_CUSTOM_FLASHINFER_URL:-} ${BUILD_CUSTOM_FLASHINFER_REF:-}
fi
# uv sync has a more reliable resolver than simple uv pip install which can fail

# Sync each training + inference backend one at a time (since they may conflict)
# to warm the uv cache, then at the end just sync the default dependencies.
# Do everything in one layer to prevent large layers.

# The venv is symlinked to avoid bloating the layer size
uv sync --link-mode symlink --locked --no-install-project
# The venv uses hardlinks to avoid bloating the layer size
UV_LINK_MODE=hardlink uv sync --locked --no-install-project
if [[ -z "${SKIP_VLLM_BUILD:-}" ]]; then
uv sync --link-mode symlink --locked --extra vllm --no-install-project
UV_LINK_MODE=hardlink uv sync --locked --extra vllm --no-install-project
fi
if [[ -z "${SKIP_SGLANG_BUILD:-}" ]]; then
uv sync --link-mode symlink --locked --extra sglang --no-install-project
UV_LINK_MODE=hardlink uv sync --locked --extra sglang --no-install-project
fi
UV_LINK_MODE=hardlink uv sync --locked --extra mcore --no-install-project
UV_LINK_MODE=hardlink uv sync --locked --extra automodel --no-install-project
UV_LINK_MODE=hardlink uv sync --locked --all-groups --no-install-project

# Prefetch NeMo Gym internal venvs (for gym servers like code_gen, math, etc.)
if [[ -n "${NEMO_GYM_PREFETCH_CONFIGS:-}" ]]; then
# Infer the index from RL's pyproject.toml. This ensures all venv creation that needs torch will use the RL prescribed version for consistency since RL dictates the container CUDA version.
UV_TORCH_BACKEND=$(uv run python -c "import tomllib,pathlib; indexes=tomllib.loads(pathlib.Path('pyproject.toml').read_text())['tool']['uv']['index']; print(next(i['name'].removeprefix('pytorch-') for i in indexes if i['name'].startswith('pytorch-')))") \
UV_LINK_MODE=hardlink uv run python examples/nemo_gym/prefetch_venvs.py $NEMO_GYM_PREFETCH_CONFIGS
fi

# Remove /tmp/ray because the previous script starts up a local ray cluster which creates a session
# that we can just clean up.
rm -rf /tmp/ray
Comment on lines +164 to +173
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Prefetch script and nemo_rl source are both absent in the hermetic stage — build breaks by default

Two problems combine to break the Docker build:

  1. Missing file: examples/nemo_gym/prefetch_venvs.py is never copied in the hermetic stage (the COPY commands at lines 120–126 only bring in pyproject.toml, uv.lock, two nemo_rl entry files, and tools/). Python will exit with "No such file or directory", failing the RUN layer.

  2. Incomplete nemo_rl source: Even if the file were present, the script imports nemo_rl.environments.nemo_gym, nemo_rl.distributed.virtual_cluster, etc. None of these sub-packages are present in hermetic (only nemo_rl/__init__.py and nemo_rl/package_info.py are copied), so the imports would raise ModuleNotFoundError.

Because once a build argument is declared in a stage it is automatically inherited by child stages, NEMO_GYM_PREFETCH_CONFIGS has its non-empty default value when the if condition is evaluated in hermetic, so this code path executes on every default build.

Recommended fix: Move the prefetch block to the release stage (after line 194, where the full source tree is copied), and re-declare the ARG there — matching the pattern already used for nemo_rl/utils/prefetch_venvs.py.

🐛 Proposed fix — move prefetch to `release` stage

Remove lines 155–162 from the hermetic RUN block, then in the release stage add:

 # Re-declare build args for this stage
+ARG NEMO_GYM_PREFETCH_CONFIGS
 ARG SKIP_VLLM_BUILD
 ...

 # (after the full COPY at line 194 and after uv sync installs nemo_rl)
+# Prefetch NeMo Gym internal venvs (for gym servers like code_gen, math, etc.)
+RUN if [[ -n "${NEMO_GYM_PREFETCH_CONFIGS:-}" ]]; then \
+        UV_LINK_MODE=symlink uv run python examples/nemo_gym/prefetch_venvs.py \
+            $NEMO_GYM_PREFETCH_CONFIGS; \
+        rm -rf /tmp/ray; \
+    fi
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@docker/Dockerfile` around lines 155 - 162, The hermetic stage runs a prefetch
step referencing examples/nemo_gym/prefetch_venvs.py and
NEMO_GYM_PREFETCH_CONFIGS but that script and nemo_rl subpackages aren't copied
into hermetic, causing the build to fail; remove the prefetch RUN block from the
hermetic stage and relocate it into the release stage (after the full source
copy) and re-declare ARG NEMO_GYM_PREFETCH_CONFIGS there so the prefetch
invocation (UV_LINK_MODE=symlink uv run python
examples/nemo_gym/prefetch_venvs.py $NEMO_GYM_PREFETCH_CONFIGS) executes only
when the full source (including nemo_rl and examples) is present.


NEGATIVE_FILTERS=""
if [[ -n "${SKIP_VLLM_BUILD:-}" ]]; then
NEGATIVE_FILTERS="$NEGATIVE_FILTERS vllm"
fi
if [[ -n "${SKIP_SGLANG_BUILD:-}" ]]; then
NEGATIVE_FILTERS="$NEGATIVE_FILTERS sglang"
fi
if [[ -n "$NEGATIVE_FILTERS" ]]; then
UV_LINK_MODE=hardlink uv run nemo_rl/utils/prefetch_venvs.py --negative-filters $NEGATIVE_FILTERS
else
UV_LINK_MODE=hardlink uv run nemo_rl/utils/prefetch_venvs.py
fi
uv sync --link-mode symlink --locked --extra mcore --no-install-project
uv sync --link-mode symlink --locked --extra automodel --no-install-project
uv sync --link-mode symlink --locked --all-groups --no-install-project

# Prune unreachable cache entries
uv cache prune

# Remove the aiohttp in this uv cache dir to fully address CVE GHSA-mqqc-3gqh-h2x8
# The ray install will include the older aiohttp version in its cache
Expand All @@ -156,13 +196,23 @@ EOF
ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs

# Custom setup layer (override with: --build-context custom-setup=<dir> --build-arg CUSTOM_SETUP_FNAME=<script>)
# To skip: --build-arg CUSTOM_SETUP_FNAME=
ARG CUSTOM_SETUP_FNAME=install_apptainer.sh
RUN --mount=from=custom-setup,src=/,dst=/tmp/custom-setup <<"EOF" bash -exu
if [[ -z "${CUSTOM_SETUP_FNAME}" ]]; then
echo "CUSTOM_SETUP_FNAME is empty, skipping custom setup"
exit 0
fi
cp /tmp/custom-setup/${CUSTOM_SETUP_FNAME} /opt/${CUSTOM_SETUP_FNAME}
bash -x /opt/${CUSTOM_SETUP_FNAME}
EOF

WORKDIR /opt/nemo-rl

FROM hermetic AS release

# Re-declare build args for this stage
ARG SKIP_VLLM_BUILD
ARG SKIP_SGLANG_BUILD
ARG NEMO_RL_COMMIT
ARG NVIDIA_BUILD_ID
ARG NVIDIA_BUILD_REF
Expand All @@ -174,30 +224,6 @@ ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>}
LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"
LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"

ENV NEMO_RL_VENV_DIR=/opt/ray_venvs

# Copy in source from build context (defaults to cloned repo, can be overridden)
# Exclude pyproject.toml and uv.lock since those may be altered by build-custom-vllm.sh
COPY --from=nemo-rl --exclude=pyproject.toml --exclude=uv.lock . /opt/nemo-rl
# Unshallow the repo to get the full history (in the case it was from the scratch layer).
# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
# so do a quick check before trying to unshallow.
RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true
RUN <<"EOF" bash -exu
NEGATIVE_FILTERS=""
if [[ -n "${SKIP_VLLM_BUILD:-}" ]]; then
NEGATIVE_FILTERS="$NEGATIVE_FILTERS vllm"
fi
if [[ -n "${SKIP_SGLANG_BUILD:-}" ]]; then
NEGATIVE_FILTERS="$NEGATIVE_FILTERS sglang"
fi
if [[ -n "$NEGATIVE_FILTERS" ]]; then
UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py --negative-filters $NEGATIVE_FILTERS
else
UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py
fi
EOF

# Generate container fingerprint for frozen environment support
# Store outside /opt/nemo-rl to avoid being overwritten by user mounts
RUN python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint
Expand Down
23 changes: 23 additions & 0 deletions docker/install_apptainer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -euo pipefail
apt-get update
apt-get install -y --no-install-recommends software-properties-common
add-apt-repository -y ppa:apptainer/ppa
apt-get update
CODENAME=$(. /etc/os-release && echo "$VERSION_CODENAME")
apt-get install -y --no-install-recommends apptainer=1.4.5-1~${CODENAME}
ln -sf /usr/bin/apptainer /usr/bin/singularity
apt-get clean && rm -rf /var/lib/apt/lists/*
31 changes: 31 additions & 0 deletions docs/docker.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,34 @@ When these build arguments are set, the corresponding `uv sync --extra` commands

> [!NOTE]
> If you skip vLLM or SGLang during the build but later try to use those backends at runtime, the dependencies will be fetched and built on-demand. This may add significant setup time on first use.

## Custom Setup Commands

By default, the Docker image installs [apptainer](https://apptainer.org/) (with a `singularity` symlink) via a pluggable `custom-setup` build stage. The default script is `docker/install_apptainer.sh`. You can override or skip this step at build time.

### Override with a custom script

Create a directory containing your setup script(s), then pass it as a build context along with the script filename:

```sh
# my-setup-dir/my_script.sh
#!/bin/bash
set -euo pipefail
apt-get update && apt-get install -y my-custom-package
apt-get clean && rm -rf /var/lib/apt/lists/*
```

```sh
docker buildx build \
--build-context custom-setup=my-setup-dir/ \
--build-arg CUSTOM_SETUP_FNAME=my_script.sh \
-f docker/Dockerfile --tag <registry>/nemo-rl:latest .
```

### Skip custom setup entirely

To build without any custom setup commands, set `CUSTOM_SETUP_FNAME` to empty:

```sh
docker buildx build --build-arg CUSTOM_SETUP_FNAME= -f docker/Dockerfile --tag <registry>/nemo-rl:latest .
```
Loading
Loading