diff --git a/.github/actions/docker-push/action.yml b/.github/actions/docker-push/action.yml new file mode 100644 index 0000000..763fc0a --- /dev/null +++ b/.github/actions/docker-push/action.yml @@ -0,0 +1,28 @@ +name: Docker Push +description: "Push locally-loaded images to their registry refs" + +inputs: + image-refs: + description: "JSON array of image references to push" + required: true + +runs: + using: composite + steps: + - name: Push + shell: bash + env: + IMAGE_REFS: ${{ inputs.image-refs }} + run: | + set -euo pipefail + mapfile -t refs < <(echo "${IMAGE_REFS}" | jq -r '.[]') + if [ ${#refs[@]} -eq 0 ]; then + echo "No image refs to push" + exit 1 + fi + + for ref in "${refs[@]}"; do + echo "::group::docker push — ${ref}" + docker push "${ref}" + echo "::endgroup::" + done diff --git a/.github/actions/grype/action.yml b/.github/actions/grype/action.yml index 123b27e..a593fa0 100644 --- a/.github/actions/grype/action.yml +++ b/.github/actions/grype/action.yml @@ -3,8 +3,16 @@ description: "Scan Docker images with Grype. For now it's report-only and doesn' inputs: image-refs: - description: "JSON array of image references to scan" + description: "JSON array of image refs to scan" required: true + skip-files: + description: | + Optional newline-separated list of glob patterns to pass to Trivy as + --skip-files. Use this to silence known-benign findings (e.g. demo + cert/key fixtures shipped by vendored upstream libraries in NGC base + images) without affecting the rest of the scan. + required: false + default: "" runs: using: composite @@ -22,6 +30,7 @@ runs: shell: bash env: IMAGE_REFS: ${{ inputs.image-refs }} + SKIP_FILES: ${{ inputs.skip-files }} run: | set -uo pipefail mapfile -t refs < <(echo "${IMAGE_REFS}" | jq -r '.[]') @@ -30,8 +39,30 @@ runs: exit 1 fi + trivy_cmd=( + trivy image + --timeout 30m + --severity CRITICAL,HIGH + --exit-code 1 + --ignore-unfixed + --pkg-types os,library + --format table + --no-progress + ) + + skip_patterns=() + while IFS= read -r pattern; do + [ -z "$pattern" ] && continue + trivy_cmd+=(--skip-files "$pattern") + skip_patterns+=("$pattern") + done <<< "${SKIP_FILES}" + echo "Scanning ${#refs[@]} image(s):" printf ' - %s\n' "${refs[@]}" + if [ ${#skip_patterns[@]} -gt 0 ]; then + echo "Skipping file patterns:" + printf ' - %s\n' "${skip_patterns[@]}" + fi failed=() for ref in "${refs[@]}"; do diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 265ee1f..c3cb879 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -21,7 +21,7 @@ permissions: jobs: build-base: - runs-on: blacksmith-8vcpu-ubuntu-2204 + runs-on: blacksmith-16vcpu-ubuntu-2204 steps: - name: Checkout uses: actions/checkout@v6 @@ -46,7 +46,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/base/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs @@ -59,10 +60,15 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} + - name: Push images + uses: ./.github/actions/docker-push + with: + image-refs: ${{ steps.refs.outputs.refs }} + build-autoresearch: needs: build-base if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') - runs-on: blacksmith-8vcpu-ubuntu-2204 + runs-on: blacksmith-16vcpu-ubuntu-2204 steps: - name: Checkout uses: actions/checkout@v6 @@ -99,7 +105,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/autoresearch/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs @@ -114,11 +121,18 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} + - name: Push images + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.autoresearch_any_changed == 'true' + uses: ./.github/actions/docker-push + with: + image-refs: ${{ steps.refs.outputs.refs }} + build-pytorch: needs: build-base # always() forces job run even if the dependant is skipped (but not if it failed) if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') - runs-on: blacksmith-16vcpu-ubuntu-2404 + runs-on: blacksmith-32vcpu-ubuntu-2404 + timeout-minutes: 240 steps: - name: Checkout uses: actions/checkout@v6 @@ -156,7 +170,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/pytorch/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs @@ -168,5 +183,11 @@ jobs: - name: Grype scan if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' uses: ./.github/actions/grype + with: + image-refs: ${{ steps.refs.outputs.refs }} + + - name: Push images + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' + uses: ./.github/actions/docker-push with: image-refs: ${{ steps.refs.outputs.refs }} \ No newline at end of file diff --git a/.github/workflows/hadolint-pr.yml b/.github/workflows/hadolint-pr.yml index 277c1ae..8fb5244 100644 --- a/.github/workflows/hadolint-pr.yml +++ b/.github/workflows/hadolint-pr.yml @@ -13,12 +13,15 @@ jobs: strategy: fail-fast: false matrix: - # Add rows when new top-level Dockerfiles appear (e.g. nvidia-pytorch). - dockerfile: - - official-templates/base/Dockerfile - - official-templates/pytorch/Dockerfile - - official-templates/autoresearch/Dockerfile - - helper-templates/verify-nccl/Dockerfile + include: + - dockerfile: official-templates/base/Dockerfile + ignore: DL3006,DL3008,DL3013,DL3022 + - dockerfile: official-templates/pytorch/Dockerfile + ignore: "DL3013,DL3006" + - dockerfile: official-templates/autoresearch/Dockerfile + ignore: "DL3006" + - dockerfile: helper-templates/verify-nccl/Dockerfile + ignore: "DL3008" steps: - name: Checkout uses: actions/checkout@v6 @@ -31,4 +34,5 @@ jobs: dockerfile: ${{ matrix.dockerfile }} failure-threshold: warning format: tty - output-file: /dev/stdout \ No newline at end of file + output-file: /dev/stdout + ignore: ${{ matrix.ignore }} \ No newline at end of file diff --git a/.github/workflows/hadolint-push.yml b/.github/workflows/hadolint-push.yml index 781eb66..9e44634 100644 --- a/.github/workflows/hadolint-push.yml +++ b/.github/workflows/hadolint-push.yml @@ -15,12 +15,15 @@ jobs: strategy: fail-fast: false matrix: - # Add rows when new top-level Dockerfiles appear - dockerfile: - - official-templates/base/Dockerfile - - official-templates/pytorch/Dockerfile - - official-templates/autoresearch/Dockerfile - - helper-templates/verify-nccl/Dockerfile + include: + - dockerfile: official-templates/base/Dockerfile + ignore: DL3006,DL3008,DL3013,DL3022 + - dockerfile: official-templates/pytorch/Dockerfile + ignore: "DL3013,DL3006" + - dockerfile: official-templates/autoresearch/Dockerfile + ignore: "DL3006" + - dockerfile: helper-templates/verify-nccl/Dockerfile + ignore: "DL3008" steps: - name: Checkout uses: actions/checkout@v6 diff --git a/.github/workflows/nvidia.yml b/.github/workflows/nvidia.yml index fbcbc35..3161650 100644 --- a/.github/workflows/nvidia.yml +++ b/.github/workflows/nvidia.yml @@ -15,7 +15,7 @@ permissions: jobs: build-nvidia: - runs-on: blacksmith-16vcpu-ubuntu-2404 + runs-on: blacksmith-32vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 @@ -40,7 +40,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/nvidia-pytorch/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs @@ -52,3 +53,13 @@ jobs: uses: ./.github/actions/grype with: image-refs: ${{ steps.refs.outputs.refs }} + skip-files: | + **/civetweb/resources/cert/* + **/civetweb/resources/ssl_cert.pem + **/civetweb/resources/server.pem + **/civetweb/resources/server_bkup.pem + + - name: Push images + uses: ./.github/actions/docker-push + with: + image-refs: ${{ steps.refs.outputs.refs }} diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 562aaa8..44b5786 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -15,7 +15,7 @@ permissions: jobs: build-rocm: - runs-on: blacksmith-16vcpu-ubuntu-2404 + runs-on: blacksmith-32vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 @@ -40,7 +40,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/rocm/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs @@ -50,5 +51,15 @@ jobs: - name: Grype scan uses: ./.github/actions/grype + with: + image-refs: ${{ steps.refs.outputs.refs }} + skip-files: | + **/civetweb/resources/cert/* + **/civetweb/resources/ssl_cert.pem + **/civetweb/resources/server.pem + **/civetweb/resources/server_bkup.pem + + - name: Push images + uses: ./.github/actions/docker-push with: image-refs: ${{ steps.refs.outputs.refs }} \ No newline at end of file diff --git a/helper-templates/verify-nccl/Dockerfile b/helper-templates/verify-nccl/Dockerfile index ca159f5..a83e79a 100644 --- a/helper-templates/verify-nccl/Dockerfile +++ b/helper-templates/verify-nccl/Dockerfile @@ -10,11 +10,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pciutils \ && rm -rf /var/lib/apt/lists/* -RUN git clone https://github.com/NVIDIA/cuda-samples.git && \ - cd cuda-samples/Samples/0_Introduction/simpleP2P && \ - make +RUN git clone https://github.com/NVIDIA/cuda-samples.git + +WORKDIR /verify-nccl/cuda-samples/Samples/0_Introduction/simpleP2P + +RUN make COPY --chmod=755 check_nccl.sh . # Start Container -CMD tail -f /dev/null +CMD ["tail", "-f", "/dev/null"] diff --git a/official-templates/autoresearch/Dockerfile b/official-templates/autoresearch/Dockerfile index 329630e..28664ee 100644 --- a/official-templates/autoresearch/Dockerfile +++ b/official-templates/autoresearch/Dockerfile @@ -1,8 +1,10 @@ ARG BASE_IMAGE=non-existing FROM ${BASE_IMAGE} +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + # Install runpodctl for pod management (scaling up GPUs) -ARG RUNPODCTL_VERSION=v2.1.6 +ARG RUNPODCTL_VERSION=v2.3.0 RUN wget -qO- https://github.com/runpod/runpodctl/releases/download/${RUNPODCTL_VERSION}/runpodctl-linux-amd64.tar.gz | \ tar -xz -C /usr/local/bin runpodctl @@ -14,10 +16,8 @@ RUN git clone --branch ${AUTORESEARCH_REF} --depth 1 \ WORKDIR /opt/autoresearch # Install Python dependencies -RUN uv sync - -# Download data and train tokenizer (~2 min) -RUN uv run prepare.py +RUN uv sync && \ + uv run prepare.py # Download data and train tokenizer (~2 min) # On first boot: copy source files to /workspace (lightweight, persists edits) # and symlink .venv back to /opt (12GB, stays on fast container layer) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index fcaac27..5e9bbc1 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -2,7 +2,6 @@ ARG BASE_IMAGE=non-existing FROM ${BASE_IMAGE} ARG RP_SKIP_PYTHON -ARG RP_SKIP_JUPYTER SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -36,7 +35,7 @@ WORKDIR / RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen -# Install +# Install RUN apt-get update --yes && \ apt-get upgrade --yes && \ apt-get install --yes --no-install-recommends \ @@ -46,8 +45,15 @@ RUN apt-get update --yes && \ libgl1 libhdf5-dev libjpeg-dev liblapack-dev libnuma-dev libpng-dev libpostproc-dev \ libsm6 libssl-dev libswscale-dev libtiff-dev libv4l-dev libx264-dev libxrender-dev \ libxvidcore-dev lsof make mtr nano nfs-common nginx openssh-server rsync slurm-wlm \ - software-properties-common sudo tmux unzip vim wget zip zstd + software-properties-common sudo tmux unzip vim wget zip zstd && \ + rm -f /etc/ssh/ssh_host_*_key /etc/ssh/ssh_host_*_key.pub && \ + rm -rf /var/lib/apt/lists/* +# Strip AWS EFA Nsight plugin — AWS-only hardware, not used on RunPod. +# Its nic_sampler Go binary ships old Go stdlib that triggers HIGH Trivy CVEs. +RUN find /opt/nvidia /usr/local/cuda* -type d -name 'efa_metrics' \ + -print -exec rm -rf {} + 2>/dev/null || true + # Install Python versions RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ add-apt-repository ppa:deadsnakes/ppa -y && \ @@ -71,11 +77,11 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ # Install virtualenv RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ - python3.9 -m pip install --upgrade pip virtualenv && \ - python3.10 -m pip install --upgrade pip virtualenv && \ - python3.11 -m pip install --upgrade pip virtualenv && \ - python3.12 -m pip install --upgrade pip virtualenv && \ - python3.13 -m pip install --upgrade pip virtualenv; + python3.9 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.10 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.11 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.12 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.13 -m pip install --upgrade --no-cache-dir pip virtualenv; # Symlink default python/pip RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ @@ -85,14 +91,13 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ +COPY --from=requirements requirements.txt /requirements.txt +COPY --from=scrub_stale_metadata scrub-stale-metadata.py /tmp/scrub-stale-metadata.py + # Install Jupyter and hf_transfer packages -RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ - python -m pip install --upgrade --no-cache-dir \ - hf_transfer \ - jupyterlab \ - ipywidgets \ - jupyter-archive \ - notebook==7.4.2; +RUN python -m pip install --upgrade --no-cache-dir -r requirements.txt && \ + python /tmp/scrub-stale-metadata.py /requirements.txt && \ + rm /tmp/scrub-stale-metadata.py # Install filebrowser RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash @@ -102,9 +107,6 @@ COPY --from=proxy nginx.conf /etc/nginx/nginx.conf COPY --from=proxy snippets /etc/nginx/snippets COPY --from=proxy readme.html /usr/share/nginx/html/readme.html -# Remove existing SSH host keys -RUN rm -f /etc/ssh/ssh_host_* - # Copy the README.md COPY README.md /usr/share/nginx/html/README.md @@ -113,7 +115,9 @@ COPY --from=scripts --chmod=755 start.sh / # Welcome Message COPY --from=logo runpod.txt /etc/runpod.txt -RUN echo 'cat /etc/runpod.txt' >> /root/.bashrc -RUN echo 'echo -e "\nFor detailed documentation and guides, please visit:\n\033[1;34mhttps://docs.runpod.io/\033[0m and \033[1;34mhttps://blog.runpod.io/\033[0m\n\n"' >> /root/.bashrc +RUN cat >> /root/.bashrc <<'EOF' +cat /etc/runpod.txt +printf "\nFor detailed documentation and guides, please visit:\n\033[1;34mhttps://docs.runpod.io/\033[0m and \033[1;34mhttps://blog.runpod.io/\033[0m\n\n" +EOF CMD ["/start.sh"] diff --git a/official-templates/base/docker-bake.hcl b/official-templates/base/docker-bake.hcl index f8904b8..a4e3723 100644 --- a/official-templates/base/docker-bake.hcl +++ b/official-templates/base/docker-bake.hcl @@ -14,6 +14,8 @@ target "common-base" { scripts = "container-template" proxy = "container-template/proxy" logo = "container-template" + requirements = "official-templates/base" + scrub_stale_metadata = "scripts" } } diff --git a/official-templates/base/requirements.txt b/official-templates/base/requirements.txt new file mode 100644 index 0000000..2feebf1 --- /dev/null +++ b/official-templates/base/requirements.txt @@ -0,0 +1,5 @@ +hf_transfer +jupyterlab==4.5.7 +ipywidgets +jupyter-archive +notebook==7.5.6 diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index 10bb396..4c8e8ff 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -10,10 +10,11 @@ target "nvidia-base" { scripts = "container-template" proxy = "container-template/proxy" logo = "container-template" + requirements = "official-templates/nvidia-pytorch" + scrub_stale_metadata = "scripts" } args = { - RP_SKIP_PYTHON = "1" - RP_SKIP_JUPYTER = "1" + RP_SKIP_PYTHON = "1" } } diff --git a/official-templates/nvidia-pytorch/requirements.txt b/official-templates/nvidia-pytorch/requirements.txt new file mode 100644 index 0000000..bf40672 --- /dev/null +++ b/official-templates/nvidia-pytorch/requirements.txt @@ -0,0 +1,15 @@ +aiohttp==3.13.3 +black==26.3.1 +jaraco.context==6.1.0 +jupyter_server==2.18.0 +jupyterlab==4.5.7 +mistune==3.2.1 +nbconvert==7.17.0 +notebook==7.5.6 +onnx==1.21.0 +pillow==12.2.0 +protobuf==6.33.5 +tornado==6.5.5 +urllib3==2.7.0 +wheel==0.46.2 +pyarrow==23.0.1 \ No newline at end of file diff --git a/official-templates/rocm/docker-bake.hcl b/official-templates/rocm/docker-bake.hcl index 13d7914..5586279 100644 --- a/official-templates/rocm/docker-bake.hcl +++ b/official-templates/rocm/docker-bake.hcl @@ -15,6 +15,8 @@ target "rocm-base" { scripts = "container-template" proxy = "container-template/proxy" logo = "container-template" + requirements = "official-templates/rocm" + scrub_stale_metadata = "scripts" } args = { RP_SKIP_PYTHON = "1" diff --git a/official-templates/rocm/requirements.txt b/official-templates/rocm/requirements.txt new file mode 100644 index 0000000..55e14f3 --- /dev/null +++ b/official-templates/rocm/requirements.txt @@ -0,0 +1,26 @@ +Brotli==1.2.0 +PyJWT==2.12.0 +aiohttp==3.14.0 +cryptography==46.0.7 +jaraco.context==6.1.0 +jupyterlab==4.5.7 +lxml==6.1.0 +notebook==7.5.6 +onnx==1.21.0 +pillow==12.2.0 +protobuf==6.33.5 +tornado==6.5.5 +urllib3==2.7.0 +wheel==0.46.2 +filelock>=3.20.3 +jinja2>=3.1.6 +pygments>=2.20.0 +pynacl>=1.6.2 +requests>=2.33.0 +werkzeug>=3.1.6 +idna>=3.15 +fonttools>=4.60.2 +h2>=4.3.0 +marshmallow>=3.26.2 +pytest>=9.0.3 +pip>=26.1 \ No newline at end of file diff --git a/official-templates/shared/versions.hcl b/official-templates/shared/versions.hcl index ba5e158..60af80f 100644 --- a/official-templates/shared/versions.hcl +++ b/official-templates/shared/versions.hcl @@ -1,4 +1,4 @@ -RELEASE_VERSION = "1.0.4" +RELEASE_VERSION = "1.0.5" variable "RELEASE_SUFFIX" { default = "" # Set by CI, not used by humans. diff --git a/scripts/scrub-stale-metadata.py b/scripts/scrub-stale-metadata.py new file mode 100644 index 0000000..03747c0 --- /dev/null +++ b/scripts/scrub-stale-metadata.py @@ -0,0 +1,81 @@ +"""Remove .dist-info/.egg-info directories whose Version: line disagrees +with the version pip just installed for one of our pinned packages. + +NGC base images bundle several Python packages as in-tree source builds +that carry their own egg-info next to the source. `pip install --upgrade` +upgrades the wheel install but cannot reach those bundled trees, so +Trivy keeps reporting the (now-unused) older version. This script removes +the orphaned metadata for packages listed in the supplied requirements +file.""" +from __future__ import annotations + +import pathlib +import re +import shutil +import sys +from collections.abc import Iterator + +NAME_RE = re.compile(r"^Name:\s*([^\n]*)$", re.MULTILINE) +VERSION_RE = re.compile(r"^Version:\s*([^\n]*)$", re.MULTILINE) +SEARCH_ROOTS = (pathlib.Path("/usr"), pathlib.Path("/opt")) + + +def canonical(name: str) -> str: + return re.sub(r"[-_.]+", "-", name).strip().lower() + + +def parse_pinned(requirements_path: str) -> dict[str, str]: + """Read a requirements file, return {canonical_name: version} for `==` pins.""" + pinned: dict[str, str] = {} + for raw in pathlib.Path(requirements_path).read_text().splitlines(): + line = raw.split("#", 1)[0].strip() + if "==" not in line: + continue + name, version = line.split("==", 1) + pinned[canonical(name)] = version.strip() + return pinned + + +def read_meta(meta_dir: pathlib.Path) -> tuple[str, str] | None: + """Return (canonical_name, version) for a metadata dir, or None if unreadable.""" + metadata = meta_dir / "METADATA" + if not metadata.exists(): + metadata = meta_dir / "PKG-INFO" + if not metadata.exists(): + return None + try: + text = metadata.read_text(errors="ignore") + except OSError: + return None + name_match = NAME_RE.search(text) + version_match = VERSION_RE.search(text) + if not name_match or not version_match: + return None + return canonical(name_match.group(1)), version_match.group(1).strip() + + +def iter_meta_dirs() -> Iterator[pathlib.Path]: + for root in SEARCH_ROOTS: + if not root.is_dir(): + continue + for meta_dir in (*root.rglob("*.dist-info"), *root.rglob("*.egg-info")): + if meta_dir.is_dir(): + yield meta_dir + + +def main(requirements_path: str) -> None: + pinned = parse_pinned(requirements_path) + for meta_dir in iter_meta_dirs(): + meta = read_meta(meta_dir) + if meta is None: + continue + pkg, ver = meta + expected = pinned.get(pkg) + if expected is None or ver == expected: + continue + print(f"scrub-stale-metadata: removing {meta_dir} (Version: {ver}, pinned {expected})") + shutil.rmtree(meta_dir, ignore_errors=True) + + +if __name__ == "__main__": + main(sys.argv[1]) \ No newline at end of file