From 20681836aa6c49a1da807f7bc0c516fe1bec3571 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 18 May 2026 12:28:08 +0300 Subject: [PATCH 01/36] fix: jupyterlab, notebook, ssh vulnerabilities --- official-templates/base/Dockerfile | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index fcaac27..9b34b40 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -36,7 +36,7 @@ WORKDIR / RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen -# Install +# Install RUN apt-get update --yes && \ apt-get upgrade --yes && \ apt-get install --yes --no-install-recommends \ @@ -46,8 +46,9 @@ RUN apt-get update --yes && \ libgl1 libhdf5-dev libjpeg-dev liblapack-dev libnuma-dev libpng-dev libpostproc-dev \ libsm6 libssl-dev libswscale-dev libtiff-dev libv4l-dev libx264-dev libxrender-dev \ libxvidcore-dev lsof make mtr nano nfs-common nginx openssh-server rsync slurm-wlm \ - software-properties-common sudo tmux unzip vim wget zip zstd - + software-properties-common sudo tmux unzip vim wget zip zstd && \ + rm -f /etc/ssh/ssh_host_*_key /etc/ssh/ssh_host_*_key.pub + # Install Python versions RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ add-apt-repository ppa:deadsnakes/ppa -y && \ @@ -89,10 +90,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ python -m pip install --upgrade --no-cache-dir \ hf_transfer \ - jupyterlab \ + jupyterlab==4.5.7 \ ipywidgets \ jupyter-archive \ - notebook==7.4.2; + notebook==7.5.6; # Install filebrowser RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash @@ -102,9 +103,6 @@ COPY --from=proxy nginx.conf /etc/nginx/nginx.conf COPY --from=proxy snippets /etc/nginx/snippets COPY --from=proxy readme.html /usr/share/nginx/html/readme.html -# Remove existing SSH host keys -RUN rm -f /etc/ssh/ssh_host_* - # Copy the README.md COPY README.md /usr/share/nginx/html/README.md From 3f2d76962ae6cf69c48a1c9343dd3f5db5b9b957 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 18 May 2026 15:52:17 +0300 Subject: [PATCH 02/36] fix: hadolint findings --- .github/workflows/hadolint-pr.yml | 3 ++- official-templates/autoresearch/Dockerfile | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/hadolint-pr.yml b/.github/workflows/hadolint-pr.yml index e1da38b..cd0b3db 100644 --- a/.github/workflows/hadolint-pr.yml +++ b/.github/workflows/hadolint-pr.yml @@ -31,4 +31,5 @@ jobs: dockerfile: ${{ matrix.dockerfile }} failure-threshold: warning format: tty - output-file: /dev/stdout \ No newline at end of file + output-file: /dev/stdout + ignore: DL3022,DL3006 \ No newline at end of file diff --git a/official-templates/autoresearch/Dockerfile b/official-templates/autoresearch/Dockerfile index 329630e..50c2945 100644 --- a/official-templates/autoresearch/Dockerfile +++ b/official-templates/autoresearch/Dockerfile @@ -11,6 +11,10 @@ ARG AUTORESEARCH_REF=master RUN git clone --branch ${AUTORESEARCH_REF} --depth 1 \ https://github.com/runpod/autoresearch.git /opt/autoresearch +RUN python -m pip install --upgrade --no-cache-dir \ + pillow==12.2.0\ + urllib3==2.7.0 \ + WORKDIR /opt/autoresearch # Install Python dependencies From a70220f71289407a28d5fc4dccd0132c3a1ccfc1 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 18 May 2026 16:28:52 +0300 Subject: [PATCH 03/36] fix: autoresearch hadolint --- official-templates/autoresearch/Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/official-templates/autoresearch/Dockerfile b/official-templates/autoresearch/Dockerfile index 50c2945..a63778c 100644 --- a/official-templates/autoresearch/Dockerfile +++ b/official-templates/autoresearch/Dockerfile @@ -9,9 +9,8 @@ RUN wget -qO- https://github.com/runpod/runpodctl/releases/download/${RUNPODCTL_ # Clone autoresearch to /opt (safe from volume mounts) ARG AUTORESEARCH_REF=master RUN git clone --branch ${AUTORESEARCH_REF} --depth 1 \ - https://github.com/runpod/autoresearch.git /opt/autoresearch - -RUN python -m pip install --upgrade --no-cache-dir \ + https://github.com/runpod/autoresearch.git /opt/autoresearch && \ + python -m pip install --upgrade --no-cache-dir \ pillow==12.2.0\ urllib3==2.7.0 \ From 69641b2c0b76c6074d77687794a4dedea5b13b76 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 18 May 2026 19:15:20 +0300 Subject: [PATCH 04/36] fix: trivy vulnerabilities; bake push: true --- .github/workflows/base.yml | 36 +++++++++---------- official-templates/autoresearch/Dockerfile | 7 ++-- official-templates/base/Dockerfile | 9 ++--- official-templates/base/docker-bake.hcl | 1 + official-templates/base/requirements.txt | 5 +++ .../nvidia-pytorch/docker-bake.hcl | 9 ++--- .../nvidia-pytorch/requirements.txt | 14 ++++++++ 7 files changed, 49 insertions(+), 32 deletions(-) create mode 100644 official-templates/base/requirements.txt create mode 100644 official-templates/nvidia-pytorch/requirements.txt diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 1619f3b..8c20fd2 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -46,8 +46,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/base/docker-bake.hcl - push: false - load: true + push: true #false + # load: true - name: Extract image refs id: refs @@ -60,10 +60,10 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} - - name: Push base images - uses: ./.github/actions/docker-push - with: - image-refs: ${{ steps.refs.outputs.refs }} + # - name: Push base images + # uses: ./.github/actions/docker-push + # with: + # image-refs: ${{ steps.refs.outputs.refs }} build-autoresearch: @@ -106,8 +106,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/autoresearch/docker-bake.hcl - push: false - load: true + push: true #false + # load: true - name: Extract image refs id: refs @@ -120,10 +120,10 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} - - name: Push autoresearch images - uses: ./.github/actions/docker-push - with: - image-refs: ${{ steps.refs.outputs.refs }} + # - name: Push autoresearch images + # uses: ./.github/actions/docker-push + # with: + # image-refs: ${{ steps.refs.outputs.refs }} build-pytorch: needs: build-base @@ -167,8 +167,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/pytorch/docker-bake.hcl - push: false - load: true + push: true #false + # load: true - name: Extract image refs id: refs @@ -181,7 +181,7 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} - - name: Push pytorch images - uses: ./.github/actions/docker-push - with: - image-refs: ${{ steps.refs.outputs.refs }} + # - name: Push pytorch images + # uses: ./.github/actions/docker-push + # with: + # image-refs: ${{ steps.refs.outputs.refs }} diff --git a/official-templates/autoresearch/Dockerfile b/official-templates/autoresearch/Dockerfile index a63778c..40e809a 100644 --- a/official-templates/autoresearch/Dockerfile +++ b/official-templates/autoresearch/Dockerfile @@ -9,16 +9,15 @@ RUN wget -qO- https://github.com/runpod/runpodctl/releases/download/${RUNPODCTL_ # Clone autoresearch to /opt (safe from volume mounts) ARG AUTORESEARCH_REF=master RUN git clone --branch ${AUTORESEARCH_REF} --depth 1 \ - https://github.com/runpod/autoresearch.git /opt/autoresearch && \ - python -m pip install --upgrade --no-cache-dir \ - pillow==12.2.0\ - urllib3==2.7.0 \ + https://github.com/runpod/autoresearch.git /opt/autoresearch WORKDIR /opt/autoresearch # Install Python dependencies RUN uv sync +RUN uv pip install --upgrade "pillow==12.2.0" "urllib3==2.7.0" + # Download data and train tokenizer (~2 min) RUN uv run prepare.py diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 9b34b40..5f07208 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -86,14 +86,11 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ +COPY --from=requirements requirements.txt /requirements.txt + # Install Jupyter and hf_transfer packages RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ - python -m pip install --upgrade --no-cache-dir \ - hf_transfer \ - jupyterlab==4.5.7 \ - ipywidgets \ - jupyter-archive \ - notebook==7.5.6; + python -m pip install --upgrade --no-cache-dir -r requirements.txt # Install filebrowser RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash diff --git a/official-templates/base/docker-bake.hcl b/official-templates/base/docker-bake.hcl index f8904b8..6c79e42 100644 --- a/official-templates/base/docker-bake.hcl +++ b/official-templates/base/docker-bake.hcl @@ -14,6 +14,7 @@ target "common-base" { scripts = "container-template" proxy = "container-template/proxy" logo = "container-template" + requirements = "official-templates/base" } } diff --git a/official-templates/base/requirements.txt b/official-templates/base/requirements.txt new file mode 100644 index 0000000..b0b5ea8 --- /dev/null +++ b/official-templates/base/requirements.txt @@ -0,0 +1,5 @@ +hf_transfer +jupyterlab==4.5.7 +ipywidgets +jupyter-archive +notebook==7.5.6 \ No newline at end of file diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index 10bb396..dc8bcd3 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -10,11 +10,12 @@ target "nvidia-base" { scripts = "container-template" proxy = "container-template/proxy" logo = "container-template" + requirements = "official-templates/nvidia-pytorch" } - args = { - RP_SKIP_PYTHON = "1" - RP_SKIP_JUPYTER = "1" - } + // args = { + // RP_SKIP_PYTHON = "1" + // RP_SKIP_JUPYTER = "1" + // } } target "pytorch-2511" { diff --git a/official-templates/nvidia-pytorch/requirements.txt b/official-templates/nvidia-pytorch/requirements.txt new file mode 100644 index 0000000..0b07ad1 --- /dev/null +++ b/official-templates/nvidia-pytorch/requirements.txt @@ -0,0 +1,14 @@ +aiohttp==3.13.3 +black==26.3.1 +jaraco.context==6.1.0 +jupyter_server==2.18.0 +jupyterlab==4.5.7 +mistune==3.2.1 +nbconvert==7.17.0 +notebook==7.5.6 +onnx==1.21.0 +pillow==12.2.0 +protobuf==6.33.5 +tornado==6.5.5 +urllib3==2.7.0 +wheel==0.46.2 \ No newline at end of file From e562b5caabcceba639894119dcb1dce8b3ed1ed4 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 18 May 2026 19:51:12 +0300 Subject: [PATCH 05/36] fix: pytorch max-parallelism: 4 --- .github/workflows/base.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 8c20fd2..9c3f89f 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -153,7 +153,7 @@ jobs: with: dockerhub-username: ${{ secrets.DOCKERHUB_USERNAME }} dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }} - max-parallelism: 8 + max-parallelism: 4 - name: Build pytorch images id: build From de920b4aa55c726678f3108aafc702d1c584d71e Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 18 May 2026 19:52:38 +0300 Subject: [PATCH 06/36] fix: pytorch max-parallelism: 3 --- .github/workflows/base.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 9c3f89f..d34613a 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -153,7 +153,7 @@ jobs: with: dockerhub-username: ${{ secrets.DOCKERHUB_USERNAME }} dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }} - max-parallelism: 4 + max-parallelism: 3 - name: Build pytorch images id: build From d3a7e1e97ef312de629444434978ca15b558f610 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 19 May 2026 11:02:07 +0300 Subject: [PATCH 07/36] fix: autoresearch linter --- official-templates/autoresearch/Dockerfile | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/official-templates/autoresearch/Dockerfile b/official-templates/autoresearch/Dockerfile index 40e809a..a8d7317 100644 --- a/official-templates/autoresearch/Dockerfile +++ b/official-templates/autoresearch/Dockerfile @@ -1,6 +1,8 @@ ARG BASE_IMAGE=non-existing FROM ${BASE_IMAGE} +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + # Install runpodctl for pod management (scaling up GPUs) ARG RUNPODCTL_VERSION=v2.1.6 RUN wget -qO- https://github.com/runpod/runpodctl/releases/download/${RUNPODCTL_VERSION}/runpodctl-linux-amd64.tar.gz | \ @@ -14,12 +16,9 @@ RUN git clone --branch ${AUTORESEARCH_REF} --depth 1 \ WORKDIR /opt/autoresearch # Install Python dependencies -RUN uv sync - -RUN uv pip install --upgrade "pillow==12.2.0" "urllib3==2.7.0" - -# Download data and train tokenizer (~2 min) -RUN uv run prepare.py +RUN uv sync && \ + uv pip install --upgrade "pillow==12.2.0" "urllib3==2.7.0" && \ + uv run prepare.py # Download data and train tokenizer (~2 min) # On first boot: copy source files to /workspace (lightweight, persists edits) # and symlink .venv back to /opt (12GB, stays on fast container layer) From 72ae1a4ef77fc3b3d0cc6c4855e42d72b9de5b99 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 19 May 2026 14:01:29 +0300 Subject: [PATCH 08/36] fix: hadolint findings and RUNPODCTL_VERSION=v2.3.0 --- .github/workflows/hadolint-pr.yml | 17 ++++++++++------- helper-templates/verify-nccl/Dockerfile | 10 ++++++---- official-templates/autoresearch/Dockerfile | 3 +-- .../nvidia-pytorch/docker-bake.hcl | 7 +++---- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/.github/workflows/hadolint-pr.yml b/.github/workflows/hadolint-pr.yml index cd0b3db..ede49a8 100644 --- a/.github/workflows/hadolint-pr.yml +++ b/.github/workflows/hadolint-pr.yml @@ -13,12 +13,15 @@ jobs: strategy: fail-fast: false matrix: - # Add rows when new top-level Dockerfiles appear (e.g. nvidia-pytorch). - dockerfile: - - official-templates/base/Dockerfile - - official-templates/pytorch/Dockerfile - - official-templates/autoresearch/Dockerfile - - helper-templates/verify-nccl/Dockerfile + include: + - dockerfile: official-templates/base/Dockerfile + ignore: DL3022,DL3006 + - dockerfile: official-templates/pytorch/Dockerfile + ignore: DL3013 + - dockerfile: official-templates/autoresearch/Dockerfile + ignore: "" + - dockerfile: helper-templates/verify-nccl/Dockerfile + ignore: "DL3008" steps: - name: Checkout uses: actions/checkout@v4 @@ -32,4 +35,4 @@ jobs: failure-threshold: warning format: tty output-file: /dev/stdout - ignore: DL3022,DL3006 \ No newline at end of file + ignore: ${{ matrix.ignore }} \ No newline at end of file diff --git a/helper-templates/verify-nccl/Dockerfile b/helper-templates/verify-nccl/Dockerfile index ca159f5..a83e79a 100644 --- a/helper-templates/verify-nccl/Dockerfile +++ b/helper-templates/verify-nccl/Dockerfile @@ -10,11 +10,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pciutils \ && rm -rf /var/lib/apt/lists/* -RUN git clone https://github.com/NVIDIA/cuda-samples.git && \ - cd cuda-samples/Samples/0_Introduction/simpleP2P && \ - make +RUN git clone https://github.com/NVIDIA/cuda-samples.git + +WORKDIR /verify-nccl/cuda-samples/Samples/0_Introduction/simpleP2P + +RUN make COPY --chmod=755 check_nccl.sh . # Start Container -CMD tail -f /dev/null +CMD ["tail", "-f", "/dev/null"] diff --git a/official-templates/autoresearch/Dockerfile b/official-templates/autoresearch/Dockerfile index a8d7317..28664ee 100644 --- a/official-templates/autoresearch/Dockerfile +++ b/official-templates/autoresearch/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} SHELL ["/bin/bash", "-o", "pipefail", "-c"] # Install runpodctl for pod management (scaling up GPUs) -ARG RUNPODCTL_VERSION=v2.1.6 +ARG RUNPODCTL_VERSION=v2.3.0 RUN wget -qO- https://github.com/runpod/runpodctl/releases/download/${RUNPODCTL_VERSION}/runpodctl-linux-amd64.tar.gz | \ tar -xz -C /usr/local/bin runpodctl @@ -17,7 +17,6 @@ WORKDIR /opt/autoresearch # Install Python dependencies RUN uv sync && \ - uv pip install --upgrade "pillow==12.2.0" "urllib3==2.7.0" && \ uv run prepare.py # Download data and train tokenizer (~2 min) # On first boot: copy source files to /workspace (lightweight, persists edits) diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index dc8bcd3..bb385ff 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -12,10 +12,9 @@ target "nvidia-base" { logo = "container-template" requirements = "official-templates/nvidia-pytorch" } - // args = { - // RP_SKIP_PYTHON = "1" - // RP_SKIP_JUPYTER = "1" - // } + args = { + RP_SKIP_PYTHON = "1" + } } target "pytorch-2511" { From d6049ac3f4d49e0c39aa5626c73f79e5626269f5 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 19 May 2026 14:46:28 +0300 Subject: [PATCH 09/36] fix: hadolint findings --- .github/workflows/hadolint-pr.yml | 4 ++-- official-templates/base/Dockerfile | 16 +++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/hadolint-pr.yml b/.github/workflows/hadolint-pr.yml index ede49a8..511122f 100644 --- a/.github/workflows/hadolint-pr.yml +++ b/.github/workflows/hadolint-pr.yml @@ -17,9 +17,9 @@ jobs: - dockerfile: official-templates/base/Dockerfile ignore: DL3022,DL3006 - dockerfile: official-templates/pytorch/Dockerfile - ignore: DL3013 + ignore: "DL3013,DL3006" - dockerfile: official-templates/autoresearch/Dockerfile - ignore: "" + ignore: "DL3006" - dockerfile: helper-templates/verify-nccl/Dockerfile ignore: "DL3008" steps: diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 5f07208..088c678 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -72,11 +72,11 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ # Install virtualenv RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ - python3.9 -m pip install --upgrade pip virtualenv && \ - python3.10 -m pip install --upgrade pip virtualenv && \ - python3.11 -m pip install --upgrade pip virtualenv && \ - python3.12 -m pip install --upgrade pip virtualenv && \ - python3.13 -m pip install --upgrade pip virtualenv; + python3.9 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ + python3.10 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ + python3.11 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ + python3.12 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ + python3.13 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3; # Symlink default python/pip RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ @@ -108,7 +108,9 @@ COPY --from=scripts --chmod=755 start.sh / # Welcome Message COPY --from=logo runpod.txt /etc/runpod.txt -RUN echo 'cat /etc/runpod.txt' >> /root/.bashrc -RUN echo 'echo -e "\nFor detailed documentation and guides, please visit:\n\033[1;34mhttps://docs.runpod.io/\033[0m and \033[1;34mhttps://blog.runpod.io/\033[0m\n\n"' >> /root/.bashrc +RUN cat >> /root/.bashrc <<'EOF' +cat /etc/runpod.txt +printf "\nFor detailed documentation and guides, please visit:\n\033[1;34mhttps://docs.runpod.io/\033[0m and \033[1;34mhttps://blog.runpod.io/\033[0m\n\n" +EOF CMD ["/start.sh"] From 74e86de67c484deacb8c9ad44e263cc369a8fc92 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 19 May 2026 14:53:45 +0300 Subject: [PATCH 10/36] fix: base build --- official-templates/base/Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 088c678..16d06cd 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -72,11 +72,11 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ # Install virtualenv RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ - python3.9 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ - python3.10 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ - python3.11 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ - python3.12 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3 && \ - python3.13 -m pip install --upgrade --no-cache-dir pip==26.1.1 virtualenv==21.3.3; + python3.9 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ + python3.10 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ + python3.11 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ + python3.12 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ + python3.13 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3; # Symlink default python/pip RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ From 629e6c5119fa71969efa741d88d37e678dad4c11 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 19 May 2026 15:41:54 +0300 Subject: [PATCH 11/36] fix: hadolint findings --- .github/workflows/hadolint-pr.yml | 2 +- official-templates/base/Dockerfile | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/hadolint-pr.yml b/.github/workflows/hadolint-pr.yml index 511122f..779b777 100644 --- a/.github/workflows/hadolint-pr.yml +++ b/.github/workflows/hadolint-pr.yml @@ -15,7 +15,7 @@ jobs: matrix: include: - dockerfile: official-templates/base/Dockerfile - ignore: DL3022,DL3006 + ignore: DL3006,DL3008,DL3013,DL3022 - dockerfile: official-templates/pytorch/Dockerfile ignore: "DL3013,DL3006" - dockerfile: official-templates/autoresearch/Dockerfile diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 16d06cd..cf739ac 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -47,7 +47,8 @@ RUN apt-get update --yes && \ libsm6 libssl-dev libswscale-dev libtiff-dev libv4l-dev libx264-dev libxrender-dev \ libxvidcore-dev lsof make mtr nano nfs-common nginx openssh-server rsync slurm-wlm \ software-properties-common sudo tmux unzip vim wget zip zstd && \ - rm -f /etc/ssh/ssh_host_*_key /etc/ssh/ssh_host_*_key.pub + rm -f /etc/ssh/ssh_host_*_key /etc/ssh/ssh_host_*_key.pub && \ + rm -rf /var/lib/apt/lists/* # Install Python versions RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ @@ -72,11 +73,11 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ # Install virtualenv RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ - python3.9 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ - python3.10 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ - python3.11 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ - python3.12 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3 && \ - python3.13 -m pip install --upgrade --no-cache-dir pip virtualenv==21.3.3; + python3.9 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.10 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.11 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.12 -m pip install --upgrade --no-cache-dir pip virtualenv && \ + python3.13 -m pip install --upgrade --no-cache-dir pip virtualenv; # Symlink default python/pip RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ From 91dcbad6799e23b931bd7f8c8b72603197d44787 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 19 May 2026 17:26:09 +0300 Subject: [PATCH 12/36] fix: rocm vulnerabilities --- .github/workflows/base.yml | 2 + container-template/scrub-stale-metadata.py | 61 ++++++++++++++++++++++ official-templates/rocm/docker-bake.hcl | 1 + official-templates/rocm/requirements.txt | 14 +++++ 4 files changed, 78 insertions(+) create mode 100644 container-template/scrub-stale-metadata.py create mode 100644 official-templates/rocm/requirements.txt diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index d34613a..e42be41 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -6,10 +6,12 @@ on: - ".github/workflows/base.yml" - "official-templates/base/docker-bake.hcl" - "official-templates/base/Dockerfile" + - "official-templates/base/requirements.txt" - "official-templates/pytorch/docker-bake.hcl" - "official-templates/pytorch/Dockerfile" - "official-templates/autoresearch/docker-bake.hcl" - "official-templates/autoresearch/Dockerfile" + - "official-templates/autoresearch/requirements.txt" - "official-templates/shared/**" push: branches: [main] diff --git a/container-template/scrub-stale-metadata.py b/container-template/scrub-stale-metadata.py new file mode 100644 index 0000000..182f316 --- /dev/null +++ b/container-template/scrub-stale-metadata.py @@ -0,0 +1,61 @@ +"""Remove .dist-info/.egg-info directories whose Version: line disagrees +with the version pip just installed for one of our pinned packages. + +NGC base images bundle several Python packages as in-tree source builds +that carry their own egg-info next to the source. `pip install --upgrade` +upgrades the wheel install but cannot reach those bundled trees, so +Trivy keeps reporting the (now-unused) older version. This script removes +the orphaned metadata for packages listed in the supplied requirements +file.""" +import pathlib +import re +import shutil +import sys + + +def canonical(name: str) -> str: + return re.sub(r"[-_.]+", "-", name).strip().lower() + + +def main(requirements_path: str) -> None: + pinned: dict[str, str] = {} + for line in pathlib.Path(requirements_path).read_text().splitlines(): + line = line.split("#", 1)[0].strip() + if "==" not in line: + continue + name, version = line.split("==", 1) + pinned[canonical(name)] = version.strip() + + name_re = re.compile(r"^Name:\s*(.+)$", re.MULTILINE) + version_re = re.compile(r"^Version:\s*(.+)$", re.MULTILINE) + + for root in (pathlib.Path("/usr"), pathlib.Path("/opt")): + if not root.is_dir(): + continue + for meta_dir in [*root.rglob("*.dist-info"), *root.rglob("*.egg-info")]: + if not meta_dir.is_dir(): + continue + metadata = meta_dir / "METADATA" + if not metadata.exists(): + metadata = meta_dir / "PKG-INFO" + if not metadata.exists(): + continue + try: + text = metadata.read_text(errors="ignore") + except OSError: + continue + name_match = name_re.search(text) + version_match = version_re.search(text) + if not name_match or not version_match: + continue + pkg = canonical(name_match.group(1)) + ver = version_match.group(1).strip() + expected = pinned.get(pkg) + if expected is None or ver == expected: + continue + print(f"scrub-stale-metadata: removing {meta_dir} (Version: {ver}, pinned {expected})") + shutil.rmtree(meta_dir, ignore_errors=True) + + +if __name__ == "__main__": + main(sys.argv[1]) \ No newline at end of file diff --git a/official-templates/rocm/docker-bake.hcl b/official-templates/rocm/docker-bake.hcl index 13d7914..c897fc4 100644 --- a/official-templates/rocm/docker-bake.hcl +++ b/official-templates/rocm/docker-bake.hcl @@ -15,6 +15,7 @@ target "rocm-base" { scripts = "container-template" proxy = "container-template/proxy" logo = "container-template" + requirements = "official-templates/rocm" } args = { RP_SKIP_PYTHON = "1" diff --git a/official-templates/rocm/requirements.txt b/official-templates/rocm/requirements.txt new file mode 100644 index 0000000..55f3901 --- /dev/null +++ b/official-templates/rocm/requirements.txt @@ -0,0 +1,14 @@ +Brotli==1.2.0 +PyJWT==2.12.0 +aiohttp==3.13.3 +cryptography==46.0.5 +jaraco.context==6.1.0 +jupyterlab==4.5.7 +lxml==6.1.0 +notebook==7.5.6 +onnx==1.21.0 +pillow==12.2.0 +protobuf==6.33.5 +tornado==6.5.5 +urllib3==2.7.0 +wheel==0.46.2 \ No newline at end of file From d368a69a0d595780d2bfbf22b393cc618c142026 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 19 May 2026 20:12:47 +0300 Subject: [PATCH 13/36] fix: script to scrub stale metadata --- official-templates/base/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index cf739ac..23bd61b 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -88,10 +88,13 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=requirements requirements.txt /requirements.txt +COPY --from=scripts scrub-stale-metadata.py /tmp/scrub-stale-metadata.py # Install Jupyter and hf_transfer packages RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ - python -m pip install --upgrade --no-cache-dir -r requirements.txt + python -m pip install --upgrade --no-cache-dir -r requirements.txt && \ + python /tmp/scrub-stale-metadata.py /requirements.txt && \ + rm /tmp/scrub-stale-metadata.py # Install filebrowser RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash From eafb84391b69417603552dcaf0d4876a3891f704 Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 13 May 2026 13:32:25 +0300 Subject: [PATCH 14/36] align with base branch --- .github/actions/trivy/action.yml | 7 +++---- .github/workflows/base.yml | 31 +++++-------------------------- 2 files changed, 8 insertions(+), 30 deletions(-) diff --git a/.github/actions/trivy/action.yml b/.github/actions/trivy/action.yml index a8889d9..2199607 100644 --- a/.github/actions/trivy/action.yml +++ b/.github/actions/trivy/action.yml @@ -1,9 +1,8 @@ name: Trivy -description: "Scan Docker images with Trivy; fail on CRITICAL/HIGH fixed vulnerabilities" - +description: "Scan Docker images with Trivy" inputs: - image-refs: - description: "JSON array of image references to scan" + bake-metadata: + description: "The bake metadata to scan" required: true runs: diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index e42be41..73e4029 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -6,12 +6,10 @@ on: - ".github/workflows/base.yml" - "official-templates/base/docker-bake.hcl" - "official-templates/base/Dockerfile" - - "official-templates/base/requirements.txt" - "official-templates/pytorch/docker-bake.hcl" - "official-templates/pytorch/Dockerfile" - "official-templates/autoresearch/docker-bake.hcl" - "official-templates/autoresearch/Dockerfile" - - "official-templates/autoresearch/requirements.txt" - "official-templates/shared/**" push: branches: [main] @@ -48,8 +46,7 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/base/docker-bake.hcl - push: true #false - # load: true + push: true - name: Extract image refs id: refs @@ -62,12 +59,6 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} - # - name: Push base images - # uses: ./.github/actions/docker-push - # with: - # image-refs: ${{ steps.refs.outputs.refs }} - - build-autoresearch: needs: build-base if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') @@ -108,8 +99,7 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/autoresearch/docker-bake.hcl - push: true #false - # load: true + push: true - name: Extract image refs id: refs @@ -122,11 +112,6 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} - # - name: Push autoresearch images - # uses: ./.github/actions/docker-push - # with: - # image-refs: ${{ steps.refs.outputs.refs }} - build-pytorch: needs: build-base # always() forces job run even if the dependant is skipped (but not if it failed) @@ -155,7 +140,7 @@ jobs: with: dockerhub-username: ${{ secrets.DOCKERHUB_USERNAME }} dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }} - max-parallelism: 3 + # max-parallelism: 4 - name: Build pytorch images id: build @@ -169,8 +154,7 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/pytorch/docker-bake.hcl - push: true #false - # load: true + push: true - name: Extract image refs id: refs @@ -181,9 +165,4 @@ jobs: - name: Trivy scan uses: ./.github/actions/trivy with: - image-refs: ${{ steps.refs.outputs.refs }} - - # - name: Push pytorch images - # uses: ./.github/actions/docker-push - # with: - # image-refs: ${{ steps.refs.outputs.refs }} + image-refs: ${{ steps.refs.outputs.refs }} \ No newline at end of file From cc797ec93eb093ee1c82981d6c9d607fc7033826 Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 20 May 2026 12:13:45 +0300 Subject: [PATCH 15/36] feat: upgrade github actions versions and increase pytorch timeout --- .github/actions/docker-setup/action.yml | 4 ++-- .github/workflows/base.yml | 10 +++++----- .github/workflows/hadolint-pr.yml | 2 +- .github/workflows/hadolint-push.yml | 2 +- .github/workflows/nvidia.yml | 2 +- .github/workflows/rocm.yml | 2 +- .github/workflows/shellcheck.yml | 2 +- official-templates/pytorch/Dockerfile | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/actions/docker-setup/action.yml b/.github/actions/docker-setup/action.yml index 00e7e45..2c78d63 100644 --- a/.github/actions/docker-setup/action.yml +++ b/.github/actions/docker-setup/action.yml @@ -31,7 +31,7 @@ runs: rm -rf "$AGENT_TOOLSDIRECTORY" - name: Login to Docker Hub - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: username: ${{ inputs.dockerhub-username }} password: ${{ inputs.dockerhub-token }} @@ -42,7 +42,7 @@ runs: max-parallelism: ${{ inputs.max-parallelism }} - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v4 - name: Determine if this is a production build id: build-type diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 73e4029..7d77dab 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -24,7 +24,7 @@ jobs: runs-on: blacksmith-8vcpu-ubuntu-2204 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 0 @@ -65,12 +65,12 @@ jobs: runs-on: blacksmith-8vcpu-ubuntu-2204 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 0 - name: Check if build is needed - uses: tj-actions/changed-files@v46 + uses: docker/setup-qemu-action@v47.0.6 id: changes with: files_yaml: | @@ -119,12 +119,12 @@ jobs: runs-on: blacksmith-16vcpu-ubuntu-2404 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 0 - name: Check if build is needed - uses: tj-actions/changed-files@v46 + uses: docker/setup-qemu-action@v47.0.6 id: changes with: files_yaml: | diff --git a/.github/workflows/hadolint-pr.yml b/.github/workflows/hadolint-pr.yml index 779b777..8fb5244 100644 --- a/.github/workflows/hadolint-pr.yml +++ b/.github/workflows/hadolint-pr.yml @@ -24,7 +24,7 @@ jobs: ignore: "DL3008" steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 # TTY + default stdout so hadolint-action's problem matcher surfaces # annotations on the PR "Files changed" tab (SARIF/Code Scanning is much harder to spot). diff --git a/.github/workflows/hadolint-push.yml b/.github/workflows/hadolint-push.yml index ab0cde9..781eb66 100644 --- a/.github/workflows/hadolint-push.yml +++ b/.github/workflows/hadolint-push.yml @@ -23,7 +23,7 @@ jobs: - helper-templates/verify-nccl/Dockerfile steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Lint Dockerfile uses: ./.github/actions/hadolint diff --git a/.github/workflows/nvidia.yml b/.github/workflows/nvidia.yml index 782ded7..fcd1ad1 100644 --- a/.github/workflows/nvidia.yml +++ b/.github/workflows/nvidia.yml @@ -18,7 +18,7 @@ jobs: runs-on: blacksmith-16vcpu-ubuntu-2404 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 0 diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 14d3fa5..77d1e4c 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -18,7 +18,7 @@ jobs: runs-on: blacksmith-16vcpu-ubuntu-2404 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 0 diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml index d73a282..d2e4a88 100644 --- a/.github/workflows/shellcheck.yml +++ b/.github/workflows/shellcheck.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Shellcheck uses: reviewdog/action-shellcheck@v1.32.0 diff --git a/official-templates/pytorch/Dockerfile b/official-templates/pytorch/Dockerfile index b3841dc..8195939 100644 --- a/official-templates/pytorch/Dockerfile +++ b/official-templates/pytorch/Dockerfile @@ -5,7 +5,7 @@ ARG WHEEL_SRC ARG TORCH RUN python -m pip install \ - --timeout 120 \ + --timeout 300 \ --retries 10 \ --resume-retries 5 \ --no-cache-dir \ From e04f71af79a580593580de020dab60bd1d056b38 Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 20 May 2026 12:45:23 +0300 Subject: [PATCH 16/36] fix: docker/setup-qemu-action --- .github/workflows/base.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 7d77dab..b1c36f0 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -70,7 +70,7 @@ jobs: fetch-depth: 0 - name: Check if build is needed - uses: docker/setup-qemu-action@v47.0.6 + uses: docker/setup-qemu-action@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 id: changes with: files_yaml: | @@ -124,7 +124,7 @@ jobs: fetch-depth: 0 - name: Check if build is needed - uses: docker/setup-qemu-action@v47.0.6 + uses: docker/setup-qemu-action@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 id: changes with: files_yaml: | From d8da79a4bd03df372df6f8a7859d8533a41da126 Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 20 May 2026 14:05:01 +0300 Subject: [PATCH 17/36] fix: nvidia-pythorch vulnerabilities --- official-templates/base/Dockerfile | 8 ++++++++ official-templates/nvidia-pytorch/docker-bake.hcl | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 23bd61b..892b3d0 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -3,6 +3,7 @@ FROM ${BASE_IMAGE} ARG RP_SKIP_PYTHON ARG RP_SKIP_JUPYTER +ARG RP_STRIP_VENDORED_TEST_FIXTURES SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -85,6 +86,13 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip && \ ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip3; +RUN [[ -z "$RP_STRIP_VENDORED_TEST_FIXTURES" ]] && exit 0; \ + find /opt /usr -type f \( \ + -path '*/civetweb/resources/cert/*' \ + -o -path '*/civetweb/resources/ssl_cert.pem' \ + -o -path '*/civetweb/resources/server.pem' \ + \) -print -delete + COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=requirements requirements.txt /requirements.txt diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index bb385ff..0d245e9 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -13,7 +13,8 @@ target "nvidia-base" { requirements = "official-templates/nvidia-pytorch" } args = { - RP_SKIP_PYTHON = "1" + RP_SKIP_PYTHON = "1" + RP_STRIP_VENDORED_TEST_FIXTURES = "1" } } From 1feffe971bb7886dba66dd581f8182dd4e7faaba Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 20 May 2026 14:08:26 +0300 Subject: [PATCH 18/36] fix: base workflow --- .github/workflows/base.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index b1c36f0..0c98b91 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -70,7 +70,7 @@ jobs: fetch-depth: 0 - name: Check if build is needed - uses: docker/setup-qemu-action@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 + uses: tj-actions/changed-files@v46 id: changes with: files_yaml: | @@ -124,7 +124,7 @@ jobs: fetch-depth: 0 - name: Check if build is needed - uses: docker/setup-qemu-action@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 + uses: tj-actions/changed-files@v46 id: changes with: files_yaml: | From fe0cf157c44d59ffa6817429e0cdf67d011a1e3f Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 20 May 2026 16:57:01 +0300 Subject: [PATCH 19/36] fix: ignore nvidia-pytorch trvy findings with certs --- .github/actions/trivy/action.yml | 45 ++++++++++++++----- .github/workflows/nvidia.yml | 5 +++ official-templates/base/Dockerfile | 8 ---- .../nvidia-pytorch/docker-bake.hcl | 3 +- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.github/actions/trivy/action.yml b/.github/actions/trivy/action.yml index 2199607..a522e57 100644 --- a/.github/actions/trivy/action.yml +++ b/.github/actions/trivy/action.yml @@ -1,9 +1,17 @@ name: Trivy description: "Scan Docker images with Trivy" inputs: - bake-metadata: - description: "The bake metadata to scan" + image-refs: + description: "JSON array of image refs to scan" required: true + skip-files: + description: | + Optional newline-separated list of glob patterns to pass to Trivy as + --skip-files. Use this to silence known-benign findings (e.g. demo + cert/key fixtures shipped by vendored upstream libraries in NGC base + images) without affecting the rest of the scan. + required: false + default: "" runs: using: composite @@ -21,6 +29,7 @@ runs: shell: bash env: IMAGE_REFS: ${{ inputs.image-refs }} + SKIP_FILES: ${{ inputs.skip-files }} run: | set -uo pipefail mapfile -t refs < <(echo "${IMAGE_REFS}" | jq -r '.[]') @@ -29,21 +38,35 @@ runs: exit 1 fi + trivy_cmd=( + trivy image + --timeout 30m + --severity CRITICAL,HIGH + --exit-code 1 + --ignore-unfixed + --pkg-types os,library + --format table + --no-progress + ) + + skip_patterns=() + while IFS= read -r pattern; do + [ -z "$pattern" ] && continue + trivy_cmd+=(--skip-files "$pattern") + skip_patterns+=("$pattern") + done <<< "${SKIP_FILES}" + echo "Scanning ${#refs[@]} image(s):" printf ' - %s\n' "${refs[@]}" + if [ ${#skip_patterns[@]} -gt 0 ]; then + echo "Skipping file patterns:" + printf ' - %s\n' "${skip_patterns[@]}" + fi failed=() for ref in "${refs[@]}"; do echo "::group::Trivy — ${ref}" - if ! trivy image \ - --timeout 30m \ - --severity CRITICAL,HIGH \ - --exit-code 1 \ - --ignore-unfixed \ - --pkg-types os,library \ - --format table \ - --no-progress \ - "${ref}"; then + if ! "${trivy_cmd[@]}" "${ref}"; then failed+=("${ref}") fi echo "::endgroup::" diff --git a/.github/workflows/nvidia.yml b/.github/workflows/nvidia.yml index fcd1ad1..eb0172a 100644 --- a/.github/workflows/nvidia.yml +++ b/.github/workflows/nvidia.yml @@ -53,6 +53,11 @@ jobs: uses: ./.github/actions/trivy with: image-refs: ${{ steps.refs.outputs.refs }} + skip-files: | + **/civetweb/resources/cert/* + **/civetweb/resources/ssl_cert.pem + **/civetweb/resources/server.pem + **/civetweb/resources/server_bkup.pem - name: Push nvidia images uses: ./.github/actions/docker-push diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 892b3d0..23bd61b 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -3,7 +3,6 @@ FROM ${BASE_IMAGE} ARG RP_SKIP_PYTHON ARG RP_SKIP_JUPYTER -ARG RP_STRIP_VENDORED_TEST_FIXTURES SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -86,13 +85,6 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip && \ ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip3; -RUN [[ -z "$RP_STRIP_VENDORED_TEST_FIXTURES" ]] && exit 0; \ - find /opt /usr -type f \( \ - -path '*/civetweb/resources/cert/*' \ - -o -path '*/civetweb/resources/ssl_cert.pem' \ - -o -path '*/civetweb/resources/server.pem' \ - \) -print -delete - COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=requirements requirements.txt /requirements.txt diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index 0d245e9..bb385ff 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -13,8 +13,7 @@ target "nvidia-base" { requirements = "official-templates/nvidia-pytorch" } args = { - RP_SKIP_PYTHON = "1" - RP_STRIP_VENDORED_TEST_FIXTURES = "1" + RP_SKIP_PYTHON = "1" } } From dfdcc8f75240901c465df74f455b8cf79d091016 Mon Sep 17 00:00:00 2001 From: mchekm Date: Thu, 21 May 2026 12:07:49 +0300 Subject: [PATCH 20/36] fix: rocm vulnerabilities --- .github/workflows/rocm.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index e3737fe..700a76f 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -51,4 +51,9 @@ jobs: - name: Trivy scan uses: ./.github/actions/trivy with: - image-refs: ${{ steps.refs.outputs.refs }} \ No newline at end of file + image-refs: ${{ steps.refs.outputs.refs }} + skip-files: | + **/civetweb/resources/cert/* + **/civetweb/resources/ssl_cert.pem + **/civetweb/resources/server.pem + **/civetweb/resources/server_bkup.pem \ No newline at end of file From 14fccdc5efa4a0040340bf2393c2319c110dd791 Mon Sep 17 00:00:00 2001 From: mchekm Date: Thu, 21 May 2026 15:22:56 +0300 Subject: [PATCH 21/36] check if filebrowser generates vulneralities --- official-templates/base/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 23bd61b..d94a610 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -97,7 +97,7 @@ RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ rm /tmp/scrub-stale-metadata.py # Install filebrowser -RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash +# RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash # NGINX Proxy COPY --from=proxy nginx.conf /etc/nginx/nginx.conf From 69360502aaa354b77bd03c1146e2fe405bf226f9 Mon Sep 17 00:00:00 2001 From: mchekm Date: Thu, 21 May 2026 16:17:31 +0300 Subject: [PATCH 22/36] fix: nic_sampler vulnerabilities --- official-templates/base/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index d94a610..bef5a00 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -50,6 +50,9 @@ RUN apt-get update --yes && \ rm -f /etc/ssh/ssh_host_*_key /etc/ssh/ssh_host_*_key.pub && \ rm -rf /var/lib/apt/lists/* +RUN find /opt/nvidia /usr/local/cuda* -type d -name 'efa_metrics' \ + -print -exec rm -rf {} + 2>/dev/null || true + # Install Python versions RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ add-apt-repository ppa:deadsnakes/ppa -y && \ @@ -97,7 +100,7 @@ RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ rm /tmp/scrub-stale-metadata.py # Install filebrowser -# RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash +RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash # NGINX Proxy COPY --from=proxy nginx.conf /etc/nginx/nginx.conf From 417b29683d0cd735be22dc6b2be92f8aa95bb1df Mon Sep 17 00:00:00 2001 From: mchekm Date: Fri, 22 May 2026 11:17:33 +0300 Subject: [PATCH 23/36] chore: added comment --- official-templates/base/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index bef5a00..1daf48e 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -49,7 +49,9 @@ RUN apt-get update --yes && \ software-properties-common sudo tmux unzip vim wget zip zstd && \ rm -f /etc/ssh/ssh_host_*_key /etc/ssh/ssh_host_*_key.pub && \ rm -rf /var/lib/apt/lists/* - + +# Strip AWS EFA Nsight plugin — AWS-only hardware, not used on RunPod. +# Its nic_sampler Go binary ships old Go stdlib that triggers HIGH Trivy CVEs. RUN find /opt/nvidia /usr/local/cuda* -type d -name 'efa_metrics' \ -print -exec rm -rf {} + 2>/dev/null || true From d7a76cccb9fdea10f3392376414e3d9f06375ca9 Mon Sep 17 00:00:00 2001 From: mchekm Date: Fri, 22 May 2026 15:45:52 +0300 Subject: [PATCH 24/36] fix: ignore some of hadolint findings on push --- .github/workflows/hadolint-push.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/hadolint-push.yml b/.github/workflows/hadolint-push.yml index 781eb66..9e44634 100644 --- a/.github/workflows/hadolint-push.yml +++ b/.github/workflows/hadolint-push.yml @@ -15,12 +15,15 @@ jobs: strategy: fail-fast: false matrix: - # Add rows when new top-level Dockerfiles appear - dockerfile: - - official-templates/base/Dockerfile - - official-templates/pytorch/Dockerfile - - official-templates/autoresearch/Dockerfile - - helper-templates/verify-nccl/Dockerfile + include: + - dockerfile: official-templates/base/Dockerfile + ignore: DL3006,DL3008,DL3013,DL3022 + - dockerfile: official-templates/pytorch/Dockerfile + ignore: "DL3013,DL3006" + - dockerfile: official-templates/autoresearch/Dockerfile + ignore: "DL3006" + - dockerfile: helper-templates/verify-nccl/Dockerfile + ignore: "DL3008" steps: - name: Checkout uses: actions/checkout@v6 From 94f453fad99c18637de87c4120b5f3372f8df158 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 25 May 2026 15:06:35 +0300 Subject: [PATCH 25/36] fix: relocate scrub-stale-metadata.py --- official-templates/base/Dockerfile | 2 +- official-templates/base/docker-bake.hcl | 1 + {container-template => scripts}/scrub-stale-metadata.py | 0 3 files changed, 2 insertions(+), 1 deletion(-) rename {container-template => scripts}/scrub-stale-metadata.py (100%) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 1daf48e..fbce320 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -93,7 +93,7 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=requirements requirements.txt /requirements.txt -COPY --from=scripts scrub-stale-metadata.py /tmp/scrub-stale-metadata.py +COPY --from=scrab_stale_metadata scrub-stale-metadata.py /tmp/scrub-stale-metadata.py # Install Jupyter and hf_transfer packages RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ diff --git a/official-templates/base/docker-bake.hcl b/official-templates/base/docker-bake.hcl index 6c79e42..28cd51c 100644 --- a/official-templates/base/docker-bake.hcl +++ b/official-templates/base/docker-bake.hcl @@ -15,6 +15,7 @@ target "common-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/base" + scrab_stale_metadata = "scripts" } } diff --git a/container-template/scrub-stale-metadata.py b/scripts/scrub-stale-metadata.py similarity index 100% rename from container-template/scrub-stale-metadata.py rename to scripts/scrub-stale-metadata.py From 318ee0abb99ee7e65a1a48a2d980ddc44e6b0243 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 25 May 2026 18:37:17 +0300 Subject: [PATCH 26/36] fix: relocate scrub-stale-metadata.py --- official-templates/nvidia-pytorch/docker-bake.hcl | 1 + official-templates/rocm/docker-bake.hcl | 1 + 2 files changed, 2 insertions(+) diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index bb385ff..fe64b0b 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -11,6 +11,7 @@ target "nvidia-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/nvidia-pytorch" + scripts_stale_metadata = "scripts" } args = { RP_SKIP_PYTHON = "1" diff --git a/official-templates/rocm/docker-bake.hcl b/official-templates/rocm/docker-bake.hcl index c897fc4..fed4530 100644 --- a/official-templates/rocm/docker-bake.hcl +++ b/official-templates/rocm/docker-bake.hcl @@ -16,6 +16,7 @@ target "rocm-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/rocm" + scripts_stale_metadata = "scripts" } args = { RP_SKIP_PYTHON = "1" From 7e46af8dac9038bb8f7a121079cfa166099c3dca Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 25 May 2026 18:48:45 +0300 Subject: [PATCH 27/36] fix: scrab_stale_metadata --- official-templates/nvidia-pytorch/docker-bake.hcl | 2 +- official-templates/rocm/docker-bake.hcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index fe64b0b..2b91d9b 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -11,7 +11,7 @@ target "nvidia-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/nvidia-pytorch" - scripts_stale_metadata = "scripts" + scrab_stale_metadata = "scripts" } args = { RP_SKIP_PYTHON = "1" diff --git a/official-templates/rocm/docker-bake.hcl b/official-templates/rocm/docker-bake.hcl index fed4530..8140dce 100644 --- a/official-templates/rocm/docker-bake.hcl +++ b/official-templates/rocm/docker-bake.hcl @@ -16,7 +16,7 @@ target "rocm-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/rocm" - scripts_stale_metadata = "scripts" + scrab_stale_metadata = "scripts" } args = { RP_SKIP_PYTHON = "1" From be8434269ade223604fcd52ed0d4eef83e4613ed Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 27 May 2026 16:17:03 +0300 Subject: [PATCH 28/36] fix: scrub-stale-metadata.py --- scripts/scrub-stale-metadata.py | 75 ++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/scripts/scrub-stale-metadata.py b/scripts/scrub-stale-metadata.py index 182f316..bd7a5c3 100644 --- a/scripts/scrub-stale-metadata.py +++ b/scripts/scrub-stale-metadata.py @@ -12,49 +12,66 @@ import shutil import sys +NAME_RE = re.compile(r"^Name:\s*([^\n]*)$", re.MULTILINE) +VERSION_RE = re.compile(r"^Version:\s*([^\n]*)$", re.MULTILINE) +SEARCH_ROOTS = (pathlib.Path("/usr"), pathlib.Path("/opt")) + def canonical(name: str) -> str: return re.sub(r"[-_.]+", "-", name).strip().lower() -def main(requirements_path: str) -> None: +def parse_pinned(requirements_path: str) -> dict[str, str]: + """Read a requirements file, return {canonical_name: version} for `==` pins.""" pinned: dict[str, str] = {} - for line in pathlib.Path(requirements_path).read_text().splitlines(): - line = line.split("#", 1)[0].strip() + for raw in pathlib.Path(requirements_path).read_text().splitlines(): + line = raw.split("#", 1)[0].strip() if "==" not in line: continue name, version = line.split("==", 1) pinned[canonical(name)] = version.strip() + return pinned + - name_re = re.compile(r"^Name:\s*(.+)$", re.MULTILINE) - version_re = re.compile(r"^Version:\s*(.+)$", re.MULTILINE) +def read_meta(meta_dir: pathlib.Path) -> tuple[str, str] | None: + """Return (canonical_name, version) for a metadata dir, or None if unreadable.""" + metadata = meta_dir / "METADATA" + if not metadata.exists(): + metadata = meta_dir / "PKG-INFO" + if not metadata.exists(): + return None + try: + text = metadata.read_text(errors="ignore") + except OSError: + return None + name_match = NAME_RE.search(text) + version_match = VERSION_RE.search(text) + if not name_match or not version_match: + return None + return canonical(name_match.group(1)), version_match.group(1).strip() - for root in (pathlib.Path("/usr"), pathlib.Path("/opt")): + +def iter_meta_dirs() -> "Iterator[pathlib.Path]": + for root in SEARCH_ROOTS: if not root.is_dir(): continue - for meta_dir in [*root.rglob("*.dist-info"), *root.rglob("*.egg-info")]: - if not meta_dir.is_dir(): - continue - metadata = meta_dir / "METADATA" - if not metadata.exists(): - metadata = meta_dir / "PKG-INFO" - if not metadata.exists(): - continue - try: - text = metadata.read_text(errors="ignore") - except OSError: - continue - name_match = name_re.search(text) - version_match = version_re.search(text) - if not name_match or not version_match: - continue - pkg = canonical(name_match.group(1)) - ver = version_match.group(1).strip() - expected = pinned.get(pkg) - if expected is None or ver == expected: - continue - print(f"scrub-stale-metadata: removing {meta_dir} (Version: {ver}, pinned {expected})") - shutil.rmtree(meta_dir, ignore_errors=True) + for meta_dir in (*root.rglob("*.dist-info"), *root.rglob("*.egg-info")): + if meta_dir.is_dir(): + yield meta_dir + + +def main(requirements_path: str) -> None: + pinned = parse_pinned(requirements_path) + for meta_dir in iter_meta_dirs(): + meta = read_meta(meta_dir) + if meta is None: + continue + pkg, ver = meta + expected = pinned.get(pkg) + if expected is None or ver == expected: + continue + print(f"scrub-stale-metadata: removing {meta_dir} (Version: {ver}, pinned {expected})") + shutil.rmtree(meta_dir, ignore_errors=True) if __name__ == "__main__": From 5731245e72a2b077473839f55394222b88a334ee Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 27 May 2026 17:52:53 +0300 Subject: [PATCH 29/36] feat: bump version --- official-templates/shared/versions.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official-templates/shared/versions.hcl b/official-templates/shared/versions.hcl index bd522f5..60af80f 100644 --- a/official-templates/shared/versions.hcl +++ b/official-templates/shared/versions.hcl @@ -1,4 +1,4 @@ -RELEASE_VERSION = "1.0.3" +RELEASE_VERSION = "1.0.5" variable "RELEASE_SUFFIX" { default = "" # Set by CI, not used by humans. From 044108384a039e5280a1cb982217f45ecfc4893b Mon Sep 17 00:00:00 2001 From: mchekm Date: Fri, 29 May 2026 18:30:06 +0300 Subject: [PATCH 30/36] fix: do not run trivy if no changes --- .github/workflows/base.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 5850ff4..3913f5a 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -103,11 +103,13 @@ jobs: - name: Extract image refs id: refs + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.autoresearch_any_changed == 'true' uses: ./.github/actions/image-name with: bake-metadata: ${{ steps.build.outputs.metadata }} - name: Trivy scan + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.autoresearch_any_changed == 'true' uses: ./.github/actions/trivy with: image-refs: ${{ steps.refs.outputs.refs }} @@ -158,11 +160,13 @@ jobs: - name: Extract image refs id: refs + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' uses: ./.github/actions/image-name with: bake-metadata: ${{ steps.build.outputs.metadata }} - name: Trivy scan + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' uses: ./.github/actions/trivy with: image-refs: ${{ steps.refs.outputs.refs }} \ No newline at end of file From 0fc2cb4f84173094e55f2fad5dfae4cac534f537 Mon Sep 17 00:00:00 2001 From: mchekm Date: Mon, 8 Jun 2026 14:57:22 +0300 Subject: [PATCH 31/36] fix: comments --- official-templates/base/Dockerfile | 6 ++---- official-templates/base/docker-bake.hcl | 2 +- official-templates/base/requirements.txt | 4 ++-- official-templates/nvidia-pytorch/docker-bake.hcl | 2 +- official-templates/nvidia-pytorch/requirements.txt | 2 +- official-templates/rocm/docker-bake.hcl | 2 +- official-templates/rocm/requirements.txt | 4 ++-- scripts/scrub-stale-metadata.py | 5 ++++- 8 files changed, 14 insertions(+), 13 deletions(-) diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index fbce320..5e9bbc1 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -2,7 +2,6 @@ ARG BASE_IMAGE=non-existing FROM ${BASE_IMAGE} ARG RP_SKIP_PYTHON -ARG RP_SKIP_JUPYTER SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -93,11 +92,10 @@ RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=requirements requirements.txt /requirements.txt -COPY --from=scrab_stale_metadata scrub-stale-metadata.py /tmp/scrub-stale-metadata.py +COPY --from=scrub_stale_metadata scrub-stale-metadata.py /tmp/scrub-stale-metadata.py # Install Jupyter and hf_transfer packages -RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ - python -m pip install --upgrade --no-cache-dir -r requirements.txt && \ +RUN python -m pip install --upgrade --no-cache-dir -r requirements.txt && \ python /tmp/scrub-stale-metadata.py /requirements.txt && \ rm /tmp/scrub-stale-metadata.py diff --git a/official-templates/base/docker-bake.hcl b/official-templates/base/docker-bake.hcl index 28cd51c..a4e3723 100644 --- a/official-templates/base/docker-bake.hcl +++ b/official-templates/base/docker-bake.hcl @@ -15,7 +15,7 @@ target "common-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/base" - scrab_stale_metadata = "scripts" + scrub_stale_metadata = "scripts" } } diff --git a/official-templates/base/requirements.txt b/official-templates/base/requirements.txt index b0b5ea8..2feebf1 100644 --- a/official-templates/base/requirements.txt +++ b/official-templates/base/requirements.txt @@ -1,5 +1,5 @@ -hf_transfer +hf_transfer jupyterlab==4.5.7 ipywidgets jupyter-archive -notebook==7.5.6 \ No newline at end of file +notebook==7.5.6 diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl index 2b91d9b..4c8e8ff 100644 --- a/official-templates/nvidia-pytorch/docker-bake.hcl +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -11,7 +11,7 @@ target "nvidia-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/nvidia-pytorch" - scrab_stale_metadata = "scripts" + scrub_stale_metadata = "scripts" } args = { RP_SKIP_PYTHON = "1" diff --git a/official-templates/nvidia-pytorch/requirements.txt b/official-templates/nvidia-pytorch/requirements.txt index 0b07ad1..27fb742 100644 --- a/official-templates/nvidia-pytorch/requirements.txt +++ b/official-templates/nvidia-pytorch/requirements.txt @@ -11,4 +11,4 @@ pillow==12.2.0 protobuf==6.33.5 tornado==6.5.5 urllib3==2.7.0 -wheel==0.46.2 \ No newline at end of file +wheel==0.46.2 diff --git a/official-templates/rocm/docker-bake.hcl b/official-templates/rocm/docker-bake.hcl index 8140dce..5586279 100644 --- a/official-templates/rocm/docker-bake.hcl +++ b/official-templates/rocm/docker-bake.hcl @@ -16,7 +16,7 @@ target "rocm-base" { proxy = "container-template/proxy" logo = "container-template" requirements = "official-templates/rocm" - scrab_stale_metadata = "scripts" + scrub_stale_metadata = "scripts" } args = { RP_SKIP_PYTHON = "1" diff --git a/official-templates/rocm/requirements.txt b/official-templates/rocm/requirements.txt index 55f3901..9313217 100644 --- a/official-templates/rocm/requirements.txt +++ b/official-templates/rocm/requirements.txt @@ -1,5 +1,5 @@ Brotli==1.2.0 -PyJWT==2.12.0 +PyJWT==2.12.0 aiohttp==3.13.3 cryptography==46.0.5 jaraco.context==6.1.0 @@ -11,4 +11,4 @@ pillow==12.2.0 protobuf==6.33.5 tornado==6.5.5 urllib3==2.7.0 -wheel==0.46.2 \ No newline at end of file +wheel==0.46.2 diff --git a/scripts/scrub-stale-metadata.py b/scripts/scrub-stale-metadata.py index bd7a5c3..03747c0 100644 --- a/scripts/scrub-stale-metadata.py +++ b/scripts/scrub-stale-metadata.py @@ -7,10 +7,13 @@ Trivy keeps reporting the (now-unused) older version. This script removes the orphaned metadata for packages listed in the supplied requirements file.""" +from __future__ import annotations + import pathlib import re import shutil import sys +from collections.abc import Iterator NAME_RE = re.compile(r"^Name:\s*([^\n]*)$", re.MULTILINE) VERSION_RE = re.compile(r"^Version:\s*([^\n]*)$", re.MULTILINE) @@ -51,7 +54,7 @@ def read_meta(meta_dir: pathlib.Path) -> tuple[str, str] | None: return canonical(name_match.group(1)), version_match.group(1).strip() -def iter_meta_dirs() -> "Iterator[pathlib.Path]": +def iter_meta_dirs() -> Iterator[pathlib.Path]: for root in SEARCH_ROOTS: if not root.is_dir(): continue From 357ec59d929ab19f15402362768642ed6f39ffe7 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 9 Jun 2026 10:52:48 +0300 Subject: [PATCH 32/36] feat: increased runners and bake-action doesn't push images --- .github/workflows/base.yml | 15 +++++++++------ .github/workflows/nvidia.yml | 5 +++-- .github/workflows/rocm.yml | 5 +++-- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 265ee1f..6435c5c 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -21,7 +21,7 @@ permissions: jobs: build-base: - runs-on: blacksmith-8vcpu-ubuntu-2204 + runs-on: blacksmith-16vcpu-ubuntu-2204 steps: - name: Checkout uses: actions/checkout@v6 @@ -46,7 +46,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/base/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs @@ -62,7 +63,7 @@ jobs: build-autoresearch: needs: build-base if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') - runs-on: blacksmith-8vcpu-ubuntu-2204 + runs-on: blacksmith-16vcpu-ubuntu-2204 steps: - name: Checkout uses: actions/checkout@v6 @@ -99,7 +100,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/autoresearch/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs @@ -118,7 +120,7 @@ jobs: needs: build-base # always() forces job run even if the dependant is skipped (but not if it failed) if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') - runs-on: blacksmith-16vcpu-ubuntu-2404 + runs-on: blacksmith-32vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 @@ -156,7 +158,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/pytorch/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs diff --git a/.github/workflows/nvidia.yml b/.github/workflows/nvidia.yml index fa86d2f..451074f 100644 --- a/.github/workflows/nvidia.yml +++ b/.github/workflows/nvidia.yml @@ -15,7 +15,7 @@ permissions: jobs: build-nvidia: - runs-on: blacksmith-16vcpu-ubuntu-2404 + runs-on: blacksmith-32vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 @@ -40,7 +40,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/nvidia-pytorch/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 814595e..655b2b5 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -15,7 +15,7 @@ permissions: jobs: build-rocm: - runs-on: blacksmith-16vcpu-ubuntu-2404 + runs-on: blacksmith-32vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 @@ -40,7 +40,8 @@ jobs: files: | official-templates/shared/versions.hcl official-templates/rocm/docker-bake.hcl - push: true + load: true + push: false - name: Extract image refs id: refs From be335b019d79855e353c87f30d39edd7b6e31bf2 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 9 Jun 2026 14:08:52 +0300 Subject: [PATCH 33/36] fix: vulnerabilities in pip packages --- official-templates/rocm/requirements.txt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/official-templates/rocm/requirements.txt b/official-templates/rocm/requirements.txt index 9313217..55e14f3 100644 --- a/official-templates/rocm/requirements.txt +++ b/official-templates/rocm/requirements.txt @@ -1,7 +1,7 @@ Brotli==1.2.0 PyJWT==2.12.0 -aiohttp==3.13.3 -cryptography==46.0.5 +aiohttp==3.14.0 +cryptography==46.0.7 jaraco.context==6.1.0 jupyterlab==4.5.7 lxml==6.1.0 @@ -12,3 +12,15 @@ protobuf==6.33.5 tornado==6.5.5 urllib3==2.7.0 wheel==0.46.2 +filelock>=3.20.3 +jinja2>=3.1.6 +pygments>=2.20.0 +pynacl>=1.6.2 +requests>=2.33.0 +werkzeug>=3.1.6 +idna>=3.15 +fonttools>=4.60.2 +h2>=4.3.0 +marshmallow>=3.26.2 +pytest>=9.0.3 +pip>=26.1 \ No newline at end of file From 75d308e23f86490ca3f1c50571195e4b66c1ec60 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 9 Jun 2026 16:31:21 +0300 Subject: [PATCH 34/36] fix: nvidia requirements --- official-templates/nvidia-pytorch/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/official-templates/nvidia-pytorch/requirements.txt b/official-templates/nvidia-pytorch/requirements.txt index 27fb742..bf40672 100644 --- a/official-templates/nvidia-pytorch/requirements.txt +++ b/official-templates/nvidia-pytorch/requirements.txt @@ -12,3 +12,4 @@ protobuf==6.33.5 tornado==6.5.5 urllib3==2.7.0 wheel==0.46.2 +pyarrow==23.0.1 \ No newline at end of file From 19d5d7746ddcba861b7d4e90692fd4b88256d2f0 Mon Sep 17 00:00:00 2001 From: mchekm Date: Tue, 9 Jun 2026 17:43:13 +0300 Subject: [PATCH 35/36] feat: push after grype --- .github/actions/docker-push/action.yml | 28 ++++++++++++++++++++++++++ .github/workflows/base.yml | 17 ++++++++++++++++ .github/workflows/nvidia.yml | 5 +++++ .github/workflows/rocm.yml | 7 ++++++- 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 .github/actions/docker-push/action.yml diff --git a/.github/actions/docker-push/action.yml b/.github/actions/docker-push/action.yml new file mode 100644 index 0000000..763fc0a --- /dev/null +++ b/.github/actions/docker-push/action.yml @@ -0,0 +1,28 @@ +name: Docker Push +description: "Push locally-loaded images to their registry refs" + +inputs: + image-refs: + description: "JSON array of image references to push" + required: true + +runs: + using: composite + steps: + - name: Push + shell: bash + env: + IMAGE_REFS: ${{ inputs.image-refs }} + run: | + set -euo pipefail + mapfile -t refs < <(echo "${IMAGE_REFS}" | jq -r '.[]') + if [ ${#refs[@]} -eq 0 ]; then + echo "No image refs to push" + exit 1 + fi + + for ref in "${refs[@]}"; do + echo "::group::docker push — ${ref}" + docker push "${ref}" + echo "::endgroup::" + done diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 6435c5c..b252d69 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -60,6 +60,11 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} + - name: Push images + uses: ./.github/actions/docker-push + with: + image-refs: ${{ steps.refs.outputs.refs }} + build-autoresearch: needs: build-base if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') @@ -116,6 +121,12 @@ jobs: with: image-refs: ${{ steps.refs.outputs.refs }} + - name: Push images + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.autoresearch_any_changed == 'true' + uses: ./.github/actions/docker-push + with: + image-refs: ${{ steps.refs.outputs.refs }} + build-pytorch: needs: build-base # always() forces job run even if the dependant is skipped (but not if it failed) @@ -171,5 +182,11 @@ jobs: - name: Grype scan if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' uses: ./.github/actions/grype + with: + image-refs: ${{ steps.refs.outputs.refs }} + + - name: Push images + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' + uses: ./.github/actions/docker-push with: image-refs: ${{ steps.refs.outputs.refs }} \ No newline at end of file diff --git a/.github/workflows/nvidia.yml b/.github/workflows/nvidia.yml index 451074f..3161650 100644 --- a/.github/workflows/nvidia.yml +++ b/.github/workflows/nvidia.yml @@ -58,3 +58,8 @@ jobs: **/civetweb/resources/ssl_cert.pem **/civetweb/resources/server.pem **/civetweb/resources/server_bkup.pem + + - name: Push images + uses: ./.github/actions/docker-push + with: + image-refs: ${{ steps.refs.outputs.refs }} diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 655b2b5..44b5786 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -57,4 +57,9 @@ jobs: **/civetweb/resources/cert/* **/civetweb/resources/ssl_cert.pem **/civetweb/resources/server.pem - **/civetweb/resources/server_bkup.pem \ No newline at end of file + **/civetweb/resources/server_bkup.pem + + - name: Push images + uses: ./.github/actions/docker-push + with: + image-refs: ${{ steps.refs.outputs.refs }} \ No newline at end of file From c4d999d203505637a4e048a8c76b42a752a34bf0 Mon Sep 17 00:00:00 2001 From: mchekm Date: Wed, 10 Jun 2026 09:23:42 +0300 Subject: [PATCH 36/36] feat: pytorch timeout-minutes --- .github/workflows/base.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index b252d69..c3cb879 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -132,6 +132,7 @@ jobs: # always() forces job run even if the dependant is skipped (but not if it failed) if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') runs-on: blacksmith-32vcpu-ubuntu-2404 + timeout-minutes: 240 steps: - name: Checkout uses: actions/checkout@v6