From 236397b9c91d20039b647b369939718cf4d87e79 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 4 Jun 2026 13:55:31 +0000 Subject: [PATCH 1/4] [INFRA] Fix Coursier cache for container CI jobs via volume mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pyspark, sparkr, lint, and docs CI jobs run inside Docker containers where the runner's HOME is /github/home (bind-mounted from /home/runner/work/_temp/_github_home), so ~/.cache/coursier inside the container resolves to a different physical path than the /home/runner/.cache/coursier that the host-runner precompile job writes to. This mismatch caused every container job's Coursier cache step to report "Path Validation Error: ... no cache is being saved" and never find nor populate the shared cache. Fix by adding a volume mount that binds the host's Coursier cache directory into the container's $HOME: /home/runner/.cache/coursier → /github/home/.cache/coursier With this mount, the restore step extracts the precompile-written Linux-coursier- cache directly into the path SBT reads from inside the container, and the Path Validation Error is gone. All four jobs remain restore-only (actions/cache/restore). pyspark and sparkr depend on precompile so they always hit the precompile-written cache. lint and docs run concurrently with precompile so keeping them restore-only avoids a race where a partial closure could be saved before precompile finishes writing the full superset. Generated-by: Claude Code (claude-sonnet-4-6) --- .github/workflows/build_and_test.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b36655c390f0..1ad0dcd96e0f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -658,6 +658,8 @@ jobs: options: >- --cap-add=SYS_PTRACE --security-opt seccomp=unconfined + volumes: + - /home/runner/.cache/coursier:/github/home/.cache/coursier strategy: fail-fast: false max-parallel: 20 @@ -898,6 +900,8 @@ jobs: timeout-minutes: 120 container: image: ${{ needs.precondition.outputs.image_sparkr_url_link }} + volumes: + - /home/runner/.cache/coursier:/github/home/.cache/coursier env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 @@ -1058,6 +1062,8 @@ jobs: BRANCH: ${{ inputs.branch }} container: image: ${{ needs.precondition.outputs.image_lint_url_link }} + volumes: + - /home/runner/.cache/coursier:/github/home/.cache/coursier steps: - name: Checkout Spark repository uses: actions/checkout@v6 @@ -1257,6 +1263,8 @@ jobs: GITHUB_PREV_SHA: ${{ github.event.before }} container: image: ${{ needs.precondition.outputs.image_docs_url_link }} + volumes: + - /home/runner/.cache/coursier:/github/home/.cache/coursier steps: - name: Checkout Spark repository uses: actions/checkout@v6 From b763efdc1f23c0fd55a662bce22880ca048704fa Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 5 Jun 2026 05:01:19 +0000 Subject: [PATCH 2/4] [INFRA] Fix Coursier cache for container CI jobs via volume mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Install zstd in all CI container images so the @actions/cache toolkit uses the same compression algorithm (zstd) as host-runner jobs. Root cause: @actions/cache computes a cache "version" as SHA256(path + compression_method). Host-runner jobs (including precompile) have zstd available and save caches with zstd. Container images (pyspark, sparkr, lint, docs) lacked zstd, so the toolkit fell back to gzip, producing a different version hash. The cache lookup URL therefore differed and every restore reported "Cache not found" even though the key string matched an existing entry - confirmed by the fork's cache API showing Linux-coursier- present but all container jobs missing it despite looking up 2+ minutes after it was saved. Add `zstd` to the apt-get install block of every CI Dockerfile: - dev/infra/Dockerfile (branch-3.5 and base) - dev/spark-test-image/python-{311,312,312-classic-only,312-pandas-3,313,314,314-nogil,minimum}/Dockerfile (pyspark variants) - dev/spark-test-image/docs/Dockerfile - dev/spark-test-image/lint/Dockerfile - dev/spark-test-image/sparkr/Dockerfile Remove the volume mounts added in the previous attempt (/home/runner/.cache/coursier:/github/home/.cache/coursier) which were the wrong fix — the path is correctly handled by the cache action extracting to path inside the container; the real issue was the version mismatch preventing lookup. Generated-by: Claude Code (claude-sonnet-4-6) --- .github/workflows/build_and_test.yml | 8 -------- dev/infra/Dockerfile | 1 + dev/spark-test-image/docs/Dockerfile | 1 + dev/spark-test-image/lint/Dockerfile | 1 + dev/spark-test-image/python-311/Dockerfile | 3 ++- dev/spark-test-image/python-312-classic-only/Dockerfile | 1 + dev/spark-test-image/python-312-pandas-3/Dockerfile | 1 + dev/spark-test-image/python-312/Dockerfile | 1 + dev/spark-test-image/python-313/Dockerfile | 3 ++- dev/spark-test-image/python-314-nogil/Dockerfile | 3 ++- dev/spark-test-image/python-314/Dockerfile | 3 ++- dev/spark-test-image/python-minimum/Dockerfile | 3 ++- dev/spark-test-image/sparkr/Dockerfile | 1 + 13 files changed, 17 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1ad0dcd96e0f..b36655c390f0 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -658,8 +658,6 @@ jobs: options: >- --cap-add=SYS_PTRACE --security-opt seccomp=unconfined - volumes: - - /home/runner/.cache/coursier:/github/home/.cache/coursier strategy: fail-fast: false max-parallel: 20 @@ -900,8 +898,6 @@ jobs: timeout-minutes: 120 container: image: ${{ needs.precondition.outputs.image_sparkr_url_link }} - volumes: - - /home/runner/.cache/coursier:/github/home/.cache/coursier env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 @@ -1062,8 +1058,6 @@ jobs: BRANCH: ${{ inputs.branch }} container: image: ${{ needs.precondition.outputs.image_lint_url_link }} - volumes: - - /home/runner/.cache/coursier:/github/home/.cache/coursier steps: - name: Checkout Spark repository uses: actions/checkout@v6 @@ -1263,8 +1257,6 @@ jobs: GITHUB_PREV_SHA: ${{ github.event.before }} container: image: ${{ needs.precondition.outputs.image_docs_url_link }} - volumes: - - /home/runner/.cache/coursier:/github/home/.cache/coursier steps: - name: Checkout Spark repository uses: actions/checkout@v6 diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index b848f8eb754d..57cde202dde8 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -70,6 +70,7 @@ RUN apt-get update && apt-get install -y \ software-properties-common \ wget \ zlib1g-dev \ + zstd \ && rm -rf /var/lib/apt/lists/* diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile index 7e3f63b05acd..8b13d9534670 100644 --- a/dev/spark-test-image/docs/Dockerfile +++ b/dev/spark-test-image/docs/Dockerfile @@ -70,6 +70,7 @@ RUN apt-get update && apt-get install -y \ software-properties-common \ wget \ zlib1g-dev \ + zstd \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile index c6bf389ecb22..b7cecae654cb 100644 --- a/dev/spark-test-image/lint/Dockerfile +++ b/dev/spark-test-image/lint/Dockerfile @@ -63,6 +63,7 @@ RUN apt-get update && apt-get install -y \ software-properties-common \ wget \ zlib1g-dev \ + zstd \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* diff --git a/dev/spark-test-image/python-311/Dockerfile b/dev/spark-test-image/python-311/Dockerfile index 8e5044aeb954..e39e9fac70d2 100644 --- a/dev/spark-test-image/python-311/Dockerfile +++ b/dev/spark-test-image/python-311/Dockerfile @@ -46,7 +46,8 @@ RUN apt-get update && apt-get install -y \ pkg-config \ tzdata \ software-properties-common \ - zlib1g-dev + zlib1g-dev \ + zstd # Install Python 3.11 RUN add-apt-repository ppa:deadsnakes/ppa diff --git a/dev/spark-test-image/python-312-classic-only/Dockerfile b/dev/spark-test-image/python-312-classic-only/Dockerfile index da2b99cd7838..ceb4694b2dc9 100644 --- a/dev/spark-test-image/python-312-classic-only/Dockerfile +++ b/dev/spark-test-image/python-312-classic-only/Dockerfile @@ -49,6 +49,7 @@ RUN apt-get update && apt-get install -y \ tzdata \ software-properties-common \ zlib1g-dev \ + zstd \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* diff --git a/dev/spark-test-image/python-312-pandas-3/Dockerfile b/dev/spark-test-image/python-312-pandas-3/Dockerfile index c54a8c284a6b..e2a2c189df15 100644 --- a/dev/spark-test-image/python-312-pandas-3/Dockerfile +++ b/dev/spark-test-image/python-312-pandas-3/Dockerfile @@ -52,6 +52,7 @@ RUN apt-get update && apt-get install -y \ tzdata \ software-properties-common \ zlib1g-dev \ + zstd \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* diff --git a/dev/spark-test-image/python-312/Dockerfile b/dev/spark-test-image/python-312/Dockerfile index 8eee9e414f1d..72b40510b8db 100644 --- a/dev/spark-test-image/python-312/Dockerfile +++ b/dev/spark-test-image/python-312/Dockerfile @@ -48,6 +48,7 @@ RUN apt-get update && apt-get install -y \ pkg-config \ tzdata \ software-properties-common \ + zstd \ zlib1g-dev \ && apt-get autoremove --purge -y \ && apt-get clean \ diff --git a/dev/spark-test-image/python-313/Dockerfile b/dev/spark-test-image/python-313/Dockerfile index 6cfdd2d5a86e..c13e364f1589 100644 --- a/dev/spark-test-image/python-313/Dockerfile +++ b/dev/spark-test-image/python-313/Dockerfile @@ -46,7 +46,8 @@ RUN apt-get update && apt-get install -y \ pkg-config \ tzdata \ software-properties-common \ - zlib1g-dev + zlib1g-dev \ + zstd # Install Python 3.13 RUN add-apt-repository ppa:deadsnakes/ppa diff --git a/dev/spark-test-image/python-314-nogil/Dockerfile b/dev/spark-test-image/python-314-nogil/Dockerfile index edfea3172992..6dea9c2fc35e 100644 --- a/dev/spark-test-image/python-314-nogil/Dockerfile +++ b/dev/spark-test-image/python-314-nogil/Dockerfile @@ -46,7 +46,8 @@ RUN apt-get update && apt-get install -y \ pkg-config \ tzdata \ software-properties-common \ - zlib1g-dev + zlib1g-dev \ + zstd # Install Python 3.14 (no GIL) RUN add-apt-repository ppa:deadsnakes/ppa diff --git a/dev/spark-test-image/python-314/Dockerfile b/dev/spark-test-image/python-314/Dockerfile index b6bc76c6b93b..2f3570ec0687 100644 --- a/dev/spark-test-image/python-314/Dockerfile +++ b/dev/spark-test-image/python-314/Dockerfile @@ -46,7 +46,8 @@ RUN apt-get update && apt-get install -y \ pkg-config \ tzdata \ software-properties-common \ - zlib1g-dev + zlib1g-dev \ + zstd # Install Python 3.14 RUN add-apt-repository ppa:deadsnakes/ppa diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 4d110600d826..89da6f618124 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -47,7 +47,8 @@ RUN apt-get update && apt-get install -y \ pkg-config \ tzdata \ software-properties-common \ - zlib1g-dev + zlib1g-dev \ + zstd # Install Python 3.11 RUN add-apt-repository ppa:deadsnakes/ppa diff --git a/dev/spark-test-image/sparkr/Dockerfile b/dev/spark-test-image/sparkr/Dockerfile index 07816add74fc..8a03095aef7e 100644 --- a/dev/spark-test-image/sparkr/Dockerfile +++ b/dev/spark-test-image/sparkr/Dockerfile @@ -62,6 +62,7 @@ RUN apt-get update && apt-get install -y \ software-properties-common \ wget \ zlib1g-dev \ + zstd \ && rm -rf /var/lib/apt/lists/* RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list From d9d834456a0c2dc465c037b62bc87a2bea6a114f Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 5 Jun 2026 12:16:41 +0000 Subject: [PATCH 3/4] [INFRA] Add comment explaining why zstd is required in CI container images Generated-by: Claude Code (claude-sonnet-4-6) --- dev/infra/Dockerfile | 6 ++++++ dev/spark-test-image/docs/Dockerfile | 6 ++++++ dev/spark-test-image/lint/Dockerfile | 6 ++++++ dev/spark-test-image/python-311/Dockerfile | 6 ++++++ dev/spark-test-image/python-312-classic-only/Dockerfile | 6 ++++++ dev/spark-test-image/python-312-pandas-3/Dockerfile | 6 ++++++ dev/spark-test-image/python-312/Dockerfile | 6 ++++++ dev/spark-test-image/python-313/Dockerfile | 6 ++++++ dev/spark-test-image/python-314-nogil/Dockerfile | 6 ++++++ dev/spark-test-image/python-314/Dockerfile | 6 ++++++ dev/spark-test-image/python-minimum/Dockerfile | 6 ++++++ dev/spark-test-image/sparkr/Dockerfile | 6 ++++++ 12 files changed, 72 insertions(+) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 57cde202dde8..46954d59bb7a 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -33,6 +33,12 @@ RUN echo 'deb https://mirrors.edge.kernel.org/ubuntu jammy main restricted unive echo 'deb https://mirrors.edge.kernel.org/ubuntu jammy-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/mirror.list && \ echo 'deb https://mirrors.edge.kernel.org/ubuntu jammy-security main restricted universe multiverse' >> /etc/apt/sources.list.d/mirror.list +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile index 8b13d9534670..b2844cd97624 100644 --- a/dev/spark-test-image/docs/Dockerfile +++ b/dev/spark-test-image/docs/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile index b7cecae654cb..359161bba2dd 100644 --- a/dev/spark-test-image/lint/Dockerfile +++ b/dev/spark-test-image/lint/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-311/Dockerfile b/dev/spark-test-image/python-311/Dockerfile index e39e9fac70d2..d0c60ba4a6df 100644 --- a/dev/spark-test-image/python-311/Dockerfile +++ b/dev/spark-test-image/python-311/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-312-classic-only/Dockerfile b/dev/spark-test-image/python-312-classic-only/Dockerfile index ceb4694b2dc9..727bc505ffe9 100644 --- a/dev/spark-test-image/python-312-classic-only/Dockerfile +++ b/dev/spark-test-image/python-312-classic-only/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-312-pandas-3/Dockerfile b/dev/spark-test-image/python-312-pandas-3/Dockerfile index e2a2c189df15..76cf990e6f0b 100644 --- a/dev/spark-test-image/python-312-pandas-3/Dockerfile +++ b/dev/spark-test-image/python-312-pandas-3/Dockerfile @@ -34,6 +34,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-312/Dockerfile b/dev/spark-test-image/python-312/Dockerfile index 72b40510b8db..d1427dea2a3a 100644 --- a/dev/spark-test-image/python-312/Dockerfile +++ b/dev/spark-test-image/python-312/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-313/Dockerfile b/dev/spark-test-image/python-313/Dockerfile index c13e364f1589..2857e1f4fb4d 100644 --- a/dev/spark-test-image/python-313/Dockerfile +++ b/dev/spark-test-image/python-313/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-314-nogil/Dockerfile b/dev/spark-test-image/python-314-nogil/Dockerfile index 6dea9c2fc35e..e1e3f27f0ab5 100644 --- a/dev/spark-test-image/python-314-nogil/Dockerfile +++ b/dev/spark-test-image/python-314-nogil/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-314/Dockerfile b/dev/spark-test-image/python-314/Dockerfile index 2f3570ec0687..7caf7128a69c 100644 --- a/dev/spark-test-image/python-314/Dockerfile +++ b/dev/spark-test-image/python-314/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 89da6f618124..9d22f0c8cbc2 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -32,6 +32,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources # Should keep the installation consistent with https://apache.github.io/spark/api/python/getting_started/install.html +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/sparkr/Dockerfile b/dev/spark-test-image/sparkr/Dockerfile index 8a03095aef7e..f84e4c84c4a7 100644 --- a/dev/spark-test-image/sparkr/Dockerfile +++ b/dev/spark-test-image/sparkr/Dockerfile @@ -31,6 +31,12 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources +# zstd is required by @actions/cache (GitHub Actions cache action). The action computes +# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and +# use it; without zstd the container falls back to gzip, producing a different version +# hash, so cache entries saved by host jobs are invisible to container jobs even when +# the key string matches. +# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ From 54f7d76f07bdb94b8650234cf6bf153390246505 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 5 Jun 2026 12:17:28 +0000 Subject: [PATCH 4/4] [INFRA] Remove inline zstd comments from Dockerfiles The explanation is in the PR description instead. Generated-by: Claude Code (claude-sonnet-4-6) --- dev/infra/Dockerfile | 6 ------ dev/spark-test-image/docs/Dockerfile | 6 ------ dev/spark-test-image/lint/Dockerfile | 6 ------ dev/spark-test-image/python-311/Dockerfile | 6 ------ dev/spark-test-image/python-312-classic-only/Dockerfile | 6 ------ dev/spark-test-image/python-312-pandas-3/Dockerfile | 6 ------ dev/spark-test-image/python-312/Dockerfile | 6 ------ dev/spark-test-image/python-313/Dockerfile | 6 ------ dev/spark-test-image/python-314-nogil/Dockerfile | 6 ------ dev/spark-test-image/python-314/Dockerfile | 6 ------ dev/spark-test-image/python-minimum/Dockerfile | 6 ------ dev/spark-test-image/sparkr/Dockerfile | 6 ------ 12 files changed, 72 deletions(-) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 46954d59bb7a..57cde202dde8 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -33,12 +33,6 @@ RUN echo 'deb https://mirrors.edge.kernel.org/ubuntu jammy main restricted unive echo 'deb https://mirrors.edge.kernel.org/ubuntu jammy-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/mirror.list && \ echo 'deb https://mirrors.edge.kernel.org/ubuntu jammy-security main restricted universe multiverse' >> /etc/apt/sources.list.d/mirror.list -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile index b2844cd97624..8b13d9534670 100644 --- a/dev/spark-test-image/docs/Dockerfile +++ b/dev/spark-test-image/docs/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile index 359161bba2dd..b7cecae654cb 100644 --- a/dev/spark-test-image/lint/Dockerfile +++ b/dev/spark-test-image/lint/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-311/Dockerfile b/dev/spark-test-image/python-311/Dockerfile index d0c60ba4a6df..e39e9fac70d2 100644 --- a/dev/spark-test-image/python-311/Dockerfile +++ b/dev/spark-test-image/python-311/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-312-classic-only/Dockerfile b/dev/spark-test-image/python-312-classic-only/Dockerfile index 727bc505ffe9..ceb4694b2dc9 100644 --- a/dev/spark-test-image/python-312-classic-only/Dockerfile +++ b/dev/spark-test-image/python-312-classic-only/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-312-pandas-3/Dockerfile b/dev/spark-test-image/python-312-pandas-3/Dockerfile index 76cf990e6f0b..e2a2c189df15 100644 --- a/dev/spark-test-image/python-312-pandas-3/Dockerfile +++ b/dev/spark-test-image/python-312-pandas-3/Dockerfile @@ -34,12 +34,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-312/Dockerfile b/dev/spark-test-image/python-312/Dockerfile index d1427dea2a3a..72b40510b8db 100644 --- a/dev/spark-test-image/python-312/Dockerfile +++ b/dev/spark-test-image/python-312/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-313/Dockerfile b/dev/spark-test-image/python-313/Dockerfile index 2857e1f4fb4d..c13e364f1589 100644 --- a/dev/spark-test-image/python-313/Dockerfile +++ b/dev/spark-test-image/python-313/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-314-nogil/Dockerfile b/dev/spark-test-image/python-314-nogil/Dockerfile index e1e3f27f0ab5..6dea9c2fc35e 100644 --- a/dev/spark-test-image/python-314-nogil/Dockerfile +++ b/dev/spark-test-image/python-314-nogil/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-314/Dockerfile b/dev/spark-test-image/python-314/Dockerfile index 7caf7128a69c..2f3570ec0687 100644 --- a/dev/spark-test-image/python-314/Dockerfile +++ b/dev/spark-test-image/python-314/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 9d22f0c8cbc2..89da6f618124 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -32,12 +32,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources # Should keep the installation consistent with https://apache.github.io/spark/api/python/getting_started/install.html -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \ diff --git a/dev/spark-test-image/sparkr/Dockerfile b/dev/spark-test-image/sparkr/Dockerfile index f84e4c84c4a7..8a03095aef7e 100644 --- a/dev/spark-test-image/sparkr/Dockerfile +++ b/dev/spark-test-image/sparkr/Dockerfile @@ -31,12 +31,6 @@ ENV DEBCONF_NONINTERACTIVE_SEEN=true RUN printf 'Types: deb\nURIs: https://mirrors.edge.kernel.org/ubuntu\nSuites: noble noble-updates noble-security\nComponents: main restricted universe multiverse\nSigned-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg\n' > /etc/apt/sources.list.d/mirror.sources -# zstd is required by @actions/cache (GitHub Actions cache action). The action computes -# a cache "version" as SHA256(paths + compression_method). Host runners have zstd and -# use it; without zstd the container falls back to gzip, producing a different version -# hash, so cache entries saved by host jobs are invisible to container jobs even when -# the key string matches. -# See: https://github.com/actions/toolkit/blob/main/packages/cache/src/internal/cacheUtils.ts RUN apt-get update && apt-get install -y \ build-essential \ ca-certificates \