diff --git a/.github/workflows/rocm-ci-dispatch.yml b/.github/workflows/rocm-ci-dispatch.yml index e679ece46..68e6c540e 100644 --- a/.github/workflows/rocm-ci-dispatch.yml +++ b/.github/workflows/rocm-ci-dispatch.yml @@ -10,6 +10,7 @@ on: permissions: contents: read + actions: read jobs: determine_level: diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index e2fb09c15..034b91139 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -109,6 +109,9 @@ jobs: needs: [select_image, build] timeout-minutes: 360 runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-4' || 'linux-te-mi35x-4' }} + permissions: + contents: read + actions: read strategy: fail-fast: false matrix: @@ -124,10 +127,11 @@ jobs: 3rdparty/hipify_torch - name: Download build artifacts - uses: actions/download-artifact@v4 - with: - name: te-rocm-wheels - path: dist/ + env: + GITHUB_TOKEN: ${{ github.token }} + TE_WHEEL_ARTIFACT_ID: ${{ needs.build.outputs.wheel_artifact_id }} + TE_WHEEL_ARTIFACT_DIGEST: ${{ needs.build.outputs.wheel_artifact_digest }} + run: ci/download_wheel_artifacts.sh dist te-rocm-wheels - name: Host Diagnostics run: | @@ -311,6 +315,9 @@ jobs: needs: [select_image, build] timeout-minutes: 360 runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }} + permissions: + contents: read + actions: read strategy: fail-fast: false matrix: @@ -321,10 +328,11 @@ jobs: uses: actions/checkout@v6 - name: Download build artifacts - uses: actions/download-artifact@v4 - with: - name: te-rocm-wheels - path: dist/ + env: + GITHUB_TOKEN: ${{ github.token }} + TE_WHEEL_ARTIFACT_ID: ${{ needs.build.outputs.wheel_artifact_id }} + TE_WHEEL_ARTIFACT_DIGEST: ${{ needs.build.outputs.wheel_artifact_digest }} + run: ci/download_wheel_artifacts.sh dist te-rocm-wheels - name: Host Diagnostics run: | diff --git a/.github/workflows/rocm-wheels-build.yml b/.github/workflows/rocm-wheels-build.yml index c1a8ea087..b613c7178 100644 --- a/.github/workflows/rocm-wheels-build.yml +++ b/.github/workflows/rocm-wheels-build.yml @@ -65,6 +65,13 @@ on: docker_image_tag_override: type: string default: '' + outputs: + wheel_artifact_id: + description: 'GitHub Actions artifact ID for te-rocm-wheels.' + value: ${{ jobs.build-rocm-wheels.outputs.wheel_artifact_id }} + wheel_artifact_digest: + description: 'SHA-256 digest for te-rocm-wheels.' + value: ${{ jobs.build-rocm-wheels.outputs.wheel_artifact_digest }} env: DOCKER_IMAGE_NAME: te-rocm-manylinux-x86 @@ -76,6 +83,9 @@ jobs: build-rocm-wheels: name: Build ROCm Docker image and TransformerEngine wheels runs-on: build-only-te + outputs: + wheel_artifact_id: ${{ steps.upload-wheels.outputs['artifact-id'] }} + wheel_artifact_digest: ${{ steps.upload-wheels.outputs['artifact-digest'] }} steps: - name: Checkout repository @@ -213,6 +223,7 @@ jobs: fi - name: Upload wheels as GitHub Actions artifacts + id: upload-wheels if: success() uses: actions/upload-artifact@v4 with: diff --git a/ci/download_wheel_artifacts.sh b/ci/download_wheel_artifacts.sh new file mode 100755 index 000000000..16f39013d --- /dev/null +++ b/ci/download_wheel_artifacts.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# Download the te-rocm-wheels artifact, verify expected files, and retry twice if needed. +set -euo pipefail + +artifact_dir="${1:-dist}" +artifact_name="${2:-te-rocm-wheels}" +repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY is required}" +artifact_id="${ARTIFACT_ID:-${TE_WHEEL_ARTIFACT_ID:-}}" +expected_digest="${ARTIFACT_DIGEST:-${TE_WHEEL_ARTIFACT_DIGEST:-}}" +api_url="${GITHUB_API_URL:-https://api.github.com}" + +required_patterns=( + 'transformer_engine_rocm[0-9]*.whl' + 'transformer_engine_rocm_torch*.tar.gz' + 'transformer_engine_rocm_jax*.tar.gz' +) + +if [[ -z "${GITHUB_TOKEN:-}" && -z "${GH_TOKEN:-}" ]]; then + echo "::error::GITHUB_TOKEN or GH_TOKEN is required to download GitHub Actions artifacts" + exit 1 +fi + +token="${GITHUB_TOKEN:-${GH_TOKEN:-}}" + +if [[ -z "${artifact_id}" ]]; then + echo "::error::ARTIFACT_ID/TE_WHEEL_ARTIFACT_ID is required to download ${artifact_name}" + exit 1 +fi + +if ! command -v curl >/dev/null 2>&1; then + echo "::error::curl is required to download artifacts" + exit 1 +fi + +if ! command -v sha256sum >/dev/null 2>&1; then + echo "::error::sha256sum is required to verify artifact digests" + exit 1 +fi + +extract_zip() { + local zip_file="$1" + local dest_dir="$2" + + if command -v unzip >/dev/null 2>&1; then + unzip -q "${zip_file}" -d "${dest_dir}" + elif command -v busybox >/dev/null 2>&1 && busybox unzip --help >/dev/null 2>&1; then + busybox unzip -q "${zip_file}" -d "${dest_dir}" + elif command -v bsdtar >/dev/null 2>&1; then + (cd "${dest_dir}" && bsdtar -xf "${zip_file}") + else + echo "::error::unzip, busybox unzip, or bsdtar is required to extract artifacts" + return 1 + fi +} + +print_manifest_group() { + echo "::group::Build artifact manifest" + echo "=== artifact download context ===" + echo "repository: ${repo}" + echo "artifact_name: ${artifact_name}" + echo "artifact_id: ${artifact_id}" + echo "artifact_dir: ${artifact_dir}" + echo "expected_digest: ${expected_digest:-unknown}" + echo "" + echo "=== downloaded files ===" + if [[ -d "${artifact_dir}" ]]; then + find "${artifact_dir}" -maxdepth 2 -type f -printf '%p\t%s bytes\n' | sort + else + echo "${artifact_dir} directory is missing" + fi + echo "::endgroup::" +} + +verify_artifacts() { + local missing=0 + print_manifest_group + for pattern in "${required_patterns[@]}"; do + if ! find "${artifact_dir}" -maxdepth 2 -type f -name "${pattern}" | grep -q .; then + echo "::error::Missing required build artifact matching ${pattern} under ${artifact_dir}" + missing=1 + fi + done + return "${missing}" +} + +download_once() { + local attempt="$1" + local tmp_dir artifact_zip curl_config expected_sha actual_sha download_url + tmp_dir="$(mktemp -d)" + artifact_zip="${tmp_dir}/artifact.zip" + curl_config="${tmp_dir}/curl.conf" + download_url="${api_url}/repos/${repo}/actions/artifacts/${artifact_id}/zip" + + cat > "${curl_config}" <