Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/rocm-ci-dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:

permissions:
contents: read
actions: read

jobs:
determine_level:
Expand Down
24 changes: 16 additions & 8 deletions .github/workflows/rocm-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ jobs:
needs: [select_image, build]
timeout-minutes: 360
runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-4' || 'linux-te-mi35x-4' }}
permissions:
contents: read
actions: read
strategy:
fail-fast: false
matrix:
Expand All @@ -124,10 +127,11 @@ jobs:
3rdparty/hipify_torch

- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: te-rocm-wheels
path: dist/
env:
GITHUB_TOKEN: ${{ github.token }}
TE_WHEEL_ARTIFACT_ID: ${{ needs.build.outputs.wheel_artifact_id }}
TE_WHEEL_ARTIFACT_DIGEST: ${{ needs.build.outputs.wheel_artifact_digest }}
run: ci/download_wheel_artifacts.sh dist te-rocm-wheels

- name: Host Diagnostics
run: |
Expand Down Expand Up @@ -311,6 +315,9 @@ jobs:
needs: [select_image, build]
timeout-minutes: 360
runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }}
permissions:
contents: read
actions: read
strategy:
fail-fast: false
matrix:
Expand All @@ -321,10 +328,11 @@ jobs:
uses: actions/checkout@v6

- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: te-rocm-wheels
path: dist/
env:
GITHUB_TOKEN: ${{ github.token }}
TE_WHEEL_ARTIFACT_ID: ${{ needs.build.outputs.wheel_artifact_id }}
TE_WHEEL_ARTIFACT_DIGEST: ${{ needs.build.outputs.wheel_artifact_digest }}
run: ci/download_wheel_artifacts.sh dist te-rocm-wheels

- name: Host Diagnostics
run: |
Expand Down
11 changes: 11 additions & 0 deletions .github/workflows/rocm-wheels-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ on:
docker_image_tag_override:
type: string
default: ''
outputs:
wheel_artifact_id:
description: 'GitHub Actions artifact ID for te-rocm-wheels.'
value: ${{ jobs.build-rocm-wheels.outputs.wheel_artifact_id }}
wheel_artifact_digest:
description: 'SHA-256 digest for te-rocm-wheels.'
value: ${{ jobs.build-rocm-wheels.outputs.wheel_artifact_digest }}

env:
DOCKER_IMAGE_NAME: te-rocm-manylinux-x86
Expand All @@ -76,6 +83,9 @@ jobs:
build-rocm-wheels:
name: Build ROCm Docker image and TransformerEngine wheels
runs-on: build-only-te
outputs:
wheel_artifact_id: ${{ steps.upload-wheels.outputs['artifact-id'] }}
wheel_artifact_digest: ${{ steps.upload-wheels.outputs['artifact-digest'] }}

steps:
- name: Checkout repository
Expand Down Expand Up @@ -213,6 +223,7 @@ jobs:
fi

- name: Upload wheels as GitHub Actions artifacts
id: upload-wheels
if: success()
uses: actions/upload-artifact@v4
with:
Expand Down
149 changes: 149 additions & 0 deletions ci/download_wheel_artifacts.sh

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This script is not a part of CI but rather .github/scripts

Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env bash
# Download the te-rocm-wheels artifact, verify expected files, and retry twice if needed.
set -euo pipefail

artifact_dir="${1:-dist}"
artifact_name="${2:-te-rocm-wheels}"
repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY is required}"
artifact_id="${ARTIFACT_ID:-${TE_WHEEL_ARTIFACT_ID:-}}"
expected_digest="${ARTIFACT_DIGEST:-${TE_WHEEL_ARTIFACT_DIGEST:-}}"
api_url="${GITHUB_API_URL:-https://api.github.com}"

required_patterns=(
'transformer_engine_rocm[0-9]*.whl'
'transformer_engine_rocm_torch*.tar.gz'
'transformer_engine_rocm_jax*.tar.gz'
)

if [[ -z "${GITHUB_TOKEN:-}" && -z "${GH_TOKEN:-}" ]]; then
echo "::error::GITHUB_TOKEN or GH_TOKEN is required to download GitHub Actions artifacts"
exit 1
fi

token="${GITHUB_TOKEN:-${GH_TOKEN:-}}"

if [[ -z "${artifact_id}" ]]; then
echo "::error::ARTIFACT_ID/TE_WHEEL_ARTIFACT_ID is required to download ${artifact_name}"
exit 1
fi

if ! command -v curl >/dev/null 2>&1; then
echo "::error::curl is required to download artifacts"
exit 1
fi

if ! command -v sha256sum >/dev/null 2>&1; then
echo "::error::sha256sum is required to verify artifact digests"
exit 1
fi

extract_zip() {
local zip_file="$1"
local dest_dir="$2"

if command -v unzip >/dev/null 2>&1; then
unzip -q "${zip_file}" -d "${dest_dir}"
elif command -v busybox >/dev/null 2>&1 && busybox unzip --help >/dev/null 2>&1; then
busybox unzip -q "${zip_file}" -d "${dest_dir}"
elif command -v bsdtar >/dev/null 2>&1; then
(cd "${dest_dir}" && bsdtar -xf "${zip_file}")
else
echo "::error::unzip, busybox unzip, or bsdtar is required to extract artifacts"
return 1
fi
}

print_manifest_group() {
echo "::group::Build artifact manifest"
echo "=== artifact download context ==="
echo "repository: ${repo}"
echo "artifact_name: ${artifact_name}"
echo "artifact_id: ${artifact_id}"
echo "artifact_dir: ${artifact_dir}"
echo "expected_digest: ${expected_digest:-unknown}"
echo ""
echo "=== downloaded files ==="
if [[ -d "${artifact_dir}" ]]; then
find "${artifact_dir}" -maxdepth 2 -type f -printf '%p\t%s bytes\n' | sort
else
echo "${artifact_dir} directory is missing"
fi
echo "::endgroup::"
}

verify_artifacts() {
local missing=0
print_manifest_group
for pattern in "${required_patterns[@]}"; do
if ! find "${artifact_dir}" -maxdepth 2 -type f -name "${pattern}" | grep -q .; then
echo "::error::Missing required build artifact matching ${pattern} under ${artifact_dir}"
missing=1
fi
done
return "${missing}"
}

download_once() {
local attempt="$1"
local tmp_dir artifact_zip curl_config expected_sha actual_sha download_url
tmp_dir="$(mktemp -d)"
artifact_zip="${tmp_dir}/artifact.zip"
curl_config="${tmp_dir}/curl.conf"
download_url="${api_url}/repos/${repo}/actions/artifacts/${artifact_id}/zip"

cat > "${curl_config}" <<EOF
header = "Accept: application/vnd.github+json"
header = "Authorization: Bearer ${token}"
header = "X-GitHub-Api-Version: 2022-11-28"
EOF

echo "Preparing to download artifact ${artifact_name} (ID: ${artifact_id}, Expected Digest: ${expected_digest:-unknown})"
echo "Downloading artifact attempt ${attempt} to ${artifact_zip}"

if ! curl --fail --silent --show-error --location \
--connect-timeout 30 \
--max-time 1800 \
--config "${curl_config}" \
--output "${artifact_zip}" \
"${download_url}"; then
rm -rf "${tmp_dir}"
return 1
fi

if [[ -n "${expected_digest}" ]]; then
expected_sha="${expected_digest#sha256:}"
actual_sha="$(sha256sum "${artifact_zip}" | awk '{print $1}')"
echo "SHA256 digest of downloaded artifact is ${actual_sha}"
if [[ "${actual_sha}" != "${expected_sha}" ]]; then
echo "::error::Artifact digest mismatch: expected ${expected_sha}, got ${actual_sha}"
rm -rf "${tmp_dir}"
return 1
fi
else
echo "::warning::No expected artifact digest was provided; validating file contents only."
fi

rm -rf "${artifact_dir}"
mkdir -p "${artifact_dir}"
if ! extract_zip "${artifact_zip}" "${artifact_dir}"; then
rm -rf "${tmp_dir}"
return 1
fi

rm -rf "${tmp_dir}"
return 0
}

max_attempts=3
for attempt in $(seq 1 "${max_attempts}"); do
if download_once "${attempt}" && verify_artifacts; then
echo "Build artifact download and verification succeeded on attempt ${attempt}."
exit 0
fi
if [[ "${attempt}" -lt "${max_attempts}" ]]; then
echo "::warning::Build artifact download/verification failed; retrying ($((max_attempts - attempt)) retries left)."
fi
done

echo "::error::Build artifact download/verification failed after 2 retries."
exit 1