Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 56 additions & 17 deletions .github/workflows/build_and_test_maxtext.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,23 @@ on:
pull_request:
workflow_call:
workflow_dispatch:
inputs:
rocm_only:
description: 'Run only ROCm jobs (manual runs only)'
type: boolean
required: false
default: false
schedule:
# Run the job every 4 hours
- cron: '0 */4 * * *'
- cron: '0 3 * * *' # Daily 03:00 UTC

concurrency:
# Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
# Cancel previous runs for same PR (all actors), scheduled runs,
# and manual runs per (branch + actor).
group: >
${{
github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) ||
github.event_name == 'schedule' && format('{0}-schedule', github.workflow) ||
github.event_name == 'workflow_dispatch' && format('{0}-manual-{1}-{2}', github.workflow, github.ref, github.actor) ||
github.run_id
}}
cancel-in-progress: true
Expand Down Expand Up @@ -118,18 +125,16 @@ jobs:
build_and_upload_maxtext_package:
needs: analyze_code_changes
# Run if either tests or notebooks need to run
if: |
needs.analyze_code_changes.outputs.run_tests == 'true' ||
needs.analyze_code_changes.outputs.run_notebooks == 'true'
if: ${{ vars.ROCM_ONLY == 'true' || needs.analyze_code_changes.outputs.run_tests == 'true' || needs.analyze_code_changes.outputs.run_notebooks == 'true' }}
uses: ./.github/workflows/build_package.yml
with:
device_type: tpu
device_name: v4-8
cloud_runner: linux-x86-n2-16-buildkit
device_type: ${{ vars.ROCM_ONLY == 'true' && 'rocm' || 'tpu' }}
device_name: ${{ vars.ROCM_ONLY == 'true' && 'mi355' || 'v4-8' }}
cloud_runner: ${{ vars.ROCM_ONLY == 'true' && 'linux-x86-64-4gpu-amd' || 'linux-x86-n2-16-buildkit' }}

maxtext_jupyter_notebooks:
needs: build_and_upload_maxtext_package
if: needs.analyze_code_changes.outputs.run_notebooks == 'true'
if: ${{ vars.ROCM_ONLY != 'true' && !(github.event_name == 'workflow_dispatch' && inputs.rocm_only) && needs.analyze_code_changes.outputs.run_notebooks == 'true' }}
uses: ./.github/workflows/run_jupyter_notebooks.yml
strategy:
fail-fast: false
Expand All @@ -145,7 +150,7 @@ jobs:
tpu-tests:
name: ${{ matrix.flavor }} tests
needs: [build_and_upload_maxtext_package]
if: needs.analyze_code_changes.outputs.run_tests == 'true'
if: ${{ vars.ROCM_ONLY != 'true' && !(github.event_name == 'workflow_dispatch' && inputs.rocm_only) && needs.analyze_code_changes.outputs.run_tests == 'true' }}
uses: ./.github/workflows/run_tests_coordinator.yml
strategy:
fail-fast: false
Expand All @@ -160,7 +165,7 @@ jobs:
gpu-tests:
name: ${{ matrix.flavor }} tests
needs: [build_and_upload_maxtext_package]
if: needs.analyze_code_changes.outputs.run_tests == 'true'
if: ${{ vars.ROCM_ONLY != 'true' && !(github.event_name == 'workflow_dispatch' && inputs.rocm_only) && needs.analyze_code_changes.outputs.run_tests == 'true' }}
strategy:
fail-fast: false
matrix:
Expand All @@ -175,7 +180,7 @@ jobs:
cpu-tests:
name: ${{ matrix.flavor }} tests
needs: [build_and_upload_maxtext_package]
if: needs.analyze_code_changes.outputs.run_tests == 'true'
if: ${{ vars.ROCM_ONLY != 'true' && !(github.event_name == 'workflow_dispatch' && inputs.rocm_only) && needs.analyze_code_changes.outputs.run_tests == 'true' }}
uses: ./.github/workflows/run_tests_coordinator.yml
strategy:
fail-fast: false
Expand All @@ -189,7 +194,7 @@ jobs:

maxtext_tpu_pathways_unit_tests:
needs: build_and_upload_maxtext_package
if: needs.analyze_code_changes.outputs.run_tests == 'true'
if: ${{ vars.ROCM_ONLY != 'true' && !(github.event_name == 'workflow_dispatch' && inputs.rocm_only) && needs.analyze_code_changes.outputs.run_tests == 'true' }}
uses: ./.github/workflows/run_pathways_tests.yml
strategy:
fail-fast: false
Expand Down Expand Up @@ -224,9 +229,39 @@ jobs:
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

rocm-tests:
name: ${{ matrix.flavor }} tests
needs: [build_and_upload_maxtext_package]
if: ${{ vars.ROCM_ONLY != 'true' && needs.analyze_code_changes.outputs.run_tests == 'true' }}
uses: ./.github/workflows/run_tests_coordinator.yml
strategy:
fail-fast: false
matrix:
flavor: [rocm-unit]
with:
flavor: ${{ matrix.flavor }}
base_image: 'rocm-placeholder'
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

rocm-decoupled-tests:
name: ${{ matrix.flavor }} tests
needs: [build_and_upload_maxtext_package]
if: ${{ vars.ROCM_ONLY == 'true' || (github.event_name == 'workflow_dispatch' && inputs.rocm_only) || needs.analyze_code_changes.outputs.run_tests == 'true' }}
uses: ./.github/workflows/run_tests_coordinator.yml
strategy:
fail-fast: false
matrix:
flavor: [rocm-decoupled]
with:
flavor: ${{ matrix.flavor }}
base_image: 'rocm-placeholder'
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

all_tests_passed:
name: All Required Tests Passed
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
needs: [analyze_code_changes, build_and_upload_maxtext_package, tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, rocm-tests, rocm-decoupled-tests]
if: always()
runs-on: ubuntu-latest
steps:
Expand All @@ -244,6 +279,8 @@ jobs:
echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
echo "ROCm Tests (Matrix) result: ${NEEDS_ROCM_TESTS_RESULT}"
echo "ROCm Decoupled Tests (Matrix) result: ${NEEDS_ROCM_DECOUPLED_TESTS_RESULT}"

# Fail only if any job failed or was cancelled (skipped is OK)
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
Expand All @@ -260,11 +297,13 @@ jobs:
NEEDS_GPU_TESTS_RESULT: ${{ needs.gpu-tests.result }}
NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
NEEDS_ROCM_TESTS_RESULT: ${{ needs.rocm-tests.result }}
NEEDS_ROCM_DECOUPLED_TESTS_RESULT: ${{ needs.rocm-decoupled-tests.result }}

all_notebooks_passed:
name: All Notebooks Passed
needs: [analyze_code_changes, build_and_upload_maxtext_package, maxtext_jupyter_notebooks]
if: always()
if: ${{ vars.ROCM_ONLY != 'true' && !(github.event_name == 'workflow_dispatch' && inputs.rocm_only) && always() }}
runs-on: ubuntu-latest
steps:
- name: Check notebooks results
Expand Down Expand Up @@ -292,7 +331,7 @@ jobs:

notify_failure:
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, rocm-tests, rocm-decoupled-tests]
if: ${{ always() }}
runs-on: ubuntu-latest
permissions:
Expand Down
100 changes: 89 additions & 11 deletions .github/workflows/run_tests_against_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ on:
default: ''
is_scheduled_run:
required: true
type: string
type: boolean
xla_python_client_mem_fraction:
required: true
type: string
Expand All @@ -65,6 +65,17 @@ on:
description: 'Git SHA to checkout if MaxText is not pre-installed'
required: false
type: string
decoupled_mode:
required: false
type: boolean
default: false
requirements_file:
required: false
type: string
default: ''
extra_pip_deps_file:
required: false
type: string
# Flag to skip source checkout and wheel installation
maxtext_installed:
description: 'If false, maxtext_sha must be provided for checkout'
Expand All @@ -78,19 +89,24 @@ permissions:
contents: read
jobs:
run:
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || (inputs.device_type == 'rocm' && fromJson('["self-hosted","linux-x86-64-4gpu-amd"]')) || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
timeout-minutes: ${{ inputs.device_type == 'rocm' && 90 || 360 }}
container:
image: gcr.io/${{ vars.PROJECT_NAME || 'tpu-prod-env-multipod' }}/${{ inputs.base_image }}
image: ${{ inputs.device_type == 'rocm' && 'ghcr.io/rocm/jax-base-ubu24.rocm720:latest' || format('gcr.io/{0}/{1}', vars.PROJECT_NAME || 'tpu-prod-env-multipod', inputs.base_image) }}
env:
XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }}
TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }}
TPU_SKIP_MDS_QUERY: ${{ inputs.device_type == 'cpu' && '1' || '' }}
# ROCm installs the wheel with --no-deps plus a pinned requirements file; never use TPU wheel extras.
MAXTEXT_PACKAGE_EXTRA: >-
${{
!contains(inputs.pytest_marker, 'not post_training') && 'tpu-post-train'
inputs.device_type == 'rocm' && 'rocm'
|| !contains(inputs.pytest_marker, 'not post_training') && 'tpu-post-train'
|| (inputs.device_type == 'cpu' && 'tpu' || inputs.device_type)
}}
ALLOW_MULTIPLE_LIBTPU_LOAD: ${{ inputs.device_type == 'cpu' && 'true' || '' }} # bypass /tmp/libtpu_lockfile check for cpu tests, which don't actually use accelerators (to allow concurrency)
DECOUPLE_GCLOUD: ${{ inputs.decoupled_mode && 'TRUE' || '' }}
LOCAL_GCLOUD_PROJECT: ${{ inputs.decoupled_mode && 'ci-decoupled' || '' }}
options: ${{ inputs.container_resource_option }}
steps:
- name: Checkout MaxText
Expand All @@ -107,20 +123,79 @@ jobs:
if: ${{ !inputs.maxtext_installed }}
shell: bash
run: |
if [ "${{ inputs.device_type }}" = "rocm" ]; then
python3 -m pip install -U uv
fi
python3 -m uv venv --seed
source .venv/bin/activate
maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null)
echo "Installing ${maxtext_wheel} for ${MAXTEXT_PACKAGE_EXTRA}..."
uv pip install ${maxtext_wheel}[${MAXTEXT_PACKAGE_EXTRA}] --resolution=lowest
if [ "${MAXTEXT_PACKAGE_EXTRA}" == "tpu-post-train" ]; then
install_tpu_post_train_extra_deps
if [ "${{ inputs.device_type }}" = "rocm" ]; then
if [ -n "${{ inputs.requirements_file }}" ]; then
echo "Installing requirements from ${{ inputs.requirements_file }}"
uv pip install -r "${{ inputs.requirements_file }}"
fi
uv pip install ${maxtext_wheel} --no-deps
# When a requirements file is set (e.g. decoupled or rocm-unit JAX stacks), it already
# carries test/runtime pins; otherwise add GitHub-sourced pins (no TPU extra / scripts).
if [ -z "${{ inputs.requirements_file }}" ]; then
uv pip install -r src/dependencies/extra_deps/pre_train_github_deps.txt --no-deps
fi
else
install_tpu_pre_train_extra_deps
if [ -n "${{ inputs.requirements_file }}" ]; then
echo "Installing requirements from ${{ inputs.requirements_file }}"
uv pip install -r "${{ inputs.requirements_file }}"
uv pip install ${maxtext_wheel} --no-deps
else
uv pip install ${maxtext_wheel}[${MAXTEXT_PACKAGE_EXTRA}] --resolution=lowest
fi
if [ "${MAXTEXT_PACKAGE_EXTRA}" == "tpu-post-train" ]; then
install_tpu_post_train_extra_deps
else
install_tpu_pre_train_extra_deps
fi
fi
python3 --version
python3 -m pip freeze
- name: Install extra pip deps
if: inputs.extra_pip_deps_file != ''
shell: bash
run: |
source .venv/bin/activate
uv pip install -r ${{ inputs.extra_pip_deps_file }}

- name: Select ROCm arch (mi300/mi355)
if: ${{ inputs.device_type == 'rocm' }}
shell: bash
run: |
set -euo pipefail
echo "=== ROCm arch selection (from install_te_rocm_wheel.py) ==="
if [ -x .venv/bin/python3 ]; then
te_arch="$(
.venv/bin/python3 .github/workflows/utils/install_te_rocm_wheel.py --print-arch
)"
echo "[te detect_arch] ${te_arch}"
echo "TE_WHEEL_ARCH=${te_arch}" >> "${GITHUB_ENV}"
else
echo "No .venv python found; skipping arch selection."
fi
echo "========================================================="

- name: Install Transformer Engine wheel (ROCm)
if: ${{ inputs.device_type == 'rocm' }}
shell: bash
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
source .venv/bin/activate
set -euo pipefail

python3 .github/workflows/utils/install_te_rocm_wheel.py

uv pip install --no-deps --force-reinstall transformer_engine-*.whl

- name: Copy test assets files
if: ${{ !inputs.maxtext_installed }}
if: ${{ !inputs.maxtext_installed && !inputs.decoupled_mode }}
run : gcloud storage cp gs://maxtext-test-assets/* tests/assets
- name: Run Tests
shell: bash
Expand Down Expand Up @@ -153,8 +228,8 @@ jobs:
export MAXTEXT_ASSETS_ROOT=$(pwd)/src/maxtext/assets
export MAXTEXT_TEST_ASSETS_ROOT=$(pwd)/tests/assets
export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
# omit this libtpu init args for gpu tests
if [ "${INPUTS_DEVICE_TYPE}" != "cuda12" ]; then
# omit this libtpu init args for gpu tests (cuda + rocm)
if [ "${INPUTS_DEVICE_TYPE}" != "cuda12" ] && [ "${INPUTS_DEVICE_TYPE}" != "rocm" ]; then
export LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536'
else
# For cuda12, explicitly point to the pip-installed CUDA libraries
Expand All @@ -171,6 +246,9 @@ jobs:
done
fi
fi
if [ "${INPUTS_DEVICE_TYPE}" = "rocm" ]; then
ulimit -c 0
fi
if [ "${INPUTS_TOTAL_WORKERS}" -gt 1 ]; then
$PYTHON_EXE -m pip install --quiet pytest-split pytest-xdist
SPLIT_ARGS="--splits ${INPUTS_TOTAL_WORKERS} --group ${INPUTS_WORKER_GROUP} -n auto"
Expand Down
Loading
Loading