diff --git a/.github/workflows/UploadDockerImages.yml b/.github/workflows/UploadDockerImages.yml index ca79b8f4ba..9cadb6b338 100644 --- a/.github/workflows/UploadDockerImages.yml +++ b/.github/workflows/UploadDockerImages.yml @@ -1,4 +1,4 @@ -# Copyright 2023–2025 Google LLC +# Copyright 2023–2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -42,27 +42,16 @@ permissions: contents: read jobs: - setup: - runs-on: ubuntu-latest - outputs: - maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }} - image_date: ${{ steps.vars.outputs.image_date }} - steps: - - name: Checkout MaxText - uses: actions/checkout@v5 - - - name: Get metadata - id: vars - run: | - # MaxText SHA - echo "maxtext_sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT - - # Image date - echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT + build_and_upload_maxtext_package: + uses: ./.github/workflows/build_package.yml + with: + device_type: tpu + device_name: v4-8 + cloud_runner: linux-x86-n2-16-buildkit build-and-push: name: ${{ matrix.image_name }} - needs: setup + needs: build_and_upload_maxtext_package strategy: fail-fast: false matrix: @@ -71,72 +60,49 @@ jobs: build_mode: stable workflow: pre-training image_name: maxtext_jax_stable - dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile + dockerfile: maxtext_tpu_dependencies.Dockerfile - device: tpu build_mode: nightly workflow: pre-training image_name: maxtext_jax_nightly - dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile + dockerfile: maxtext_tpu_dependencies.Dockerfile - device: tpu build_mode: nightly workflow: post-training image_name: maxtext_post_training_nightly - dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile + dockerfile: maxtext_tpu_dependencies.Dockerfile - device: gpu build_mode: stable workflow: pre-training image_name: maxtext_gpu_jax_stable - dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile + dockerfile: maxtext_gpu_dependencies.Dockerfile - device: gpu build_mode: nightly workflow: pre-training image_name: maxtext_gpu_jax_nightly - dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile + dockerfile: maxtext_gpu_dependencies.Dockerfile uses: ./.github/workflows/build_and_push_docker_image.yml with: - image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }} + image_name: ${{ inputs.image_suffix != '' && format('{0}_{1}', matrix.image_name, inputs.image_suffix) || matrix.image_name }} device: ${{ matrix.device }} build_mode: ${{ matrix.build_mode }} workflow: ${{ matrix.workflow }} dockerfile: ${{ matrix.dockerfile }} - maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} - image_date: ${{ needs.setup.outputs.image_date }} + maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} + include_test_assets: true secrets: HF_TOKEN: ${{ secrets.HF_TOKEN }} - promote: - name: promote-${{ matrix.image_name }} - needs: build-and-push - strategy: - fail-fast: false - matrix: - include: - - device: tpu - build_mode: stable - workflow: pre-training - image_name: maxtext_jax_stable - - device: tpu - build_mode: nightly - workflow: pre-training - image_name: maxtext_jax_nightly - - device: tpu - build_mode: nightly - workflow: post-training - image_name: maxtext_post_training_nightly - - device: gpu - build_mode: stable - workflow: pre-training - image_name: maxtext_gpu_jax_stable - - device: gpu - build_mode: nightly - workflow: pre-training - image_name: maxtext_gpu_jax_nightly - - uses: ./.github/workflows/promote_docker_image.yml - with: - image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }} - image_tag: ${{ github.run_id }} - device: ${{ matrix.device }} - workflow: ${{ matrix.workflow }} - secrets: - HF_TOKEN: ${{ secrets.HF_TOKEN }} \ No newline at end of file + notify_failure: + name: Notify failed build + needs: [build-and-push] + if: ${{ failure() && inputs.image_suffix == '' }} + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - name: Create issue on failure + uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + title-template: "MaxText Docker Image Build Failure" diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml index c74372ad24..5cdc2aaf41 100644 --- a/.github/workflows/build_and_push_docker_image.yml +++ b/.github/workflows/build_and_push_docker_image.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Google LLC +# Copyright 2023-2026 Google LLC # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,9 +34,6 @@ on: maxtext_sha: required: true type: string - image_date: - required: false - type: string workflow: required: false type: string @@ -45,6 +42,10 @@ on: required: false type: string default: '' + include_test_assets: + required: false + type: boolean + default: false secrets: HF_TOKEN: required: true @@ -53,36 +54,42 @@ permissions: contents: read jobs: - build_and_push: - runs-on: linux-x86-n2-16-buildkit - container: google/cloud-sdk:524.0.0 - if: > - github.event_name == 'release' || - github.event_name == 'schedule' || - github.event_name == 'pull_request' || - github.event_name == 'workflow_dispatch' && ( - github.event.inputs.target_device == 'all' || - github.event.inputs.target_device == 'tpu' || - github.event.inputs.target_device == 'gpu' - ) + pre_build_check: + runs-on: ubuntu-latest + outputs: + should_run: ${{ steps.check.outputs.should_run }} steps: - name: Check if build should run id: check shell: bash run: | - if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "all" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "${INPUTS_DEVICE}" ]]; then - echo "should_run=false" >> $GITHUB_OUTPUT - echo "Skipping ${INPUTS_IMAGE_NAME} build for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode." - else + EVENT_NAME="${{ github.event_name }}" + TARGET_DEVICE="${{ github.event.inputs.target_device }}" + INPUT_DEVICE="${{ inputs.device }}" + + SHOULD_RUN="false" + if [[ "$EVENT_NAME" == "release" || "$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "pull_request" ]]; then + SHOULD_RUN="true" + elif [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then + if [[ "$TARGET_DEVICE" == "all" || "$TARGET_DEVICE" == "$INPUT_DEVICE" ]]; then + SHOULD_RUN="true" + fi + fi + + if [[ "$SHOULD_RUN" == "true" ]]; then echo "should_run=true" >> $GITHUB_OUTPUT - echo "Building ${INPUTS_IMAGE_NAME} for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode." + echo "Building ${{ inputs.image_name }} for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode." + else + echo "should_run=false" >> $GITHUB_OUTPUT + echo "Skipping ${{ inputs.image_name }} build for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode." fi - env: - GITHUB_EVENT_INPUTS_TARGET_DEVICE: ${{ github.event.inputs.target_device }} - INPUTS_DEVICE: ${{ inputs.device }} - INPUTS_IMAGE_NAME: ${{ inputs.image_name }} - INPUTS_BUILD_MODE: ${{ inputs.build_mode }} + build_and_push: + needs: pre_build_check + runs-on: linux-x86-n2-16-buildkit + container: google/cloud-sdk:524.0.0 + if: needs.pre_build_check.outputs.should_run == 'true' + steps: - name: Matrix Debugger run: | echo "device: ${{ inputs.device }}" @@ -93,50 +100,68 @@ jobs: - name: Checkout MaxText uses: actions/checkout@v5 - if: steps.check.outputs.should_run == 'true' with: - # This ensures that every job clones the exact same commit as "setup" job ref: ${{ inputs.maxtext_sha }} - name: Mark git repositories as safe run: git config --global --add safe.directory ${GITHUB_WORKSPACE} - if: steps.check.outputs.should_run == 'true' - name: Configure Docker run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q - if: steps.check.outputs.should_run == 'true' - name: Set up Docker BuildX uses: docker/setup-buildx-action@v3.11.1 - if: steps.check.outputs.should_run == 'true' with: driver: remote endpoint: tcp://localhost:1234 + - name: Download MaxText wheel + uses: actions/download-artifact@v4 + with: + name: maxtext-wheel + + - name: Install uv and set Python version + uses: astral-sh/setup-uv@v7 + with: + python-version: '3.12' + enable-cache: true + + - name: Install MaxText wheel + shell: bash + run: | + uv venv --seed + source .venv/bin/activate + maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null) + uv pip install ${maxtext_wheel}[runner] --resolution=lowest + + - name: Copy tests assets to package directory + if: inputs.include_test_assets == true + shell: bash + run: | + source .venv/bin/activate + cp -r ${PWD}/tests .venv/lib/python3.12/site-packages/ + cp ${PWD}/pytest.ini .venv/lib/python3.12/site-packages/ + - name: Build and push Docker image uses: docker/build-push-action@v6 - if: steps.check.outputs.should_run == 'true' with: push: true context: . - file: ${{ inputs.dockerfile }} - tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ github.run_id }} + file: .venv/lib/python3.12/site-packages/dependencies/dockerfiles/${{ inputs.dockerfile }} + tags: gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}:${{ github.run_id }} cache-from: type=gha outputs: type=image,compression=zstd,force-compression=true build-args: | DEVICE=${{ inputs.device }} MODE=${{ inputs.build_mode }} WORKFLOW=${{ inputs.workflow }} - PACKAGE_DIR=./src - JAX_VERSION=NONE - LIBTPU_VERSION=NONE - INCLUDE_TEST_ASSETS=true + PACKAGE_DIR=.venv/lib/python3.12/site-packages + INCLUDE_TEST_ASSETS=${{ inputs.include_test_assets }} - name: Add tags to Docker image - if: steps.check.outputs.should_run == 'true' shell: bash run: | - SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}" + SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${INPUTS_IMAGE_NAME}" TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}" if [[ $INPUTS_VERSION_NAME ]]; then @@ -146,16 +171,26 @@ jobs: echo "Tagging docker images corresponding to nightly release..." # Add date tag - gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet + IMAGE_DATE="$(date +%Y-%m-%d)" + gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:$IMAGE_DATE" --quiet # Convert date to YYYYMMDD format - clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8) + clean_date=$(echo "$IMAGE_DATE" | sed 's/[-:]//g' | cut -c1-8) # Add MaxText tag + MAXTEXT_SHA=$(git rev-parse --short HEAD) gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${MAXTEXT_SHA}_${clean_date}" --quiet fi env: INPUTS_IMAGE_NAME: ${{ inputs.image_name }} - INPUTS_IMAGE_DATE: ${{ inputs.image_date }} INPUTS_VERSION_NAME: ${{ inputs.version_name }} - MAXTEXT_SHA: ${{ inputs.maxtext_sha }} + + promote_image: + needs: [pre_build_check, build_and_push] + if: needs.pre_build_check.outputs.should_run == 'true' && inputs.include_test_assets == true + uses: ./.github/workflows/promote_docker_image.yml + with: + image_name: ${{ inputs.image_name }} + image_tag: ${{ github.run_id }} + device: ${{ inputs.device }} + workflow: ${{ inputs.workflow }} diff --git a/.github/workflows/promote_docker_image.yml b/.github/workflows/promote_docker_image.yml index 437b09218d..67daf27f2e 100644 --- a/.github/workflows/promote_docker_image.yml +++ b/.github/workflows/promote_docker_image.yml @@ -31,10 +31,6 @@ on: workflow: required: true type: string - secrets: - HF_TOKEN: - required: false - permissions: contents: read @@ -49,7 +45,7 @@ jobs: id: check shell: bash run: | - if gcloud container images describe "gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_tag }}" >/dev/null 2>&1; then + if gcloud container images describe "gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}:${{ inputs.image_tag }}" >/dev/null 2>&1; then echo "exists=true" >> $GITHUB_OUTPUT else echo "exists=false" >> $GITHUB_OUTPUT @@ -87,9 +83,6 @@ jobs: - name: Add tags to Docker image shell: bash run: | - SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}" - TEMP_IMG="${SOURCE_IMAGE}:${{ inputs.image_tag }}" - # Latest Tag - gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet - env: - INPUTS_IMAGE_NAME: ${{ inputs.image_name }} + # Add Latest Tag + SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}" + gcloud container images add-tag "${SOURCE_IMAGE}:${{ inputs.image_tag }}" "${SOURCE_IMAGE}:latest" --quiet diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/pypi_release.yml index 495e682867..0eba10c310 100644 --- a/.github/workflows/pypi_release.yml +++ b/.github/workflows/pypi_release.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Google LLC +# Copyright 2023-2026 Google LLC # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -103,17 +103,17 @@ jobs: build_mode: stable image_name: maxtext_jax_stable workflow: pre-training - dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile + dockerfile: maxtext_tpu_dependencies.Dockerfile - device: gpu build_mode: stable image_name: maxtext_gpu_jax_stable workflow: pre-training - dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile + dockerfile: maxtext_gpu_dependencies.Dockerfile - device: tpu build_mode: stable image_name: maxtext_post_training_stable workflow: post-training - dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile + dockerfile: maxtext_tpu_dependencies.Dockerfile uses: ./.github/workflows/build_and_push_docker_image.yml with: image_name: ${{ matrix.image_name }} diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml index 2955e082f0..c4027b7c70 100644 --- a/.github/workflows/run_tests_against_package.yml +++ b/.github/workflows/run_tests_against_package.yml @@ -74,14 +74,13 @@ on: required: false type: boolean default: false - permissions: contents: read jobs: run: runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }} container: - image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }} + image: gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.base_image }} env: XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }} TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }} @@ -141,7 +140,7 @@ jobs: uv pip install pytest-cov PYTEST_COV_ARGS="--cov=MaxText --cov=maxtext --cov-report=xml --cov-report=term" fi - export PYTHONPATH="${pwd}/src${PYTHONPATH:+:${PYTHONPATH}}" + export PYTHONPATH="${PWD}/src${PYTHONPATH:+:${PYTHONPATH}}" if [ "${INPUTS_IS_SCHEDULED_RUN}" == "true" ]; then FINAL_PYTEST_MARKER="${INPUTS_PYTEST_MARKER}" diff --git a/.github/workflows/update_reference_hlo.yml b/.github/workflows/update_reference_hlo.yml index d9a473ba42..97af9f9af2 100644 --- a/.github/workflows/update_reference_hlo.yml +++ b/.github/workflows/update_reference_hlo.yml @@ -2,6 +2,7 @@ name: "Update HLO References (for hlo_diff_test.py)" on: workflow_dispatch: + permissions: contents: read diff --git a/src/dependencies/scripts/docker_build_dependency_image.sh b/src/dependencies/scripts/docker_build_dependency_image.sh index 3705334014..bc328f21d7 100644 --- a/src/dependencies/scripts/docker_build_dependency_image.sh +++ b/src/dependencies/scripts/docker_build_dependency_image.sh @@ -63,6 +63,9 @@ fi if [[ -z ${WORKFLOW} ]]; then export WORKFLOW=pre-training fi +if [[ -z ${INCLUDE_TEST_ASSETS} ]]; then + export INCLUDE_TEST_ASSETS=false +fi # Create docker build arguments array docker_build_args=( @@ -71,6 +74,7 @@ docker_build_args=( "MODE=${MODE}" "JAX_VERSION=${JAX_VERSION}" "PACKAGE_DIR=${PACKAGE_DIR}" + "INCLUDE_TEST_ASSETS=${INCLUDE_TEST_ASSETS}" ) run_docker_build() {