Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 29 additions & 63 deletions .github/workflows/UploadDockerImages.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023–2025 Google LLC
# Copyright 2023–2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,27 +42,16 @@ permissions:
contents: read

jobs:
setup:
runs-on: ubuntu-latest
outputs:
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
image_date: ${{ steps.vars.outputs.image_date }}
steps:
- name: Checkout MaxText
uses: actions/checkout@v5

- name: Get metadata
id: vars
run: |
# MaxText SHA
echo "maxtext_sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT

# Image date
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
build_and_upload_maxtext_package:
uses: ./.github/workflows/build_package.yml
with:
device_type: tpu
device_name: v4-8
cloud_runner: linux-x86-n2-16-buildkit

build-and-push:
name: ${{ matrix.image_name }}
needs: setup
needs: build_and_upload_maxtext_package
strategy:
fail-fast: false
matrix:
Expand All @@ -71,72 +60,49 @@ jobs:
build_mode: stable
workflow: pre-training
image_name: maxtext_jax_stable
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
dockerfile: maxtext_tpu_dependencies.Dockerfile
- device: tpu
build_mode: nightly
workflow: pre-training
image_name: maxtext_jax_nightly
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
dockerfile: maxtext_tpu_dependencies.Dockerfile
- device: tpu
build_mode: nightly
workflow: post-training
image_name: maxtext_post_training_nightly
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
dockerfile: maxtext_tpu_dependencies.Dockerfile
- device: gpu
build_mode: stable
workflow: pre-training
image_name: maxtext_gpu_jax_stable
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
dockerfile: maxtext_gpu_dependencies.Dockerfile
- device: gpu
build_mode: nightly
workflow: pre-training
image_name: maxtext_gpu_jax_nightly
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
dockerfile: maxtext_gpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
Comment thread
SurbhiJainUSC marked this conversation as resolved.
image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }}
image_name: ${{ inputs.image_suffix != '' && format('{0}_{1}', matrix.image_name, inputs.image_suffix) || matrix.image_name }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
workflow: ${{ matrix.workflow }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
include_test_assets: true
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

promote:
name: promote-${{ matrix.image_name }}
needs: build-and-push
strategy:
fail-fast: false
matrix:
include:
- device: tpu
build_mode: stable
workflow: pre-training
image_name: maxtext_jax_stable
- device: tpu
build_mode: nightly
workflow: pre-training
image_name: maxtext_jax_nightly
- device: tpu
build_mode: nightly
workflow: post-training
image_name: maxtext_post_training_nightly
- device: gpu
build_mode: stable
workflow: pre-training
image_name: maxtext_gpu_jax_stable
- device: gpu
build_mode: nightly
workflow: pre-training
image_name: maxtext_gpu_jax_nightly

uses: ./.github/workflows/promote_docker_image.yml
with:
image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }}
image_tag: ${{ github.run_id }}
device: ${{ matrix.device }}
workflow: ${{ matrix.workflow }}
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
notify_failure:
name: Notify failed build
needs: [build-and-push]
if: ${{ failure() && inputs.image_suffix == '' }}
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- name: Create issue on failure
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
title-template: "MaxText Docker Image Build Failure"
123 changes: 79 additions & 44 deletions .github/workflows/build_and_push_docker_image.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2025 Google LLC
# Copyright 2023-2026 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -34,9 +34,6 @@ on:
maxtext_sha:
required: true
type: string
image_date:
required: false
type: string
workflow:
required: false
type: string
Expand All @@ -45,6 +42,10 @@ on:
required: false
type: string
default: ''
include_test_assets:
required: false
type: boolean
default: false
secrets:
HF_TOKEN:
required: true
Expand All @@ -53,36 +54,42 @@ permissions:
contents: read

jobs:
build_and_push:
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
if: >
github.event_name == 'release' ||
github.event_name == 'schedule' ||
github.event_name == 'pull_request' ||
github.event_name == 'workflow_dispatch' && (
github.event.inputs.target_device == 'all' ||
github.event.inputs.target_device == 'tpu' ||
github.event.inputs.target_device == 'gpu'
)
pre_build_check:
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.check.outputs.should_run }}
steps:
- name: Check if build should run
id: check
shell: bash
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "all" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "${INPUTS_DEVICE}" ]]; then
echo "should_run=false" >> $GITHUB_OUTPUT
echo "Skipping ${INPUTS_IMAGE_NAME} build for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode."
else
EVENT_NAME="${{ github.event_name }}"
TARGET_DEVICE="${{ github.event.inputs.target_device }}"
INPUT_DEVICE="${{ inputs.device }}"

SHOULD_RUN="false"
if [[ "$EVENT_NAME" == "release" || "$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "pull_request" ]]; then
SHOULD_RUN="true"
elif [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
if [[ "$TARGET_DEVICE" == "all" || "$TARGET_DEVICE" == "$INPUT_DEVICE" ]]; then
SHOULD_RUN="true"
fi
fi

if [[ "$SHOULD_RUN" == "true" ]]; then
echo "should_run=true" >> $GITHUB_OUTPUT
echo "Building ${INPUTS_IMAGE_NAME} for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode."
echo "Building ${{ inputs.image_name }} for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
else
echo "should_run=false" >> $GITHUB_OUTPUT
echo "Skipping ${{ inputs.image_name }} build for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
fi
env:
GITHUB_EVENT_INPUTS_TARGET_DEVICE: ${{ github.event.inputs.target_device }}
INPUTS_DEVICE: ${{ inputs.device }}
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
INPUTS_BUILD_MODE: ${{ inputs.build_mode }}

build_and_push:
needs: pre_build_check
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
if: needs.pre_build_check.outputs.should_run == 'true'
steps:
- name: Matrix Debugger
run: |
echo "device: ${{ inputs.device }}"
Expand All @@ -93,50 +100,68 @@ jobs:

- name: Checkout MaxText
uses: actions/checkout@v5
if: steps.check.outputs.should_run == 'true'
with:
# This ensures that every job clones the exact same commit as "setup" job
ref: ${{ inputs.maxtext_sha }}

- name: Mark git repositories as safe
run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
if: steps.check.outputs.should_run == 'true'

- name: Configure Docker
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
if: steps.check.outputs.should_run == 'true'

- name: Set up Docker BuildX
uses: docker/setup-buildx-action@v3.11.1
if: steps.check.outputs.should_run == 'true'
with:
driver: remote
endpoint: tcp://localhost:1234

- name: Download MaxText wheel
uses: actions/download-artifact@v4
with:
name: maxtext-wheel

- name: Install uv and set Python version
uses: astral-sh/setup-uv@v7
with:
python-version: '3.12'
enable-cache: true

- name: Install MaxText wheel
shell: bash
run: |
uv venv --seed
source .venv/bin/activate
maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null)
Comment thread
SurbhiJainUSC marked this conversation as resolved.
uv pip install ${maxtext_wheel}[runner] --resolution=lowest
Comment thread
SurbhiJainUSC marked this conversation as resolved.

- name: Copy tests assets to package directory
if: inputs.include_test_assets == true
shell: bash
run: |
Comment thread
SurbhiJainUSC marked this conversation as resolved.
source .venv/bin/activate
cp -r ${PWD}/tests .venv/lib/python3.12/site-packages/
cp ${PWD}/pytest.ini .venv/lib/python3.12/site-packages/

- name: Build and push Docker image
uses: docker/build-push-action@v6
if: steps.check.outputs.should_run == 'true'
with:
push: true
context: .
file: ${{ inputs.dockerfile }}
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ github.run_id }}
file: .venv/lib/python3.12/site-packages/dependencies/dockerfiles/${{ inputs.dockerfile }}
tags: gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}:${{ github.run_id }}
cache-from: type=gha
outputs: type=image,compression=zstd,force-compression=true
build-args: |
Comment thread
SurbhiJainUSC marked this conversation as resolved.
DEVICE=${{ inputs.device }}
MODE=${{ inputs.build_mode }}
WORKFLOW=${{ inputs.workflow }}
PACKAGE_DIR=./src
JAX_VERSION=NONE
LIBTPU_VERSION=NONE
INCLUDE_TEST_ASSETS=true
PACKAGE_DIR=.venv/lib/python3.12/site-packages
INCLUDE_TEST_ASSETS=${{ inputs.include_test_assets }}

- name: Add tags to Docker image
if: steps.check.outputs.should_run == 'true'
shell: bash
run: |
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${INPUTS_IMAGE_NAME}"
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"

if [[ $INPUTS_VERSION_NAME ]]; then
Expand All @@ -146,16 +171,26 @@ jobs:
echo "Tagging docker images corresponding to nightly release..."

# Add date tag
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
IMAGE_DATE="$(date +%Y-%m-%d)"
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:$IMAGE_DATE" --quiet

# Convert date to YYYYMMDD format
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
clean_date=$(echo "$IMAGE_DATE" | sed 's/[-:]//g' | cut -c1-8)

# Add MaxText tag
MAXTEXT_SHA=$(git rev-parse --short HEAD)
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${MAXTEXT_SHA}_${clean_date}" --quiet
fi
env:
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
INPUTS_VERSION_NAME: ${{ inputs.version_name }}
MAXTEXT_SHA: ${{ inputs.maxtext_sha }}

promote_image:
needs: [pre_build_check, build_and_push]
if: needs.pre_build_check.outputs.should_run == 'true' && inputs.include_test_assets == true
uses: ./.github/workflows/promote_docker_image.yml
with:
image_name: ${{ inputs.image_name }}
image_tag: ${{ github.run_id }}
device: ${{ inputs.device }}
workflow: ${{ inputs.workflow }}
15 changes: 4 additions & 11 deletions .github/workflows/promote_docker_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ on:
workflow:
required: true
type: string
secrets:
HF_TOKEN:
required: false

permissions:
contents: read

Expand All @@ -49,7 +45,7 @@ jobs:
id: check
shell: bash
run: |
if gcloud container images describe "gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_tag }}" >/dev/null 2>&1; then
if gcloud container images describe "gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}:${{ inputs.image_tag }}" >/dev/null 2>&1; then
echo "exists=true" >> $GITHUB_OUTPUT
else
echo "exists=false" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -87,9 +83,6 @@ jobs:
- name: Add tags to Docker image
shell: bash
run: |
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
TEMP_IMG="${SOURCE_IMAGE}:${{ inputs.image_tag }}"
# Latest Tag
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
env:
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
# Add Latest Tag
SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}"
gcloud container images add-tag "${SOURCE_IMAGE}:${{ inputs.image_tag }}" "${SOURCE_IMAGE}:latest" --quiet
Loading
Loading