diff --git a/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md new file mode 100644 index 0000000000..9a4eefe22c --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md @@ -0,0 +1,62 @@ +# HBase Snapshot Import Helper Script Usage + +This document describes the environment variables used by the `run-snapshot-import.sh` script to automate HBase snapshot imports into Cloud Bigtable using Dataflow. + +## Environment Variables + +The script relies on the following environment variables. You should set them before executing the script. + +| Variable | Description | Example / Suggested Value | +| :--- | :--- | :--- | +| `PROJECT_ID` | The Google Cloud Project ID where the Bigtable instance and Dataflow jobs reside. | `your-project-id` | +| `INSTANCE_ID` | The Bigtable Instance ID to import data into. | `your-instance-id` | +| `BUCKET` | The GCS bucket name used for Dataflow staging, temp files, and default snapshot source path. | `your-gcs-bucket` | +| `REGION` | The GCP region to run the Dataflow jobs in. | `us-central1` | +| `TABLE_NAME` | The target Bigtable table name. | `your-table-name` | +| `SNAPSHOT_NAME` | The name of the HBase snapshot to import. | `your-snapshot-name` | +| `SNAPSHOT_SOURCE_DIR` | The GCS path where the HBase snapshot export is located. | `gs://your-gcs-bucket/snapshots` | +| `SERVICE_ACCOUNT` | The service account email to run the Dataflow jobs. | `your-service-account@developer.gserviceaccount.com` | +| `NUM_SHARDS` | The number of shards to split the import into for parallel processing. | `20` | +| `MAX_INFLIGHT_RPCS` | Maximum number of inflight RPCs for Bigtable client. | `100` | +| `BULK_MUTATION_CLOSE_TIMEOUT_MINUTES` | Timeout in minutes for closing bulk mutations. | `30` | +| `NETWORK` | VPC Network name for Dataflow workers. | `your-network` | +| `SUBNETWORK` | VPC Subnetwork name for Dataflow workers. | `regions/us-central1/subnetworks/your-subnetwork` | + +## Usage + +### Run a specific shard range +```bash +./run-snapshot-import.sh +``` +Example: `./run-snapshot-import.sh 0 5` + +### Run all shards (Auto-parallel mode) +```bash +./run-snapshot-import.sh --all +``` +This mode will first run the restore step, and then launch background processes for all shards in parallel groups of 4 by default. + +## Advanced Usage + +### Manual Parallel Execution + +To run shards in parallel groups (e.g., assuming 20 shards total), you can run multiple instances of this script. + +> [!IMPORTANT] +> Shard 0 performs the restore step. You MUST run the first group (including shard 0) first and let it complete the restore step before launching other groups in parallel. Otherwise, they will fail because the restored files won't exist yet! + +Example for manual parallel execution: +```bash +./run-snapshot-import.sh 0 3 & # Run this first! +# Wait for shard 0 to finish restore, then run the rest: +./run-snapshot-import.sh 4 7 & +./run-snapshot-import.sh 8 11 & +./run-snapshot-import.sh 12 15 & +./run-snapshot-import.sh 16 19 & +``` + +## Troubleshooting + +### JDK Compatibility + +If you are running on a newer JDK (like Java 21 or 26) and hit ByteBuddy errors, you can add `-Dnet.bytebuddy.experimental=true` to the `java` command lines in the script. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh new file mode 100755 index 0000000000..d66b042574 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# ============================================================================== +# HBase Snapshot Import Helper Script +# ============================================================================== +# This script runs a range of Dataflow snapshot import jobs sequentially or in parallel. +# Must be executed from the 'bigtable-dataflow-parent/bigtable-beam-import' directory. +# +# For detailed usage and advanced options, see: SNAPSHOT_IMPORT_USAGE.md +# ============================================================================== + +# ------------------------------------------------------------------------------ +# Environment Variables +# ------------------------------------------------------------------------------ +# Most users will need to set these variables before running the script. +# See SNAPSHOT_IMPORT_USAGE.md for details and expected values. + +# --- Required / Common Configurations --- +# export PROJECT_ID="your-project-id" +# export INSTANCE_ID="your-instance-id" +# export BUCKET="your-gcs-bucket" +# export REGION="us-central1" +# +# export TABLE_NAME="your-table-name" +# export SNAPSHOT_NAME="your-snapshot-name" +# export SNAPSHOT_SOURCE_DIR="gs://your-gcs-bucket/snapshots" +# export SERVICE_ACCOUNT="your-service-account" + +# --- Sharding & Tuning --- +# export NUM_SHARDS="20" +# export MAX_INFLIGHT_RPCS="100" +# export BULK_MUTATION_CLOSE_TIMEOUT_MINUTES="30" + +# --- Network Configurations --- +# export NETWORK="your-network" +# export SUBNETWORK="your-subnetwork" + +# ------------------------------------------------------------------------------ +# Usage +# ------------------------------------------------------------------------------ +# Usage: ./run-snapshot-import.sh +# Or: ./run-snapshot-import.sh --all +# (Runs all shards in parallel groups of 4 by default) +# +# Examples: +# ./run-snapshot-import.sh 0 3 +# ./run-snapshot-import.sh --all + +if [ "$#" -ne 2 ] && [ "$1" != "--all" ]; then + echo "Usage: $0 " + echo " Or: $0 --all" + exit 1 +fi + +START_SHARD=$1 +END_SHARD=$2 + +# Configurations +JAR_PATH="target/bigtable-beam-import-2.17.0-shaded.jar" + +# --- AUTO-PARALLEL MODE --- +if [ "$1" == "--all" ]; then + echo "🚀 Starting fully automated snapshot import..." + + # Step 1: Perform ONLY the restore step + echo "Step 1/2: Performing snapshot restore (blocking)..." + java -jar ${JAR_PATH} importsnapshot \ + --runner=DataflowRunner \ + --project=${PROJECT_ID} \ + --bigtableInstanceId=${INSTANCE_ID} \ + --bigtableTableId=${TABLE_NAME} \ + --hbaseSnapshotSourceDir=${SNAPSHOT_SOURCE_DIR} \ + --snapshots=${SNAPSHOT_NAME}:${TABLE_NAME} \ + --stagingLocation=gs://${BUCKET}/dataflow/staging \ + --tempLocation=gs://${BUCKET}/dataflow/temp \ + --region=${REGION} \ + --performOnlyRestoreStep=true \ + --jobName="restore-job" \ + --network=${NETWORK} \ + --subnetwork=${SUBNETWORK} + + echo "Restore completed. Proceeding to data import." + + # Step 2: Launch parallel groups of 4 + echo "Step 2/2: Launching parallel groups of 4 shards..." + SHARDS_PER_GROUP=4 + + for (( start=0; start<$NUM_SHARDS; start+=$SHARDS_PER_GROUP )); do + end=$((start + SHARDS_PER_GROUP - 1)) + [ $end -ge $NUM_SHARDS ] && end=$((NUM_SHARDS - 1)) + + echo "Launching group: shards $start to $end in background" + # Call ourselves with the range! + $0 $start $end & + done + + echo "All groups launched. Waiting for all background jobs to finish..." + wait + echo "🎉 All import jobs completed!" + exit 0 +fi +# ---------------------------------------- + +# Standard Range Mode +for i in $(seq $START_SHARD $END_SHARD); do + echo "Submitting Dataflow job for shardIndex: $i" + + # We skip restore for all shards if running via --all because Step 1 handled it. + # If running manually via ranges, shard 0 will perform restore. + SKIP_RESTORE="true" + if [ $i -eq 0 ]; then + SKIP_RESTORE="false" + fi + + JOB="job-${i}" + java -jar ${JAR_PATH} importsnapshot \ + --runner=DataflowRunner \ + --project=${PROJECT_ID} \ + --bigtableInstanceId=${INSTANCE_ID} \ + --bigtableTableId=${TABLE_NAME} \ + --hbaseSnapshotSourceDir=${SNAPSHOT_SOURCE_DIR} \ + --snapshots=${SNAPSHOT_NAME}:${TABLE_NAME} \ + --stagingLocation=gs://${BUCKET}/dataflow/staging \ + --tempLocation=gs://${BUCKET}/dataflow/temp \ + --workerMachineType=n1-highmem-4 \ + --diskSizeGb=500 \ + --maxNumWorkers=10 \ + --region=${REGION} \ + --serviceAccount=${SERVICE_ACCOUNT} \ + --usePublicIps=false \ + --enableSnappy=true \ + --skipRestoreStep=${SKIP_RESTORE} \ + --numShards=${NUM_SHARDS} \ + --shardIndex=$i \ + --jobName="${JOB}" \ + --network=${NETWORK} \ + --subnetwork=${SUBNETWORK} \ + --maxInflightRpcs=${MAX_INFLIGHT_RPCS} \ + --bulkMutationCloseTimeoutMinutes=${BULK_MUTATION_CLOSE_TIMEOUT_MINUTES} + + # Sequential within this script instance +done