From aedfbb005d4ed44e731b096932fff475f9044117 Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Thu, 30 Apr 2026 15:34:11 +0000 Subject: [PATCH 1/8] feat(import): add helper script to run dataflow snapshot import --- .../run-snapshot-import.sh | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100755 bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh new file mode 100755 index 0000000000..588d290a23 --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# This script runs the Dataflow snapshot import job. +# It should be executed from the 'bigtable-dataflow-parent/bigtable-beam-import' directory. + +export PROJECT_ID=db-blackbelt-cndb +export INSTANCE_ID=bench-workload +export TABLE_NAME=validation_test +export SNAPSHOT_NAME=validation_test_20200929 +export SERVICE_ACCOUNT=295490517436-compute@developer.gserviceaccount.com + +export BUCKET=jh-data-sandbox-backups +export REGION=us-west1 + +# Using version 2.17.0 as per the current project version +JAR_PATH="target/bigtable-beam-import-2.17.0-shaded.jar" + +echo "Submitting Dataflow job for shardIndex: 0. The initial job will restore the snapshot to the first iteration and will skip this step for subsequent jobs" +java -jar ${JAR_PATH} importsnapshot \ + --runner=DataflowRunner \ + --project=${PROJECT_ID} \ + --bigtableInstanceId=${INSTANCE_ID} \ + --bigtableTableId=${TABLE_NAME} \ + --importConfigFilePath=import-config-test.json \ + --stagingLocation=gs://${BUCKET}/dataflow/staging \ + --tempLocation=gs://${BUCKET}/dataflow/temp \ + --workerMachineType=n1-highmem-4 \ + --diskSizeGb=500 \ + --maxNumWorkers=10 \ + --region=${REGION} \ + --serviceAccount=${SERVICE_ACCOUNT} \ + --usePublicIps=false \ + --enableSnappy=true \ + --skipRestoreStep=false \ + --numShards=20 \ + --shardIndex=0 + +# Loop from 1 to 19 +for i in {1..19}; do + echo "Submitting Dataflow job for shardIndex: $i" + + JOB="job ${i}" + java -jar ${JAR_PATH} importsnapshot \ + --runner=DataflowRunner \ + --project=${PROJECT_ID} \ + --bigtableInstanceId=${INSTANCE_ID} \ + --bigtableTableId=${TABLE_NAME} \ + --importConfigFilePath=import-config-test.json \ + --stagingLocation=gs://${BUCKET}/dataflow/staging \ + --tempLocation=gs://${BUCKET}/dataflow/temp \ + --workerMachineType=n1-highmem-4 \ + --diskSizeGb=500 \ + --maxNumWorkers=10 \ + --region=${REGION} \ + --serviceAccount=${SERVICE_ACCOUNT} \ + --usePublicIps=false \ + --enableSnappy=true \ + --skipRestoreStep=true \ + --numShards=20 \ + --shardIndex=$i \ + --jobName="${JOB}" & + + # Optional: Sleep briefly between submissions to avoid API rate limits + sleep 5 +done From cc755c83115aa9b2d2ed5406fb1d65ee8598c2aa Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Thu, 30 Apr 2026 18:11:02 +0000 Subject: [PATCH 2/8] feat(import): enhance helper script with range support and auto-parallel mode --- .../run-snapshot-import.sh | 146 +++++++++++++----- 1 file changed, 110 insertions(+), 36 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh index 588d290a23..6e78f7768f 100755 --- a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -1,45 +1,118 @@ #!/bin/bash -# This script runs the Dataflow snapshot import job. +# This script runs a range of Dataflow snapshot import jobs sequentially. # It should be executed from the 'bigtable-dataflow-parent/bigtable-beam-import' directory. +# +# Usage: ./run-snapshot-import.sh +# Or: ./run-snapshot-import.sh --all +# Example: ./run-snapshot-import.sh 0 3 +# Example: ./run-snapshot-import.sh --all +# +# You can override default configurations by setting environment variables in your terminal. +# Example: TABLE_NAME="my-table" SNAPSHOT_NAME="my-snap" ./run-snapshot-import.sh 0 3 +# +# NOTE: If you are running on a newer JDK (like Java 21 or 26) and hit ByteBuddy errors, +# you can add '-Dnet.bytebuddy.experimental=true' to the java command lines below. +# +# --- Manual Parallel Execution --- +# To run shards in parallel groups of 4 (assuming 20 shards total), you can run 5 instances of this script. +# +# IMPORTANT: Shard 0 performs the restore step. You MUST run the first group (including shard 0) +# first and let it complete the restore step before launching other groups in parallel, +# otherwise they will fail because the restored files won't exist yet! +# +# Example for manual parallel execution: +# ./run-snapshot-import.sh 0 3 & # Run this first! +# # Wait for shard 0 to finish restore, then run the rest: +# ./run-snapshot-import.sh 4 7 & +# ./run-snapshot-import.sh 8 11 & +# ./run-snapshot-import.sh 12 15 & +# ./run-snapshot-import.sh 16 19 & +# +# --- Automated Parallel Execution --- +# Alternatively, use the --all flag to automatically handle the restore step and launch all groups: +# ./run-snapshot-import.sh --all -export PROJECT_ID=db-blackbelt-cndb -export INSTANCE_ID=bench-workload -export TABLE_NAME=validation_test -export SNAPSHOT_NAME=validation_test_20200929 -export SERVICE_ACCOUNT=295490517436-compute@developer.gserviceaccount.com +if [ "$#" -ne 2 ] && [ "$1" != "--all" ]; then + echo "Usage: $0 " + echo " Or: $0 --all" + exit 1 +fi -export BUCKET=jh-data-sandbox-backups -export REGION=us-west1 +START_SHARD=$1 +END_SHARD=$2 + +# Configurations (Uses environment variables if set, otherwise defaults) +export PROJECT_ID="${PROJECT_ID:-google.com:cloud-bigtable-dev}" +export INSTANCE_ID="${INSTANCE_ID:-tianlei-test-inst}" +export BUCKET="${BUCKET:-tianlei-beam-test-bucket}" +export REGION="${REGION:-us-central1}" + +export TABLE_NAME="${TABLE_NAME:-validation_test}" +export SNAPSHOT_NAME="${SNAPSHOT_NAME:-validation_test_20200929}" +export SERVICE_ACCOUNT="${SERVICE_ACCOUNT:-295490517436-compute@developer.gserviceaccount.com}" + +export NUM_SHARDS="${NUM_SHARDS:-20}" + +export NETWORK="${NETWORK:-tianlei-network}" +export SUBNETWORK="${SUBNETWORK:-regions/us-central1/subnetworks/tianlei-network}" -# Using version 2.17.0 as per the current project version JAR_PATH="target/bigtable-beam-import-2.17.0-shaded.jar" -echo "Submitting Dataflow job for shardIndex: 0. The initial job will restore the snapshot to the first iteration and will skip this step for subsequent jobs" -java -jar ${JAR_PATH} importsnapshot \ - --runner=DataflowRunner \ - --project=${PROJECT_ID} \ - --bigtableInstanceId=${INSTANCE_ID} \ - --bigtableTableId=${TABLE_NAME} \ - --importConfigFilePath=import-config-test.json \ - --stagingLocation=gs://${BUCKET}/dataflow/staging \ - --tempLocation=gs://${BUCKET}/dataflow/temp \ - --workerMachineType=n1-highmem-4 \ - --diskSizeGb=500 \ - --maxNumWorkers=10 \ - --region=${REGION} \ - --serviceAccount=${SERVICE_ACCOUNT} \ - --usePublicIps=false \ - --enableSnappy=true \ - --skipRestoreStep=false \ - --numShards=20 \ - --shardIndex=0 +# --- AUTO-PARALLEL MODE --- +if [ "$1" == "--all" ]; then + echo "🚀 Starting fully automated snapshot import..." + + # Step 1: Perform ONLY the restore step + echo "Step 1/2: Performing snapshot restore (blocking)..." + java -jar ${JAR_PATH} importsnapshot \ + --runner=DataflowRunner \ + --project=${PROJECT_ID} \ + --bigtableInstanceId=${INSTANCE_ID} \ + --bigtableTableId=${TABLE_NAME} \ + --importConfigFilePath=import-config-test.json \ + --stagingLocation=gs://${BUCKET}/dataflow/staging \ + --tempLocation=gs://${BUCKET}/dataflow/temp \ + --region=${REGION} \ + --performOnlyRestoreStep=true \ + --jobName="restore-job" \ + --network=${NETWORK} \ + --subnetwork=${SUBNETWORK} + + echo "Restore completed. Proceeding to data import." + + # Step 2: Launch parallel groups of 4 + echo "Step 2/2: Launching parallel groups of 4 shards..." + SHARDS_PER_GROUP=4 + + for (( start=0; start<$NUM_SHARDS; start+=$SHARDS_PER_GROUP )); do + end=$((start + SHARDS_PER_GROUP - 1)) + [ $end -ge $NUM_SHARDS ] && end=$((NUM_SHARDS - 1)) + + echo "Launching group: shards $start to $end in background" + # Call ourselves with the range! + $0 $start $end & + done + + echo "All groups launched. Waiting for all background jobs to finish..." + wait + echo "🎉 All import jobs completed!" + exit 0 +fi +# ---------------------------------------- -# Loop from 1 to 19 -for i in {1..19}; do +# Standard Range Mode +for i in $(seq $START_SHARD $END_SHARD); do echo "Submitting Dataflow job for shardIndex: $i" - JOB="job ${i}" + # We skip restore for all shards if running via --all because Step 1 handled it. + # If running manually via ranges, shard 0 will perform restore. + SKIP_RESTORE="true" + if [ $i -eq 0 ]; then + SKIP_RESTORE="false" + fi + + JOB="job-${i}" java -jar ${JAR_PATH} importsnapshot \ --runner=DataflowRunner \ --project=${PROJECT_ID} \ @@ -55,11 +128,12 @@ for i in {1..19}; do --serviceAccount=${SERVICE_ACCOUNT} \ --usePublicIps=false \ --enableSnappy=true \ - --skipRestoreStep=true \ - --numShards=20 \ + --skipRestoreStep=${SKIP_RESTORE} \ + --numShards=${NUM_SHARDS} \ --shardIndex=$i \ - --jobName="${JOB}" & + --jobName="${JOB}" \ + --network=${NETWORK} \ + --subnetwork=${SUBNETWORK} - # Optional: Sleep briefly between submissions to avoid API rate limits - sleep 5 + # Sequential within this script instance done From 59ae558e4f37b069b35d85860a24c7ae7af7f7e8 Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Wed, 13 May 2026 15:00:21 +0000 Subject: [PATCH 3/8] Update helper script to use flags instead of missing config file --- .../bigtable-beam-import/run-snapshot-import.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh index 6e78f7768f..3e3482354f 100755 --- a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -50,6 +50,7 @@ export REGION="${REGION:-us-central1}" export TABLE_NAME="${TABLE_NAME:-validation_test}" export SNAPSHOT_NAME="${SNAPSHOT_NAME:-validation_test_20200929}" +export SNAPSHOT_SOURCE_DIR="${SNAPSHOT_SOURCE_DIR:-gs://${BUCKET}/snapshots}" export SERVICE_ACCOUNT="${SERVICE_ACCOUNT:-295490517436-compute@developer.gserviceaccount.com}" export NUM_SHARDS="${NUM_SHARDS:-20}" @@ -70,7 +71,8 @@ if [ "$1" == "--all" ]; then --project=${PROJECT_ID} \ --bigtableInstanceId=${INSTANCE_ID} \ --bigtableTableId=${TABLE_NAME} \ - --importConfigFilePath=import-config-test.json \ + --hbaseSnapshotSourceDir=${SNAPSHOT_SOURCE_DIR} \ + --snapshots=${SNAPSHOT_NAME}:${TABLE_NAME} \ --stagingLocation=gs://${BUCKET}/dataflow/staging \ --tempLocation=gs://${BUCKET}/dataflow/temp \ --region=${REGION} \ @@ -118,7 +120,8 @@ for i in $(seq $START_SHARD $END_SHARD); do --project=${PROJECT_ID} \ --bigtableInstanceId=${INSTANCE_ID} \ --bigtableTableId=${TABLE_NAME} \ - --importConfigFilePath=import-config-test.json \ + --hbaseSnapshotSourceDir=${SNAPSHOT_SOURCE_DIR} \ + --snapshots=${SNAPSHOT_NAME}:${TABLE_NAME} \ --stagingLocation=gs://${BUCKET}/dataflow/staging \ --tempLocation=gs://${BUCKET}/dataflow/temp \ --workerMachineType=n1-highmem-4 \ From a862ef11d35c9fa57952bd6962172606abe37fbf Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Wed, 13 May 2026 15:13:28 +0000 Subject: [PATCH 4/8] Expose maxInflightRpcs and bulkMutationCloseTimeoutMinutes in script --- .../bigtable-beam-import/run-snapshot-import.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh index 3e3482354f..4035bf1395 100755 --- a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -54,6 +54,8 @@ export SNAPSHOT_SOURCE_DIR="${SNAPSHOT_SOURCE_DIR:-gs://${BUCKET}/snapshots}" export SERVICE_ACCOUNT="${SERVICE_ACCOUNT:-295490517436-compute@developer.gserviceaccount.com}" export NUM_SHARDS="${NUM_SHARDS:-20}" +export MAX_INFLIGHT_RPCS="${MAX_INFLIGHT_RPCS:-100}" +export BULK_MUTATION_CLOSE_TIMEOUT_MINUTES="${BULK_MUTATION_CLOSE_TIMEOUT_MINUTES:-30}" export NETWORK="${NETWORK:-tianlei-network}" export SUBNETWORK="${SUBNETWORK:-regions/us-central1/subnetworks/tianlei-network}" @@ -136,7 +138,9 @@ for i in $(seq $START_SHARD $END_SHARD); do --shardIndex=$i \ --jobName="${JOB}" \ --network=${NETWORK} \ - --subnetwork=${SUBNETWORK} + --subnetwork=${SUBNETWORK} \ + --maxInflightRpcs=${MAX_INFLIGHT_RPCS} \ + --bulkMutationCloseTimeoutMinutes=${BULK_MUTATION_CLOSE_TIMEOUT_MINUTES} # Sequential within this script instance done From 1c83a1809ed7ad2a8d3aae94173bcb7db6727163 Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Wed, 13 May 2026 15:16:04 +0000 Subject: [PATCH 5/8] Remove defaults from script and add SNAPSHOT_IMPORT_USAGE.md doc --- .../SNAPSHOT_IMPORT_USAGE.md | 37 +++++++++++++++++++ .../run-snapshot-import.sh | 34 +++++++++-------- 2 files changed, 56 insertions(+), 15 deletions(-) create mode 100644 bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md diff --git a/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md new file mode 100644 index 0000000000..adf0275afd --- /dev/null +++ b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md @@ -0,0 +1,37 @@ +# HBase Snapshot Import Helper Script Usage + +This document describes the environment variables used by the `run-snapshot-import.sh` script to automate HBase snapshot imports into Cloud Bigtable using Dataflow. + +## Environment Variables + +The script relies on the following environment variables. You should set them before executing the script. + +| Variable | Description | Example / Suggested Value | +| :--- | :--- | :--- | +| `PROJECT_ID` | The Google Cloud Project ID where the Bigtable instance and Dataflow jobs reside. | `your-project-id` | +| `INSTANCE_ID` | The Bigtable Instance ID to import data into. | `your-instance-id` | +| `BUCKET` | The GCS bucket name used for Dataflow staging, temp files, and default snapshot source path. | `your-gcs-bucket` | +| `REGION` | The GCP region to run the Dataflow jobs in. | `us-central1` | +| `TABLE_NAME` | The target Bigtable table name. | `your-table-name` | +| `SNAPSHOT_NAME` | The name of the HBase snapshot to import. | `your-snapshot-name` | +| `SNAPSHOT_SOURCE_DIR` | The GCS path where the HBase snapshot export is located. | `gs://your-gcs-bucket/snapshots` | +| `SERVICE_ACCOUNT` | The service account email to run the Dataflow jobs. | `your-service-account@developer.gserviceaccount.com` | +| `NUM_SHARDS` | The number of shards to split the import into for parallel processing. | `20` | +| `MAX_INFLIGHT_RPCS` | Maximum number of inflight RPCs for Bigtable client. | `100` | +| `BULK_MUTATION_CLOSE_TIMEOUT_MINUTES` | Timeout in minutes for closing bulk mutations. | `30` | +| `NETWORK` | VPC Network name for Dataflow workers. | `your-network` | +| `SUBNETWORK` | VPC Subnetwork name for Dataflow workers. | `regions/us-central1/subnetworks/your-subnetwork` | + +## Usage + +### Run a specific shard range +```bash +./run-snapshot-import.sh +``` +Example: `./run-snapshot-import.sh 0 5` + +### Run all shards (Auto-parallel mode) +```bash +./run-snapshot-import.sh --all +``` +This mode will first run the restore step, and then launch background processes for all shards in parallel. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh index 4035bf1395..f2931ccf7e 100755 --- a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -43,22 +43,26 @@ START_SHARD=$1 END_SHARD=$2 # Configurations (Uses environment variables if set, otherwise defaults) -export PROJECT_ID="${PROJECT_ID:-google.com:cloud-bigtable-dev}" -export INSTANCE_ID="${INSTANCE_ID:-tianlei-test-inst}" -export BUCKET="${BUCKET:-tianlei-beam-test-bucket}" -export REGION="${REGION:-us-central1}" +# Environment variables configuration. +# Please set these variables before running the script. +# See SNAPSHOT_IMPORT_USAGE.md for details and expected values. -export TABLE_NAME="${TABLE_NAME:-validation_test}" -export SNAPSHOT_NAME="${SNAPSHOT_NAME:-validation_test_20200929}" -export SNAPSHOT_SOURCE_DIR="${SNAPSHOT_SOURCE_DIR:-gs://${BUCKET}/snapshots}" -export SERVICE_ACCOUNT="${SERVICE_ACCOUNT:-295490517436-compute@developer.gserviceaccount.com}" - -export NUM_SHARDS="${NUM_SHARDS:-20}" -export MAX_INFLIGHT_RPCS="${MAX_INFLIGHT_RPCS:-100}" -export BULK_MUTATION_CLOSE_TIMEOUT_MINUTES="${BULK_MUTATION_CLOSE_TIMEOUT_MINUTES:-30}" - -export NETWORK="${NETWORK:-tianlei-network}" -export SUBNETWORK="${SUBNETWORK:-regions/us-central1/subnetworks/tianlei-network}" +# export PROJECT_ID="your-project-id" +# export INSTANCE_ID="your-instance-id" +# export BUCKET="your-gcs-bucket" +# export REGION="us-central1" +# +# export TABLE_NAME="your-table-name" +# export SNAPSHOT_NAME="your-snapshot-name" +# export SNAPSHOT_SOURCE_DIR="gs://your-gcs-bucket/snapshots" +# export SERVICE_ACCOUNT="your-service-account" +# +# export NUM_SHARDS="20" +# export MAX_INFLIGHT_RPCS="100" +# export BULK_MUTATION_CLOSE_TIMEOUT_MINUTES="30" +# +# export NETWORK="your-network" +# export SUBNETWORK="your-subnetwork" JAR_PATH="target/bigtable-beam-import-2.17.0-shaded.jar" From ca455c13c73811e61b357e19391cfc8173ff2c3e Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Wed, 13 May 2026 15:34:58 +0000 Subject: [PATCH 6/8] Move advanced usage and troubleshooting comments to doc --- .../SNAPSHOT_IMPORT_USAGE.md | 25 +++++++++++++++++++ .../run-snapshot-import.sh | 22 +--------------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md index adf0275afd..f1ae4df040 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md +++ b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md @@ -35,3 +35,28 @@ Example: `./run-snapshot-import.sh 0 5` ./run-snapshot-import.sh --all ``` This mode will first run the restore step, and then launch background processes for all shards in parallel. + +## Advanced Usage + +### Manual Parallel Execution + +To run shards in parallel groups (e.g., assuming 20 shards total), you can run multiple instances of this script. + +> [!IMPORTANT] +> Shard 0 performs the restore step. You MUST run the first group (including shard 0) first and let it complete the restore step before launching other groups in parallel. Otherwise, they will fail because the restored files won't exist yet! + +Example for manual parallel execution: +```bash +./run-snapshot-import.sh 0 3 & # Run this first! +# Wait for shard 0 to finish restore, then run the rest: +./run-snapshot-import.sh 4 7 & +./run-snapshot-import.sh 8 11 & +./run-snapshot-import.sh 12 15 & +./run-snapshot-import.sh 16 19 & +``` + +## Troubleshooting + +### JDK Compatibility + +If you are running on a newer JDK (like Java 21 or 26) and hit ByteBuddy errors, you can add `-Dnet.bytebuddy.experimental=true` to the `java` command lines in the script. diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh index f2931ccf7e..437b195bc5 100755 --- a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -11,27 +11,7 @@ # You can override default configurations by setting environment variables in your terminal. # Example: TABLE_NAME="my-table" SNAPSHOT_NAME="my-snap" ./run-snapshot-import.sh 0 3 # -# NOTE: If you are running on a newer JDK (like Java 21 or 26) and hit ByteBuddy errors, -# you can add '-Dnet.bytebuddy.experimental=true' to the java command lines below. -# -# --- Manual Parallel Execution --- -# To run shards in parallel groups of 4 (assuming 20 shards total), you can run 5 instances of this script. -# -# IMPORTANT: Shard 0 performs the restore step. You MUST run the first group (including shard 0) -# first and let it complete the restore step before launching other groups in parallel, -# otherwise they will fail because the restored files won't exist yet! -# -# Example for manual parallel execution: -# ./run-snapshot-import.sh 0 3 & # Run this first! -# # Wait for shard 0 to finish restore, then run the rest: -# ./run-snapshot-import.sh 4 7 & -# ./run-snapshot-import.sh 8 11 & -# ./run-snapshot-import.sh 12 15 & -# ./run-snapshot-import.sh 16 19 & -# -# --- Automated Parallel Execution --- -# Alternatively, use the --all flag to automatically handle the restore step and launch all groups: -# ./run-snapshot-import.sh --all +# See SNAPSHOT_IMPORT_USAGE.md for advanced usage and troubleshooting. if [ "$#" -ne 2 ] && [ "$1" != "--all" ]; then echo "Usage: $0 " From 1671b8d7bc784e8c7809f3dc4a8629faff9d77b6 Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Wed, 13 May 2026 15:38:07 +0000 Subject: [PATCH 7/8] Beautify script comments and structure --- .../run-snapshot-import.sh | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh index 437b195bc5..76da63586e 100755 --- a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -1,32 +1,21 @@ #!/bin/bash -# This script runs a range of Dataflow snapshot import jobs sequentially. -# It should be executed from the 'bigtable-dataflow-parent/bigtable-beam-import' directory. +# ============================================================================== +# HBase Snapshot Import Helper Script +# ============================================================================== +# This script runs a range of Dataflow snapshot import jobs sequentially or in parallel. +# Must be executed from the 'bigtable-dataflow-parent/bigtable-beam-import' directory. # -# Usage: ./run-snapshot-import.sh -# Or: ./run-snapshot-import.sh --all -# Example: ./run-snapshot-import.sh 0 3 -# Example: ./run-snapshot-import.sh --all -# -# You can override default configurations by setting environment variables in your terminal. -# Example: TABLE_NAME="my-table" SNAPSHOT_NAME="my-snap" ./run-snapshot-import.sh 0 3 -# -# See SNAPSHOT_IMPORT_USAGE.md for advanced usage and troubleshooting. +# For detailed usage and advanced options, see: SNAPSHOT_IMPORT_USAGE.md +# ============================================================================== -if [ "$#" -ne 2 ] && [ "$1" != "--all" ]; then - echo "Usage: $0 " - echo " Or: $0 --all" - exit 1 -fi - -START_SHARD=$1 -END_SHARD=$2 - -# Configurations (Uses environment variables if set, otherwise defaults) -# Environment variables configuration. -# Please set these variables before running the script. +# ------------------------------------------------------------------------------ +# Environment Variables +# ------------------------------------------------------------------------------ +# Most users will need to set these variables before running the script. # See SNAPSHOT_IMPORT_USAGE.md for details and expected values. +# --- Required / Common Configurations --- # export PROJECT_ID="your-project-id" # export INSTANCE_ID="your-instance-id" # export BUCKET="your-gcs-bucket" @@ -36,14 +25,36 @@ END_SHARD=$2 # export SNAPSHOT_NAME="your-snapshot-name" # export SNAPSHOT_SOURCE_DIR="gs://your-gcs-bucket/snapshots" # export SERVICE_ACCOUNT="your-service-account" -# + +# --- Sharding & Tuning --- # export NUM_SHARDS="20" # export MAX_INFLIGHT_RPCS="100" # export BULK_MUTATION_CLOSE_TIMEOUT_MINUTES="30" -# + +# --- Network Configurations --- # export NETWORK="your-network" # export SUBNETWORK="your-subnetwork" +# ------------------------------------------------------------------------------ +# Usage +# ------------------------------------------------------------------------------ +# Usage: ./run-snapshot-import.sh +# Or: ./run-snapshot-import.sh --all +# +# Examples: +# ./run-snapshot-import.sh 0 3 +# ./run-snapshot-import.sh --all + +if [ "$#" -ne 2 ] && [ "$1" != "--all" ]; then + echo "Usage: $0 " + echo " Or: $0 --all" + exit 1 +fi + +START_SHARD=$1 +END_SHARD=$2 + +# Configurations JAR_PATH="target/bigtable-beam-import-2.17.0-shaded.jar" # --- AUTO-PARALLEL MODE --- From b68498fe4243dbc2e1f70c7cb4d87ad37cf2e1e0 Mon Sep 17 00:00:00 2001 From: Tianlei Pan Date: Wed, 13 May 2026 19:44:59 +0000 Subject: [PATCH 8/8] Update documentation to mention --all runs shards in groups of 4 by default --- .../bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md | 2 +- .../bigtable-beam-import/run-snapshot-import.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md index f1ae4df040..9a4eefe22c 100644 --- a/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md +++ b/bigtable-dataflow-parent/bigtable-beam-import/SNAPSHOT_IMPORT_USAGE.md @@ -34,7 +34,7 @@ Example: `./run-snapshot-import.sh 0 5` ```bash ./run-snapshot-import.sh --all ``` -This mode will first run the restore step, and then launch background processes for all shards in parallel. +This mode will first run the restore step, and then launch background processes for all shards in parallel groups of 4 by default. ## Advanced Usage diff --git a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh index 76da63586e..d66b042574 100755 --- a/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh +++ b/bigtable-dataflow-parent/bigtable-beam-import/run-snapshot-import.sh @@ -40,6 +40,7 @@ # ------------------------------------------------------------------------------ # Usage: ./run-snapshot-import.sh # Or: ./run-snapshot-import.sh --all +# (Runs all shards in parallel groups of 4 by default) # # Examples: # ./run-snapshot-import.sh 0 3