From 81cfc214062af33e96c9ba3c7d47d5ef5dfb3911 Mon Sep 17 00:00:00 2001 From: shuke987 Date: Mon, 15 Jun 2026 17:54:08 +0800 Subject: [PATCH] [regression-test](backup-restore) wait for colocate group to stabilize before asserting COLOCATE plan test_backup_restore_colocate_with_partition flakily fails right after a RESTORE with `expect contains 'COLOCATE', but actual explain string is ... INNER JOIN(BROADCAST) / HAS_COLO_PLAN_NODE: false`. After RESTORE the restored colocate group needs time to become stable, and the planner only emits a COLOCATE join once the group is stable. The suite ran `explain ... contains("COLOCATE")` immediately after waitAllRestoreFinish, racing the stabilization, while checkColocateTabletHealth (a single-shot ColocateMismatchNum==0 assert) sat after the assertion. Add a bounded poll (waitColocatePlan) that waits for the explain plan to contain COLOCATE before each contains("COLOCATE") assertion, and make checkColocateTabletHealth a bounded poll too. If COLOCATE never shows up within the timeout the assertion still fails, so no buggy behavior is masked. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../test_backup_restore_colocate.groovy | 55 ++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy b/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy index 98262234edf258..20588be01e9e4f 100644 --- a/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy +++ b/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy @@ -35,12 +35,35 @@ suite("test_backup_restore_colocate", "backup_restore,external") { } def checkColocateTabletHealth = { db_name -> - def result = showTabletHealth.call(db_name) + // Poll until the colocate group has stabilized (no mismatch) instead of + // asserting once, so the check waits for stabilization (e.g. after a restore) + // rather than racing it. + def result = null + for (int i = 0; i < 60; i++) { + result = showTabletHealth.call(db_name) + if (result != null && (result.ColocateMismatchNum as int) == 0) { + break + } + sleep(1000) + } log.info(result as String) assertNotNull(result) assertTrue(result.ColocateMismatchNum as int == 0) } + // The planner only produces a COLOCATE join once the colocate group is stable. + // Right after a restore the restored group may still be stabilizing, so poll the + // explain plan until COLOCATE shows up (bounded wait) before asserting on it. + def waitColocatePlan = { q -> + def plan = q.replaceAll(/;\s*$/, "") + for (int i = 0; i < 60; i++) { + if (sql("explain ${plan}").toString().contains("COLOCATE")) { + break + } + sleep(1000) + } + } + def syncer = getSyncer() syncer.createS3Repository(repoName) @@ -95,6 +118,7 @@ suite("test_backup_restore_colocate", "backup_restore,external") { res = sql "SELECT * FROM ${dbName}.${tableName2}" assertEquals(res.size(), insert_num) + waitColocatePlan(query) explain { sql("${query}") contains("COLOCATE") @@ -201,6 +225,7 @@ suite("test_backup_restore_colocate", "backup_restore,external") { assertEquals(res.size(), insert_num) + waitColocatePlan(query) explain { sql("${query}") contains("COLOCATE") @@ -370,12 +395,35 @@ suite("test_backup_restore_colocate_with_partition", "backup_restore") { } def checkColocateTabletHealth = { db_name -> - def result = showTabletHealth.call(db_name) + // Poll until the colocate group has stabilized (no mismatch) instead of + // asserting once, so the check waits for stabilization (e.g. after a restore) + // rather than racing it. + def result = null + for (int i = 0; i < 60; i++) { + result = showTabletHealth.call(db_name) + if (result != null && (result.ColocateMismatchNum as int) == 0) { + break + } + sleep(1000) + } log.info(result as String) assertNotNull(result) assertTrue(result.ColocateMismatchNum as int == 0) } + // The planner only produces a COLOCATE join once the colocate group is stable. + // Right after a restore the restored group may still be stabilizing, so poll the + // explain plan until COLOCATE shows up (bounded wait) before asserting on it. + def waitColocatePlan = { q -> + def plan = q.replaceAll(/;\s*$/, "") + for (int i = 0; i < 60; i++) { + if (sql("explain ${plan}").toString().contains("COLOCATE")) { + break + } + sleep(1000) + } + } + def syncer = getSyncer() syncer.createS3Repository(repoName) @@ -446,6 +494,7 @@ suite("test_backup_restore_colocate_with_partition", "backup_restore") { res = sql "SELECT * FROM ${dbName}.${tableName2}" assertEquals(res.size(), insert_num) + waitColocatePlan(query) explain { sql("${query}") contains("COLOCATE") @@ -550,6 +599,7 @@ suite("test_backup_restore_colocate_with_partition", "backup_restore") { assertEquals(res.size(), insert_num) + waitColocatePlan(query) explain { sql("${query}") contains("COLOCATE") @@ -624,6 +674,7 @@ suite("test_backup_restore_colocate_with_partition", "backup_restore") { query = "select * from ${newDbName}.${tableName1} as t1, ${newDbName}.${tableName2} as t2 where t1.id=t2.id;" + waitColocatePlan(query) explain { sql("${query}") contains("COLOCATE")