From b65e5200c246ee3843acbc36eebe2f4f86aec058 Mon Sep 17 00:00:00 2001 From: Alexey Yurchenko Date: Tue, 30 Jun 2026 14:41:35 +0200 Subject: [PATCH 1/2] MDEV-38147 error 1950 after mariabackup SST with gtid_strict_mode=ON After a mariabackup SST the joiner could fail with ER_GTID_STRICT_OUT_OF_ORDER (error 1950) while re-binlogging transactions received over IST. The cause is that the binary log copied from the donor carries a Gtid_list whose position can be ahead of the storage-engine snapshot: BACKUP STAGE BLOCK_COMMIT blocks the engine commit (2PC step 3) but not the binary log write (step 2), so transactions can be present in the copied binlog that are not committed in the copied engine snapshot. After the SST the joiner reports the (committed) engine position to the cluster, IST resends those transactions, and re-binlogging them under gtid_strict_mode=ON collides with the ahead Gtid_list -> error 1950. (MDEV-34483 made the engine snapshot stop short of the binlog, which is what exposed this.) The copied binary log carries no transactions the joiner needs - only a Gtid_list - so instead of shipping and then having to truncate/reconcile it, the joiner now starts a fresh binary log and seeds its GTID position from the storage-engine checkpoint during recovery. That checkpoint is the committed cluster position, i.e. exactly where IST resumes, so the joiner's binary log stays in lockstep with the rest of the cluster and no out-of-order GTID can occur. This works for both wsrep_gtid_mode settings; only the binlog domain of the cluster stream differs: - wsrep_gtid_mode=ON : wsrep_gtid_domain_id (cluster writes are re-tagged to it), which is the domain stored in the checkpoint; - wsrep_gtid_mode=OFF: gtid_domain_id (cluster writes keep the node's configured domain). Async-replica positions (mysql.gtid_slave_pos) are part of the engine snapshot and survive the SST unchanged, so a Galera node can still serve as an async master or replica across the SST. This commit: - sql/log.cc: adds wsrep_seed_binlog_gtid_state(), called from do_binlog_recovery() when the joiner has no binary log, seeding the binlog GTID state for the cluster domain to the SE checkpoint position. - scripts/wsrep_sst_mariabackup.sh: no longer moves the donor's binary log into place on the joiner. - extra/mariabackup: stop flushing and copying the donor's current binary log under --galera-info (removed write_current_binlog_file()). Its only purpose was to ship that binary log to the joiner, which now discards it; flushing needlessly rotated the donor's binary log on every SST. xtrabackup_galera_info and xtrabackup_binlog_info are still written. - sql/wsrep_sst.cc: logs the position actually adopted from storage (the authoritative post-SST position) rather than the script-reported one. - sql/handler.cc: downgrades the "Discovered discontinuity in recovered wsrep transaction XIDs" message in wsrep_order_and_check_continuity() from warning to debug level. With parallel appliers a snapshot routinely captures prepared XIDs that are not contiguous with the engine checkpoint, so this is normal during SST recovery and of no value in regular operation; the transactions past the checkpoint are re-delivered by the cluster (IST/SST) regardless. - Adds an MDEV-38147 MTR test reproducing the issue. Co-Authored-By: Claude Opus 4.8 --- extra/mariabackup/backup_mysql.cc | 92 +------ extra/mariabackup/backup_mysql.h | 3 - .../suite/galera_3nodes/r/MDEV-38147.result | 43 ++++ .../suite/galera_3nodes/t/MDEV-38147.cnf | 25 ++ .../suite/galera_3nodes/t/MDEV-38147.test | 229 ++++++++++++++++++ scripts/wsrep_sst_mariabackup.sh | 32 ++- sql/handler.cc | 6 +- sql/log.cc | 56 +++++ sql/wsrep_sst.cc | 26 +- 9 files changed, 405 insertions(+), 107 deletions(-) create mode 100644 mysql-test/suite/galera_3nodes/r/MDEV-38147.result create mode 100644 mysql-test/suite/galera_3nodes/t/MDEV-38147.cnf create mode 100644 mysql-test/suite/galera_3nodes/t/MDEV-38147.test diff --git a/extra/mariabackup/backup_mysql.cc b/extra/mariabackup/backup_mysql.cc index 7fa02b1128c6d..bc032d3e5f4d9 100644 --- a/extra/mariabackup/backup_mysql.cc +++ b/extra/mariabackup/backup_mysql.cc @@ -1504,8 +1504,14 @@ write_galera_info(ds_ctxt *datasink, MYSQL *connection) domain_id ? domain_id : domain_id55); } - if (result) - write_current_binlog_file(datasink, connection); + /* + MDEV-38147: Do not flush and copy the donor's current binary log here. + This used to ship the binary log to the SST joiner, but the joiner no + longer uses it - it starts a fresh binary log and seeds its GTID position + from the storage-engine checkpoint (see wsrep_seed_binlog_gtid_state() in + sql/log.cc and scripts/wsrep_sst_mariabackup.sh). Flushing here only + rotated the donor's binary log needlessly on every SST. + */ if (result) msg("Writing Galera info succeeded with %s:%s %s", @@ -1522,88 +1528,6 @@ write_galera_info(ds_ctxt *datasink, MYSQL *connection) } -/*********************************************************************//** -Flush and copy the current binary log file into the backup, -if GTID is enabled */ -bool -write_current_binlog_file(ds_ctxt *datasink, MYSQL *connection) -{ - char *executed_gtid_set = NULL; - char *gtid_binlog_state = NULL; - char *log_bin_file = NULL; - char *log_bin_dir = NULL; - bool gtid_exists; - bool result = true; - char filepath[FN_REFLEN]; - - mysql_variable status[] = { - {"Executed_Gtid_Set", &executed_gtid_set}, - {NULL, NULL} - }; - - mysql_variable status_after_flush[] = { - {"File", &log_bin_file}, - {NULL, NULL} - }; - - mysql_variable vars[] = { - {"gtid_binlog_state", >id_binlog_state}, - {"log_bin_basename", &log_bin_dir}, - {NULL, NULL} - }; - - read_mysql_variables(connection, "SHOW MASTER STATUS", status, false); - read_mysql_variables(connection, "SHOW VARIABLES", vars, true); - - gtid_exists = (executed_gtid_set && *executed_gtid_set) - || (gtid_binlog_state && *gtid_binlog_state); - - if (gtid_exists) { - size_t log_bin_dir_length; - - xb_mysql_query(connection, "FLUSH BINARY LOGS", false); - - read_mysql_variables(connection, "SHOW MASTER STATUS", - status_after_flush, false); - - if (opt_log_bin != NULL && strchr(opt_log_bin, FN_LIBCHAR)) { - /* If log_bin is set, it has priority */ - if (log_bin_dir) { - free(log_bin_dir); - } - log_bin_dir = strdup(opt_log_bin); - } else if (log_bin_dir == NULL) { - /* Default location is MySQL datadir */ - log_bin_dir = strdup("./"); - } - - dirname_part(log_bin_dir, log_bin_dir, &log_bin_dir_length); - - /* strip final slash if it is not the only path component */ - if (log_bin_dir_length > 1 && - log_bin_dir[log_bin_dir_length - 1] == FN_LIBCHAR) { - log_bin_dir[log_bin_dir_length - 1] = 0; - } - - if (log_bin_dir == NULL || log_bin_file == NULL) { - msg("Failed to get master binlog coordinates from " - "SHOW MASTER STATUS"); - result = false; - goto cleanup; - } - - snprintf(filepath, sizeof(filepath), "%s%c%s", - log_bin_dir, FN_LIBCHAR, log_bin_file); - result = datasink->copy_file(filepath, log_bin_file, 0); - } - -cleanup: - free_mysql_variables(status_after_flush); - free_mysql_variables(status); - free_mysql_variables(vars); - - return(result); -} /*********************************************************************//** diff --git a/extra/mariabackup/backup_mysql.h b/extra/mariabackup/backup_mysql.h index 55700dddf6d67..ac163643c103c 100644 --- a/extra/mariabackup/backup_mysql.h +++ b/extra/mariabackup/backup_mysql.h @@ -53,9 +53,6 @@ xb_mysql_query(MYSQL *connection, const char *query, bool use_result, void unlock_all(MYSQL *connection); -bool -write_current_binlog_file(ds_ctxt *datasink, MYSQL *connection); - bool write_binlog_info(ds_ctxt *datasink, MYSQL *connection); diff --git a/mysql-test/suite/galera_3nodes/r/MDEV-38147.result b/mysql-test/suite/galera_3nodes/r/MDEV-38147.result new file mode 100644 index 0000000000000..7e7199aa32296 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/r/MDEV-38147.result @@ -0,0 +1,43 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +# gtid_strict_mode must be enabled on all nodes +SELECT @@global.gtid_strict_mode AS gtid_strict_mode; +gtid_strict_mode +1 +connection node_2; +connection node_3; +connect node_1_load, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connect node_2_load, 127.0.0.1, root, , test, $NODE_MYPORT_2; +connection node_1_load; +CALL p_load(); +connection node_2_load; +CALL p_load(); +connection node_1; +connection node_2; +connection node_3; +connection node_1; +UPDATE ctrl SET stop = 1 WHERE id = 1; +connection node_1_load; +connection node_2_load; +connection node_1; +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +wsrep_cluster_size +3 +connection node_2; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_3; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_1; +connection node_2; +connection node_3; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-38147.cnf b/mysql-test/suite/galera_3nodes/t/MDEV-38147.cnf new file mode 100644 index 0000000000000..a8c4cfe813755 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-38147.cnf @@ -0,0 +1,25 @@ +!include ../galera_3nodes.cnf + +[mysqld] +wsrep_sst_method=mariabackup +wsrep_sst_auth="root:" +gtid_strict_mode=ON +wsrep_gtid_mode=ON +wsrep_gtid_domain_id=100 +gtid_domain_id=10 +log_bin +log_slave_updates=ON +wsrep_slave_threads=4 + +[mysqld.1] +server_id=11 + +[mysqld.2] +server_id=12 + +[mysqld.3] +server_id=13 + +[sst] +transferfmt=@ENV.MTR_GALERA_TFMT +streamfmt=mbstream diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-38147.test b/mysql-test/suite/galera_3nodes/t/MDEV-38147.test new file mode 100644 index 0000000000000..8b7eb3715290a --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-38147.test @@ -0,0 +1,229 @@ +# +# MDEV-38147 +# +# Three node cluster with gtid_strict_mode enabled and mariabackup SST. +# Two nodes (node_1 and node_2) run a continuous load, inserting batches +# of rows in a throttled loop, while the third node (node_3) is repeatedly +# stopped, has its data directory purged and is started again, forcing a +# full mariabackup SST on every rejoin (8 times). +# +# At the end the cluster must reconverge to three nodes and all three +# nodes must hold identical data and GTID positions. +# + +--source include/big_test.inc +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_mariabackup.inc + +# Number of stop/purge/start cycles for node_3. +--let $restarts = 8 + +--let $galera_connection_name = node_3 +--let $galera_server_number = 3 +--source include/galera_connect.inc + +# Save original auto_increment_offset values so that MTR's post-check is +# happy after node_3 has been restarted multiple times. +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source ../galera/include/auto_increment_offset_save.inc + +# +# Schema: t1 holds the load, ctrl carries the stop flag for the loaders. +# +--connection node_1 +--disable_query_log +CREATE TABLE t1 (pk BIGINT AUTO_INCREMENT PRIMARY KEY, val INT) ENGINE=InnoDB; +CREATE TABLE ctrl (id INT PRIMARY KEY, stop INT) ENGINE=InnoDB; +INSERT INTO ctrl VALUES (1, 0); + +DELIMITER |; +CREATE PROCEDURE p_load() +BEGIN + DECLARE v_stop INT DEFAULT 0; + DECLARE v_i INT; + # Keep the loop alive across transient cluster errors (BF aborts, + # certification failures, donor desync timeouts, ...). + DECLARE CONTINUE HANDLER FOR SQLEXCEPTION + BEGIN + ROLLBACK; + END; + WHILE v_stop = 0 DO + START TRANSACTION; + SET v_i = 0; + WHILE v_i < 16 DO + INSERT INTO t1 (val) VALUES (v_i); + SET v_i = v_i + 1; + END WHILE; + COMMIT; + # Throttle slightly between transactions so that a freshly joined node can + # catch up its replication queue instead of being starved by the load. + DO SLEEP(0.01); + SELECT stop INTO v_stop FROM ctrl WHERE id = 1; + END WHILE; +END| +DELIMITER ;| +--enable_query_log + +--echo # gtid_strict_mode must be enabled on all nodes +SELECT @@global.gtid_strict_mode AS gtid_strict_mode; + +# Make sure the schema reached the other nodes before starting the load. +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME = 't1'; +--source include/wait_condition.inc +--connection node_3 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME = 't1'; +--source include/wait_condition.inc + +# +# Start the continuous load on node_1 and node_2. +# +--connect node_1_load, 127.0.0.1, root, , test, $NODE_MYPORT_1 +--connect node_2_load, 127.0.0.1, root, , test, $NODE_MYPORT_2 + +--connection node_1_load +--send CALL p_load() + +--connection node_2_load +--send CALL p_load() + +# +# While the load is running, repeatedly stop node_3, purge its data +# directory and start it again. An empty data directory forces a full +# mariabackup SST on every rejoin. +# +--disable_query_log +--let $i = $restarts +while ($i) +{ + --connection node_3 + --source include/shutdown_mysqld.inc + --disable_query_log + + # Wait until node_3 has actually left the cluster. + # (shutdown_mysqld.inc / wait_condition.inc / start_mysqld.inc / + # galera_wait_ready.inc each re-enable the query log, so re-disable it after + # every such include to keep the loop output out of the result file.) + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + # Purge node_3's data directory. + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/test + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mysql + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/performance_schema + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mtr + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data + + # Start node_3 again (rejoins via mariabackup SST). + --connection node_3 + --let $restart_noprint = 2 + --source include/start_mysqld.inc + --disable_query_log + --source include/galera_wait_ready.inc + --disable_query_log + + # Wait until the cluster is back to three nodes before the next cycle. + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + --dec $i +} +--enable_query_log + +# +# Make sure the whole cluster is healthy before stopping the load, so that +# any donor that desynced during SST has resynced and the loaders can read +# the stop flag without blocking. +# +--connection node_1 +--source include/galera_wait_ready.inc +--connection node_2 +--source include/galera_wait_ready.inc +--connection node_3 +--source include/galera_wait_ready.inc + +# +# Signal the loaders to stop and collect them. +# +--connection node_1 +UPDATE ctrl SET stop = 1 WHERE id = 1; + +--connection node_1_load +--reap +--connection node_2_load +--reap + +# +# Verify reconvergence and data / GTID consistency across all nodes. +# +--connection node_1 +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +--let $expect_count = `SELECT COUNT(*) FROM t1` +--let $expect_sum = `SELECT COALESCE(SUM(pk), 0) + COALESCE(SUM(val), 0) FROM t1` +--let $expect_gtid = `SELECT @@global.gtid_binlog_pos` + +--connection node_2 +SET SESSION wsrep_sync_wait = 15; +--let $wait_condition = SELECT COUNT(*) = $expect_count FROM t1 +--source include/wait_condition.inc +--disable_query_log +--eval SELECT $expect_count = (SELECT COUNT(*) FROM t1) AS count_match, $expect_sum = (SELECT COALESCE(SUM(pk), 0) + COALESCE(SUM(val), 0) FROM t1) AS checksum_match, '$expect_gtid' = @@global.gtid_binlog_pos AS gtid_match +--enable_query_log + +--connection node_3 +SET SESSION wsrep_sync_wait = 15; +--let $wait_condition = SELECT COUNT(*) = $expect_count FROM t1 +--source include/wait_condition.inc +--disable_query_log +--eval SELECT $expect_count = (SELECT COUNT(*) FROM t1) AS count_match, $expect_sum = (SELECT COALESCE(SUM(pk), 0) + COALESCE(SUM(val), 0) FROM t1) AS checksum_match, '$expect_gtid' = @@global.gtid_binlog_pos AS gtid_match +--enable_query_log + +# +# Cleanup. +# +--connection node_1 +--disable_query_log +DROP PROCEDURE p_load; +DROP TABLE t1; +DROP TABLE ctrl; + +CALL mtr.add_suppression("WSREP: Failed to prepare for incremental state transfer"); +CALL mtr.add_suppression("WSREP: Member .* requested state transfer"); +CALL mtr.add_suppression("WSREP: .* returned an error: Not connected to Primary Component"); +--enable_query_log + +--connection node_2 +--disable_query_log +CALL mtr.add_suppression("WSREP: Failed to prepare for incremental state transfer"); +CALL mtr.add_suppression("WSREP: Member .* requested state transfer"); +CALL mtr.add_suppression("WSREP: Did not find domain ID from SST script output"); +CALL mtr.add_suppression("WSREP: Ignoring server id for non bootstrap node"); +--enable_query_log + +--connection node_3 +--disable_query_log +CALL mtr.add_suppression("WSREP: Failed to prepare for incremental state transfer"); +CALL mtr.add_suppression("WSREP: Member .* requested state transfer"); +CALL mtr.add_suppression("InnoDB: Table \"mysql\"\\.\"innodb_index_stats\" not found"); +CALL mtr.add_suppression("InnoDB: Table \"mysql\"\\.\"innodb_table_stats\" not found"); +CALL mtr.add_suppression("Can't open and lock time zone table"); +CALL mtr.add_suppression("Can't open and lock privilege tables"); +CALL mtr.add_suppression("Table 'mysql\\.gtid_slave_pos' doesn't exist"); +CALL mtr.add_suppression("Native table .* has the wrong structure"); +CALL mtr.add_suppression("WSREP: Did not find domain ID from SST script output"); +CALL mtr.add_suppression("WSREP: Ignoring server id for non bootstrap node"); +--enable_query_log + +# Restore original auto_increment_offset values. +--source ../galera/include/auto_increment_offset_restore.inc + +--source include/galera_end.inc diff --git a/scripts/wsrep_sst_mariabackup.sh b/scripts/wsrep_sst_mariabackup.sh index 6ff1728a38702..b4342ec1a6d4b 100644 --- a/scripts/wsrep_sst_mariabackup.sh +++ b/scripts/wsrep_sst_mariabackup.sh @@ -1448,18 +1448,28 @@ else # joiner if [ -n "$WSREP_SST_OPT_BINLOG" ]; then cd "$DATA" + # + # MDEV-38147: Do not ship the donor's binary log to the joiner. + # + # The backed-up binary log only carries a Gtid_list, and that + # position can be ahead of the engine snapshot (BACKUP STAGE + # BLOCK_COMMIT blocks the engine commit but not the binary log + # write). With gtid_strict_mode=ON that ahead position makes the + # joiner raise error 1950 when it re-binlogs transactions during + # IST. + # + # Instead the joiner starts a fresh binary log and seeds its GTID + # position from the storage-engine checkpoint during recovery (see + # wsrep_seed_binlog_gtid_state() in sql/log.cc) - the exact position + # from which IST resumes, which keeps the joiner's binary log in + # lockstep with the rest of the cluster. Leaving $binlogs empty skips + # the move below; the donor binary log files stay in "$DATA" and are + # removed together with it after the move stage. + # binlogs="" - if [ -f 'xtrabackup_binlog_info' ]; then - NL=$'\n' - while read bin_string || [ -n "$bin_string" ]; do - bin_file=$(echo "$bin_string" | cut -f1) - if [ -f "$bin_file" ]; then - binlogs="$binlogs${binlogs:+$NL}$bin_file" - fi - done < 'xtrabackup_binlog_info' - else - binlogs=$(ls -d -1 "$binlog_base".[0-9]* 2>/dev/null || :) - fi + wsrep_log_info "Not shipping the donor's binary log; the joiner" \ + "will start a fresh binary log seeded from the" \ + "storage-engine checkpoint (MDEV-38147)" cd "$DATA_DIR" if [ -n "$binlog_dir" -a "$binlog_dir" != '.' -a \ "$binlog_dir" != "$DATA_DIR" ] diff --git a/sql/handler.cc b/sql/handler.cc index af27370e26133..5ce876df7d778 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -2556,9 +2556,9 @@ static my_xid wsrep_order_and_check_continuity(XID *list, int len) if (!wsrep_is_wsrep_xid(list + i) || wsrep_xid_seqno(list + i) != cur_seqno + 1) { - WSREP_WARN("Discovered discontinuity in recovered wsrep " - "transaction XIDs. Truncating the recovery list to " - "%d entries", i); + WSREP_DEBUG("Discovered discontinuity in recovered wsrep " + "transaction XIDs. Truncating the recovery list to " + "%d entries", i); break; } ++cur_seqno; diff --git a/sql/log.cc b/sql/log.cc index 0d4764cef8105..d6e9e1a64989b 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -64,6 +64,7 @@ #ifdef WITH_WSREP #include "wsrep_trans_observer.h" #include "wsrep_status.h" +#include "wsrep_xid.h" #endif /* WITH_WSREP */ #ifdef HAVE_REPLICATION @@ -12014,6 +12015,57 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, +#if defined(WITH_WSREP) && defined(HAVE_REPLICATION) +/* + MDEV-38147: A Galera mariabackup SST no longer ships the donor's binary log + (the only thing it carried was a Gtid_list whose position was ahead of the + snapshot, causing error 1950). Instead the joiner starts a fresh binary log, + so its Gtid_list / @@gtid_binlog_pos must be seeded from the recovered wsrep + position - otherwise the joiner would report an empty binlog position until + it re-binlogs new transactions, which breaks its use as an async master. + + The wsrep cluster position lives in the storage-engine checkpoint (restored + by the SST). Async-replica source positions live in mysql.gtid_slave_pos + (also restored from the engine) and are handled separately, so they are not + seeded here. + + The whole cluster binlogs cluster writes under one consistent stream (the + seqno stays in lockstep because every node applies in the same total order). + The domain of that stream depends on the mode: + - wsrep_gtid_mode=ON : wsrep_gtid_domain_id (cluster writes are re-tagged + to it, see [wsrep_mysqld.cc:2983]); this is the domain in the checkpoint. + - wsrep_gtid_mode=OFF: gtid_domain_id (cluster writes keep the node's + configured domain, no re-tag). + In both modes the committed cluster seqno is the SE checkpoint seqno, so we + seed that domain's binlog state to the checkpoint position. This is also the + exact position from which IST will resume re-binlogging, so the joiner stays + in lockstep with the rest of the cluster (and, in ON mode, avoids error 1950 + from re-binlogging over an ahead position). +*/ +static void wsrep_seed_binlog_gtid_state() +{ + wsrep_server_gtid_t const eng= wsrep_get_SE_checkpoint(); + if (eng.seqno <= 0) + return; /* not a wsrep node / no position */ + + rpl_gtid eng_gtid; + eng_gtid.domain_id= wsrep_gtid_mode ? eng.domain_id + : global_system_variables.gtid_domain_id; + eng_gtid.server_id= eng.server_id; + eng_gtid.seq_no= eng.seqno; + + rpl_gtid *cur= rpl_global_gtid_binlog_state.find_most_recent(eng_gtid.domain_id); + if (cur && cur->seq_no >= eng_gtid.seq_no) + return; /* binlog state already at or ahead of the checkpoint */ + + sql_print_information("WSREP: seeding binlog GTID state to %u-%u-%llu " + "from the storage-engine checkpoint", + eng_gtid.domain_id, eng_gtid.server_id, + (unsigned long long) eng_gtid.seq_no); + rpl_global_gtid_binlog_state.update_nolock(&eng_gtid, false); +} +#endif /* WITH_WSREP && HAVE_REPLICATION */ + int MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) { @@ -12048,6 +12100,10 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) error= 0; } } +#if defined(WITH_WSREP) && defined(HAVE_REPLICATION) + if (!error && WSREP_PROVIDER_EXISTS) + wsrep_seed_binlog_gtid_state(); +#endif return error; } diff --git a/sql/wsrep_sst.cc b/sql/wsrep_sst.cc index 813ffc24e53b0..d4397a3439f1c 100644 --- a/sql/wsrep_sst.cc +++ b/sql/wsrep_sst.cc @@ -402,9 +402,6 @@ static bool wsrep_sst_complete (THD* thd, Wsrep_server_state& server_state= Wsrep_server_state::instance(); enum wsrep::server_state::state state= server_state.state(); bool failed= false; - char start_pos_buf[FN_REFLEN]; - ssize_t len= wsrep::print_to_c_str(sst_gtid, start_pos_buf, FN_REFLEN-1); - start_pos_buf[len]='\0'; // Do not call sst_received if we are not in joiner or // initialized state on server. This is because it @@ -419,14 +416,31 @@ static bool wsrep_sst_complete (THD* thd, } else { - WSREP_INFO("SST succeeded for position %s", start_pos_buf); + /* + Note: sst_received() does NOT use sst_gtid (the position reported by + the SST script). It determines the position internally from storage via + Wsrep_server_service::get_position(). + For physical SST methods these two may differ (e.g. the joiner's storage + recovers to an earlier position than the script reported). Log the + position actually adopted, not the script-reported one, to avoid + confusion. + */ + wsrep::gtid const received_gtid(wsrep_get_SE_checkpoint()); + char recv_pos_buf[FN_REFLEN]; + ssize_t const recv_len= + wsrep::print_to_c_str(received_gtid, recv_pos_buf, FN_REFLEN-1); + recv_pos_buf[recv_len > 0 ? recv_len : 0]= '\0'; + WSREP_INFO("SST succeeded for position %s", recv_pos_buf); } } else { + char start_pos_buf[FN_REFLEN]; + ssize_t const len= wsrep::print_to_c_str(sst_gtid, start_pos_buf, FN_REFLEN - 1); + start_pos_buf[len]= '\0'; + WSREP_ERROR("SST failed for position %s initialized %d server_state %s", - start_pos_buf, - server_state.is_initialized(), + start_pos_buf, server_state.is_initialized(), wsrep::to_c_string(state)); failed= true; } From 0985417e80c48aba4dfacd96519e0a9fb415e63b Mon Sep 17 00:00:00 2001 From: Alexey Yurchenko Date: Sun, 28 Jun 2026 11:58:22 +0200 Subject: [PATCH 2/2] MDEV-40179 Found N prepared transactions after mariabackup SST With log_bin=ON a transaction is committed via two-phase commit (the binary log is the second participant), so it passes through the InnoDB XA-prepare state. While a donor is held in BLOCK_COMMIT for a mariabackup backup, its parallel appliers (wsrep_slave_threads > 1) leave one or more such writesets prepared-but-not-yet-committed, and the snapshot captures them. On a freshly SST'd joiner nothing resolves these prepared transactions: binlog crash recovery does not run (the joiner has no in-use binlog to recover from), and the wsrep continuity-based commit is inactive because wsrep_emulate_bin_log is FALSE when log_bin is ON. The leftover prepared transactions then abort startup with "Found prepared transactions!". Note this does not depend on the prepared set being non-contiguous - even a contiguous run aborts, because nothing commits or rolls it back. Rollback these transactions in xarecover_handlerton(). Co-Authored-By: Claude Opus 4.8 --- .../suite/galera_3nodes/r/MDEV-40179.result | 47 +++ .../r/MDEV-40179_nobinlog.result | 47 +++ .../suite/galera_3nodes/t/MDEV-40179.cnf | 37 ++ .../suite/galera_3nodes/t/MDEV-40179.inc | 346 ++++++++++++++++++ .../suite/galera_3nodes/t/MDEV-40179.test | 20 + .../galera_3nodes/t/MDEV-40179_nobinlog.cnf | 27 ++ .../galera_3nodes/t/MDEV-40179_nobinlog.test | 16 + sql/handler.cc | 25 ++ 8 files changed, 565 insertions(+) create mode 100644 mysql-test/suite/galera_3nodes/r/MDEV-40179.result create mode 100644 mysql-test/suite/galera_3nodes/r/MDEV-40179_nobinlog.result create mode 100644 mysql-test/suite/galera_3nodes/t/MDEV-40179.cnf create mode 100644 mysql-test/suite/galera_3nodes/t/MDEV-40179.inc create mode 100644 mysql-test/suite/galera_3nodes/t/MDEV-40179.test create mode 100644 mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.cnf create mode 100644 mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.test diff --git a/mysql-test/suite/galera_3nodes/r/MDEV-40179.result b/mysql-test/suite/galera_3nodes/r/MDEV-40179.result new file mode 100644 index 0000000000000..9d8d43bbd7345 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/r/MDEV-40179.result @@ -0,0 +1,47 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +connection node_2; +connection node_3; +connection n1_load_1; +CALL p_load('t1_1'); +connection n2_load_1; +CALL p_load('t1_5'); +connection n1_load_2; +CALL p_load('t1_2'); +connection n2_load_2; +CALL p_load('t1_6'); +connection n1_load_3; +CALL p_load('t1_3'); +connection n2_load_3; +CALL p_load('t1_7'); +connection n1_load_4; +CALL p_load('t1_4'); +connection n2_load_4; +CALL p_load('t1_8'); +connection node_1; +connection node_2; +connection node_3; +connection node_1; +UPDATE ctrl SET stop = 1 WHERE id = 1; +connection node_1; +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +wsrep_cluster_size +3 +connection node_2; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_3; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_1; +connection node_2; +connection node_3; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera_3nodes/r/MDEV-40179_nobinlog.result b/mysql-test/suite/galera_3nodes/r/MDEV-40179_nobinlog.result new file mode 100644 index 0000000000000..6ca4480ce03dd --- /dev/null +++ b/mysql-test/suite/galera_3nodes/r/MDEV-40179_nobinlog.result @@ -0,0 +1,47 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +connection node_2; +connection node_3; +connection n1_load_1; +CALL p_load('t1_1'); +connection n2_load_1; +CALL p_load('t1_5'); +connection n1_load_2; +CALL p_load('t1_2'); +connection n2_load_2; +CALL p_load('t1_6'); +connection n1_load_3; +CALL p_load('t1_3'); +connection n2_load_3; +CALL p_load('t1_7'); +connection n1_load_4; +CALL p_load('t1_4'); +connection n2_load_4; +CALL p_load('t1_8'); +connection node_1; +connection node_2; +connection node_3; +connection node_1; +UPDATE ctrl SET stop = 1 WHERE id = 1; +connection node_1; +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +wsrep_cluster_size +3 +connection node_2; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match +1 1 +connection node_3; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match +1 1 +connection node_1; +connection node_2; +connection node_3; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179.cnf b/mysql-test/suite/galera_3nodes/t/MDEV-40179.cnf new file mode 100644 index 0000000000000..1ce02400920ae --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179.cnf @@ -0,0 +1,37 @@ +!include ../galera_3nodes.cnf + +[mysqld] +wsrep_sst_method=mariabackup +wsrep_sst_auth="root:" +gtid_strict_mode=ON +wsrep_gtid_mode=ON +wsrep_gtid_domain_id=100 +gtid_domain_id=10 +log_bin +log_slave_updates=ON +# Parallel apply so that prepared transactions can be committed out of order, +# producing a non-contiguous prepared set on the donor. +wsrep_slave_threads=8 +# Slow, durable commits widen the window during which transactions sit in the +# prepared (XA) state of two-phase commit, so the backup's BLOCK_COMMIT +# snapshot is more likely to capture in-doubt transactions. sync_binlog adds an +# fsync inside the 2PC window without exploding the binlog file count, and a +# moderate max_binlog_size keeps the binlog rotating (further widening the +# window) while avoiding the tens of thousands of tiny files that a 4 KB limit +# would create under this load. +innodb_flush_log_at_trx_commit=1 +sync_binlog=1 +max_binlog_size=16384 + +[mysqld.1] +server_id=11 + +[mysqld.2] +server_id=12 + +[mysqld.3] +server_id=13 + +[sst] +transferfmt=@ENV.MTR_GALERA_TFMT +streamfmt=mbstream diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179.inc b/mysql-test/suite/galera_3nodes/t/MDEV-40179.inc new file mode 100644 index 0000000000000..55e1b20421d1b --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179.inc @@ -0,0 +1,346 @@ +# +# Shared body for the MDEV-40179 tests (sourced by MDEV-40179.test with +# log_bin=ON and MDEV-40179_nobinlog.test with log_bin=OFF). +# +# The bug (reproduced by the log_bin=ON variant): +# +# With log_bin=ON a transaction is committed via two-phase commit (the binary +# log is the second participant), so it passes through the InnoDB XA-prepare +# state. While a donor is held in BLOCK_COMMIT for a mariabackup backup, its +# parallel appliers (wsrep_slave_threads > 1) leave one or more such writesets +# prepared-but-not-yet-committed, and the snapshot captures them. On a freshly +# SST'd joiner nothing resolves these prepared transactions: binlog crash +# recovery does not run (the joiner has no in-use binlog to recover from), and +# the wsrep continuity-based commit is inactive because wsrep_emulate_bin_log +# is FALSE when log_bin is ON. The leftover prepared transactions then abort +# startup with "Found prepared transactions!". Note this does not depend on +# the prepared set being non-contiguous - even a contiguous run aborts, because +# nothing commits or rolls it back. +# +# The log_bin=OFF variant is coverage only: with a single (InnoDB) read-write +# engine and no binary log, commits use one-phase commit, so transactions never +# enter the XA-prepared state and the snapshot has nothing in doubt. It simply +# verifies that mariabackup SST and reconvergence keep working with log_bin=OFF. +# +# To maximize parallel apply on the donor (and thus the chance of catching +# prepared transactions in the snapshot) each client thread writes to its own +# table: there are no certification conflicts between writers, so all of them +# apply concurrently. $writers client threads load on each of node_1 and node_2 +# while node_3 is repeatedly stopped, has its data directory purged and is +# started again, forcing a full mariabackup SST on every rejoin. At the end the +# cluster must reconverge to three nodes and all three nodes must hold identical +# data (and, with log_bin, identical GTID positions). +# +# Parameters set by the including .test: +# $restarts - number of stop/purge/start cycles for node_3 +# $writers - number of concurrent loader threads per node (each gets its +# own table to avoid certification conflicts) +# $check_gtid - 1 to also compare @@global.gtid_binlog_pos across nodes +# (only meaningful with log_bin), 0 otherwise +# + +--source include/big_test.inc +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_mariabackup.inc + +--let $galera_connection_name = node_3 +--let $galera_server_number = 3 +--source include/galera_connect.inc + +# Save original auto_increment_offset values so that MTR's post-check is +# happy after node_3 has been restarted multiple times. +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source ../galera/include/auto_increment_offset_save.inc + +# Total number of data tables: one per writer thread across both nodes. +--let $ntables = `SELECT 2 * $writers` + +# +# Schema: t1_1 .. t1_$ntables hold the load (one table per writer thread), +# ctrl carries the stop flag for the loaders. +# +--connection node_1 +--disable_query_log +CREATE TABLE ctrl (id INT PRIMARY KEY, stop INT) ENGINE=InnoDB; +INSERT INTO ctrl VALUES (1, 0); + +--let $t = 1 +while ($t <= $ntables) +{ + --eval CREATE TABLE t1_$t (pk BIGINT AUTO_INCREMENT PRIMARY KEY, val INT) ENGINE=InnoDB + --inc $t +} + +DELIMITER |; +CREATE PROCEDURE p_load(IN tname VARCHAR(64)) +BEGIN + DECLARE v_stop INT DEFAULT 0; + DECLARE v_i INT; + # Keep the loop alive across transient cluster errors (BF aborts, + # certification failures, donor desync timeouts, ...). + DECLARE CONTINUE HANDLER FOR SQLEXCEPTION + BEGIN + ROLLBACK; + END; + SET @ins_sql = CONCAT('INSERT INTO ', tname, ' (pk, val) VALUES (DEFAULT, 1)'); + PREPARE ins FROM @ins_sql; + WHILE v_stop = 0 DO + START TRANSACTION; + SET v_i = 0; + WHILE v_i < 16 DO + EXECUTE ins; + SET v_i = v_i + 1; + END WHILE; + COMMIT; + # Throttle slightly between transactions so that a freshly joined node can + # catch up its replication queue instead of being starved by the load. + DO SLEEP(0.01); + SELECT stop INTO v_stop FROM ctrl WHERE id = 1; + END WHILE; + DEALLOCATE PREPARE ins; +END| +DELIMITER ;| +--enable_query_log + +# Make sure the schema reached the other nodes before starting the load. +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = $ntables FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME LIKE 't1\_%'; +--source include/wait_condition.inc +--connection node_3 +--let $wait_condition = SELECT COUNT(*) = $ntables FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME LIKE 't1\_%'; +--source include/wait_condition.inc + +# +# Start the continuous load: $writers threads on node_1 (tables t1_1..t1_W) +# and $writers threads on node_2 (tables t1_(W+1)..t1_2W). +# +--disable_query_log +--let $w = 1 +while ($w <= $writers) +{ + --connect (n1_load_$w, 127.0.0.1, root, , test, $NODE_MYPORT_1) + --connect (n2_load_$w, 127.0.0.1, root, , test, $NODE_MYPORT_2) + --inc $w +} +--enable_query_log + +--let $w = 1 +--let $n2 = $writers +while ($w <= $writers) +{ + --connection n1_load_$w + --send_eval CALL p_load('t1_$w') + --inc $n2 + --connection n2_load_$w + --send_eval CALL p_load('t1_$n2') + --inc $w +} + +# +# While the load is running, repeatedly stop node_3, purge its data +# directory and start it again. An empty data directory forces a full +# mariabackup SST on every rejoin. +# +--disable_query_log +--let $i = $restarts +while ($i) +{ + --connection node_3 + --source include/shutdown_mysqld.inc + --disable_query_log + + # Wait until node_3 has actually left the cluster. + # (shutdown_mysqld.inc / wait_condition.inc / start_mysqld.inc / + # galera_wait_ready.inc each re-enable the query log, so re-disable it after + # every such include to keep the loop output out of the result file.) + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + # Purge node_3's data directory. + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/test + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mysql + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/performance_schema + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mtr + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data + + # Start node_3 again (rejoins via mariabackup SST). + --connection node_3 + --let $restart_noprint = 2 + --source include/start_mysqld.inc + --disable_query_log + --source include/galera_wait_ready.inc + --disable_query_log + + # Wait until the cluster is back to three nodes before the next cycle. + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + --dec $i +} +--enable_query_log + +# +# Make sure the whole cluster is healthy before stopping the load, so that +# any donor that desynced during SST has resynced and the loaders can read +# the stop flag without blocking. +# +--connection node_1 +--source include/galera_wait_ready.inc +--connection node_2 +--source include/galera_wait_ready.inc +--connection node_3 +--source include/galera_wait_ready.inc + +# +# Signal the loaders to stop and collect them. +# +--connection node_1 +UPDATE ctrl SET stop = 1 WHERE id = 1; + +--disable_query_log +--let $w = 1 +while ($w <= $writers) +{ + --connection n1_load_$w + --reap + --connection n2_load_$w + --reap + --inc $w +} +--enable_query_log + +# +# Build the aggregate count / checksum expressions over all data tables. +# +--let $count_expr = 0 +--let $sum_expr = 0 +--let $t = 1 +while ($t <= $ntables) +{ + --let $count_expr = $count_expr + (SELECT COUNT(*) FROM t1_$t) + --let $sum_expr = $sum_expr + (SELECT COALESCE(SUM(pk),0)+COALESCE(SUM(val),0) FROM t1_$t) + --inc $t +} + +# +# Verify reconvergence and data / GTID consistency across all nodes. +# +--connection node_1 +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +# The load has stopped; issue one final transaction from node_1 (sync_wait is +# on, so node_1 first applies everything else). This makes node_1 the origin of +# the cluster's highest GTID, so the checks below can wait for node_2/node_3 to +# converge *up* to node_1's position instead of comparing a single snapshot: +# @@gtid_binlog_pos is a system variable, so reading it is not covered by +# wsrep_sync_wait and a plain read can otherwise sample a position before the +# node has finished applying (a race that grows with accumulated load, e.g. +# under --repeat). +--disable_query_log +UPDATE ctrl SET stop = 2 WHERE id = 1; +--enable_query_log + +--let $expect_count = `SELECT $count_expr` +--let $expect_sum = `SELECT $sum_expr` +if ($check_gtid) +{ + # Compare only the wsrep domain (wsrep_gtid_domain_id) of gtid_binlog_pos. + # That is the part the whole cluster shares. Other domains in the position + # are node-local and legitimately differ: e.g. CALL mtr.add_suppression() + # below writes to the non-replicated 'mtr' database, which each node binlogs + # under its own gtid_domain_id/server_id - so those entries accumulate + # per-node across runs (visible under --repeat) and must not be compared. + --let $wsrep_dom = `SELECT @@global.wsrep_gtid_domain_id` + --let $expect_gtid = `SELECT REGEXP_SUBSTR(@@global.gtid_binlog_pos, '(? prepared transactions!". (gtid_strict_mode is enabled so any +# binlog/engine position inconsistency would also be caught.) +# +# See MDEV-40179.inc for the shared test body. +# + +--let $restarts = 8 +--let $writers = 4 +--let $check_gtid = 1 +--source MDEV-40179.inc diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.cnf b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.cnf new file mode 100644 index 0000000000000..38740c2ec8702 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.cnf @@ -0,0 +1,27 @@ +!include ../galera_3nodes.cnf + +[mysqld] +wsrep_sst_method=mariabackup +wsrep_sst_auth="root:" +# No log_bin: Galera uses its emulated binlog (wsrep_emulate_bin_log), so the +# wsrep XID continuity check is what resolves prepared transactions on a joiner. +# Parallel apply so that prepared transactions can be committed out of order, +# producing a non-contiguous prepared set on the donor. +wsrep_slave_threads=8 +# Slow, durable commits widen the window during which transactions sit in the +# prepared state, so the backup's BLOCK_COMMIT snapshot is more likely to +# capture in-doubt transactions. +innodb_flush_log_at_trx_commit=1 + +[mysqld.1] +server_id=11 + +[mysqld.2] +server_id=12 + +[mysqld.3] +server_id=13 + +[sst] +transferfmt=@ENV.MTR_GALERA_TFMT +streamfmt=mbstream diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.test b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.test new file mode 100644 index 0000000000000..3eb96d3defd1c --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.test @@ -0,0 +1,16 @@ +# +# MDEV-40179 - prepared transactions left behind by a mariabackup SST. +# +# log_bin=OFF variant: coverage only. With a single InnoDB read-write engine +# and no binary log, commits use one-phase commit, so transactions never enter +# the XA-prepared state and a mariabackup snapshot has nothing in doubt - the +# bug cannot occur here. This variant just exercises the same load and repeated +# mariabackup SST with log_bin=OFF and checks the cluster reconverges. +# +# See MDEV-40179.inc for the shared test body. +# + +--let $restarts = 8 +--let $writers = 4 +--let $check_gtid = 0 +--source MDEV-40179.inc diff --git a/sql/handler.cc b/sql/handler.cc index 5ce876df7d778..ad71aa5e24516 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -2843,6 +2843,31 @@ static my_bool xarecover_handlerton(THD *unused, plugin_ref plugin, x <= wsrep_limit) && info->dry_run, info->dry_run)) { +#ifdef WITH_WSREP + /* + MDEV-40179: a wsrep transaction still in the prepared state at the + final recovery pass (the dry run, commit_list == 0) is past the + storage-engine checkpoint and will be re-delivered by the cluster + (IST/SST). After a physical SST (mariabackup) the joiner runs no + binlog XA recovery to commit or roll back such transactions, so + without this they would abort startup with + "Found N prepared transactions!". Roll them back here; the cluster + re-applies them from the donor. Non-wsrep (e.g. user XA) prepared + transactions are left untouched and still reported. + + The guard is WSREP_PROVIDER_EXISTS ("a Galera provider is loaded"): + a node configured with a provider will rejoin and receive + these transactions; a standalone node (no provider) cannot, so there + we keep the conservative default and still report them. + */ + if (WSREP_PROVIDER_EXISTS && wsrep_is_wsrep_xid(info->list + i)) + { + if (hton->rollback_by_xid(hton, info->list + i) == 0) + sql_print_information("Rolled back orphan prepared wsrep " + "transaction %lld", (longlong) x); + continue; + } +#endif /* WITH_WSREP */ info->found_my_xids++; continue; }