diff --git a/extra/mariabackup/backup_mysql.cc b/extra/mariabackup/backup_mysql.cc index 7fa02b1128c6d..bc032d3e5f4d9 100644 --- a/extra/mariabackup/backup_mysql.cc +++ b/extra/mariabackup/backup_mysql.cc @@ -1504,8 +1504,14 @@ write_galera_info(ds_ctxt *datasink, MYSQL *connection) domain_id ? domain_id : domain_id55); } - if (result) - write_current_binlog_file(datasink, connection); + /* + MDEV-38147: Do not flush and copy the donor's current binary log here. + This used to ship the binary log to the SST joiner, but the joiner no + longer uses it - it starts a fresh binary log and seeds its GTID position + from the storage-engine checkpoint (see wsrep_seed_binlog_gtid_state() in + sql/log.cc and scripts/wsrep_sst_mariabackup.sh). Flushing here only + rotated the donor's binary log needlessly on every SST. + */ if (result) msg("Writing Galera info succeeded with %s:%s %s", @@ -1522,88 +1528,6 @@ write_galera_info(ds_ctxt *datasink, MYSQL *connection) } -/*********************************************************************//** -Flush and copy the current binary log file into the backup, -if GTID is enabled */ -bool -write_current_binlog_file(ds_ctxt *datasink, MYSQL *connection) -{ - char *executed_gtid_set = NULL; - char *gtid_binlog_state = NULL; - char *log_bin_file = NULL; - char *log_bin_dir = NULL; - bool gtid_exists; - bool result = true; - char filepath[FN_REFLEN]; - - mysql_variable status[] = { - {"Executed_Gtid_Set", &executed_gtid_set}, - {NULL, NULL} - }; - - mysql_variable status_after_flush[] = { - {"File", &log_bin_file}, - {NULL, NULL} - }; - - mysql_variable vars[] = { - {"gtid_binlog_state", >id_binlog_state}, - {"log_bin_basename", &log_bin_dir}, - {NULL, NULL} - }; - - read_mysql_variables(connection, "SHOW MASTER STATUS", status, false); - read_mysql_variables(connection, "SHOW VARIABLES", vars, true); - - gtid_exists = (executed_gtid_set && *executed_gtid_set) - || (gtid_binlog_state && *gtid_binlog_state); - - if (gtid_exists) { - size_t log_bin_dir_length; - - xb_mysql_query(connection, "FLUSH BINARY LOGS", false); - - read_mysql_variables(connection, "SHOW MASTER STATUS", - status_after_flush, false); - - if (opt_log_bin != NULL && strchr(opt_log_bin, FN_LIBCHAR)) { - /* If log_bin is set, it has priority */ - if (log_bin_dir) { - free(log_bin_dir); - } - log_bin_dir = strdup(opt_log_bin); - } else if (log_bin_dir == NULL) { - /* Default location is MySQL datadir */ - log_bin_dir = strdup("./"); - } - - dirname_part(log_bin_dir, log_bin_dir, &log_bin_dir_length); - - /* strip final slash if it is not the only path component */ - if (log_bin_dir_length > 1 && - log_bin_dir[log_bin_dir_length - 1] == FN_LIBCHAR) { - log_bin_dir[log_bin_dir_length - 1] = 0; - } - - if (log_bin_dir == NULL || log_bin_file == NULL) { - msg("Failed to get master binlog coordinates from " - "SHOW MASTER STATUS"); - result = false; - goto cleanup; - } - - snprintf(filepath, sizeof(filepath), "%s%c%s", - log_bin_dir, FN_LIBCHAR, log_bin_file); - result = datasink->copy_file(filepath, log_bin_file, 0); - } - -cleanup: - free_mysql_variables(status_after_flush); - free_mysql_variables(status); - free_mysql_variables(vars); - - return(result); -} /*********************************************************************//** diff --git a/extra/mariabackup/backup_mysql.h b/extra/mariabackup/backup_mysql.h index 55700dddf6d67..ac163643c103c 100644 --- a/extra/mariabackup/backup_mysql.h +++ b/extra/mariabackup/backup_mysql.h @@ -53,9 +53,6 @@ xb_mysql_query(MYSQL *connection, const char *query, bool use_result, void unlock_all(MYSQL *connection); -bool -write_current_binlog_file(ds_ctxt *datasink, MYSQL *connection); - bool write_binlog_info(ds_ctxt *datasink, MYSQL *connection); diff --git a/mysql-test/suite/galera_3nodes/r/MDEV-38147.result b/mysql-test/suite/galera_3nodes/r/MDEV-38147.result new file mode 100644 index 0000000000000..7e7199aa32296 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/r/MDEV-38147.result @@ -0,0 +1,43 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +# gtid_strict_mode must be enabled on all nodes +SELECT @@global.gtid_strict_mode AS gtid_strict_mode; +gtid_strict_mode +1 +connection node_2; +connection node_3; +connect node_1_load, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connect node_2_load, 127.0.0.1, root, , test, $NODE_MYPORT_2; +connection node_1_load; +CALL p_load(); +connection node_2_load; +CALL p_load(); +connection node_1; +connection node_2; +connection node_3; +connection node_1; +UPDATE ctrl SET stop = 1 WHERE id = 1; +connection node_1_load; +connection node_2_load; +connection node_1; +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +wsrep_cluster_size +3 +connection node_2; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_3; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_1; +connection node_2; +connection node_3; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera_3nodes/r/MDEV-40179.result b/mysql-test/suite/galera_3nodes/r/MDEV-40179.result new file mode 100644 index 0000000000000..9d8d43bbd7345 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/r/MDEV-40179.result @@ -0,0 +1,47 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +connection node_2; +connection node_3; +connection n1_load_1; +CALL p_load('t1_1'); +connection n2_load_1; +CALL p_load('t1_5'); +connection n1_load_2; +CALL p_load('t1_2'); +connection n2_load_2; +CALL p_load('t1_6'); +connection n1_load_3; +CALL p_load('t1_3'); +connection n2_load_3; +CALL p_load('t1_7'); +connection n1_load_4; +CALL p_load('t1_4'); +connection n2_load_4; +CALL p_load('t1_8'); +connection node_1; +connection node_2; +connection node_3; +connection node_1; +UPDATE ctrl SET stop = 1 WHERE id = 1; +connection node_1; +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +wsrep_cluster_size +3 +connection node_2; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_3; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match gtid_match +1 1 1 +connection node_1; +connection node_2; +connection node_3; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera_3nodes/r/MDEV-40179_nobinlog.result b/mysql-test/suite/galera_3nodes/r/MDEV-40179_nobinlog.result new file mode 100644 index 0000000000000..6ca4480ce03dd --- /dev/null +++ b/mysql-test/suite/galera_3nodes/r/MDEV-40179_nobinlog.result @@ -0,0 +1,47 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +connection node_2; +connection node_3; +connection n1_load_1; +CALL p_load('t1_1'); +connection n2_load_1; +CALL p_load('t1_5'); +connection n1_load_2; +CALL p_load('t1_2'); +connection n2_load_2; +CALL p_load('t1_6'); +connection n1_load_3; +CALL p_load('t1_3'); +connection n2_load_3; +CALL p_load('t1_7'); +connection n1_load_4; +CALL p_load('t1_4'); +connection n2_load_4; +CALL p_load('t1_8'); +connection node_1; +connection node_2; +connection node_3; +connection node_1; +UPDATE ctrl SET stop = 1 WHERE id = 1; +connection node_1; +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +wsrep_cluster_size +3 +connection node_2; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match +1 1 +connection node_3; +SET SESSION wsrep_sync_wait = 15; +count_match checksum_match +1 1 +connection node_1; +connection node_2; +connection node_3; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-38147.cnf b/mysql-test/suite/galera_3nodes/t/MDEV-38147.cnf new file mode 100644 index 0000000000000..a8c4cfe813755 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-38147.cnf @@ -0,0 +1,25 @@ +!include ../galera_3nodes.cnf + +[mysqld] +wsrep_sst_method=mariabackup +wsrep_sst_auth="root:" +gtid_strict_mode=ON +wsrep_gtid_mode=ON +wsrep_gtid_domain_id=100 +gtid_domain_id=10 +log_bin +log_slave_updates=ON +wsrep_slave_threads=4 + +[mysqld.1] +server_id=11 + +[mysqld.2] +server_id=12 + +[mysqld.3] +server_id=13 + +[sst] +transferfmt=@ENV.MTR_GALERA_TFMT +streamfmt=mbstream diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-38147.test b/mysql-test/suite/galera_3nodes/t/MDEV-38147.test new file mode 100644 index 0000000000000..8b7eb3715290a --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-38147.test @@ -0,0 +1,229 @@ +# +# MDEV-38147 +# +# Three node cluster with gtid_strict_mode enabled and mariabackup SST. +# Two nodes (node_1 and node_2) run a continuous load, inserting batches +# of rows in a throttled loop, while the third node (node_3) is repeatedly +# stopped, has its data directory purged and is started again, forcing a +# full mariabackup SST on every rejoin (8 times). +# +# At the end the cluster must reconverge to three nodes and all three +# nodes must hold identical data and GTID positions. +# + +--source include/big_test.inc +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_mariabackup.inc + +# Number of stop/purge/start cycles for node_3. +--let $restarts = 8 + +--let $galera_connection_name = node_3 +--let $galera_server_number = 3 +--source include/galera_connect.inc + +# Save original auto_increment_offset values so that MTR's post-check is +# happy after node_3 has been restarted multiple times. +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source ../galera/include/auto_increment_offset_save.inc + +# +# Schema: t1 holds the load, ctrl carries the stop flag for the loaders. +# +--connection node_1 +--disable_query_log +CREATE TABLE t1 (pk BIGINT AUTO_INCREMENT PRIMARY KEY, val INT) ENGINE=InnoDB; +CREATE TABLE ctrl (id INT PRIMARY KEY, stop INT) ENGINE=InnoDB; +INSERT INTO ctrl VALUES (1, 0); + +DELIMITER |; +CREATE PROCEDURE p_load() +BEGIN + DECLARE v_stop INT DEFAULT 0; + DECLARE v_i INT; + # Keep the loop alive across transient cluster errors (BF aborts, + # certification failures, donor desync timeouts, ...). + DECLARE CONTINUE HANDLER FOR SQLEXCEPTION + BEGIN + ROLLBACK; + END; + WHILE v_stop = 0 DO + START TRANSACTION; + SET v_i = 0; + WHILE v_i < 16 DO + INSERT INTO t1 (val) VALUES (v_i); + SET v_i = v_i + 1; + END WHILE; + COMMIT; + # Throttle slightly between transactions so that a freshly joined node can + # catch up its replication queue instead of being starved by the load. + DO SLEEP(0.01); + SELECT stop INTO v_stop FROM ctrl WHERE id = 1; + END WHILE; +END| +DELIMITER ;| +--enable_query_log + +--echo # gtid_strict_mode must be enabled on all nodes +SELECT @@global.gtid_strict_mode AS gtid_strict_mode; + +# Make sure the schema reached the other nodes before starting the load. +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME = 't1'; +--source include/wait_condition.inc +--connection node_3 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME = 't1'; +--source include/wait_condition.inc + +# +# Start the continuous load on node_1 and node_2. +# +--connect node_1_load, 127.0.0.1, root, , test, $NODE_MYPORT_1 +--connect node_2_load, 127.0.0.1, root, , test, $NODE_MYPORT_2 + +--connection node_1_load +--send CALL p_load() + +--connection node_2_load +--send CALL p_load() + +# +# While the load is running, repeatedly stop node_3, purge its data +# directory and start it again. An empty data directory forces a full +# mariabackup SST on every rejoin. +# +--disable_query_log +--let $i = $restarts +while ($i) +{ + --connection node_3 + --source include/shutdown_mysqld.inc + --disable_query_log + + # Wait until node_3 has actually left the cluster. + # (shutdown_mysqld.inc / wait_condition.inc / start_mysqld.inc / + # galera_wait_ready.inc each re-enable the query log, so re-disable it after + # every such include to keep the loop output out of the result file.) + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + # Purge node_3's data directory. + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/test + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mysql + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/performance_schema + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mtr + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data + + # Start node_3 again (rejoins via mariabackup SST). + --connection node_3 + --let $restart_noprint = 2 + --source include/start_mysqld.inc + --disable_query_log + --source include/galera_wait_ready.inc + --disable_query_log + + # Wait until the cluster is back to three nodes before the next cycle. + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + --dec $i +} +--enable_query_log + +# +# Make sure the whole cluster is healthy before stopping the load, so that +# any donor that desynced during SST has resynced and the loaders can read +# the stop flag without blocking. +# +--connection node_1 +--source include/galera_wait_ready.inc +--connection node_2 +--source include/galera_wait_ready.inc +--connection node_3 +--source include/galera_wait_ready.inc + +# +# Signal the loaders to stop and collect them. +# +--connection node_1 +UPDATE ctrl SET stop = 1 WHERE id = 1; + +--connection node_1_load +--reap +--connection node_2_load +--reap + +# +# Verify reconvergence and data / GTID consistency across all nodes. +# +--connection node_1 +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +--let $expect_count = `SELECT COUNT(*) FROM t1` +--let $expect_sum = `SELECT COALESCE(SUM(pk), 0) + COALESCE(SUM(val), 0) FROM t1` +--let $expect_gtid = `SELECT @@global.gtid_binlog_pos` + +--connection node_2 +SET SESSION wsrep_sync_wait = 15; +--let $wait_condition = SELECT COUNT(*) = $expect_count FROM t1 +--source include/wait_condition.inc +--disable_query_log +--eval SELECT $expect_count = (SELECT COUNT(*) FROM t1) AS count_match, $expect_sum = (SELECT COALESCE(SUM(pk), 0) + COALESCE(SUM(val), 0) FROM t1) AS checksum_match, '$expect_gtid' = @@global.gtid_binlog_pos AS gtid_match +--enable_query_log + +--connection node_3 +SET SESSION wsrep_sync_wait = 15; +--let $wait_condition = SELECT COUNT(*) = $expect_count FROM t1 +--source include/wait_condition.inc +--disable_query_log +--eval SELECT $expect_count = (SELECT COUNT(*) FROM t1) AS count_match, $expect_sum = (SELECT COALESCE(SUM(pk), 0) + COALESCE(SUM(val), 0) FROM t1) AS checksum_match, '$expect_gtid' = @@global.gtid_binlog_pos AS gtid_match +--enable_query_log + +# +# Cleanup. +# +--connection node_1 +--disable_query_log +DROP PROCEDURE p_load; +DROP TABLE t1; +DROP TABLE ctrl; + +CALL mtr.add_suppression("WSREP: Failed to prepare for incremental state transfer"); +CALL mtr.add_suppression("WSREP: Member .* requested state transfer"); +CALL mtr.add_suppression("WSREP: .* returned an error: Not connected to Primary Component"); +--enable_query_log + +--connection node_2 +--disable_query_log +CALL mtr.add_suppression("WSREP: Failed to prepare for incremental state transfer"); +CALL mtr.add_suppression("WSREP: Member .* requested state transfer"); +CALL mtr.add_suppression("WSREP: Did not find domain ID from SST script output"); +CALL mtr.add_suppression("WSREP: Ignoring server id for non bootstrap node"); +--enable_query_log + +--connection node_3 +--disable_query_log +CALL mtr.add_suppression("WSREP: Failed to prepare for incremental state transfer"); +CALL mtr.add_suppression("WSREP: Member .* requested state transfer"); +CALL mtr.add_suppression("InnoDB: Table \"mysql\"\\.\"innodb_index_stats\" not found"); +CALL mtr.add_suppression("InnoDB: Table \"mysql\"\\.\"innodb_table_stats\" not found"); +CALL mtr.add_suppression("Can't open and lock time zone table"); +CALL mtr.add_suppression("Can't open and lock privilege tables"); +CALL mtr.add_suppression("Table 'mysql\\.gtid_slave_pos' doesn't exist"); +CALL mtr.add_suppression("Native table .* has the wrong structure"); +CALL mtr.add_suppression("WSREP: Did not find domain ID from SST script output"); +CALL mtr.add_suppression("WSREP: Ignoring server id for non bootstrap node"); +--enable_query_log + +# Restore original auto_increment_offset values. +--source ../galera/include/auto_increment_offset_restore.inc + +--source include/galera_end.inc diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179.cnf b/mysql-test/suite/galera_3nodes/t/MDEV-40179.cnf new file mode 100644 index 0000000000000..1ce02400920ae --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179.cnf @@ -0,0 +1,37 @@ +!include ../galera_3nodes.cnf + +[mysqld] +wsrep_sst_method=mariabackup +wsrep_sst_auth="root:" +gtid_strict_mode=ON +wsrep_gtid_mode=ON +wsrep_gtid_domain_id=100 +gtid_domain_id=10 +log_bin +log_slave_updates=ON +# Parallel apply so that prepared transactions can be committed out of order, +# producing a non-contiguous prepared set on the donor. +wsrep_slave_threads=8 +# Slow, durable commits widen the window during which transactions sit in the +# prepared (XA) state of two-phase commit, so the backup's BLOCK_COMMIT +# snapshot is more likely to capture in-doubt transactions. sync_binlog adds an +# fsync inside the 2PC window without exploding the binlog file count, and a +# moderate max_binlog_size keeps the binlog rotating (further widening the +# window) while avoiding the tens of thousands of tiny files that a 4 KB limit +# would create under this load. +innodb_flush_log_at_trx_commit=1 +sync_binlog=1 +max_binlog_size=16384 + +[mysqld.1] +server_id=11 + +[mysqld.2] +server_id=12 + +[mysqld.3] +server_id=13 + +[sst] +transferfmt=@ENV.MTR_GALERA_TFMT +streamfmt=mbstream diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179.inc b/mysql-test/suite/galera_3nodes/t/MDEV-40179.inc new file mode 100644 index 0000000000000..55e1b20421d1b --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179.inc @@ -0,0 +1,346 @@ +# +# Shared body for the MDEV-40179 tests (sourced by MDEV-40179.test with +# log_bin=ON and MDEV-40179_nobinlog.test with log_bin=OFF). +# +# The bug (reproduced by the log_bin=ON variant): +# +# With log_bin=ON a transaction is committed via two-phase commit (the binary +# log is the second participant), so it passes through the InnoDB XA-prepare +# state. While a donor is held in BLOCK_COMMIT for a mariabackup backup, its +# parallel appliers (wsrep_slave_threads > 1) leave one or more such writesets +# prepared-but-not-yet-committed, and the snapshot captures them. On a freshly +# SST'd joiner nothing resolves these prepared transactions: binlog crash +# recovery does not run (the joiner has no in-use binlog to recover from), and +# the wsrep continuity-based commit is inactive because wsrep_emulate_bin_log +# is FALSE when log_bin is ON. The leftover prepared transactions then abort +# startup with "Found prepared transactions!". Note this does not depend on +# the prepared set being non-contiguous - even a contiguous run aborts, because +# nothing commits or rolls it back. +# +# The log_bin=OFF variant is coverage only: with a single (InnoDB) read-write +# engine and no binary log, commits use one-phase commit, so transactions never +# enter the XA-prepared state and the snapshot has nothing in doubt. It simply +# verifies that mariabackup SST and reconvergence keep working with log_bin=OFF. +# +# To maximize parallel apply on the donor (and thus the chance of catching +# prepared transactions in the snapshot) each client thread writes to its own +# table: there are no certification conflicts between writers, so all of them +# apply concurrently. $writers client threads load on each of node_1 and node_2 +# while node_3 is repeatedly stopped, has its data directory purged and is +# started again, forcing a full mariabackup SST on every rejoin. At the end the +# cluster must reconverge to three nodes and all three nodes must hold identical +# data (and, with log_bin, identical GTID positions). +# +# Parameters set by the including .test: +# $restarts - number of stop/purge/start cycles for node_3 +# $writers - number of concurrent loader threads per node (each gets its +# own table to avoid certification conflicts) +# $check_gtid - 1 to also compare @@global.gtid_binlog_pos across nodes +# (only meaningful with log_bin), 0 otherwise +# + +--source include/big_test.inc +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_mariabackup.inc + +--let $galera_connection_name = node_3 +--let $galera_server_number = 3 +--source include/galera_connect.inc + +# Save original auto_increment_offset values so that MTR's post-check is +# happy after node_3 has been restarted multiple times. +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source ../galera/include/auto_increment_offset_save.inc + +# Total number of data tables: one per writer thread across both nodes. +--let $ntables = `SELECT 2 * $writers` + +# +# Schema: t1_1 .. t1_$ntables hold the load (one table per writer thread), +# ctrl carries the stop flag for the loaders. +# +--connection node_1 +--disable_query_log +CREATE TABLE ctrl (id INT PRIMARY KEY, stop INT) ENGINE=InnoDB; +INSERT INTO ctrl VALUES (1, 0); + +--let $t = 1 +while ($t <= $ntables) +{ + --eval CREATE TABLE t1_$t (pk BIGINT AUTO_INCREMENT PRIMARY KEY, val INT) ENGINE=InnoDB + --inc $t +} + +DELIMITER |; +CREATE PROCEDURE p_load(IN tname VARCHAR(64)) +BEGIN + DECLARE v_stop INT DEFAULT 0; + DECLARE v_i INT; + # Keep the loop alive across transient cluster errors (BF aborts, + # certification failures, donor desync timeouts, ...). + DECLARE CONTINUE HANDLER FOR SQLEXCEPTION + BEGIN + ROLLBACK; + END; + SET @ins_sql = CONCAT('INSERT INTO ', tname, ' (pk, val) VALUES (DEFAULT, 1)'); + PREPARE ins FROM @ins_sql; + WHILE v_stop = 0 DO + START TRANSACTION; + SET v_i = 0; + WHILE v_i < 16 DO + EXECUTE ins; + SET v_i = v_i + 1; + END WHILE; + COMMIT; + # Throttle slightly between transactions so that a freshly joined node can + # catch up its replication queue instead of being starved by the load. + DO SLEEP(0.01); + SELECT stop INTO v_stop FROM ctrl WHERE id = 1; + END WHILE; + DEALLOCATE PREPARE ins; +END| +DELIMITER ;| +--enable_query_log + +# Make sure the schema reached the other nodes before starting the load. +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = $ntables FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME LIKE 't1\_%'; +--source include/wait_condition.inc +--connection node_3 +--let $wait_condition = SELECT COUNT(*) = $ntables FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'test' AND TABLE_NAME LIKE 't1\_%'; +--source include/wait_condition.inc + +# +# Start the continuous load: $writers threads on node_1 (tables t1_1..t1_W) +# and $writers threads on node_2 (tables t1_(W+1)..t1_2W). +# +--disable_query_log +--let $w = 1 +while ($w <= $writers) +{ + --connect (n1_load_$w, 127.0.0.1, root, , test, $NODE_MYPORT_1) + --connect (n2_load_$w, 127.0.0.1, root, , test, $NODE_MYPORT_2) + --inc $w +} +--enable_query_log + +--let $w = 1 +--let $n2 = $writers +while ($w <= $writers) +{ + --connection n1_load_$w + --send_eval CALL p_load('t1_$w') + --inc $n2 + --connection n2_load_$w + --send_eval CALL p_load('t1_$n2') + --inc $w +} + +# +# While the load is running, repeatedly stop node_3, purge its data +# directory and start it again. An empty data directory forces a full +# mariabackup SST on every rejoin. +# +--disable_query_log +--let $i = $restarts +while ($i) +{ + --connection node_3 + --source include/shutdown_mysqld.inc + --disable_query_log + + # Wait until node_3 has actually left the cluster. + # (shutdown_mysqld.inc / wait_condition.inc / start_mysqld.inc / + # galera_wait_ready.inc each re-enable the query log, so re-disable it after + # every such include to keep the loop output out of the result file.) + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + # Purge node_3's data directory. + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/test + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mysql + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/performance_schema + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data/mtr + --remove_files_wildcard $MYSQLTEST_VARDIR/mysqld.3/data + + # Start node_3 again (rejoins via mariabackup SST). + --connection node_3 + --let $restart_noprint = 2 + --source include/start_mysqld.inc + --disable_query_log + --source include/galera_wait_ready.inc + --disable_query_log + + # Wait until the cluster is back to three nodes before the next cycle. + --connection node_1 + --let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + --source include/wait_condition.inc + --disable_query_log + + --dec $i +} +--enable_query_log + +# +# Make sure the whole cluster is healthy before stopping the load, so that +# any donor that desynced during SST has resynced and the loaders can read +# the stop flag without blocking. +# +--connection node_1 +--source include/galera_wait_ready.inc +--connection node_2 +--source include/galera_wait_ready.inc +--connection node_3 +--source include/galera_wait_ready.inc + +# +# Signal the loaders to stop and collect them. +# +--connection node_1 +UPDATE ctrl SET stop = 1 WHERE id = 1; + +--disable_query_log +--let $w = 1 +while ($w <= $writers) +{ + --connection n1_load_$w + --reap + --connection n2_load_$w + --reap + --inc $w +} +--enable_query_log + +# +# Build the aggregate count / checksum expressions over all data tables. +# +--let $count_expr = 0 +--let $sum_expr = 0 +--let $t = 1 +while ($t <= $ntables) +{ + --let $count_expr = $count_expr + (SELECT COUNT(*) FROM t1_$t) + --let $sum_expr = $sum_expr + (SELECT COALESCE(SUM(pk),0)+COALESCE(SUM(val),0) FROM t1_$t) + --inc $t +} + +# +# Verify reconvergence and data / GTID consistency across all nodes. +# +--connection node_1 +SET SESSION wsrep_sync_wait = 15; +SELECT VARIABLE_VALUE AS wsrep_cluster_size FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +# The load has stopped; issue one final transaction from node_1 (sync_wait is +# on, so node_1 first applies everything else). This makes node_1 the origin of +# the cluster's highest GTID, so the checks below can wait for node_2/node_3 to +# converge *up* to node_1's position instead of comparing a single snapshot: +# @@gtid_binlog_pos is a system variable, so reading it is not covered by +# wsrep_sync_wait and a plain read can otherwise sample a position before the +# node has finished applying (a race that grows with accumulated load, e.g. +# under --repeat). +--disable_query_log +UPDATE ctrl SET stop = 2 WHERE id = 1; +--enable_query_log + +--let $expect_count = `SELECT $count_expr` +--let $expect_sum = `SELECT $sum_expr` +if ($check_gtid) +{ + # Compare only the wsrep domain (wsrep_gtid_domain_id) of gtid_binlog_pos. + # That is the part the whole cluster shares. Other domains in the position + # are node-local and legitimately differ: e.g. CALL mtr.add_suppression() + # below writes to the non-replicated 'mtr' database, which each node binlogs + # under its own gtid_domain_id/server_id - so those entries accumulate + # per-node across runs (visible under --repeat) and must not be compared. + --let $wsrep_dom = `SELECT @@global.wsrep_gtid_domain_id` + --let $expect_gtid = `SELECT REGEXP_SUBSTR(@@global.gtid_binlog_pos, '(? prepared transactions!". (gtid_strict_mode is enabled so any +# binlog/engine position inconsistency would also be caught.) +# +# See MDEV-40179.inc for the shared test body. +# + +--let $restarts = 8 +--let $writers = 4 +--let $check_gtid = 1 +--source MDEV-40179.inc diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.cnf b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.cnf new file mode 100644 index 0000000000000..38740c2ec8702 --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.cnf @@ -0,0 +1,27 @@ +!include ../galera_3nodes.cnf + +[mysqld] +wsrep_sst_method=mariabackup +wsrep_sst_auth="root:" +# No log_bin: Galera uses its emulated binlog (wsrep_emulate_bin_log), so the +# wsrep XID continuity check is what resolves prepared transactions on a joiner. +# Parallel apply so that prepared transactions can be committed out of order, +# producing a non-contiguous prepared set on the donor. +wsrep_slave_threads=8 +# Slow, durable commits widen the window during which transactions sit in the +# prepared state, so the backup's BLOCK_COMMIT snapshot is more likely to +# capture in-doubt transactions. +innodb_flush_log_at_trx_commit=1 + +[mysqld.1] +server_id=11 + +[mysqld.2] +server_id=12 + +[mysqld.3] +server_id=13 + +[sst] +transferfmt=@ENV.MTR_GALERA_TFMT +streamfmt=mbstream diff --git a/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.test b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.test new file mode 100644 index 0000000000000..3eb96d3defd1c --- /dev/null +++ b/mysql-test/suite/galera_3nodes/t/MDEV-40179_nobinlog.test @@ -0,0 +1,16 @@ +# +# MDEV-40179 - prepared transactions left behind by a mariabackup SST. +# +# log_bin=OFF variant: coverage only. With a single InnoDB read-write engine +# and no binary log, commits use one-phase commit, so transactions never enter +# the XA-prepared state and a mariabackup snapshot has nothing in doubt - the +# bug cannot occur here. This variant just exercises the same load and repeated +# mariabackup SST with log_bin=OFF and checks the cluster reconverges. +# +# See MDEV-40179.inc for the shared test body. +# + +--let $restarts = 8 +--let $writers = 4 +--let $check_gtid = 0 +--source MDEV-40179.inc diff --git a/scripts/wsrep_sst_mariabackup.sh b/scripts/wsrep_sst_mariabackup.sh index 6ff1728a38702..b4342ec1a6d4b 100644 --- a/scripts/wsrep_sst_mariabackup.sh +++ b/scripts/wsrep_sst_mariabackup.sh @@ -1448,18 +1448,28 @@ else # joiner if [ -n "$WSREP_SST_OPT_BINLOG" ]; then cd "$DATA" + # + # MDEV-38147: Do not ship the donor's binary log to the joiner. + # + # The backed-up binary log only carries a Gtid_list, and that + # position can be ahead of the engine snapshot (BACKUP STAGE + # BLOCK_COMMIT blocks the engine commit but not the binary log + # write). With gtid_strict_mode=ON that ahead position makes the + # joiner raise error 1950 when it re-binlogs transactions during + # IST. + # + # Instead the joiner starts a fresh binary log and seeds its GTID + # position from the storage-engine checkpoint during recovery (see + # wsrep_seed_binlog_gtid_state() in sql/log.cc) - the exact position + # from which IST resumes, which keeps the joiner's binary log in + # lockstep with the rest of the cluster. Leaving $binlogs empty skips + # the move below; the donor binary log files stay in "$DATA" and are + # removed together with it after the move stage. + # binlogs="" - if [ -f 'xtrabackup_binlog_info' ]; then - NL=$'\n' - while read bin_string || [ -n "$bin_string" ]; do - bin_file=$(echo "$bin_string" | cut -f1) - if [ -f "$bin_file" ]; then - binlogs="$binlogs${binlogs:+$NL}$bin_file" - fi - done < 'xtrabackup_binlog_info' - else - binlogs=$(ls -d -1 "$binlog_base".[0-9]* 2>/dev/null || :) - fi + wsrep_log_info "Not shipping the donor's binary log; the joiner" \ + "will start a fresh binary log seeded from the" \ + "storage-engine checkpoint (MDEV-38147)" cd "$DATA_DIR" if [ -n "$binlog_dir" -a "$binlog_dir" != '.' -a \ "$binlog_dir" != "$DATA_DIR" ] diff --git a/sql/handler.cc b/sql/handler.cc index af27370e26133..ad71aa5e24516 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -2556,9 +2556,9 @@ static my_xid wsrep_order_and_check_continuity(XID *list, int len) if (!wsrep_is_wsrep_xid(list + i) || wsrep_xid_seqno(list + i) != cur_seqno + 1) { - WSREP_WARN("Discovered discontinuity in recovered wsrep " - "transaction XIDs. Truncating the recovery list to " - "%d entries", i); + WSREP_DEBUG("Discovered discontinuity in recovered wsrep " + "transaction XIDs. Truncating the recovery list to " + "%d entries", i); break; } ++cur_seqno; @@ -2843,6 +2843,31 @@ static my_bool xarecover_handlerton(THD *unused, plugin_ref plugin, x <= wsrep_limit) && info->dry_run, info->dry_run)) { +#ifdef WITH_WSREP + /* + MDEV-40179: a wsrep transaction still in the prepared state at the + final recovery pass (the dry run, commit_list == 0) is past the + storage-engine checkpoint and will be re-delivered by the cluster + (IST/SST). After a physical SST (mariabackup) the joiner runs no + binlog XA recovery to commit or roll back such transactions, so + without this they would abort startup with + "Found N prepared transactions!". Roll them back here; the cluster + re-applies them from the donor. Non-wsrep (e.g. user XA) prepared + transactions are left untouched and still reported. + + The guard is WSREP_PROVIDER_EXISTS ("a Galera provider is loaded"): + a node configured with a provider will rejoin and receive + these transactions; a standalone node (no provider) cannot, so there + we keep the conservative default and still report them. + */ + if (WSREP_PROVIDER_EXISTS && wsrep_is_wsrep_xid(info->list + i)) + { + if (hton->rollback_by_xid(hton, info->list + i) == 0) + sql_print_information("Rolled back orphan prepared wsrep " + "transaction %lld", (longlong) x); + continue; + } +#endif /* WITH_WSREP */ info->found_my_xids++; continue; } diff --git a/sql/log.cc b/sql/log.cc index 0d4764cef8105..d6e9e1a64989b 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -64,6 +64,7 @@ #ifdef WITH_WSREP #include "wsrep_trans_observer.h" #include "wsrep_status.h" +#include "wsrep_xid.h" #endif /* WITH_WSREP */ #ifdef HAVE_REPLICATION @@ -12014,6 +12015,57 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, +#if defined(WITH_WSREP) && defined(HAVE_REPLICATION) +/* + MDEV-38147: A Galera mariabackup SST no longer ships the donor's binary log + (the only thing it carried was a Gtid_list whose position was ahead of the + snapshot, causing error 1950). Instead the joiner starts a fresh binary log, + so its Gtid_list / @@gtid_binlog_pos must be seeded from the recovered wsrep + position - otherwise the joiner would report an empty binlog position until + it re-binlogs new transactions, which breaks its use as an async master. + + The wsrep cluster position lives in the storage-engine checkpoint (restored + by the SST). Async-replica source positions live in mysql.gtid_slave_pos + (also restored from the engine) and are handled separately, so they are not + seeded here. + + The whole cluster binlogs cluster writes under one consistent stream (the + seqno stays in lockstep because every node applies in the same total order). + The domain of that stream depends on the mode: + - wsrep_gtid_mode=ON : wsrep_gtid_domain_id (cluster writes are re-tagged + to it, see [wsrep_mysqld.cc:2983]); this is the domain in the checkpoint. + - wsrep_gtid_mode=OFF: gtid_domain_id (cluster writes keep the node's + configured domain, no re-tag). + In both modes the committed cluster seqno is the SE checkpoint seqno, so we + seed that domain's binlog state to the checkpoint position. This is also the + exact position from which IST will resume re-binlogging, so the joiner stays + in lockstep with the rest of the cluster (and, in ON mode, avoids error 1950 + from re-binlogging over an ahead position). +*/ +static void wsrep_seed_binlog_gtid_state() +{ + wsrep_server_gtid_t const eng= wsrep_get_SE_checkpoint(); + if (eng.seqno <= 0) + return; /* not a wsrep node / no position */ + + rpl_gtid eng_gtid; + eng_gtid.domain_id= wsrep_gtid_mode ? eng.domain_id + : global_system_variables.gtid_domain_id; + eng_gtid.server_id= eng.server_id; + eng_gtid.seq_no= eng.seqno; + + rpl_gtid *cur= rpl_global_gtid_binlog_state.find_most_recent(eng_gtid.domain_id); + if (cur && cur->seq_no >= eng_gtid.seq_no) + return; /* binlog state already at or ahead of the checkpoint */ + + sql_print_information("WSREP: seeding binlog GTID state to %u-%u-%llu " + "from the storage-engine checkpoint", + eng_gtid.domain_id, eng_gtid.server_id, + (unsigned long long) eng_gtid.seq_no); + rpl_global_gtid_binlog_state.update_nolock(&eng_gtid, false); +} +#endif /* WITH_WSREP && HAVE_REPLICATION */ + int MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) { @@ -12048,6 +12100,10 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) error= 0; } } +#if defined(WITH_WSREP) && defined(HAVE_REPLICATION) + if (!error && WSREP_PROVIDER_EXISTS) + wsrep_seed_binlog_gtid_state(); +#endif return error; } diff --git a/sql/wsrep_sst.cc b/sql/wsrep_sst.cc index 813ffc24e53b0..d4397a3439f1c 100644 --- a/sql/wsrep_sst.cc +++ b/sql/wsrep_sst.cc @@ -402,9 +402,6 @@ static bool wsrep_sst_complete (THD* thd, Wsrep_server_state& server_state= Wsrep_server_state::instance(); enum wsrep::server_state::state state= server_state.state(); bool failed= false; - char start_pos_buf[FN_REFLEN]; - ssize_t len= wsrep::print_to_c_str(sst_gtid, start_pos_buf, FN_REFLEN-1); - start_pos_buf[len]='\0'; // Do not call sst_received if we are not in joiner or // initialized state on server. This is because it @@ -419,14 +416,31 @@ static bool wsrep_sst_complete (THD* thd, } else { - WSREP_INFO("SST succeeded for position %s", start_pos_buf); + /* + Note: sst_received() does NOT use sst_gtid (the position reported by + the SST script). It determines the position internally from storage via + Wsrep_server_service::get_position(). + For physical SST methods these two may differ (e.g. the joiner's storage + recovers to an earlier position than the script reported). Log the + position actually adopted, not the script-reported one, to avoid + confusion. + */ + wsrep::gtid const received_gtid(wsrep_get_SE_checkpoint()); + char recv_pos_buf[FN_REFLEN]; + ssize_t const recv_len= + wsrep::print_to_c_str(received_gtid, recv_pos_buf, FN_REFLEN-1); + recv_pos_buf[recv_len > 0 ? recv_len : 0]= '\0'; + WSREP_INFO("SST succeeded for position %s", recv_pos_buf); } } else { + char start_pos_buf[FN_REFLEN]; + ssize_t const len= wsrep::print_to_c_str(sst_gtid, start_pos_buf, FN_REFLEN - 1); + start_pos_buf[len]= '\0'; + WSREP_ERROR("SST failed for position %s initialized %d server_state %s", - start_pos_buf, - server_state.is_initialized(), + start_pos_buf, server_state.is_initialized(), wsrep::to_c_string(state)); failed= true; }