diff --git a/mysql-test/suite/galera/r/galera_bf_abort_orphan_lock.result b/mysql-test/suite/galera/r/galera_bf_abort_orphan_lock.result new file mode 100644 index 0000000000000..eb30c7c7c9bf3 --- /dev/null +++ b/mysql-test/suite/galera/r/galera_bf_abort_orphan_lock.result @@ -0,0 +1,39 @@ +connection node_2; +connection node_1; +connection node_1; +CREATE TABLE t_a (id INT PRIMARY KEY, c INT) ENGINE=InnoDB; +CREATE TABLE t_b (id INT PRIMARY KEY, c INT) ENGINE=InnoDB; +INSERT INTO t_a VALUES (1, 0); +INSERT INTO t_b VALUES (1, 0); +connect node_1_holder, 127.0.0.1, root, , test, $NODE_MYPORT_1; +BEGIN; +SELECT * FROM t_b WHERE id = 1 FOR UPDATE; +id c +1 0 +connect node_1_victim, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET SESSION tx_isolation = 'READ-COMMITTED'; +BEGIN; +UPDATE t_a SET c = c + 1 WHERE id > 0; +connection node_1; +connection node_1_victim; +SET DEBUG_SYNC = 'row_search_before_rec_lock SIGNAL victim_at_lock WAIT_FOR victim_cont'; +SET DEBUG_SYNC = 'trx_t_release_locks_enter SIGNAL victim_at_release WAIT_FOR holder_done'; +UPDATE t_b SET c = c + 1 WHERE id > 0; +connection node_1; +SET DEBUG_SYNC = 'now WAIT_FOR victim_at_lock'; +connection node_2; +UPDATE t_a SET c = c + 100 WHERE id = 1; +connection node_1; +SET DEBUG_SYNC = 'now SIGNAL victim_cont'; +SET DEBUG_SYNC = 'now WAIT_FOR victim_at_release'; +connection node_1_holder; +COMMIT; +connection node_1; +SET DEBUG_SYNC = 'now SIGNAL holder_done'; +connection node_1_victim; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +connection node_1; +SET DEBUG_SYNC = 'RESET'; +DROP TABLE t_a, t_b; +disconnect node_1_holder; +disconnect node_1_victim; diff --git a/mysql-test/suite/galera/t/galera_bf_abort_orphan_lock.test b/mysql-test/suite/galera/t/galera_bf_abort_orphan_lock.test new file mode 100644 index 0000000000000..64c23ed94e0a5 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_bf_abort_orphan_lock.test @@ -0,0 +1,103 @@ +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug_sync.inc + +# +# A local transaction is BF-aborted (wsrep victim flag set) while it holds a +# granted lock and is about to enqueue a NEW waiting lock for another row. +# +# Pre-fix, the victim parked an orphaned LOCK_WAIT request (with wait_thr still +# set) that survived into rollback; another transaction granting it later +# invoked lock_wait_end() on the victim from a different thread, corrupting its +# error_state and tripping the lock_wait_end() / que_thr_step() / +# rollback_finish() assertions. With the fix, lock_rec_enqueue_waiting() +# returns DB_DEADLOCK immediately for an already-chosen victim, so no orphaned +# waiter is created. +# +# Two separate tables (t_a, t_b) are used on purpose: the victim is paused +# holding t_b's page latch, while the applier conflicts on t_a -- so the BF +# abort cannot deadlock on a page latch the victim holds. +# +# Requires a debug build: the regression manifests as an assertion failure. +# + +--connection node_1 +CREATE TABLE t_a (id INT PRIMARY KEY, c INT) ENGINE=InnoDB; +CREATE TABLE t_b (id INT PRIMARY KEY, c INT) ENGINE=InnoDB; +INSERT INTO t_a VALUES (1, 0); +INSERT INTO t_b VALUES (1, 0); + +# holder holds the lock that the victim will need on t_b +--connect node_1_holder, 127.0.0.1, root, , test, $NODE_MYPORT_1 +BEGIN; +SELECT * FROM t_b WHERE id = 1 FOR UPDATE; + +# Victim: holds a granted lock on t_a (makes it a BF target), then locks t_b +--connect node_1_victim, 127.0.0.1, root, , test, $NODE_MYPORT_1 +SET SESSION tx_isolation = 'READ-COMMITTED'; +BEGIN; +UPDATE t_a SET c = c + 1 WHERE id > 0; + +--connection node_1 +--let $bf_before = `SELECT variable_value FROM information_schema.global_status WHERE variable_name = 'wsrep_local_bf_aborts'` + +# Pause the victim just before it requests the row lock on t_b: past the +# interrupt check at the top of rec_loop, before it enqueues a waiting lock, +# and holding no lock_sys/trx mutex (only t_b's page latch). +--connection node_1_victim +# Freeze the victim at two points: +# (1) just before it requests the t_b row lock -- so it can be flagged a BF +# victim while it still holds no wait_lock, and +# (2) once it reaches release_locks() during its rollback finalization, while +# its state is already TRX_STATE_COMMITTED_IN_MEMORY and (pre-fix) it still +# owns the orphaned LOCK_WAIT request on t_b. +SET DEBUG_SYNC = 'row_search_before_rec_lock SIGNAL victim_at_lock WAIT_FOR victim_cont'; +SET DEBUG_SYNC = 'trx_t_release_locks_enter SIGNAL victim_at_release WAIT_FOR holder_done'; +# WHERE id > 0 forces a non-unique clustered-index scan, i.e. the +# semi-consistent read path (lock_trx_handle_wait), where a victim's waiting +# lock is NOT auto-cancelled the way the plain lock_wait() path cancels it. +--send UPDATE t_b SET c = c + 1 WHERE id > 0 + +--connection node_1 +SET DEBUG_SYNC = 'now WAIT_FOR victim_at_lock'; + +# node_2 writeset conflicts with the victim's granted t_a lock -> BF-aborts it. +# At this moment the victim has no wait_lock, so cancel_lock_wait_for_wsrep_bf_abort() +# only sets the wsrep victim flag (bit 2) and cancels nothing. +--connection node_2 +UPDATE t_a SET c = c + 100 WHERE id = 1; + +# Wait until the victim has actually been flagged as a BF victim +--connection node_1 +--let $wait_condition = SELECT variable_value > $bf_before FROM information_schema.global_status WHERE variable_name = 'wsrep_local_bf_aborts' +--source include/wait_condition.inc + +# Release the victim: it enqueues the t_b lock while already a victim, fails with +# deadlock and rolls back. Pre-fix it leaves an orphaned LOCK_WAIT request and +# parks in release_locks() (state COMMITTED_IN_MEMORY); the fix makes the +# enqueue return DB_DEADLOCK so no orphan is ever created. +SET DEBUG_SYNC = 'now SIGNAL victim_cont'; +SET DEBUG_SYNC = 'now WAIT_FOR victim_at_release'; + +# Grant the (pre-fix) orphaned waiter by releasing holder's lock while the victim is +# parked in commit-in-memory: this invokes lock_wait_end() on the committed +# victim from H's thread -> assertion failure without the fix. With the fix +# there is no orphan, so the commit is clean. +--connection node_1_holder +COMMIT; + +# Let the victim finish rolling back +--connection node_1 +SET DEBUG_SYNC = 'now SIGNAL holder_done'; + +--connection node_1_victim +--error ER_LOCK_DEADLOCK +--reap + +# Server is alive and consistent +--connection node_1 +SET DEBUG_SYNC = 'RESET'; +DROP TABLE t_a, t_b; + +--disconnect node_1_holder +--disconnect node_1_victim diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 1cf4296f474a6..be1156c4c5432 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1499,6 +1499,14 @@ lock_rec_enqueue_waiting( lock, that's why we check only deadlock victim bit here. */ ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1)); + /* If this transaction has already been chosen as a victim — it must not + enqueue a new waiting lock request. + Return DB_DEADLOCK so that the caller rolls back instead of waiting. */ + if (trx->lock.was_chosen_as_deadlock_victim) { + trx->error_state = DB_DEADLOCK; + return DB_DEADLOCK; + } + if (trx->mysql_thd && thd_lock_wait_timeout(trx->mysql_thd) == 0) { trx->error_state = DB_LOCK_WAIT_TIMEOUT; return DB_LOCK_WAIT_TIMEOUT; @@ -3825,10 +3833,6 @@ lock_table_enqueue_waiting( ut_ad(trx->mutex_is_owner()); ut_ad(!trx->dict_operation_lock_mode); - /* Enqueue the lock request that will wait to be granted */ - lock_table_create(table, mode | LOCK_WAIT, trx, c_lock); - - trx->lock.wait_thr = thr; /* Apart from Galera, only transactions that have waiting lock may be chosen as deadlock victims. Only one lock can be waited for at a time, and a transaction is associated with a single thread. That is why @@ -3838,6 +3842,19 @@ lock_table_enqueue_waiting( lock, that's why we check only deadlock victim bit here. */ ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1)); + /* A transaction already chosen as a victim must not enqueue a new + waiting lock request. + Return DB_DEADLOCK so the caller rolls the transaction back. */ + if (trx->lock.was_chosen_as_deadlock_victim) { + trx->error_state = DB_DEADLOCK; + return DB_DEADLOCK; + } + + /* Enqueue the lock request that will wait to be granted */ + lock_table_create(table, mode | LOCK_WAIT, trx, c_lock); + + trx->lock.wait_thr = thr; + MONITOR_INC(MONITOR_TABLELOCK_WAIT); return(DB_LOCK_WAIT); } @@ -6702,6 +6719,28 @@ while holding a clustered index leaf page latch. dberr_t lock_trx_handle_wait(trx_t *trx) { DEBUG_SYNC_C("lock_trx_handle_wait_enter"); +#ifdef WITH_WSREP + /* A wsrep BF-abort marks the victim out-of-band: unlike native deadlock + resolution it does not cancel the victim's waiting lock atomically with + setting the flag, so the victim can reach this point still owning a live + LOCK_WAIT request (with trx->lock.wait_thr set). + So for BF-abort victm, we cancel the pending wait + here, which also clears trx->lock.wait_thr. But, this must not be done for + the native deadlock case, which cancels lock in Deadlock::report() */ + if (trx->lock.was_chosen_as_deadlock_victim && trx->is_wsrep()) + { + DEBUG_SYNC_C("lock_trx_handle_wait_before_unlocked_wait_lock_check"); + if (trx->lock.wait_lock) + { + mysql_mutex_lock(&lock_sys.wait_mutex); + if (lock_t *wait_lock= trx->lock.wait_lock) + lock_sys.cancel(trx, wait_lock); + lock_sys.deadlock_check(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + } + return DB_DEADLOCK; + } +#endif /* WITH_WSREP */ if (trx->lock.was_chosen_as_deadlock_victim) return DB_DEADLOCK; DEBUG_SYNC_C("lock_trx_handle_wait_before_unlocked_wait_lock_check"); diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index c3d6e971cd734..00627e17e9fc3 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -5241,6 +5241,13 @@ row_search_mvcc( lock_type = LOCK_ORDINARY; } + /* Test hook (debug builds only): allows a BF abort to mark this + transaction a wsrep victim after the interrupt check at the top of + rec_loop but before it enqueues a waiting lock request below — the + window in which the orphaned-waiter bug forms. Inert in release + builds (DEBUG_SYNC_C compiles to nothing without ENABLED_DEBUG_SYNC). */ + DEBUG_SYNC_C("row_search_before_rec_lock"); + err = sel_set_rec_lock(pcur, rec, index, offsets, prebuilt->select_lock_type,