Skip to content

Commit b5cc47e

Browse files
blauddenbjornmu
authored andcommitted
BUG#35164090 NDB MTA fails to retry temporary error
Temporary error that occurs while applying a transaction using MTA(multi threaded applier) for NDB is not properly retried. The problem is only visible when using more than one worker. The problem causes replication applier to stop with error similar to below, the "Got error -1" is significant: "Worker 1 failed executing transaction 'ANONYMOUS' at source log master-bin.000001, end_log_pos 1341; Could not execute Write_rows event on table test.t1; Got error -1 - 'Unknown error -1' during COMMIT, Error_code: 1180; the event's source log master-bin.000001, end_log_pos 1341" Fix by: - returning a MySQL mapped error code, thus avoiding that -1 is used as error code. - push warning(s) indicating temporary error to the warning stack, this is important in order for the replication applier to classify the error as temporary. Add tests showing how a provoked deadlock (which is considered temporary and should be retried) stops the applier when no transaction retries are allowed and sucessfully retries when allowed. Change-Id: I0ffc17beda65637d45fdba8c3d7dc4fe328f4527
1 parent 3ecaa12 commit b5cc47e

File tree

3 files changed

+152
-2
lines changed

3 files changed

+152
-2
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
include/master-slave.inc
2+
Warnings:
3+
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
4+
Note #### Storing MySQL user name or password information in the connection metadata repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START REPLICA; see the 'START REPLICA Syntax' in the MySQL Manual for more information.
5+
[connection master]
6+
##################################################################
7+
#
8+
# Check that applier retries temporary errors
9+
# - first show that applier stops when no retries are allowed.
10+
# - then reconfigure to allow transaction retries and show how
11+
# applier is retrying until the provoked error is cleared
12+
#
13+
[connection slave]
14+
call mtr.add_suppression(".*Lock timed out; Retry transaction.*");
15+
call mtr.add_suppression(".*worker thread retried transaction 1 time.*");
16+
call mtr.add_suppression(".*coordinator and worker threads are stopped.*");
17+
[connection master]
18+
CREATE TABLE t1 (
19+
nid int NOT NULL,
20+
nom char(4) default NULL,
21+
prenom char(4) default NULL,
22+
PRIMARY KEY USING HASH (nid)
23+
) ENGINE=ndbcluster DEFAULT CHARSET=latin1;
24+
INSERT INTO t1 VALUES (1,"XYZ1","ABC1");
25+
include/sync_slave_sql_with_master.inc
26+
[connection slave]
27+
# Lock the row on replica
28+
BEGIN;
29+
UPDATE t1 SET nom="LOCK" WHERE nid=1;
30+
# Set number of retries low so retries are not allowed
31+
SET GLOBAL replica_transaction_retries=1;
32+
[connection master]
33+
# Change the row on source, this will cause lock wait timeout when
34+
# applying on replica
35+
UPDATE t1 SET nom="DEAD" WHERE nid=1;
36+
[connection slave1]
37+
# Wait for deadlock to be detected and applier to stop since
38+
# retries where not allowed. Use different connection.
39+
include/wait_for_slave_sql_error.inc [errno=1205]
40+
# Reconfigure retries high enough to allow retries, start replication
41+
SET GLOBAL replica_transaction_retries=10;
42+
include/start_slave.inc
43+
# Wait for deadlock to be detected and retried at least two times.
44+
[connection slave]
45+
# Switch back to the connection holding lock on replica, commit the
46+
# transaction to release lock and thus let the currently retrying
47+
# applier succeed
48+
select * from t1 order by nid;
49+
nid nom prenom
50+
1 LOCK ABC1
51+
COMMIT;
52+
[connection master]
53+
include/sync_slave_sql_with_master.inc
54+
[connection slave]
55+
# Verify that the row was applied sucessfully on replica
56+
SELECT * FROM t1;
57+
nid nom prenom
58+
1 DEAD ABC1
59+
[connection master]
60+
# Cleanup
61+
DROP TABLE t1;
62+
include/rpl_end.inc
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
--source include/have_ndb.inc
2+
--source suite/ndb_rpl/ndb_master-slave.inc
3+
4+
--echo ##################################################################
5+
--echo #
6+
--echo # Check that applier retries temporary errors
7+
--echo # - first show that applier stops when no retries are allowed.
8+
--echo # - then reconfigure to allow transaction retries and show how
9+
--echo # applier is retrying until the provoked error is cleared
10+
--echo #
11+
12+
--source include/rpl_connection_slave.inc
13+
# Supress warnings caused by provoked temporary error retries
14+
call mtr.add_suppression(".*Lock timed out; Retry transaction.*");
15+
call mtr.add_suppression(".*worker thread retried transaction 1 time.*");
16+
call mtr.add_suppression(".*coordinator and worker threads are stopped.*");
17+
18+
--source include/rpl_connection_master.inc
19+
CREATE TABLE t1 (
20+
nid int NOT NULL,
21+
nom char(4) default NULL,
22+
prenom char(4) default NULL,
23+
PRIMARY KEY USING HASH (nid)
24+
) ENGINE=ndbcluster DEFAULT CHARSET=latin1;
25+
INSERT INTO t1 VALUES (1,"XYZ1","ABC1");
26+
27+
--source include/sync_slave_sql_with_master.inc
28+
29+
--source include/rpl_connection_slave.inc
30+
--echo # Lock the row on replica
31+
BEGIN;
32+
UPDATE t1 SET nom="LOCK" WHERE nid=1;
33+
34+
--echo # Set number of retries low so retries are not allowed
35+
SET GLOBAL replica_transaction_retries=1;
36+
37+
--source include/rpl_connection_master.inc
38+
--echo # Change the row on source, this will cause lock wait timeout when
39+
--echo # applying on replica
40+
UPDATE t1 SET nom="DEAD" WHERE nid=1;
41+
42+
--source include/rpl_connection_slave1.inc
43+
--echo # Wait for deadlock to be detected and applier to stop since
44+
--echo # retries where not allowed. Use different connection.
45+
--let $slave_sql_errno= convert_error(ER_LOCK_WAIT_TIMEOUT)
46+
--let $show_slave_sql_error= 0
47+
--source include/wait_for_slave_sql_error.inc
48+
49+
--echo # Reconfigure retries high enough to allow retries, start replication
50+
SET GLOBAL replica_transaction_retries=10;
51+
--source include/start_slave.inc
52+
53+
--echo # Wait for deadlock to be detected and retried at least two times.
54+
let $wait_condition=
55+
SELECT APPLYING_TRANSACTION_RETRIES_COUNT >= 2 FROM
56+
performance_schema.replication_applier_status_by_worker
57+
WHERE CHANNEL_NAME = "";
58+
--source include/wait_condition.inc
59+
60+
--source include/rpl_connection_slave.inc
61+
--echo # Switch back to the connection holding lock on replica, commit the
62+
--echo # transaction to release lock and thus let the currently retrying
63+
--echo # applier succeed
64+
select * from t1 order by nid;
65+
COMMIT;
66+
67+
--source include/rpl_connection_master.inc
68+
--source include/sync_slave_sql_with_master.inc
69+
70+
--source include/rpl_connection_slave.inc
71+
--echo # Verify that the row was applied sucessfully on replica
72+
SELECT * FROM t1;
73+
74+
--source include/rpl_connection_master.inc
75+
--echo # Cleanup
76+
DROP TABLE t1;
77+
78+
--source include/rpl_end.inc

storage/ndb/plugin/ha_ndbcluster.cc

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7784,11 +7784,21 @@ int ndbcluster_commit(handlerton *, THD *thd, bool all) {
77847784
if (applier && applier->get_num_workers() > 1) {
77857785
if (thd_ndb->m_unsent_bytes) {
77867786
DBUG_PRINT("info", ("Applier preparing defined operations"));
7787-
return execute_no_commit(thd_ndb, trans, true);
7787+
res = execute_no_commit(thd_ndb, trans, true);
7788+
if (res != 0) {
7789+
// Fatal transaction error occured
7790+
const NdbError &trans_error = trans->getNdbError();
7791+
if (trans_error.code == 4350) { // Transaction already aborted
7792+
thd_ndb->push_ndb_error_warning(trans_error);
7793+
res = HA_ERR_ROLLED_BACK;
7794+
} else {
7795+
res = ndbcluster_print_error(trans, thd_ndb->m_handler);
7796+
}
7797+
}
77887798
}
77897799
}
77907800

7791-
return 0;
7801+
return res;
77927802
}
77937803
thd_ndb->save_point_count = 0;
77947804

0 commit comments

Comments
 (0)