Skip to content

Commit 3b59dc0

Browse files
committed
Bug#30930132 SUBSCRIPTION REPORTS SENT OUT TOO EARLY BY SUMA DURING NODE RESTART
Problem: During a node restart the suma block of the restarting node sends subscription reports to the new subscriber and, if requested, informs it about all other subscribers. Currently, these reports are sent too early for the subscribers as they won't be connected to the restarting node yet. Due to this they miss out these subscription reports during a node restart. The MySQL Server depends on these subscription reports to maintain a list of other MySQL Servers(participants) connected to the cluster. During schema distribution, the MySQL Server logs the schema change and waits for these participants to finish applying the changes before returning. If all the data nodes connected in a cluster are restarted one by one, the MySQL Servers will miss out on all the subscription reports and thus end up losing all details about the connected participants. Solution: SUMA block changed in order to, during the node restart, wait until all the subscribers are connected to send out the reports. Also, the reporting mechanism is optimized to send to subscribers only the necessary information. New API leve test added to test_event New MTR test ndb_participants added Change-Id: I879fe0fa535f4afd503d85f926f55388ce9f6227
1 parent e1efdee commit 3b59dc0

File tree

8 files changed

+744
-15
lines changed

8 files changed

+744
-15
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Execute a DDL and verify the participant list in the servers
2+
create table t1 (a int) engine ndb;
3+
include/assert_grep.inc [Found two participants in Server 1]
4+
drop table t1;
5+
include/assert_grep.inc [Found two participants in Server 2]
6+
# Restart all nodes one by one
7+
Restart data node 1
8+
Node 1 is being restarted
9+
10+
# Verify the participants again to confirm that they were not lost after node 1 stopped
11+
create table t1 (a int) engine ndb;
12+
include/assert_grep.inc [Found two participants in Server 1]
13+
drop table t1;
14+
include/assert_grep.inc [Found two participants in Server 2]
15+
Database node 1 is being started.
16+
17+
Node 1 restarted
18+
# Verify the participants again to confirm that they were not lost after node 1 restart
19+
create table t1 (a int) engine ndb;
20+
include/assert_grep.inc [Found two participants in Server 1]
21+
drop table t1;
22+
include/assert_grep.inc [Found two participants in Server 2]
23+
Restart data node 2
24+
Node 2 is being restarted
25+
26+
# Verify the participants again to confirm that they were not lost after node 2 stopped
27+
create table t1 (a int) engine ndb;
28+
include/assert_grep.inc [Found two participants in Server 1]
29+
drop table t1;
30+
include/assert_grep.inc [Found two participants in Server 2]
31+
Database node 2 is being started.
32+
33+
Node 1 restarted
34+
# Verify the participants again to confirm that they were not lost after node 2 restart
35+
create table t1 (a int) engine ndb;
36+
include/assert_grep.inc [Found two participants in Server 1]
37+
drop table t1;
38+
include/assert_grep.inc [Found two participants in Server 2]
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
--source include/have_ndb.inc
2+
--source include/have_multi_ndb.inc
3+
4+
--echo # Execute a DDL and verify the participant list in the servers
5+
6+
--connection server1
7+
create table t1 (a int) engine ndb;
8+
9+
--let $assert_text= Found two participants in Server 1
10+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.1.1.err
11+
# MySQL Server prints the connected participants in the format :
12+
# [NDB] Participants [<participant1 node id>,<participant2 node id>..]
13+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
14+
--let $assert_count= 1
15+
--source include/assert_grep.inc
16+
17+
--connection server2
18+
drop table t1;
19+
20+
--let $assert_text= Found two participants in Server 2
21+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.1.err
22+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
23+
--let $assert_count= 1
24+
--source include/assert_grep.inc
25+
26+
--echo # Restart all nodes one by one
27+
--echo Restart data node 1
28+
--exec $NDB_MGM -e "1 RESTART -n"
29+
--exec $NDB_WAITER --nowait-nodes=2 --not-started > /dev/null
30+
31+
--echo # Verify the participants again to confirm that they were not lost after node 1 stopped
32+
--connection server1
33+
create table t1 (a int) engine ndb;
34+
35+
--let $assert_text= Found two participants in Server 1
36+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.1.1.err
37+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
38+
--let $assert_count= 2
39+
--source include/assert_grep.inc
40+
41+
--connection server2
42+
drop table t1;
43+
44+
--let $assert_text= Found two participants in Server 2
45+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.1.err
46+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
47+
--let $assert_count= 2
48+
--source include/assert_grep.inc
49+
50+
--exec $NDB_MGM -e "1 START"
51+
--exec $NDB_WAITER > /dev/null
52+
--echo Node 1 restarted
53+
54+
--echo # Verify the participants again to confirm that they were not lost after node 1 restart
55+
--connection server1
56+
create table t1 (a int) engine ndb;
57+
58+
--let $assert_text= Found two participants in Server 1
59+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.1.1.err
60+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
61+
--let $assert_count= 3
62+
--source include/assert_grep.inc
63+
64+
--connection server2
65+
drop table t1;
66+
67+
--let $assert_text= Found two participants in Server 2
68+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.1.err
69+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
70+
--let $assert_count= 3
71+
--source include/assert_grep.inc
72+
73+
--echo Restart data node 2
74+
--exec $NDB_MGM -e "2 RESTART -n"
75+
--exec $NDB_WAITER --nowait-nodes=1 --not-started > /dev/null
76+
77+
--echo # Verify the participants again to confirm that they were not lost after node 2 stopped
78+
--connection server1
79+
create table t1 (a int) engine ndb;
80+
81+
--let $assert_text= Found two participants in Server 1
82+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.1.1.err
83+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
84+
--let $assert_count= 4
85+
--source include/assert_grep.inc
86+
87+
--connection server2
88+
drop table t1;
89+
90+
--let $assert_text= Found two participants in Server 2
91+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.1.err
92+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
93+
--let $assert_count= 4
94+
--source include/assert_grep.inc
95+
96+
--exec $NDB_MGM -e "2 START"
97+
--exec $NDB_WAITER > /dev/null
98+
--echo Node 1 restarted
99+
100+
--echo # Verify the participants again to confirm that they were not lost after node 2 restart
101+
--connection server1
102+
create table t1 (a int) engine ndb;
103+
104+
--let $assert_text= Found two participants in Server 1
105+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.1.1.err
106+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
107+
--let $assert_count= 5
108+
--source include/assert_grep.inc
109+
110+
--connection server2
111+
drop table t1;
112+
113+
--let $assert_text= Found two participants in Server 2
114+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.1.err
115+
--let $assert_select= \[NDB\] Participants: \[[0-9]+,[0-9]+\]
116+
--let $assert_count= 5
117+
--source include/assert_grep.inc

storage/ndb/include/kernel/signaldata/SumaImpl.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ struct SumaContinueB
593593
,WAIT_SCAN_TAB_REQ = 10
594594
,WAIT_GET_FRAGMENT = 11
595595
,SEND_SUB_GCP_COMPLETE_REP = 12
596+
,REPORT_SUBSCRIPTION_SET = 13
596597
};
597598
};
598599

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Next TRPMAN 9501
3434
Next BACKUP 10055
3535
Next PGMAN 11010
3636
Next DBTUX 12010
37-
Next SUMA 13058
37+
Next SUMA 13060
3838
Next LGMAN 15002
3939
Next TSMAN 16002
4040
Next DBSPJ 17000
@@ -855,6 +855,8 @@ SUMA:
855855
13051: Delay the DROP_TRIG_IMPL_REQ
856856
13057: Changing the ndbrequire from checking whether the gci is higher than
857857
the known gci instead of the next gci
858+
13058: Delay subscription reporting during node restart + cause API failure
859+
13059: Delay subscription reporting during node restart
858860

859861
LGMAN:
860862
-----

0 commit comments

Comments
 (0)