Skip to content

Commit 8eab5f5

Browse files
committed
Bug#35389533 Backup won't start with NOWAIT option after restarting a data node
Problem: A backup will not start if NOWAIT option is specified after a data node is restarted. Analysis: 1- if the restarted node is the master, master will eventually change and start backup signals will end up in a wrong data node(previous master) 2- if the backup is launched with nowait flag, the Management Server will not wait for any reply from the backup block. The way backup block has to report that self it not the master is to send back a GSN_BACKUP_REF signal to Management Server. So when both 1 and 2 are met, as Management Server is not waiting for signals from backup the REF will be ignored and consequently Management Server will not (re)try to perform the start backup. Solution: From Management Server always wait for BACKUP_REF/BACKUP_CONF from backup (even if nowait is specified). From backup side always send 1 signal to Management Server, GSN_BACKUP_REF if self is not master or signal GSN_BACKUP_CONF otherwise. Change-Id: Ie9c9163edf5b980676e72651f92cd73368d87fa9
1 parent 4a19cd4 commit 8eab5f5

File tree

4 files changed

+162
-41
lines changed

4 files changed

+162
-41
lines changed

storage/ndb/include/kernel/signaldata/BackupSignalData.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -50,6 +50,7 @@ class BackupReq {
5050
STATIC_CONST( SignalLength = 4 );
5151
STATIC_CONST( WAITCOMPLETED = 0x3 );
5252
STATIC_CONST( USE_UNDO_LOG = 0x4 );
53+
STATIC_CONST( NOWAIT_REPLY = 0x20 );
5354

5455
private:
5556
Uint32 senderData;

storage/ndb/include/ndb_version.h.in

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2004, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2004, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -1144,4 +1144,56 @@ ndbd_support_waitgcp_shutdownsync(Uint32 x)
11441144
return 1;
11451145
}
11461146

1147+
/*
1148+
* Data nodes reply to MGMD START BACKUP NOWAIT REQ from
1149+
* MGMD with BACKUP_REF/CONF.
1150+
*/
1151+
#define NDBD_START_BACKUP_NOWAIT_REPLY_75 NDB_MAKE_VERSION(7,5,31)
1152+
#define NDBD_START_BACKUP_NOWAIT_REPLY_76 NDB_MAKE_VERSION(7,6,27)
1153+
#define NDBD_START_BACKUP_NOWAIT_REPLY_80 NDB_MAKE_VERSION(8,0,35)
1154+
#define NDBD_START_BACKUP_NOWAIT_REPLY_82 NDB_MAKE_VERSION(8,2,0)
1155+
static
1156+
inline
1157+
int
1158+
ndbd_start_backup_nowait_reply(Uint32 x)
1159+
{
1160+
const Uint32 major = (x >> 16) & 0xFF;
1161+
const Uint32 minor = (x >> 8) & 0xFF;
1162+
if (major < 7)
1163+
{
1164+
return 0;
1165+
}
1166+
else if (major == 7)
1167+
{
1168+
if (minor < 5)
1169+
{
1170+
return 0;
1171+
}
1172+
else if (minor == 5)
1173+
{
1174+
return x >= NDBD_START_BACKUP_NOWAIT_REPLY_75;
1175+
}
1176+
else if (minor == 6)
1177+
{
1178+
return x >= NDBD_START_BACKUP_NOWAIT_REPLY_76;
1179+
}
1180+
}
1181+
else if (major == 8)
1182+
{
1183+
if (minor == 0)
1184+
{
1185+
return x >= NDBD_START_BACKUP_NOWAIT_REPLY_80;
1186+
}
1187+
else if (minor == 1)
1188+
{
1189+
return 0;
1190+
}
1191+
else
1192+
{
1193+
return x >= NDBD_START_BACKUP_NOWAIT_REPLY_82;
1194+
}
1195+
}
1196+
return 1;
1197+
}
1198+
11471199
#endif

storage/ndb/src/kernel/blocks/backup/Backup.cpp

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -2501,7 +2501,19 @@ Backup::execBACKUP_REQ(Signal* signal)
25012501
BackupRef::IAmNotMaster);
25022502
return;
25032503
}//if
2504-
2504+
else if(flags & BackupReq::NOWAIT_REPLY)
2505+
{
2506+
/*
2507+
* Inform MgmtSrvr that I am the master node, this way MgmtSrvr can stop
2508+
* waiting for feedback from Backup when NOWAIT option is specified in the
2509+
* start backup command
2510+
*/
2511+
BackupConf * conf = (BackupConf*)signal->getDataPtrSend();
2512+
conf->backupId = 0;
2513+
conf->senderData = req->senderData;
2514+
sendSignal(senderRef, GSN_BACKUP_CONF, signal,
2515+
BackupConf::SignalLength, JBB);
2516+
}
25052517
if (c_defaults.m_diskless)
25062518
{
25072519
jam();
@@ -2642,17 +2654,25 @@ Backup::sendBackupRef(BlockReference senderRef, Uint32 flags, Signal *signal,
26422654
Uint32 senderData, Uint32 errorCode)
26432655
{
26442656
jam();
2645-
if (SEND_BACKUP_STARTED_FLAG(flags))
2657+
if (SEND_BACKUP_STARTED_FLAG(flags) ||
2658+
(flags & BackupReq::NOWAIT_REPLY && errorCode == BackupRef::IAmNotMaster))
26462659
{
26472660
jam();
2648-
BackupRef* ref = (BackupRef*)signal->getDataPtrSend();
2661+
BackupRef *ref = (BackupRef *)signal->getDataPtrSend();
26492662
ref->senderData = senderData;
26502663
ref->errorCode = errorCode;
26512664
ref->masterRef = numberToRef(BACKUP, getMasterNodeId());
26522665
sendSignal(senderRef, GSN_BACKUP_REF, signal, BackupRef::SignalLength, JBB);
26532666
}
2654-
2655-
if (errorCode != BackupRef::IAmNotMaster)
2667+
/*
2668+
* Log event if the error is other than IAmNotMaster,
2669+
* or, if error is IAmNotMaster, the NOWAIT option is set and data node
2670+
* version is >= 7.5.31 or >= 7.6.27 or >= 8.0.34 or >= 8.2.0
2671+
* (see NDBD_START_BACKUP_NOWAIT_REPLY)
2672+
*/
2673+
if ((errorCode != BackupRef::IAmNotMaster) ||
2674+
(errorCode == BackupRef::IAmNotMaster &&
2675+
!SEND_BACKUP_STARTED_FLAG(flags) && !(flags & BackupReq::NOWAIT_REPLY)))
26562676
{
26572677
jam();
26582678
signal->theData[0] = NDB_LE_BackupFailedToStart;

storage/ndb/src/mgmsrv/MgmtSrvr.cpp

Lines changed: 81 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -1396,10 +1396,10 @@ MgmtSrvr::sendall_STOP_REQ(NodeBitmask &stoppedNodes,
13961396
int
13971397
MgmtSrvr::guess_master_node(SignalSender& ss)
13981398
{
1399+
NodeId guess = m_master_node;
13991400
/**
14001401
* First check if m_master_node is started
14011402
*/
1402-
NodeId guess = m_master_node;
14031403
if (guess != 0)
14041404
{
14051405
trp_node node = ss.getNodeInfo(guess);
@@ -1408,43 +1408,65 @@ MgmtSrvr::guess_master_node(SignalSender& ss)
14081408
}
14091409

14101410
/**
1411-
* Check for any started node
1411+
* Check started nodes based on dynamicId
14121412
*/
1413-
guess = 0;
1414-
while(getNextNodeId(&guess, NDB_MGM_NODE_TYPE_NDB))
1413+
Uint32 min = UINT32_MAX;
1414+
NodeId node_id = 0;
1415+
while(getNextNodeId(&node_id, NDB_MGM_NODE_TYPE_NDB))
14151416
{
1416-
trp_node node = ss.getNodeInfo(guess);
1417-
if (node.m_state.startLevel == NodeState::SL_STARTED)
1417+
trp_node node = ss.getNodeInfo(node_id);
1418+
if(node.m_state.dynamicId < min)
14181419
{
1419-
return guess;
1420+
if(node.m_state.startLevel == NodeState::SL_STARTED)
1421+
{
1422+
min = node.m_state.dynamicId;
1423+
guess = node_id;
1424+
}
14201425
}
14211426
}
1427+
//found
1428+
if(min < UINT32_MAX)
1429+
return guess;
14221430

14231431
/**
1424-
* Check any confirmed node
1432+
* Check confirmed nodes based on dynamicId
14251433
*/
1426-
guess = 0;
1427-
while(getNextNodeId(&guess, NDB_MGM_NODE_TYPE_NDB))
1434+
node_id = 0;
1435+
while(getNextNodeId(&node_id, NDB_MGM_NODE_TYPE_NDB))
14281436
{
1429-
trp_node node = ss.getNodeInfo(guess);
1430-
if (node.is_confirmed())
1437+
trp_node node = ss.getNodeInfo(node_id);
1438+
if(node.m_state.dynamicId < min)
14311439
{
1432-
return guess;
1440+
if(node.is_confirmed())
1441+
{
1442+
min = node.m_state.dynamicId;
1443+
guess = node_id;
1444+
}
14331445
}
14341446
}
1447+
//found
1448+
if(min < UINT32_MAX)
1449+
return guess;
14351450

14361451
/**
1437-
* Check any connected node
1452+
* Check connected nodes based on dynamicId
14381453
*/
1439-
guess = 0;
1440-
while(getNextNodeId(&guess, NDB_MGM_NODE_TYPE_NDB))
1454+
node_id = 0;
1455+
while(getNextNodeId(&node_id, NDB_MGM_NODE_TYPE_NDB))
14411456
{
1442-
trp_node node = ss.getNodeInfo(guess);
1443-
if (node.is_connected())
1457+
trp_node node = ss.getNodeInfo(node_id);
1458+
if(node.m_state.dynamicId < min)
14441459
{
1445-
return guess;
1460+
if(node.is_connected())
1461+
{
1462+
min = node.m_state.dynamicId;
1463+
guess = node_id;
1464+
}
14461465
}
14471466
}
1467+
//found
1468+
if(min < UINT32_MAX)
1469+
return guess;
14481470

14491471
return 0; // give up
14501472
}
@@ -3420,8 +3442,13 @@ MgmtSrvr::trp_deliver_signal(const NdbApiSignal* signal,
34203442

34213443
/* Clear local nodeid reservation(if any) */
34223444
release_local_nodeid_reservation(i);
3423-
34243445
clear_connect_address_cache(i);
3446+
3447+
/* Clear m_master_node when master disconnects */
3448+
if(i == m_master_node)
3449+
{
3450+
m_master_node = 0;
3451+
}
34253452
}
34263453
return;
34273454
}
@@ -4331,16 +4358,7 @@ MgmtSrvr::startBackup(Uint32& backupId, int waitCompleted, Uint32 input_backupId
43314358
SignalSender ss(theFacade);
43324359
ss.lock(); // lock will be released on exit
43334360

4334-
NodeId nodeId = m_master_node;
4335-
if (okToSendTo(nodeId, false) != 0)
4336-
{
4337-
bool next;
4338-
nodeId = m_master_node = 0;
4339-
while((next = getNextNodeId(&nodeId, NDB_MGM_NODE_TYPE_NDB)) == true &&
4340-
okToSendTo(nodeId, false) != 0);
4341-
if(!next)
4342-
return NO_CONTACT_WITH_DB_NODES;
4343-
}
4361+
NodeId nodeId = guess_master_node(ss);
43444362

43454363
SimpleSignal ssig;
43464364
BackupReq* req = CAST_PTR(BackupReq, ssig.getDataPtrSend());
@@ -4370,18 +4388,48 @@ MgmtSrvr::startBackup(Uint32& backupId, int waitCompleted, Uint32 input_backupId
43704388
while (1) {
43714389
if (do_send)
43724390
{
4391+
nodeId = guess_master_node(ss);
4392+
4393+
if(waitCompleted == 0 &&
4394+
ndbd_start_backup_nowait_reply(getNodeInfo(nodeId).m_info.m_version))
4395+
{
4396+
req->flags |= BackupReq::NOWAIT_REPLY;
4397+
}
4398+
else
4399+
{
4400+
req->flags &= ~((Uint32)BackupReq::NOWAIT_REPLY);
4401+
}
4402+
43734403
if (ss.sendSignal(nodeId, &ssig) != SEND_OK) {
43744404
return SEND_OR_RECEIVE_FAILED;
43754405
}
4376-
if (waitCompleted == 0)
4377-
return 0;
4406+
if (waitCompleted == 0 &&
4407+
!ndbd_start_backup_nowait_reply(getNodeInfo(nodeId).m_info.m_version))
4408+
{
4409+
return 0;
4410+
}
43784411
do_send = 0;
43794412
}
43804413
SimpleSignal *signal = ss.waitFor();
43814414

43824415
int gsn = signal->readSignalNumber();
43834416
switch (gsn) {
43844417
case GSN_BACKUP_CONF:{
4418+
4419+
/*
4420+
* BACKUP NOWAIT case.
4421+
* BACKUP_CONF received from Backup. It is only used to confirm that
4422+
* the node is the master node and the backup can proceed.
4423+
* No more feedback expected from data node in this case.
4424+
*/
4425+
if(waitCompleted == 0)
4426+
{
4427+
return 0;
4428+
}
4429+
4430+
/*
4431+
* BACKUP WAIT case.
4432+
*/
43854433
const BackupConf * const conf =
43864434
CAST_CONSTPTR(BackupConf, signal->getDataPtr());
43874435
#ifdef VM_TRACE

0 commit comments

Comments
 (0)