Skip to content

Commit 6708e51

Browse files
author
Maitrayi Sabaratnam
committed
Bug#35641420 NDB: No warning is provided when the oldest restorable GCI reaches its maximum
Oldest restorable GCI (global checkpoint id) is limited to MaxInt32. This patch starts a warning ca. 3 months ahead of the limit and then warns once every day. Reviewed by: Frazer Clement <[email protected]> (cherry picked from commit d49f5fe08b4d2ac1584ef21d6e39c0cd2151eb28) Change-Id: Iffb2da94c83b329657b1d5caf22e896c7cc033cb
1 parent 519d65f commit 6708e51

File tree

5 files changed

+138
-13
lines changed

5 files changed

+138
-13
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,14 @@ Next DBACC 3006
2727
Next DBTUP 4039
2828
Next DBLQH 5112
2929
Next DBDICT 6223
30-
Next DBDIH 7249
30+
Next DBDIH 7251
3131
Next DBTC 8123
3232
Next CMVMI 9000
3333
Next TRPMAN 9501
3434
Next BACKUP 10055
3535
Next PGMAN 11010
3636
Next DBTUX 12010
37-
Next SUMA 13057
37+
Next SUMA 13058
3838
Next LGMAN 15002
3939
Next TSMAN 16002
4040
Next DBSPJ 17000
@@ -147,6 +147,7 @@ GCP Stop so that isolation is required to remove a node.
147147
7244: Delay SUB_GCP_COMPLETE_REP processing indefinitely
148148
7247: Delay WAIT_GCP_REQ processing indefinitely
149149
7248: Skip most recent LCP during a restart
150+
7250: Boost GCI to provoke a warning
150151

151152
Related (LQH):
152153
5085 : Ignore request to kill via DUMP 2305
@@ -856,6 +857,8 @@ SUMA:
856857
13041: Crash in SUB_CREATE_REQ (no matter)
857858
13042: Crash in SUB_START_REQ (if m_restart_server_node_id != 0)
858859
13051: Delay the DROP_TRIG_IMPL_REQ
860+
13057: Changing the ndbrequire from checking whether the gci is higher than
861+
the known gci instead of the next gci
859862

860863
LGMAN:
861864
-----

storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -1565,6 +1565,7 @@ class Dbdih: public SimulatedBlock {
15651565
void tableUpdateLab(Signal *, TabRecordPtr regTabPtr);
15661566
void checkLcpCompletedLab(Signal *);
15671567
void initLcpLab(Signal *, Uint32 masterRef, Uint32 tableId);
1568+
void checkGCI(Uint32 gci_hi);
15681569
void startGcpLab(Signal *);
15691570
void checkGcpStopLab(Signal *);
15701571
void MASTER_GCPhandling(Signal *, Uint32 failedNodeId);

storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16659,6 +16659,49 @@ Dbdih::execUPGRADE_PROTOCOL_ORD(Signal* signal)
1665916659
}
1666016660
}
1666116661

16662+
/**
16663+
* MAX_SAFE_GCI_VALUE should ideally be MAX_UINT32, as GCI save part
16664+
* is an unsigned int32. However, it is defined as MAX_INT32 here in
16665+
* order to align with the current implementation that imposes a hard
16666+
* limit on OldestRestorableGCI. See also Bug#35749589.
16667+
*/
16668+
static const Uint32 MAX_SAFE_GCI_VALUE = 0x7fffffff; // MAX_INT32
16669+
16670+
// Init the following two local variables with some dummy values.
16671+
// The real initialisation will occur in initCommonData() at node start.
16672+
static Uint32 GCPS_PER_DAY = 1;
16673+
static Uint32 next_warn_save_gci = MAX_SAFE_GCI_VALUE;
16674+
16675+
// Issue a warning, around 3 months ahead of GCI reaching MAX_SAFE_GCI_VALUE
16676+
void Dbdih::checkGCI(Uint32 save_gci) {
16677+
if (save_gci >= next_warn_save_gci) {
16678+
jam();
16679+
16680+
// Number of days to reach MaxInt32 with projected GCPS_PER_DAY
16681+
Uint32 days_left = 0;
16682+
if (save_gci < MAX_SAFE_GCI_VALUE)
16683+
days_left = (MAX_SAFE_GCI_VALUE - save_gci) / GCPS_PER_DAY;
16684+
16685+
warningEvent("GCI (%u) is approaching the maximum value (%u).",
16686+
save_gci, MAX_SAFE_GCI_VALUE);
16687+
warningEvent("GCI projected to reach max in %u days.", days_left);
16688+
warningEvent("Reset system GCI using Backup and Restore with "
16689+
"Cluster Initial Restart.");
16690+
16691+
g_eventLogger->warning("Current Global checkpoint id (%u) "
16692+
"is approaching the maximum value (%u), "
16693+
"projected to occur in %u days."
16694+
"Backup, and Restore with cluster initial restart "
16695+
"is required to reset the system GCI.",
16696+
save_gci, MAX_SAFE_GCI_VALUE, days_left);
16697+
16698+
next_warn_save_gci += GCPS_PER_DAY;
16699+
// To test more warnings, replace the above line with the following:
16700+
// next_warn_save_gci += 1;
16701+
}
16702+
return;
16703+
}
16704+
1666216705
void
1666316706
Dbdih::startGcpLab(Signal* signal)
1666416707
{
@@ -16785,6 +16828,20 @@ Dbdih::startGcpLab(Signal* signal)
1678516828
m_gcp_save.m_master.m_start_time = now;
1678616829
m_micro_gcp.m_master.m_new_gci = Uint64((currGCI >> 32) + 1) << 32;
1678716830

16831+
if (ERROR_INSERTED(7250))
16832+
{
16833+
jam();
16834+
const Uint64 currVal = m_micro_gcp.m_master.m_new_gci >> 32;
16835+
Uint64 newVal = 0;
16836+
// Boost to just below the first warning gci
16837+
newVal = std::max(currVal, Uint64(next_warn_save_gci - 4));
16838+
m_micro_gcp.m_master.m_new_gci = newVal << 32;
16839+
g_eventLogger->info("DIH Err-Ins: Incrementing GCI from %llu to %llu. ",
16840+
currVal,
16841+
newVal);
16842+
CLEAR_ERROR_INSERT_VALUE;
16843+
}
16844+
1678816845
signal->theData[0] = NDB_LE_GlobalCheckpointStarted; //Event type
1678916846
signal->theData[1] = Uint32(currGCI >> 32);
1679016847
signal->theData[2] = Uint32(currGCI);
@@ -20467,7 +20524,7 @@ void Dbdih::execTC_CLOPSIZECONF(Signal* signal)
2046720524
/* ----------------------------------------------------------------------- */
2046820525
cnoOfActiveTables = 0;
2046920526
c_lcpState.setLcpStatus(LCP_WAIT_MUTEX, __LINE__);
20470-
ndbrequire(((int)c_lcpState.oldestRestorableGci) > 0);
20527+
ndbrequire((c_lcpState.oldestRestorableGci) < MAX_SAFE_GCI_VALUE);
2047120528

2047220529
if (ERROR_INSERTED(7011)) {
2047320530
signal->theData[0] = NDB_LE_LCPStoppedInCalcKeepGci;
@@ -20636,11 +20693,8 @@ void Dbdih::storeNewLcpIdLab(Signal* signal)
2063620693
DEB_LCP(("Set SYSFILE->keepGCI = %u", SYSFILE->keepGCI));
2063720694

2063820695
SYSFILE->oldestRestorableGCI = c_lcpState.oldestRestorableGci;
20639-
2064020696
const Uint32 oldestRestorableGCI = SYSFILE->oldestRestorableGCI;
20641-
20642-
Int32 val = oldestRestorableGCI;
20643-
ndbrequire(val > 0);
20697+
ndbrequire(oldestRestorableGCI < MAX_SAFE_GCI_VALUE);
2064420698

2064520699
/* ----------------------------------------------------------------------- */
2064620700
/* SET BIT INDICATING THAT LOCAL CHECKPOINT IS ONGOING. THIS IS CLEARED */
@@ -22684,6 +22738,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
2268422738
jam();
2268522739
m_gcp_monitor.m_gcp_save.m_gci = m_gcp_save.m_gci;
2268622740
m_gcp_monitor.m_gcp_save.m_elapsed_ms = 0;
22741+
checkGCI(m_gcp_monitor.m_gcp_save.m_gci);
2268722742

2268822743
/**
2268922744
* Recalculate gcp_save.m_max_lag.
@@ -24016,6 +24071,16 @@ void Dbdih::initCommonData()
2401624071
ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &tmp);
2401724072
tmp = tmp > 60000 ? 60000 : (tmp < 10 ? 10 : tmp);
2401824073
m_gcp_save.m_master.m_time_between_gcp = tmp;
24074+
24075+
/**
24076+
* With the minimum config of 20 ms interval,
24077+
* oldestRestorableGCI will be reached IntMax32 in 1.36 yrs or 497 days
24078+
* ((2^31 -1) * 20 / (1000 * 60 * 60 * 24 * 365)).
24079+
* Start warning ca. 3 months ahead of expiry
24080+
* and then write a warning every day.
24081+
*/
24082+
GCPS_PER_DAY = (Uint32)((Uint64)(24*60*60*1000) / tmp); // per day
24083+
next_warn_save_gci = MAX_SAFE_GCI_VALUE - 90 * GCPS_PER_DAY; // 3 months
2401924084
}
2402024085

2402124086
Uint32 tmp = 0;

storage/ndb/src/kernel/blocks/suma/Suma.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2022, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -5267,7 +5267,13 @@ Suma::sendSUB_GCP_COMPLETE_REP(Signal* signal)
52675267
}
52685268
else
52695269
{
5270-
ndbrequire(gci_hi == Uint32(m_gcp_monitor >> 32) + 1);
5270+
if (ERROR_INSERTED(13057))
5271+
{
5272+
jam();
5273+
ndbrequire(gci_hi > Uint32(m_gcp_monitor >> 32));
5274+
} else {
5275+
ndbrequire(gci_hi == Uint32(m_gcp_monitor >> 32) + 1);
5276+
}
52715277
ndbrequire(gci_lo == 0);
52725278
}
52735279
m_gcp_monitor = gci;

storage/ndb/test/ndbapi/testBasic.cpp

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4319,7 +4319,52 @@ int testAbortIgnoreError(NDBT_Context* ctx, NDBT_Step* step)
43194319
return result;
43204320
}
43214321

4322+
/**
4323+
* Insert error in the DIH that boosts next gci a little prior to the
4324+
* warning-start-gci. This will make a warning written out to ndbd
4325+
* log and the cluster log. This was checked manually. Restart the
4326+
* cluster initial to clean up. The test will always pass. Therefore
4327+
* no need to run in autotest.
4328+
*/
4329+
int runCheckWarnMaxGci(NDBT_Context* ctx, NDBT_Step* step)
4330+
{
4331+
int result = NDBT_OK;
4332+
NdbRestarter restarter;
4333+
const int coordinator = restarter.getMasterNodeId();
4334+
4335+
// while-loop looping only once is needed by CHK1 to break and stop the test.
4336+
while (true) {
4337+
// Give some time for Ndb to get ready
4338+
NdbSleep_SecSleep(60);
4339+
4340+
// To avoid a check in Suma that Gcis increase by 1
4341+
CHK1(restarter.insertErrorInAllNodes(13057) == 0);
43224342

4343+
// Boost the next Gci near to the warning-start Gci
4344+
CHK1(restarter.insertErrorInNode(coordinator, 7250) == 0);
4345+
4346+
// Wait til some Gcps pass such that the warnings be written out
4347+
// in cluster and ndb logs
4348+
NdbSleep_SecSleep(30);
4349+
// The warnings written out are checked manually.
4350+
4351+
// Clean up EI and restart cluster initial to reset the GCI
4352+
CHK1(restarter.restartAll(true, /* initial */
4353+
true, /* nostart */
4354+
true /* abort */) == 0);
4355+
4356+
g_err << "wait nostart" << endl;
4357+
CHK1(restarter.waitClusterNoStart() == 0);
4358+
g_err << "startAll" << endl;
4359+
CHK1(restarter.startAll() == 0);
4360+
g_err << "wait started" << endl;
4361+
CHK1(restarter.waitClusterStarted() == 0);
4362+
4363+
break;
4364+
}
4365+
ctx->stopTest();
4366+
return result;
4367+
}
43234368

43244369
NDBT_TESTSUITE(testBasic);
43254370
TESTCASE("PkInsert",
@@ -4759,6 +4804,14 @@ TESTCASE("CheckCompletedLCPStats",
47594804
{
47604805
STEP(runCheckLCPStats);
47614806
}
4807+
TESTCASE("CheckWarnGCPReachMax",
4808+
"Check whether GCI reaching MaxInt32 is warned")
4809+
{
4810+
INITIALIZER(runLoadTable);
4811+
STEP(runPkUpdateUntilStopped);
4812+
STEP(runCheckWarnMaxGci);
4813+
FINALIZER(runClearTable);
4814+
}
47624815
NDBT_TESTSUITE_END(testBasic);
47634816

47644817
#if 0
@@ -4782,6 +4835,3 @@ int main(int argc, const char** argv){
47824835
NDBT_TESTSUITE_INSTANCE(testBasic);
47834836
return testBasic.execute(argc, argv);
47844837
}
4785-
4786-
4787-

0 commit comments

Comments
 (0)