Skip to content

Commit fd687df

Browse files
committed
Bug#35387076 ndb_desc_extra occasionally fails in PB2 due to content mismatch
Also fixing : Bug#35416104 ndb_read_local occasionally fails in PB2 due to timeout Bug#34944798 ndb_binlog.ndb_autodiscover3 fails sporadically in PB2 Bug#34196275 ndb_desc_extra fails with missing hashmap Bug#34196250 Test metadata_immediate_sync can fail in PB2 Bug#33809408 Failure of the ndb_restore_schema_partitions test Bug#32368395 NDB_DESC UTILITY CAN FAIL ON DEVBLD Bug#35433318 PB2: ndb_rpl.ndb_rpl_sync sporadically fails to create hashmap Problem - Transporters will only deliver a subset of signals until communication is enabled - other signals are dropped - Therefore it is important to only send general signals after communication is enabled as otherwise they may be dropped. - API nodes should not receive API_REGCONF from a data node until after communication is enabled, so can use this as a 'safe to proceed' indicator. - However if an API_REGREQ signal is received by a data node while it is enabling communication then it will currently send an immediate API_REGCONF - before communication is enabled. - If the API then sends general signals to the data node these race with the ENABLE_COMREQ and there is a chance that these signals are dropped - This can result in high level timeout errors such as error 4008 from early post-API_REGCONF activities (getTable, getHashmap etc) Solution - Add assertion to ensure that data nodes only send API_REGCONF when API is in correct state - Add test insertion to give coverage of 'slow' ENABLE_COMORD to bias the race - Add testcase testNdbApi -n TestSlowConnectEnable to show : i) 4008 type scenario with error insertion ii) Assertion failure with error insertion + assertion iii) No problem with fix - Fix by not sending API_REGCONF when waiting for ENABLE_COMCONF. Change-Id: I796d9e5cab04a6048afa3b08c2558667a59331f9 Change-Id: I93d39e55b2c10ead06b419b69f254b45f6ad0bde Change-Id: I277bd1e0e984358ef4cba1f5143d8be707d84a25
1 parent 67f77c0 commit fd687df

File tree

6 files changed

+137
-4
lines changed

6 files changed

+137
-4
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Next DBDICT 6223
3030
Next DBDIH 7249
3131
Next DBTC 8123
3232
Next CMVMI 9000
33+
Next TRPMAN 9501
3334
Next BACKUP 10055
3435
Next PGMAN 11010
3536
Next DBTUX 12010

storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2022, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -249,6 +249,12 @@ void Cmvmi::execNDB_TAMPER(Signal* signal)
249249
CLEAR_ERROR_INSERT_VALUE;
250250
sendSignal(TRPMAN_REF, GSN_NDB_TAMPER, signal, signal->getLength(),JBB);
251251
}
252+
if (signal->theData[0] >= 9500 &&
253+
signal->theData[0] < 9900)
254+
{
255+
/* Subrange for TRPMAN */
256+
sendSignal(TRPMAN_REF, GSN_NDB_TAMPER, signal, signal->getLength(),JBB);
257+
}
252258
}//execNDB_TAMPER()
253259

254260
static Uint32 blocks[] =

storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2022, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -3889,6 +3889,14 @@ void Qmgr::execAPI_REGREQ(Signal* signal)
38893889
* should be able to discover what nodes that it is able to actually use.
38903890
*/
38913891
}
3892+
if (apiNodePtr.p->phase == ZAPI_ACTIVATION_ONGOING)
3893+
{
3894+
jam();
3895+
/* Waiting for TRPMAN to finish enabling communication
3896+
* Must not send conf before then.
3897+
*/
3898+
return;
3899+
}
38923900

38933901
sendApiRegConf(signal, apiNodePtr.i);
38943902
}//Qmgr::execAPI_REGREQ()
@@ -3986,6 +3994,17 @@ Qmgr::sendApiRegConf(Signal *signal, Uint32 node)
39863994
const BlockReference ref = apiNodePtr.p->blockRef;
39873995
ndbassert(ref != 0);
39883996

3997+
/* No Conf to be sent unless :
3998+
* - API node is ACTIVE
3999+
* - MGM node is ACTIVE | INACTIVE
4000+
* - Data node is shutting down
4001+
*/
4002+
ndbassert(apiNodePtr.p->phase == ZAPI_ACTIVE ||
4003+
(apiNodePtr.p->phase == ZAPI_INACTIVE &&
4004+
getNodeInfo(apiNodePtr.i).getType() == NodeInfo::MGM) ||
4005+
(apiNodePtr.p->phase == ZAPI_INACTIVE &&
4006+
getNodeState().startLevel >= NodeState::SL_STOPPING_1));
4007+
39894008
ApiRegConf * const apiRegConf = (ApiRegConf *)&signal->theData[0];
39904009
apiRegConf->qmgrRef = reference();
39914010
apiRegConf->apiHeartbeatFrequency = (chbApiDelay / 10);

storage/ndb/src/kernel/blocks/trpman.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2011, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2011, 2023, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -259,6 +259,20 @@ Trpman::execENABLE_COMREQ(Signal* signal)
259259
jamEntry();
260260
const EnableComReq *enableComReq = (const EnableComReq *)signal->getDataPtr();
261261

262+
if (ERROR_INSERTED(9500) &&
263+
signal->getSendersBlockRef() != reference())
264+
{
265+
jam();
266+
g_eventLogger->info("TRPMAN %u delaying ENABLE_COMREQ for 5s",
267+
instance());
268+
sendSignalWithDelay(reference(),
269+
GSN_ENABLE_COMREQ,
270+
signal,
271+
5000,
272+
signal->getLength());
273+
return;
274+
}
275+
262276
/* Need to copy out signal data to not clobber it with sendSignal(). */
263277
BlockReference senderRef = enableComReq->m_senderRef;
264278
Uint32 senderData = enableComReq->m_senderData;
@@ -499,6 +513,15 @@ Trpman::execNDB_TAMPER(Signal* signal)
499513
{
500514
jamEntry();
501515
#ifdef ERROR_INSERT
516+
if (signal->getLength() == 1)
517+
{
518+
SET_ERROR_INSERT_VALUE(signal->theData[0]);
519+
}
520+
else
521+
{
522+
SET_ERROR_INSERT_VALUE2(signal->theData[0], signal->theData[1]);
523+
}
524+
502525
if (signal->theData[0] == 9003)
503526
{
504527
if (MAX_RECEIVED_SIGNALS < 1024)

storage/ndb/test/ndbapi/testNdbApi.cpp

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7917,6 +7917,81 @@ int runCheckSlowCommit(NDBT_Context* ctx, NDBT_Step* step)
79177917
return NDBT_OK;
79187918
}
79197919

7920+
int
7921+
testSlowConnectEnable(NDBT_Context* ctx, NDBT_Step* step)
7922+
{
7923+
/**
7924+
* Test behaviour of API client with slow connection
7925+
* enabling at the data node
7926+
*/
7927+
NdbRestarter restarter;
7928+
7929+
ndbout_c("Delay ENABLE_COM on data nodes");
7930+
restarter.insertErrorInAllNodes(9500);
7931+
7932+
Ndb_cluster_connection* otherConnection = NULL;
7933+
Ndb* otherNdb = NULL;
7934+
int result = NDBT_FAILED;
7935+
7936+
do
7937+
{
7938+
ndbout_c("Setup new connection");
7939+
char connectString[256];
7940+
ctx->m_cluster_connection.get_connectstring(connectString,
7941+
sizeof(connectString));
7942+
otherConnection= new Ndb_cluster_connection(connectString);
7943+
if (otherConnection == NULL)
7944+
{
7945+
ndbout << "Could not create extra API connection" << endl;
7946+
break;
7947+
}
7948+
7949+
int rc = otherConnection->connect();
7950+
if (rc != 0)
7951+
{
7952+
ndbout << "Connection failed with " << rc << endl;
7953+
break;
7954+
}
7955+
7956+
if (otherConnection->wait_until_ready(30,30) != 0)
7957+
{
7958+
ndbout << "Connection wait until ready failed." << endl;
7959+
break;
7960+
}
7961+
7962+
ndbout_c("Connection ready");
7963+
7964+
otherNdb = new Ndb(otherConnection, "TEST_DB");
7965+
otherNdb->init();
7966+
7967+
if (otherNdb->waitUntilReady(30) != 0)
7968+
{
7969+
ndbout << "Ndb wait until ready failed." << endl;
7970+
break;
7971+
}
7972+
7973+
ndbout_c("Ndb ready");
7974+
7975+
const char* tabName = ctx->getTab()->getName();
7976+
if (otherNdb->getDictionary()->getTable(tabName) == NULL)
7977+
{
7978+
ndbout << "Get table failed with error "
7979+
<< otherNdb->getNdbError() << endl;
7980+
break;
7981+
}
7982+
7983+
ndbout_c("Table retrieved");
7984+
7985+
result = NDBT_OK;
7986+
} while (0);
7987+
7988+
restarter.insertErrorInAllNodes(0);
7989+
7990+
delete otherNdb;
7991+
delete otherConnection;
7992+
return result;
7993+
}
7994+
79207995

79217996
NDBT_TESTSUITE(testNdbApi);
79227997
TESTCASE("MaxNdb",
@@ -8336,7 +8411,11 @@ TESTCASE("CheckSlowCommit",
83368411
STEP(runCheckSlowCommit);
83378412
FINALIZER(runDropTable);
83388413
}
8339-
8414+
TESTCASE("TestSlowConnectEnable",
8415+
"Test behaviour with slow connection enale")
8416+
{
8417+
STEP(testSlowConnectEnable);
8418+
}
83408419

83418420
NDBT_TESTSUITE_END(testNdbApi);
83428421

storage/ndb/test/run-test/daily-devel--07-tests.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,8 @@ max-time: 180
248248
cmd: testNodeRestart
249249
args: -n CheckGcpStopTimerDistributed T1
250250
max-time: 520
251+
252+
cmd: testNdbApi
253+
args: -n TestSlowConnectEnable T1
254+
max-time: 600
255+

0 commit comments

Comments
 (0)