Skip to content

Commit 725f3ee

Browse files
committed
Bug#25443080 MAIN THREAD CPU USAGE ON MASTER DATANODE INCREASES TO 100% WITH LATENCY BURST
LCP is controlled by DIH Master on one node, running on the 'main' thread. CPU usage was seen to be high there towards the end of an LCP, leading to increased transaction latency. Problem appears to be inefficiencies in the algorithm used by DIH Master to find fragment replicas to schedule for LCP, towards the end of the LCP it can spend a lot of time scanning over the set of all replicas. Fixes : - Make sure that per-node queues are made use of - If a fragment replica is queued, do not search for more - When searching for more, search for all nodes in 1 pass - When a node has started on all of its fragment replicas, do not include it in future searches - When all nodes have started on all of their fragment replicas, no need to search for more. Testcase : - 16 nodes, 4 LDMs/node : 64 fragments : 128 fragment replicas / table - 88 tables - No data - Run LCP using ALL DUMP 7099 - Observe CPU usage and efficiency of replica search algorithms
1 parent db32ea1 commit 725f3ee

File tree

2 files changed

+126
-23
lines changed

2 files changed

+126
-23
lines changed

storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -2168,6 +2168,7 @@ class Dbdih: public SimulatedBlock {
21682168
SignalCounter m_LAST_LCP_FRAG_ORD;
21692169
NdbNodeBitmask m_participatingLQH;
21702170
NdbNodeBitmask m_participatingDIH;
2171+
NdbNodeBitmask m_allReplicasQueuedLQH;
21712172

21722173
Uint32 m_masterLcpDihRef;
21732174
bool m_MASTER_LCPREQ_Received;

storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp

Lines changed: 124 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
33

44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -18266,6 +18266,13 @@ Dbdih::master_lcp_fragmentMutex_locked(Signal* signal,
1826618266
startLcpRoundLoopLab(signal, 0, 0);
1826718267
}
1826818268

18269+
18270+
//#define DIH_DEBUG_REPLICA_SEARCH
18271+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18272+
static Uint32 totalScheduled;
18273+
static Uint32 totalExamined;
18274+
#endif
18275+
1826918276
void Dbdih::startLcpRoundLoopLab(Signal* signal,
1827018277
Uint32 startTableId, Uint32 startFragId)
1827118278
{
@@ -18279,19 +18286,53 @@ void Dbdih::startLcpRoundLoopLab(Signal* signal,
1827918286
}//if
1828018287
c_lcpState.currentFragment.tableId = startTableId;
1828118288
c_lcpState.currentFragment.fragmentId = startFragId;
18289+
c_lcpState.m_allReplicasQueuedLQH.clear();
18290+
18291+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18292+
totalScheduled = totalExamined = 0;
18293+
#endif
18294+
1828218295
startNextChkpt(signal);
1828318296
}//Dbdih::startLcpRoundLoopLab()
1828418297

1828518298
void Dbdih::startNextChkpt(Signal* signal)
1828618299
{
18300+
jam();
18301+
const bool allReplicaCheckpointsQueued =
18302+
c_lcpState.m_allReplicasQueuedLQH.
18303+
contains(c_lcpState.m_participatingLQH);
18304+
18305+
if (allReplicaCheckpointsQueued)
18306+
{
18307+
jam();
18308+
18309+
/**
18310+
* No need to find new checkpoints to start,
18311+
* just waiting for completion
18312+
*/
18313+
18314+
sendLastLCP_FRAG_ORD(signal);
18315+
return;
18316+
}
18317+
1828718318
Uint32 lcpId = SYSFILE->latestLCP_ID;
1828818319

18289-
NdbNodeBitmask busyNodes;
18290-
busyNodes.clear();
18320+
/* Initialise handledNodes with those already fully queued */
18321+
NdbNodeBitmask handledNodes = c_lcpState.m_allReplicasQueuedLQH;
18322+
18323+
/* Remove any that have failed in the interim */
18324+
handledNodes.bitAND(c_lcpState.m_participatingLQH);
18325+
1829118326
const Uint32 lcpNodes = c_lcpState.m_participatingLQH.count();
1829218327

1829318328
bool save = true;
1829418329
LcpState::CurrentFragment curr = c_lcpState.currentFragment;
18330+
18331+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18332+
Uint32 examined = 0;
18333+
Uint32 started = 0;
18334+
Uint32 queued = 0;
18335+
#endif
1829518336

1829618337
while (curr.tableId < ctabFileSize) {
1829718338
TabRecordPtr tabPtr;
@@ -18314,6 +18355,10 @@ void Dbdih::startNextChkpt(Signal* signal)
1831418355

1831518356
jam();
1831618357
c_replicaRecordPool.getPtr(replicaPtr);
18358+
18359+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18360+
examined++;
18361+
#endif
1831718362

1831818363
NodeRecordPtr nodePtr;
1831918364
nodePtr.i = replicaPtr.p->procNode;
@@ -18350,6 +18395,10 @@ void Dbdih::startNextChkpt(Signal* signal)
1835018395
nodePtr.p->noOfStartedChkpt = i + 1;
1835118396

1835218397
sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
18398+
18399+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18400+
started++;
18401+
#endif
1835318402
}
1835418403
else if (nodePtr.p->noOfQueuedChkpt <
1835518404
MAX_QUEUED_FRAG_CHECKPOINTS_PER_NODE)
@@ -18369,30 +18418,42 @@ void Dbdih::startNextChkpt(Signal* signal)
1836918418
nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
1837018419
nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
1837118420
nodePtr.p->noOfQueuedChkpt = i + 1;
18421+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18422+
queued++;
18423+
#endif
1837218424
}
1837318425
else
1837418426
{
1837518427
jam();
1837618428

1837718429
if(save)
1837818430
{
18379-
/**
18380-
* Stop increasing value on first that was "full"
18381-
*/
18382-
c_lcpState.currentFragment = curr;
18383-
save = false;
18384-
}
18431+
/**
18432+
* Stop increasing value on first replica that
18433+
* we could not enqueue, so we don't miss it
18434+
* next time
18435+
*/
18436+
c_lcpState.currentFragment = curr;
18437+
save = false;
18438+
}
1838518439

18386-
busyNodes.set(nodePtr.i);
18387-
if(busyNodes.count() == lcpNodes)
18440+
handledNodes.set(nodePtr.i);
18441+
if (handledNodes.count() == lcpNodes)
1838818442
{
18389-
/**
18390-
* There were no possibility to start the local checkpoint
18391-
* and it was not possible to queue it up. In this case we
18392-
* stop the start of local checkpoints until the nodes with a
18393-
* backlog have performed more checkpoints. We will return and
18394-
* will not continue the process of starting any more checkpoints.
18395-
*/
18443+
/**
18444+
* All participating nodes have either
18445+
* - Full queues
18446+
* - All available replica checkpoints queued
18447+
* (m_allReplicasQueuedLQH)
18448+
*
18449+
* Therefore, exit the search here.
18450+
*/
18451+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18452+
ndbout_c("Search : All nodes busy. Examined %u Started %u Queued %u",
18453+
examined, started, queued);
18454+
totalExamined+= examined;
18455+
totalScheduled += (started + queued);
18456+
#endif
1839618457
return;
1839718458
}//if
1839818459
}//if
@@ -18406,6 +18467,31 @@ void Dbdih::startNextChkpt(Signal* signal)
1840618467
curr.tableId++;
1840718468
}//if
1840818469
}//while
18470+
18471+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18472+
ndbout_c("Search : At least one node not busy. Examined %u Started %u Queued %u",
18473+
examined, started, queued);
18474+
totalExamined+= examined;
18475+
totalScheduled += (started + queued);
18476+
#endif
18477+
18478+
/**
18479+
* Have examined all replicas and attempted to
18480+
* enqueue as many replica LCPs as possible,
18481+
* without filling all queues.
18482+
* This means that some node(s) have no more
18483+
* replica LCPs to be enqueued.
18484+
* These are the node(s) which are *not* in
18485+
* the handled bitmap on this round.
18486+
* We keep track of these to allow the search
18487+
* to exit early on future invocations.
18488+
*/
18489+
18490+
/* Invert handled nodes to reveal newly finished nodes */
18491+
handledNodes.bitXOR(c_lcpState.m_participatingLQH);
18492+
18493+
/* Add newly finished nodes to the global state */
18494+
c_lcpState.m_allReplicasQueuedLQH.bitOR(handledNodes);
1840918495

1841018496
sendLastLCP_FRAG_ORD(signal);
1841118497
}//Dbdih::startNextChkpt()
@@ -18874,6 +18960,21 @@ Dbdih::checkLcpAllTablesDoneInLqh(Uint32 line){
1887418960
ndbout_c("CLEARING 7194");
1887518961
CLEAR_ERROR_INSERT_VALUE;
1887618962
}
18963+
18964+
#ifdef DIH_DEBUG_REPLICA_SEARCH
18965+
if (totalScheduled == 0)
18966+
{
18967+
totalScheduled = 1;
18968+
}
18969+
ndbout_c("LCP complete. Examined %u replicas, scheduled %u. Ratio : %u.%u",
18970+
totalExamined,
18971+
totalScheduled,
18972+
totalExamined/totalScheduled,
18973+
(10 * (totalExamined -
18974+
((totalExamined/totalScheduled) *
18975+
totalScheduled)))/
18976+
totalScheduled);
18977+
#endif
1887718978

1887818979
return true;
1887918980
}
@@ -19059,13 +19160,14 @@ void Dbdih::checkStartMoreLcp(Signal* signal, Uint32 nodeId)
1905919160
//-------------------------------------------------------------------
1906019161

1906119162
sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[startIndex]);
19163+
return;
1906219164
}
1906319165

1906419166
/* ----------------------------------------------------------------------- */
19065-
// When there are no more outstanding LCP reports and there are no one queued
19066-
// in at least one node, then we are ready to make sure all nodes have at
19067-
// least two outstanding LCP requests per node and at least two queued for
19068-
// sending.
19167+
// If this node has no checkpoints queued up, then attempt to re-fill the
19168+
// queues across all nodes.
19169+
// The search for next replicas can be expensive, so we only do it when
19170+
// the queues are empty.
1906919171
/* ----------------------------------------------------------------------- */
1907019172
startNextChkpt(signal);
1907119173
}//Dbdih::checkStartMoreLcp()

0 commit comments

Comments
 (0)