Bug#36495761 PB2: ndb_restore_schema_tolerance fail sporadically creating table

vinc13e · vinc13e · commit b673ec16666f · 2024-09-10T01:25:21.000+01:00
1/2

Patch for 7.6 Only

Context:
MTR test ndb_restore_schema_tolerance fail sporadically
due to an error in DBUTIL. Util has a pool of prepared
operations that stores the Prepared Operations until they
are executed.
The pool has a fixed size of 6 where 4 are 'reserved' for
some operations on special tables, so, in fact, there
are only 2 available spaces in preparedOperationPoll.
When UTIL handles many preperareOperation simultaneous
the pool can get exhausted because it fails seize a free
object in the pool to store the new prepared op.

In a similar way, runningPrepares pool used by UTIL
to store the running prepare operations, can also be
exhausted. The pool size is 1 but there is nothing
preventing TRIX/DICT to send many UTIL_PREPARE_REQ
in parallel to UTIL. In that scenario UTIL will fail to
seize a free object for the new running prepare operation.

To check the described scenarios 2 new API level test are
added to test_event.

1 - ExhaustedPreparedPoolsApiOps:
Check if DBUTIL PreparedOperationPool and runningPrepares pool
get Exhausted when many getEvent operation run in parallel.

2 - ExhaustedPreparedPoolsInternalOps:
Checks that, if DBUTIL PreparedOperationPool and/or
runningPrepares get Exhausted due to events, internal
operations still succeed.
Test launches 32 threads running getEvent operation + 1 thread
running drop/create index in parallel in order to force UTIL
to handle many UTIL_PREPARE_REQ signals from both DICT and TRIX,
this way prepared op pool and runningPrepares pool can get
Exhausted since pool sizes are small and there is nothing
preventing UTIL to handle many UTIL_PREPARE_REQ simultaneously.
Test fails only when the operation that get the pool exhausted
is internal (from  trix).

Change-Id: I12f3ff7f92ab4310dda32ecc0c825609720ce933
diff --git a/storage/ndb/src/kernel/blocks/ERROR_codes.txt b/storage/ndb/src/kernel/blocks/ERROR_codes.txt
@@ -40,7 +40,7 @@ Next LGMAN 15002
 Next TSMAN 16002
 Next DBSPJ 17000
 Next TRIX 18004
-Next DBUTIL 19001
+Next DBUTIL 19002
 
 TESTING NODE FAILURE, ARBITRATION
 ---------------------------------
@@ -910,6 +910,7 @@ Index stats:
 DBUTIL
 -------
 19000: fail UTIL_PREPARE_REQ with PREPARE_SEIZE_ERROR
+19001: crash in UTIL_PREPARE_REQ if preparedOperationPool or runningPrepares pool is exhausted
 
 NDBFS
 -----
diff --git a/storage/ndb/src/kernel/blocks/dbutil/DbUtil.cpp b/storage/ndb/src/kernel/blocks/dbutil/DbUtil.cpp
@@ -1,5 +1,5 @@
 /*
-   Copyright (c) 2003, 2022, Oracle and/or its affiliates.
+   Copyright (c) 2003, 2024, Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License, version 2.0,
@@ -1137,6 +1137,10 @@ DbUtil::execUTIL_PREPARE_REQ(Signal* signal)
     releaseSections(handle);
     sendUtilPrepareRef(signal, UtilPrepareRef::PREPARE_SEIZE_ERROR,
 		       senderRef, senderData);
+    if (ERROR_INSERTED(19001)) {
+      /* Should never fail to seize a record */
+      ndbrequire(false);
+    }
     return;
   };
   handle.getSection(ptr, UtilPrepareReq::PROPERTIES_SECTION);
@@ -1355,6 +1359,10 @@ DbUtil::prepareOperation(Signal* signal,
     sendUtilPrepareRef(signal, UtilPrepareRef::PREPARED_OPERATION_SEIZE_ERROR,
 		       prepPtr.p->clientRef, prepPtr.p->clientData);
     releasePrepare(prepPtr);
+    if (ERROR_INSERTED(19001)) {
+      /* Should never fail to seize a record */
+      ndbrequire(false);
+    }
     return;
   }
   prepPtr.p->prepOpPtr = prepOpPtr;
diff --git a/storage/ndb/test/ndbapi/test_event.cpp b/storage/ndb/test/ndbapi/test_event.cpp
@@ -197,6 +197,122 @@ static int runCreateEvent(NDBT_Context* ctx, NDBT_Step* step)
   return NDBT_OK;
 }
 
+
+int runInsertError(NDBT_Context *ctx, NDBT_Step *step) {
+  int error = ctx->getProperty("ErrorCode");
+  NdbRestarter restarter;
+
+  if (restarter.insertErrorInAllNodes(error) != 0) {
+    ndbout << "Could not insert error in all nodes " << endl;
+    return NDBT_FAILED;
+  }
+  return NDBT_OK;
+}
+
+int runClearError(NDBT_Context *ctx, NDBT_Step *step) {
+  NdbRestarter restarter;
+
+  if (restarter.insertErrorInAllNodes(0) != 0) {
+    ndbout << "Could not clear error in all nodes " << endl;
+    return NDBT_FAILED;
+  }
+  return NDBT_OK;
+}
+
+static int runGetEvent(NDBT_Context *ctx, NDBT_Step *step) {
+  Ndb *pNdb = GETNDB(step);
+  int loops = ctx->getNumLoops();
+  NdbDictionary::Dictionary *myDict = pNdb->getDictionary();
+
+  if (!myDict) {
+    g_err << "Dictionary not found " << pNdb->getNdbError().code << " "
+          << pNdb->getNdbError().message << endl;
+    return NDBT_FAILED;
+  }
+
+  char eventName[1024];
+  const NdbDictionary::Table &tab = *ctx->getTab();
+  sprintf(eventName, "%s_EVENT", tab.getName());
+
+  while (loops-- && !ctx->isTestStopped()) {
+    const NdbDictionary::Event* ev = myDict->getEvent(eventName);
+    if (ev == NULL) {
+      g_err << "getEvent (" << step->getStepNo() << "): Event not found. "
+            << myDict->getNdbError().code << " "
+            << myDict->getNdbError().message << endl;
+      return NDBT_FAILED;
+    }
+  }
+  ctx->stopTest();
+  return NDBT_OK;
+}
+
+int runCreateTable(NDBT_Context *ctx, NDBT_Step *step) {
+  Ndb *pNdb = GETNDB(step);
+  const char *tableName = ctx->getProperty("tableName", (char *)NULL);
+
+  NdbDictionary::Dictionary *pDict = pNdb->getDictionary();
+  NdbDictionary::Table tab(tableName);
+  {
+    NdbDictionary::Column col("a");
+    col.setType(NdbDictionary::Column::Unsigned);
+    col.setPrimaryKey(true);
+    tab.addColumn(col);
+  }
+  {
+    NdbDictionary::Column col("b");
+    col.setType(NdbDictionary::Column::Unsigned);
+    col.setNullable(false);
+    tab.addColumn(col);
+  }
+  if (pDict->createTable(tab) != 0) {
+    g_err << "Failed to create table : " << pDict->getNdbError() << endl;
+    return NDBT_FAILED;
+  }
+  if (!pDict->getTable(tableName)) {
+    g_err << "Failed to get table : " << pDict->getNdbError() << endl;
+    return NDBT_FAILED;
+  }
+  return NDBT_OK;
+}
+
+int runDropTable(NDBT_Context *ctx, NDBT_Step *step) {
+  Ndb *pNdb = GETNDB(step);
+  const char *tableName = ctx->getProperty("tableName", (char *)NULL);
+  NdbDictionary::Dictionary *pDict = pNdb->getDictionary();
+  if (pDict->dropTable(tableName) != 0) {
+    g_err << "Failed to drop table : " << pDict->getNdbError() << endl;
+    return NDBT_FAILED;
+  }
+  return NDBT_OK;
+}
+
+int runCreateDropIndex(NDBT_Context *ctx, NDBT_Step *step) {
+  Ndb *pNdb = GETNDB(step);
+  NdbDictionary::Dictionary *pDict = pNdb->getDictionary();
+  const char *tableName = ctx->getProperty("tableName", (char *)NULL);
+  while (!ctx->isTestStopped()) {
+    NdbDictionary::Index ind("idx_te");
+    ind.setTable(tableName);
+    ind.setType(NdbDictionary::Index::OrderedIndex);
+    ind.setLogging(false);
+    ind.addColumn("b");
+
+    if (pDict->createIndex(ind) != 0) {
+      g_err << "Failed to create index : " << pDict->getNdbError() << endl;
+      return NDBT_FAILED;
+    }
+    g_err << "index created" << endl;
+
+    if (pDict->dropIndex("idx_te", tableName) != 0) {
+      g_err << "Failed to drop index : " << pDict->getNdbError() << endl;
+      return NDBT_FAILED;
+    }
+    g_err << "index dropped" << endl;
+  }
+  return NDBT_OK;
+}
+
 Uint32 setAnyValue(Ndb* ndb, NdbTransaction* trans, int rowid, int updVal)
 {
   /* XOR 2 32bit words of transid together */
@@ -7735,6 +7851,33 @@ TESTCASE("DelayedEventDrop",
   STEP(runInsertDeleteUntilStopped);
   FINALIZER(runDropEvent);
 }
+TESTCASE("ExhaustedPreparedPoolsApiOps",
+         "Check that DBUTIL PreparedOperationPool and runningPrepares pool do"
+         "not get Exhausted when N (=1 for now) getEvent operation run in "
+         "parallel") {
+  TC_PROPERTY("ErrorCode", 19001);
+  INITIALIZER(runCreateEvent);
+  INITIALIZER(runInsertError);  // set error insert
+  STEPS(runGetEvent, 1);        // Only 1 parallel getEvent for now, idea is to
+                                // increase the concurrency in the future.
+  FINALIZER(runClearError);     // clear error insert
+  FINALIZER(runDropEvent);
+}
+TESTCASE("ExhaustedPreparedPoolsInternalOps",
+         "Check that when DBUTIL PreparedOperationPool and/or runningPrepares"
+         "get Exhausted due to events, internal operations (generated by"
+         "create table/create index in this case) still succeed") {
+  TC_PROPERTY("tableName", "table_te");
+  INITIALIZER(runCreateEvent);
+  INITIALIZER(runCreateTable);
+  STEPS(runGetEvent,
+        32);  // 32 parallel GetEvent just to ensure that util pools will get
+              // exhausted, 32 is higher than the poll size.
+  STEP(runCreateDropIndex);
+  FINALIZER(runDropTable);
+  FINALIZER(runDropEvent);
+}
+
 #if 0
 TESTCASE("BackwardCompatiblePollCOverflowEB",
          "Check whether backward compatibility of pollEvents  manually"
diff --git a/storage/ndb/test/run-test/daily-devel--07-tests.txt b/storage/ndb/test/run-test/daily-devel--07-tests.txt
@@ -297,3 +297,11 @@ cmd: testNdbApi
 args: -n SetVarbinaryWithSetValue WIDE_2COL
 max-time: 180
 
+cmd: test_event
+args: -n ExhaustedPreparedPoolsApiOps T1 -l 10000
+max-time: 240
+
+cmd: test_event
+args: -n ExhaustedPreparedPoolsInternalOps T1 -l 1000
+max-time: 240
+