Skip to content

Commit 9102fc5

Browse files
author
Ole John Aske
committed
Bug #25557263 'WAITING MAX 119 SEC FOR DISTRIBUTING'-SCHEMA DISTRIBUTION TIMEOUT
There is a possible race condition between the schema distribution coordinator holding a ref-lock on its schema_object, and the binlog-injector thread (participant-role) possibly being late to unref-lock the same schema_object. This could potentially result in another schema distr operation getting a ref to the not-yet-released schema_object held by the injector thread instead of having to create a new schema_object as normally expected. The SLOCK-bitmap, which keeps track of which participants the coordinator is still waiting for, was set to 'all-1' when created. However, it will be 'all-0' immediately before the injector thread is about the release (unref) it. Thus, if the coordinator managed to 're-get' this schema_object before being released (and destructed) by the injector-thread, we got a schema_object->slocks with 'all-0' instead of 'all-1' as expected. - This caused a total breakdown of the schema distribution protocol. The fix is to move the schema_object->slock 'all-1' setting from the creation of new schema_object() to the place where the schema distr coordinator initate waiting for schema operations to be distributed. This will cover both scenarios where we either had to create a new schema_object, or we reuse an existing not-yet-released schema object.
1 parent 6155309 commit 9102fc5

File tree

4 files changed

+91
-14
lines changed

4 files changed

+91
-14
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
1+
Bug#25557263
2+
3+
Another variant of the 'WAITING MAX 119 SEC FOR DISTRIBUTING' timeout
4+
was sometimes provoked with this testcase. This time caused by
5+
a race condition between the schema changing client and the
6+
'coordinator' on the same mysqld.
17
DROP DATABASE IF EXISTS tmp_db;

mysql-test/suite/ndb/t/bug#24926009.test

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,46 @@ while ($1)
4545
dec $1;
4646
}
4747

48+
--echo Bug#25557263
49+
--echo
50+
--echo Another variant of the 'WAITING MAX 119 SEC FOR DISTRIBUTING' timeout
51+
--echo was sometimes provoked with this testcase. This time caused by
52+
--echo a race condition between the schema changing client and the
53+
--echo 'coordinator' on the same mysqld.
54+
55+
--connection server1
56+
set global debug='+d,ndb_binlog_schema_object_race';
57+
--connection server2
58+
set global debug='+d,ndb_binlog_schema_object_race';
59+
--connection server3
60+
set global debug='+d,ndb_binlog_schema_object_race';
61+
--connection server4
62+
set global debug='+d,ndb_binlog_schema_object_race';
63+
64+
let $1=10;
65+
while ($1)
66+
{
67+
--connection server1
68+
send CREATE DATABASE IF NOT EXISTS tmp_db;
69+
--connection server2
70+
send CREATE DATABASE IF NOT EXISTS tmp_db;
71+
--connection server3
72+
send CREATE DATABASE IF NOT EXISTS tmp_db;
73+
--connection server4
74+
send CREATE DATABASE IF NOT EXISTS tmp_db;
75+
76+
--connection server1
77+
reap;
78+
--connection server2
79+
reap;
80+
--connection server3
81+
reap;
82+
--connection server4
83+
reap;
84+
85+
dec $1;
86+
}
87+
4888
--connection server1
4989
set global debug= @save_debug;
5090
--connection server2

sql/ha_ndbcluster_binlog.cc

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006, 2016, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -1820,6 +1820,23 @@ int ndbcluster_log_schema_op(THD *thd,
18201820
char key[FN_REFLEN + 1];
18211821
build_table_filename(key, sizeof(key) - 1, db, table_name, "", 0);
18221822
ndb_schema_object= ndb_get_schema_object(key, true);
1823+
1824+
/**
1825+
* We will either get a newly created schema_object, or a
1826+
* 'all-clear' schema_object completed but still referred
1827+
* by my binlog-injector-thread. In both cases there should
1828+
* be no outstanding SLOCK's.
1829+
* See also the 'ndb_binlog_schema_object_race' error injection.
1830+
*/
1831+
DBUG_ASSERT(bitmap_is_clear_all(&ndb_schema_object->slock_bitmap));
1832+
1833+
/**
1834+
* Expect answer from all other nodes by default(those
1835+
* who are not subscribed will be filtered away by
1836+
* the Coordinator which keep track of such stuff)
1837+
*/
1838+
bitmap_set_all(&ndb_schema_object->slock_bitmap);
1839+
18231840
ndb_schema_object->table_id= ndb_table_id;
18241841
ndb_schema_object->table_version= ndb_table_version;
18251842

@@ -2045,7 +2062,13 @@ int ndbcluster_log_schema_op(THD *thd,
20452062
/*
20462063
Wait for other mysqld's to acknowledge the table operation
20472064
*/
2048-
if (ndb_error == 0 && !bitmap_is_clear_all(&ndb_schema_object->slock_bitmap))
2065+
if (unlikely(ndb_error))
2066+
{
2067+
sql_print_error("NDB %s: distributing %s err: %u",
2068+
type_str, ndb_schema_object->key,
2069+
ndb_error->code);
2070+
}
2071+
else if (!bitmap_is_clear_all(&ndb_schema_object->slock_bitmap))
20492072
{
20502073
int max_timeout= DEFAULT_SYNC_TIMEOUT;
20512074
pthread_mutex_lock(&ndb_schema_object->mutex);
@@ -2096,12 +2119,6 @@ int ndbcluster_log_schema_op(THD *thd,
20962119
}
20972120
pthread_mutex_unlock(&ndb_schema_object->mutex);
20982121
}
2099-
else if (ndb_error)
2100-
{
2101-
sql_print_error("NDB %s: distributing %s err: %u",
2102-
type_str, ndb_schema_object->key,
2103-
ndb_error->code);
2104-
}
21052122
else if (opt_ndb_extra_logging > 19)
21062123
{
21072124
sql_print_information("NDB %s: not waiting for distributing %s",
@@ -3089,6 +3106,22 @@ class Ndb_schema_event_handler {
30893106
pthread_mutex_unlock(&ndb_schema_object->mutex);
30903107
pthread_cond_signal(&ndb_schema_object->cond);
30913108

3109+
/**
3110+
* There is a possible race condition between this binlog-thread,
3111+
* which has not yet released its schema_object, and the
3112+
* coordinator which possibly release its reference
3113+
* to the same schema_object when signaled above.
3114+
*
3115+
* If the coordinator then starts yet another schema operation
3116+
* on the same schema / table, it will need a schema_object with
3117+
* the same key as the one already completed, and which this
3118+
* thread still referrs. Thus, it will get this schema_object,
3119+
* instead of creating a new one as normally expected.
3120+
*/
3121+
DBUG_EXECUTE_IF("ndb_binlog_schema_object_race",
3122+
{
3123+
NdbSleep_MilliSleep(10);
3124+
});
30923125
ndb_free_schema_object(&ndb_schema_object);
30933126
DBUG_VOID_RETURN;
30943127
}

sql/ndb_schema_object.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -82,16 +82,14 @@ NDB_SCHEMA_OBJECT *ndb_get_schema_object(const char *key,
8282
if (my_hash_insert(&ndb_schema_objects.m_hash, (uchar*) ndb_schema_object))
8383
{
8484
my_free(ndb_schema_object);
85+
ndb_schema_object= NULL;
8586
break;
8687
}
8788
pthread_mutex_init(&ndb_schema_object->mutex, MY_MUTEX_INIT_FAST);
8889
pthread_cond_init(&ndb_schema_object->cond, NULL);
8990
bitmap_init(&ndb_schema_object->slock_bitmap, ndb_schema_object->slock,
9091
sizeof(ndb_schema_object->slock)*8, FALSE);
91-
// Expect answer from all other nodes by default(those
92-
// who are not subscribed will be filtered away by
93-
// the Coordinator which keep track of that stuff)
94-
bitmap_set_all(&ndb_schema_object->slock_bitmap);
92+
//slock_bitmap is intially cleared due to 'ZEROFILL-malloc'
9593
break;
9694
}
9795
if (ndb_schema_object)
@@ -118,7 +116,7 @@ ndb_free_schema_object(NDB_SCHEMA_OBJECT **ndb_schema_object)
118116
pthread_cond_destroy(&(*ndb_schema_object)->cond);
119117
pthread_mutex_destroy(&(*ndb_schema_object)->mutex);
120118
my_free(*ndb_schema_object);
121-
*ndb_schema_object= 0;
119+
*ndb_schema_object= NULL;
122120
}
123121
else
124122
{

0 commit comments

Comments
 (0)