Skip to content

Commit d5c1264

Browse files
committed
Bug#35115601 InnoDB - Page [...] still fixed or dirty crash during shutdown
Issue: During shutdown, after all internal threads are closed; it is expected that there are no pages in the buffer pool which are dirty or buffer fixed. During concurrent DDL and DML; the DDL will fail when the online log grows too big. When the DDL exits, it leaves some pages in a buffer fixed state. These pages remain buffer fixed in the buffer pool even after all internal threads are closed. Fix: Error handling in the DDL must ensure that all pages buffer fixed by the DDL must be unfixed before the DDL exits. Background: During the DDL, the tree is built using builders - one for each index. Each builder uses a set of page loaders to handle internal page operations. The builders and page loaders will buffer fix the pages they use, to ensure that these pages are not evicted before the builder completes. This is achieved via the functions latch() and release(). The Page_load::release function will increase the buf_fix_count which is later decreased in the Page_load::latch function. Other page buffer fixing is tracked and released using MTR's mtr_memo_stack. In Builder::insert_direct, the pages at level 0 needs to be buffer fixed until the next call to insert_direct. This is done by the m_btr_load->release call before exit from insert_direct. In the next call to insert_direct, the m_btr_load->latch will "undo" the previous release call. The very first latch call will simply return as m_n_recs will be 0. So the order of latch and release will be: insert_direct(latch[], release[p0]), insert_direct(latch[p0], release[p1]), insert_direct(latch[p1], release[p2]) ... where p0, p1, p2.. are pages required for level 0. This order will continue until cursor.eof() returns true; after which we will do: insert_direct(latch[p(n-1)], release[pn]), insert_direct(latch[pn], m_btr_load->finish(DB_SUCCESS)); Hence, when the Builder::insert_direct or the Builder::add_row is handling an error in the online build log, it must ensure to call m_btr_load->latch to buffer unfix page at level 0 and it must call m_btr_load->finish to handle the last pages of each page loader. Change-Id: I956bef46b0d25fbad3b6eb92b197fe7023faf191
1 parent 94148af commit d5c1264

File tree

8 files changed

+175
-7
lines changed

8 files changed

+175
-7
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Bug scenario:
2+
CREATE TABLE t1 (c1 INT);
3+
# Create big enough table to ensure another call to Builder::insert_direct
4+
SELECT COUNT(*) FROM t1;
5+
COUNT(*)
6+
10001
7+
# Builder::insert_direct returns DB_ONLINE_LOG_TOO_BIG
8+
SET DEBUG="+d,builder_insert_direct_trigger_error";
9+
ALTER TABLE t1 ADD COLUMN c2 INT DEFAULT 20, ALGORITHM=INPLACE;
10+
ERROR HY000: Creating index 'GEN_CLUST_INDEX' required more than 'innodb_online_alter_log_max_size' bytes of modification log. Please try again.
11+
SET DEBUG="-d,builder_insert_direct_trigger_error";
12+
# Pages still buffer fixed should assert during shutdown
13+
# restart
14+
# Builder::add_row returns DB_ONLINE_LOG_TOO_BIG
15+
SET DEBUG="+d,builder_add_row_trigger_error";
16+
ALTER TABLE t1 ADD COLUMN c2 INT DEFAULT 20, ALGORITHM=INPLACE;
17+
ERROR HY000: Creating index 'GEN_CLUST_INDEX' required more than 'innodb_online_alter_log_max_size' bytes of modification log. Please try again.
18+
SET DEBUG="-d,builder_add_row_trigger_error";
19+
# Pages still buffer fixed should assert during shutdown
20+
# restart
21+
# Cleanup
22+
DROP TABLE t1;
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Bug scenario:
2+
CREATE TABLE t1 (c1 INT);
3+
# Create big enough table to ensure another call to Builder::insert_direct
4+
SELECT COUNT(*) FROM t1;
5+
COUNT(*)
6+
10001
7+
SET DEBUG="+d,builder_insert_direct_no_builder";
8+
ALTER TABLE t1 ADD COLUMN c2 INT DEFAULT 20, ALGORITHM=INPLACE;
9+
ERROR HY000: Got error 11 - 'InnoDB error' from storage engine
10+
DROP TABLE t1;
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
--source include/have_debug.inc
2+
# Bug scenario:
3+
# When DDL is on-going and hits an error in online build (for example when online log is too big); it must rollback
4+
# The error handling code has missed a call to m_btr_load->finish to cleanup the buffer fixed pages at each level
5+
6+
--echo # Bug scenario:
7+
CREATE TABLE t1 (c1 INT);
8+
9+
--echo # Create big enough table to ensure another call to Builder::insert_direct
10+
--disable_query_log
11+
DELIMITER |;
12+
CREATE PROCEDURE populate_t1(IN BASE INT, IN SIZE INT)
13+
BEGIN
14+
DECLARE i INT DEFAULT BASE;
15+
WHILE (i <= SIZE) DO
16+
INSERT INTO t1 values (i);
17+
SET i = i + 1;
18+
END WHILE;
19+
END|
20+
DELIMITER ;|
21+
22+
CALL populate_t1(0, 10000);
23+
DROP PROCEDURE populate_t1;
24+
--enable_query_log
25+
26+
SELECT COUNT(*) FROM t1;
27+
28+
--echo # Builder::insert_direct returns DB_ONLINE_LOG_TOO_BIG
29+
SET DEBUG="+d,builder_insert_direct_trigger_error";
30+
--error ER_INNODB_ONLINE_LOG_TOO_BIG
31+
ALTER TABLE t1 ADD COLUMN c2 INT DEFAULT 20, ALGORITHM=INPLACE;
32+
SET DEBUG="-d,builder_insert_direct_trigger_error";
33+
34+
--echo # Pages still buffer fixed should assert during shutdown
35+
--source include/shutdown_mysqld.inc
36+
--source include/start_mysqld.inc
37+
38+
--echo # Builder::add_row returns DB_ONLINE_LOG_TOO_BIG
39+
SET DEBUG="+d,builder_add_row_trigger_error";
40+
--error ER_INNODB_ONLINE_LOG_TOO_BIG
41+
ALTER TABLE t1 ADD COLUMN c2 INT DEFAULT 20, ALGORITHM=INPLACE;
42+
SET DEBUG="-d,builder_add_row_trigger_error";
43+
44+
--echo # Pages still buffer fixed should assert during shutdown
45+
--source include/shutdown_mysqld.inc
46+
--source include/start_mysqld.inc
47+
48+
--echo # Cleanup
49+
DROP TABLE t1;
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
--source include/have_debug.inc
2+
# Test error handling in case m_btr_load was found nullptr
3+
4+
--echo # Bug scenario:
5+
CREATE TABLE t1 (c1 INT);
6+
7+
--echo # Create big enough table to ensure another call to Builder::insert_direct
8+
--disable_query_log
9+
call mtr.add_suppression("\\[ERROR\\] .* DDL Failed as Builder is already freed");
10+
DELIMITER |;
11+
CREATE PROCEDURE populate_t1(IN BASE INT, IN SIZE INT)
12+
BEGIN
13+
DECLARE i INT DEFAULT BASE;
14+
WHILE (i <= SIZE) DO
15+
INSERT INTO t1 values (i);
16+
SET i = i + 1;
17+
END WHILE;
18+
END|
19+
DELIMITER ;|
20+
21+
CALL populate_t1(0, 10000);
22+
DROP PROCEDURE populate_t1;
23+
--enable_query_log
24+
25+
SELECT COUNT(*) FROM t1;
26+
27+
SET DEBUG="+d,builder_insert_direct_no_builder";
28+
--error ER_GET_ERRNO
29+
ALTER TABLE t1 ADD COLUMN c2 INT DEFAULT 20, ALGORITHM=INPLACE;
30+
31+
DROP TABLE t1;

share/messages_to_error_log.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12279,6 +12279,9 @@ ER_PERFORMANCE_SCHEMA_VERSION_CHANGE
1227912279
ER_WARN_DEPRECATED_OR_BLOCKED_CIPHER
1228012280
eng "Value for option '%s' contains cipher '%s' that is either blocked or deprecated (and will be removed in a future release). Please refer to the documentation for more details."
1228112281

12282+
ER_IB_MSG_DDL_FAIL_NO_BUILDER
12283+
eng "DDL Failed as Builder is already freed"
12284+
1228212285
# DO NOT add server-to-client messages here;
1228312286
# they go in messages_to_clients.txt
1228412287
# in the same directory as this file.

storage/innobase/btr/btr0load.cc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,14 @@ class Page_load : private ut::Non_copyable {
7878
m_page_no(page_no),
7979
m_level(level),
8080
m_is_comp(dict_table_is_comp(index->table)),
81-
m_flush_observer(observer) {
81+
m_flush_observer(observer),
82+
m_n_blocks_buf_fixed(0) {
8283
ut_ad(!dict_index_is_spatial(m_index));
8384
}
8485

8586
/** Destructor */
8687
~Page_load() noexcept {
88+
ut_a(m_n_blocks_buf_fixed == 0);
8789
if (m_heap != nullptr) {
8890
/* mtr is allocated using heap. */
8991
if (m_mtr != nullptr) {
@@ -285,6 +287,9 @@ class Page_load : private ut::Non_copyable {
285287
/** Page modified flag. */
286288
bool m_modified{};
287289

290+
/** Number of blocks which are buffer fixed but not pushed to mtr memo */
291+
int32_t m_n_blocks_buf_fixed{};
292+
288293
friend class Btree_load;
289294
};
290295

@@ -810,6 +815,7 @@ void Page_load::release() noexcept {
810815

811816
/* We fix the block because we will re-pin it soon. */
812817
buf_block_buf_fix_inc(m_block, UT_LOCATION_HERE);
818+
m_n_blocks_buf_fixed++;
813819

814820
m_modify_clock = m_block->get_modify_clock(IF_DEBUG(true));
815821

@@ -851,6 +857,8 @@ void Page_load::latch() noexcept {
851857
}
852858

853859
buf_block_buf_fix_dec(m_block);
860+
m_n_blocks_buf_fixed--;
861+
ut_a(m_n_blocks_buf_fixed >= 0);
854862

855863
/* The caller is going to use the m_block, so it needs to be buffer-fixed
856864
even after the decrement above. This works like this:

storage/innobase/ddl/ddl0builder.cc

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1217,6 +1217,23 @@ dberr_t Builder::key_buffer_sort(size_t thread_id) noexcept {
12171217
return DB_SUCCESS;
12181218
}
12191219

1220+
dberr_t Builder::online_build_handle_error(dberr_t err) noexcept {
1221+
set_error(err);
1222+
1223+
if (m_btr_load != nullptr) {
1224+
/* page_loaders[0] has increased buf_fix_count through release(). This is
1225+
decremented by calling latch(). Similar release() calls for page_loaders at
1226+
non-zero levels are handled in finish() */
1227+
m_btr_load->latch();
1228+
err = m_btr_load->finish(err);
1229+
1230+
ut::delete_(m_btr_load);
1231+
m_btr_load = nullptr;
1232+
}
1233+
1234+
return get_error();
1235+
}
1236+
12201237
dberr_t Builder::insert_direct(Cursor &cursor, size_t thread_id) noexcept {
12211238
ut_a(m_id == 0);
12221239
ut_ad(is_skip_file_sort());
@@ -1229,15 +1246,28 @@ dberr_t Builder::insert_direct(Cursor &cursor, size_t thread_id) noexcept {
12291246
{
12301247
auto err = m_ctx.check_state_of_online_build_log();
12311248

1249+
DBUG_EXECUTE_IF("builder_insert_direct_trigger_error", {
1250+
static int count = 0;
1251+
++count;
1252+
if (count > 1) {
1253+
err = DB_ONLINE_LOG_TOO_BIG;
1254+
m_ctx.m_trx->error_key_num = SERVER_CLUSTER_INDEX_ID;
1255+
}
1256+
});
1257+
12321258
if (err != DB_SUCCESS) {
1233-
set_error(err);
1234-
err = m_btr_load->finish(err);
1235-
ut::delete_(m_btr_load);
1236-
m_btr_load = nullptr;
1237-
return get_error();
1259+
return online_build_handle_error(err);
12381260
}
12391261
}
12401262

1263+
DBUG_EXECUTE_IF("builder_insert_direct_no_builder",
1264+
{ static_cast<void>(online_build_handle_error(DB_ERROR)); });
1265+
1266+
if (m_btr_load == nullptr) {
1267+
ib::error(ER_IB_MSG_DDL_FAIL_NO_BUILDER);
1268+
return DB_ERROR;
1269+
}
1270+
12411271
m_btr_load->latch();
12421272

12431273
auto thread_ctx = m_thread_ctxs[thread_id];
@@ -1544,8 +1574,13 @@ dberr_t Builder::add_row(Cursor &cursor, Row &row, size_t thread_id,
15441574
Latch_release &&latch_release) noexcept {
15451575
auto err = m_ctx.check_state_of_online_build_log();
15461576

1577+
DBUG_EXECUTE_IF("builder_add_row_trigger_error", {
1578+
err = DB_ONLINE_LOG_TOO_BIG;
1579+
m_ctx.m_trx->error_key_num = SERVER_CLUSTER_INDEX_ID;
1580+
});
1581+
15471582
if (err != DB_SUCCESS) {
1548-
set_error(err);
1583+
err = online_build_handle_error(err);
15491584
} else if (is_spatial_index()) {
15501585
if (!cursor.eof()) {
15511586
err = batch_add_row(row, thread_id);

storage/innobase/include/ddl0impl-builder.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,16 @@ struct Builder {
395395
[[nodiscard]] dberr_t check_duplicates(Thread_ctxs &dupcheck,
396396
Dup *dup) noexcept;
397397

398+
/** Cleanup DDL after error in online build
399+
Note: To be called if DDL must cleanup due to error in online build. Pages
400+
which are buffer-fixed (in Page_load::release) until the next iteration, must
401+
be unfixed (with Page_load::latch) before returning the error.
402+
@note: Assumes that either m_btr_load->release is called before or
403+
m_n_recs is 0 (no records are inserted yet).
404+
@param[in] err Error hit in online build
405+
@return the cursor error status. */
406+
[[nodiscard]] dberr_t online_build_handle_error(dberr_t err) noexcept;
407+
398408
private:
399409
/** Buffer ID. */
400410
size_t m_id{};

0 commit comments

Comments
 (0)