Skip to content

Commit 1e39733

Browse files
author
Tobias Christiani
committed
Bug#36574298 Background histogram update warnings flood error log
Several users have experienced the error log being flooded with warnings from background histogram updates. One reported case from a customer saw as many as 300 000 messages per hour. The offending warning concerned the failure to acquire metadata locks on a table. There is a 10-second cooldown on background histogram updates for any given table. Therefore the above flooding of the error log should only occur under special circumstances, for example with DDL operations on a large number of tables. Nevertheless, flooding the error log with messages from background histogram updates is not helpful for the user. This patch throttles messages written to the error log from the background histogram update operation. The rate is set to at most one message per minute. This rate should suffice for the user to identify potential problems with background histogram updates. With this patch we downgrade all error events that happen during the background histogram update from priority level error to priority level warning. The reasoning is twofold: 1) A failure to update statistics is generally ignorable, so a warning is more suitable than an error, and 2) throttling of messages in the error log only works out-of-the-box for error events that have a priority level lower than error. Change-Id: I62d6cd7acf2d0851605eec362de72e57356ef832
1 parent 0d53915 commit 1e39733

7 files changed

+140
-21
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#
2+
# Bug#36574298: Background histogram update warnings flood error log
3+
#
4+
CREATE TABLE t1 (x INT);
5+
ANALYZE TABLE t1 UPDATE HISTOGRAM ON x AUTO UPDATE;
6+
Table Op Msg_type Msg_text
7+
test.t1 histogram status Histogram statistics created for column 'x'.
8+
INSERT INTO t1 VALUES (1), (2), (3);
9+
CREATE TABLE t2 (x INT);
10+
ANALYZE TABLE t2 UPDATE HISTOGRAM ON x AUTO UPDATE;
11+
Table Op Msg_type Msg_text
12+
test.t2 histogram status Histogram statistics created for column 'x'.
13+
INSERT INTO t2 VALUES (1), (2), (3);
14+
DO SLEEP(1);
15+
FLUSH ERROR LOGS;
16+
# With throttling of background histogram update warnings we should only
17+
# see a single warning in the error log even though we have raised
18+
# multiple ER_BACKGROUND_HISTOGRAM_UPDATE warnings.
19+
SELECT COUNT(*) AS should_be_one FROM performance_schema.error_log WHERE ERROR_CODE = 'MY-015116';
20+
should_be_one
21+
1
22+
DROP TABLE t1;
23+
DROP TABLE t2;

mysql-test/suite/sys_vars/r/log_error_suppression_list_basic.result

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,11 @@ Level Code Message
6767
Warning 3735 log_error_suppression_list: Could not add suppression rule for code "10001". Rule-set may be full, or code may not correspond to an error-log message.
6868
Error 1231 Variable 'log_error_suppression_list' can't be set to the value of '10000 10001'
6969
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 2), "11111");
70+
SET @@global.log_error_suppression_list= CONCAT(REPEAT("ER_PARSER_TRACE,",507),"ER_PARSER_TRACE");
7071
SET @@global.log_error_suppression_list= CONCAT(REPEAT("ER_PARSER_TRACE,",508),"ER_PARSER_TRACE");
71-
SET @@global.log_error_suppression_list= CONCAT(REPEAT("ER_PARSER_TRACE,",509),"ER_PARSER_TRACE");
7272
ERROR 42000: Variable 'log_error_suppression_list' can't be set to the value of 'ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSER_TRACE,ER_PARSE'
73+
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 507), "11111");
7374
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 508), "11111");
74-
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 509), "11111");
7575
ERROR 42000: Variable 'log_error_suppression_list' can't be set to the value of '10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10'
7676
SET @@global.log_error_suppression_list='10000.123';
7777
ERROR 42000: Variable 'log_error_suppression_list' can't be set to the value of '10000.123'

mysql-test/suite/sys_vars/t/log_error_suppression_list_basic.test

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,16 @@ SHOW WARNINGS;
6666
SET @@global.log_error_suppression_list="10000 10001";
6767
SHOW WARNINGS;
6868

69-
# too many items
69+
# The statements below verify that we can have at most 512 rules in the log error suppression list.
70+
# This count includes the rules specified in log_builtins_filter_set_defaults().
7071
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 2), "11111");
71-
# 508 repeated + 1 tail + 2 automatic rules == 511 total
72+
# 507 repeated + 1 tail + 4 automatic rules == 512 total. This is OK.
73+
SET @@global.log_error_suppression_list= CONCAT(REPEAT("ER_PARSER_TRACE,",507),"ER_PARSER_TRACE");
74+
--error ER_WRONG_VALUE_FOR_VAR
7275
SET @@global.log_error_suppression_list= CONCAT(REPEAT("ER_PARSER_TRACE,",508),"ER_PARSER_TRACE");
76+
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 507), "11111");
7377
--error ER_WRONG_VALUE_FOR_VAR
74-
SET @@global.log_error_suppression_list= CONCAT(REPEAT("ER_PARSER_TRACE,",509),"ER_PARSER_TRACE");
7578
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 508), "11111");
76-
--error ER_WRONG_VALUE_FOR_VAR
77-
SET @@global.log_error_suppression_list= CONCAT(REPEAT("10000,", 509), "11111");
7879

7980
# Bug#28145616: PARSER ACCEPTS GARBAGE VALUES IF PREFIXED BY VALID ERROR CODES
8081
--error ER_WRONG_VALUE_FOR_VAR
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
--loose-debug='+d,background_histogram_update_errors'
2+
--force-restart
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
--source include/have_debug.inc
2+
3+
-- echo #
4+
-- echo # Bug#36574298: Background histogram update warnings flood error log
5+
-- echo #
6+
7+
# Note: this test is in a separate file because we want to use
8+
# the loose-debug='+d,background_histogram_update_errors' startup option
9+
# to trigger multiple errors during histogram updates from the background thread.
10+
# If we instead use "SET GLOBAL DEBUG = ..." we run seem to run into some
11+
# flakiness in terms of when this setting is picked up by the background thread.
12+
13+
# Create two tables with automatic histograms and insert values to trigger
14+
# automatic updates from the background thread. This will raise multiple
15+
# ER_BACKGROUND_HISTOGRAM_UPDATE warnings due to the
16+
# loose-debug='+d,background_histogram_update_errors' startup option.
17+
CREATE TABLE t1 (x INT);
18+
ANALYZE TABLE t1 UPDATE HISTOGRAM ON x AUTO UPDATE;
19+
INSERT INTO t1 VALUES (1), (2), (3);
20+
21+
CREATE TABLE t2 (x INT);
22+
ANALYZE TABLE t2 UPDATE HISTOGRAM ON x AUTO UPDATE;
23+
INSERT INTO t2 VALUES (1), (2), (3);
24+
25+
# Wait for a background histogram update error to show up in the error log.
26+
let $wait_condition = SELECT COUNT(*) > 0 FROM performance_schema.error_log WHERE ERROR_CODE = 'MY-015116';
27+
let $wait_timeout = 15;
28+
--source include/wait_condition.inc
29+
30+
# The wait_condition only waits for at least one error to show up.
31+
# We give the system an additional second to synchronize and flush the error logs.
32+
# This should ensure that all of the error messages have been written to the log.
33+
# It would be preferable to use DEBUG_SYNC for this, but it does not seem straightforward
34+
# to get it to work with a background thread.
35+
DO SLEEP(1);
36+
FLUSH ERROR LOGS;
37+
38+
--echo # With throttling of background histogram update warnings we should only
39+
--echo # see a single warning in the error log even though we have raised
40+
--echo # multiple ER_BACKGROUND_HISTOGRAM_UPDATE warnings.
41+
SELECT COUNT(*) AS should_be_one FROM performance_schema.error_log WHERE ERROR_CODE = 'MY-015116';
42+
43+
DROP TABLE t1;
44+
DROP TABLE t2;

sql/histograms/histogram.cc

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,15 +1713,21 @@ static void cleanup_session_context(THD *thd) {
17131713

17141714
/**
17151715
Custom error handling for histogram updates from the background thread.
1716-
Downgrades MDL timeout errors to warnings.
1716+
Downgrades all errors to warnings. This is done for two reasons:
1717+
1718+
1) Because errors during background histogram updates are mostly ignorable,
1719+
i.e., it is not critical that the user does something if histogram
1720+
statistics fail to be updated.
1721+
1722+
2) We wish to throttle error log entries from background histogram updates,
1723+
and this is currently only supported for a priority level of warnings.
17171724
*/
17181725
class Background_error_handler : public Internal_error_handler {
17191726
public:
1720-
bool handle_condition(THD *, uint sql_errno, const char *,
1727+
bool handle_condition(THD *, uint, const char *,
17211728
Sql_condition::enum_severity_level *level,
17221729
const char *) override {
1723-
if (sql_errno == ER_LOCK_WAIT_TIMEOUT &&
1724-
*level == Sql_condition::SL_ERROR) {
1730+
if (*level == Sql_condition::SL_ERROR) {
17251731
*level = Sql_condition::SL_WARNING;
17261732
}
17271733
return false;
@@ -1757,7 +1763,6 @@ bool auto_update_table_histograms_from_background_thread(
17571763
thd->mdl_context.release_transactional_locks();
17581764
write_diagnostics_area_to_error_log(thd, db_name, table_name);
17591765
cleanup_session_context(thd);
1760-
DEBUG_SYNC(thd, "background_histogram_update_done");
17611766
});
17621767

17631768
// It is crucial that we release objects from the dictionary cache _before_
@@ -1776,6 +1781,20 @@ bool auto_update_table_histograms_from_background_thread(
17761781
auto error_handler_guard =
17771782
create_scope_guard([&]() { thd->pop_internal_handler(); });
17781783

1784+
// Simulate multiple errors to test error log throttling. We should only see
1785+
// the first error.
1786+
DBUG_EXECUTE_IF("background_histogram_update_errors", {
1787+
my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
1788+
my_error(ER_UNABLE_TO_BUILD_HISTOGRAM, MYF(0), "field", "schema", "table");
1789+
my_error(ER_UNABLE_TO_UPDATE_COLUMN_STATISTICS, MYF(0), "field", "schema",
1790+
"table");
1791+
my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
1792+
my_error(ER_UNABLE_TO_BUILD_HISTOGRAM, MYF(0), "field", "schema", "table");
1793+
my_error(ER_UNABLE_TO_UPDATE_COLUMN_STATISTICS, MYF(0), "field", "schema",
1794+
"table");
1795+
return true;
1796+
});
1797+
17791798
// Lock the table metadata so we can check whether the table has any
17801799
// automatically updated histograms. We get the column names from the table
17811800
// definition in order to avoid opening the table until we know that it will

sql/server_component/log_builtins_filter.cc

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@
9292

9393
static bool filter_inited = false;
9494
static ulong filter_rule_uuid = 0;
95+
// The number of built-in rules in the log_filter_builtin_rules ruleset. This
96+
// counter should be incremented for each built-in rule we add in
97+
// log_builtins_filter_set_defaults().
98+
static uint32 builtin_count = 0;
9599

96100
log_filter_ruleset *log_filter_builtin_rules = nullptr;
97101
log_filter_tag rule_tag_builtin = {"log_filter_builtin", nullptr};
@@ -289,11 +293,12 @@ static void log_builtins_filter_ruleset_free(log_filter_ruleset **ruleset) {
289293
}
290294

291295
/**
292-
Defaults for when the configuration engine isn't loaded;
293-
aim for 5.7 compatibility.
296+
Defaults for when the configuration engine isn't loaded; aim for 5.7
297+
compatibility.
294298
*/
295299
static void log_builtins_filter_set_defaults(log_filter_ruleset *ruleset) {
296300
log_filter_rule *r;
301+
builtin_count = 0;
297302

298303
assert(ruleset != nullptr);
299304

@@ -308,6 +313,7 @@ static void log_builtins_filter_set_defaults(log_filter_ruleset *ruleset) {
308313
r->verb = LOG_FILTER_RETURN;
309314

310315
ruleset->count++;
316+
builtin_count++;
311317

312318
// sys_var: log_error_verbosity
313319
r = log_builtins_filter_rule_init(ruleset);
@@ -318,6 +324,7 @@ static void log_builtins_filter_set_defaults(log_filter_ruleset *ruleset) {
318324
r->verb = LOG_FILTER_DROP;
319325

320326
ruleset->count++;
327+
builtin_count++;
321328

322329
// example: remove all source-line log items
323330
// these are not desirable by default, only while debugging.
@@ -328,6 +335,29 @@ static void log_builtins_filter_set_defaults(log_filter_ruleset *ruleset) {
328335
// aux optional
329336

330337
ruleset->count++;
338+
builtin_count++;
339+
340+
// Throttle background histogram update errors.
341+
342+
// Initialize a new rule.
343+
r = log_builtins_filter_rule_init(ruleset);
344+
345+
// Condition/comparator: equal.
346+
r->cond = LOG_FILTER_COND_EQ;
347+
348+
// Match information: MySQL error code.
349+
log_item_set(&r->match, LOG_ITEM_SQL_ERRCODE)->data_integer =
350+
ER_BACKGROUND_HISTOGRAM_UPDATE;
351+
352+
// Action/verb: throttle (rate-limit).
353+
r->verb = LOG_FILTER_THROTTLE;
354+
355+
// Auxiliary information: maximum number of messages per minute.
356+
log_item_set(&r->aux, LOG_ITEM_GEN_INTEGER)->data_integer = 1;
357+
358+
// Rule complete, be counted.
359+
ruleset->count++;
360+
builtin_count++;
331361
}
332362

333363
/**
@@ -970,15 +1000,15 @@ int log_builtins_filter_parse_suppression_list(char *list, bool update) {
9701000
}
9711001

9721002
/*
973-
During check-phase, make sure the requested number of error-codes
974-
(and therefore, the requested number of DROP rules) will fit into
975-
the built-in rule-set. Reserve one rule for --log-error-verbosity
976-
and one for our "ERROR and SYSTEM always pass" shortcut.
977-
Without this check, we'd still catch the (attempted) overflow during
978-
assignment, but if we do it during the check phase, we protect the
1003+
During check-phase, make sure the requested number of error-codes (and
1004+
therefore, the requested number of DROP rules) will fit into the built-in
1005+
rule-set. Without this check, we'd still catch the (attempted) overflow
1006+
during assignment, but if we do it during the check phase, we protect the
9791007
integrity of both the current rule-set and the variable's value.
9801008
*/
981-
else if (++list_len >= (log_filter_builtin_rules->alloc - 2))
1009+
else if (uint32_t max_user_rules_in_list =
1010+
log_filter_builtin_rules->alloc - builtin_count;
1011+
++list_len > max_user_rules_in_list)
9821012
goto fail;
9831013

9841014
start = end;

0 commit comments

Comments
 (0)