Skip to content

Commit 3581ed5

Browse files
Merge pull request #13622 from rabbitmq/mergify/bp/v4.0.x/pr-13621
QQ: Revise checkpointing logic to take more frequent checkpoints for large message workloads (backport #13587) (backport #13621)
2 parents f820888 + aa0ba5f commit 3581ed5

File tree

4 files changed

+59
-26
lines changed

4 files changed

+59
-26
lines changed

deps/rabbit/src/rabbit_fifo.erl

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -929,7 +929,7 @@ which_module(5) -> ?MODULE.
929929
smallest_index :: undefined | ra:index(),
930930
messages_total :: non_neg_integer(),
931931
indexes = ?CHECK_MIN_INDEXES :: non_neg_integer(),
932-
unused_1 = ?NIL}).
932+
bytes_in = 0 :: non_neg_integer()}).
933933
-record(aux_gc, {last_raft_idx = 0 :: ra:index()}).
934934
-record(aux, {name :: atom(),
935935
capacity :: term(),
@@ -940,7 +940,9 @@ which_module(5) -> ?MODULE.
940940
gc = #aux_gc{} :: #aux_gc{},
941941
tick_pid :: undefined | pid(),
942942
cache = #{} :: map(),
943-
last_checkpoint :: #checkpoint{}}).
943+
last_checkpoint :: #checkpoint{},
944+
bytes_in = 0 :: non_neg_integer(),
945+
bytes_out = 0 :: non_neg_integer()}).
944946

945947
init_aux(Name) when is_atom(Name) ->
946948
%% TODO: catch specific exception throw if table already exists
@@ -953,7 +955,7 @@ init_aux(Name) when is_atom(Name) ->
953955
last_checkpoint = #checkpoint{index = 0,
954956
timestamp = erlang:system_time(millisecond),
955957
messages_total = 0,
956-
unused_1 = ?NIL}}.
958+
bytes_in = 0}}.
957959

958960
handle_aux(RaftState, Tag, Cmd, #aux{name = Name,
959961
capacity = Cap,
@@ -970,13 +972,14 @@ handle_aux(RaftState, Tag, Cmd, AuxV2, RaAux)
970972
handle_aux(RaftState, Tag, Cmd, AuxV3, RaAux);
971973
handle_aux(leader, cast, eval,
972974
#?AUX{last_decorators_state = LastDec,
975+
bytes_in = BytesIn,
973976
last_checkpoint = Check0} = Aux0,
974977
RaAux) ->
975978
#?STATE{cfg = #cfg{resource = QName}} = MacState =
976979
ra_aux:machine_state(RaAux),
977980

978981
Ts = erlang:system_time(millisecond),
979-
{Check, Effects0} = do_checkpoints(Ts, Check0, RaAux, false),
982+
{Check, Effects0} = do_checkpoints(Ts, Check0, RaAux, BytesIn, false),
980983

981984
%% this is called after each batch of commands have been applied
982985
%% set timer for message expire
@@ -992,11 +995,16 @@ handle_aux(leader, cast, eval,
992995
last_decorators_state = NewLast}, RaAux, Effects}
993996
end;
994997
handle_aux(_RaftState, cast, eval,
995-
#?AUX{last_checkpoint = Check0} = Aux0,
998+
#?AUX{last_checkpoint = Check0,
999+
bytes_in = BytesIn} = Aux0,
9961000
RaAux) ->
9971001
Ts = erlang:system_time(millisecond),
998-
{Check, Effects} = do_checkpoints(Ts, Check0, RaAux, false),
1002+
{Check, Effects} = do_checkpoints(Ts, Check0, RaAux, BytesIn, false),
9991003
{no_reply, Aux0#?AUX{last_checkpoint = Check}, RaAux, Effects};
1004+
handle_aux(_RaftState, cast, {bytes_in, {MetaSize, BodySize}},
1005+
#?AUX{bytes_in = Bytes} = Aux0,
1006+
RaAux) ->
1007+
{no_reply, Aux0#?AUX{bytes_in = Bytes + MetaSize + BodySize}, RaAux, []};
10001008
handle_aux(_RaftState, cast, {#return{msg_ids = MsgIds,
10011009
consumer_key = Key} = Ret, Corr, Pid},
10021010
Aux0, RaAux0) ->
@@ -1126,12 +1134,13 @@ handle_aux(_RaState, {call, _From}, {peek, Pos}, Aux0,
11261134
handle_aux(_, _, garbage_collection, Aux, RaAux) ->
11271135
{no_reply, force_eval_gc(RaAux, Aux), RaAux};
11281136
handle_aux(_RaState, _, force_checkpoint,
1129-
#?AUX{last_checkpoint = Check0} = Aux, RaAux) ->
1137+
#?AUX{last_checkpoint = Check0,
1138+
bytes_in = BytesIn} = Aux, RaAux) ->
11301139
Ts = erlang:system_time(millisecond),
11311140
#?STATE{cfg = #cfg{resource = QR}} = ra_aux:machine_state(RaAux),
11321141
rabbit_log:debug("~ts: rabbit_fifo: forcing checkpoint at ~b",
11331142
[rabbit_misc:rs(QR), ra_aux:last_applied(RaAux)]),
1134-
{Check, Effects} = do_checkpoints(Ts, Check0, RaAux, true),
1143+
{Check, Effects} = do_checkpoints(Ts, Check0, RaAux, BytesIn, true),
11351144
{no_reply, Aux#?AUX{last_checkpoint = Check}, RaAux, Effects};
11361145
handle_aux(RaState, _, {dlx, _} = Cmd, Aux0, RaAux) ->
11371146
#?STATE{dlx = DlxState,
@@ -1575,7 +1584,9 @@ maybe_return_all(#{system_time := Ts} = Meta, ConsumerKey,
15751584
apply_enqueue(#{index := RaftIdx,
15761585
system_time := Ts} = Meta, From,
15771586
Seq, RawMsg, Size, State0) ->
1578-
case maybe_enqueue(RaftIdx, Ts, From, Seq, RawMsg, Size, [], State0) of
1587+
Effects0 = [{aux, {bytes_in, Size}}],
1588+
case maybe_enqueue(RaftIdx, Ts, From, Seq, RawMsg, Size,
1589+
Effects0, State0) of
15791590
{ok, State1, Effects1} ->
15801591
checkout(Meta, State0, State1, Effects1);
15811592
{out_of_sequence, State, Effects} ->
@@ -2918,11 +2929,12 @@ priority_tag(Msg) ->
29182929
end.
29192930

29202931

2921-
do_checkpoints(Ts,
2922-
#checkpoint{index = ChIdx,
2923-
timestamp = ChTime,
2924-
smallest_index = LastSmallest,
2925-
indexes = MinIndexes} = Check0, RaAux, Force) ->
2932+
do_checkpoints(Ts, #checkpoint{index = ChIdx,
2933+
timestamp = ChTime,
2934+
smallest_index = LastSmallest,
2935+
bytes_in = LastBytesIn,
2936+
indexes = MinIndexes} = Check0,
2937+
RaAux, BytesIn, Force) ->
29262938
LastAppliedIdx = ra_aux:last_applied(RaAux),
29272939
IndexesSince = LastAppliedIdx - ChIdx,
29282940
#?STATE{} = MacState = ra_aux:machine_state(RaAux),
@@ -2934,21 +2946,35 @@ do_checkpoints(Ts,
29342946
Smallest
29352947
end,
29362948
MsgsTot = messages_total(MacState),
2949+
%% more than 64MB (by default) of message data has been written to the log
2950+
%% best take a checkpoint
2951+
29372952
{CheckMinInterval, CheckMinIndexes, CheckMaxIndexes} =
29382953
persistent_term:get(quorum_queue_checkpoint_config,
29392954
{?CHECK_MIN_INTERVAL_MS, ?CHECK_MIN_INDEXES,
29402955
?CHECK_MAX_INDEXES}),
2956+
2957+
%% scale the bytes limit as the backlog increases
2958+
MaxBytesFactor = max(1, MsgsTot / CheckMaxIndexes),
2959+
EnoughDataWritten = BytesIn - LastBytesIn > (?CHECK_MAX_BYTES * MaxBytesFactor),
29412960
EnoughTimeHasPassed = TimeSince > CheckMinInterval,
29422961

2943-
%% enough time has passed and enough indexes have been committed
2944-
case (IndexesSince > MinIndexes andalso
2945-
EnoughTimeHasPassed) orelse
2946-
%% the queue is empty and some commands have been
2947-
%% applied since the last checkpoint
2948-
(MsgsTot == 0 andalso
2949-
IndexesSince > CheckMinIndexes andalso
2950-
EnoughTimeHasPassed) orelse
2951-
Force of
2962+
case (EnoughTimeHasPassed andalso
2963+
(
2964+
%% condition 1: enough indexes have been committed since the last
2965+
%% checkpoint
2966+
(IndexesSince > MinIndexes) orelse
2967+
%% condition 2: the queue is empty and _some_ commands
2968+
%% have been applied since the last checkpoint
2969+
(MsgsTot == 0 andalso IndexesSince > 32)
2970+
)
2971+
) orelse
2972+
%% condition 3: enough message data has been written to warrant a new
2973+
%% checkpoint, this ignores the time windowing
2974+
EnoughDataWritten orelse
2975+
%% force was requested, e.g. after a purge
2976+
Force
2977+
of
29522978
true ->
29532979
%% take fewer checkpoints the more messages there are on queue
29542980
NextIndexes = min(max(MsgsTot, CheckMinIndexes), CheckMaxIndexes),
@@ -2957,6 +2983,7 @@ do_checkpoints(Ts,
29572983
timestamp = Ts,
29582984
smallest_index = NewSmallest,
29592985
messages_total = MsgsTot,
2986+
bytes_in = BytesIn,
29602987
indexes = NextIndexes},
29612988
[{checkpoint, LastAppliedIdx, MacState} |
29622989
release_cursor(LastSmallest, NewSmallest)]};

deps/rabbit/src/rabbit_fifo.hrl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,11 @@
100100
% represents a partially applied module call
101101

102102
-define(CHECK_MIN_INTERVAL_MS, 1000).
103-
-define(CHECK_MIN_INDEXES, 4096).
103+
-define(CHECK_MIN_INDEXES, 4096 * 2).
104104
-define(CHECK_MAX_INDEXES, 666_667).
105+
%% once these many bytes have been written since the last checkpoint
106+
%% we request a checkpoint irrespectively
107+
-define(CHECK_MAX_BYTES, 128_000_000).
105108

106109
-define(USE_AVG_HALF_LIFE, 10000.0).
107110
%% an average QQ without any message uses about 100KB so setting this limit

deps/rabbit/src/rabbit_quorum_queue.erl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,9 @@
145145
-define(DELETE_TIMEOUT, 5000).
146146
-define(MEMBER_CHANGE_TIMEOUT, 20_000).
147147
-define(SNAPSHOT_INTERVAL, 8192). %% the ra default is 4096
148-
% -define(UNLIMITED_PREFETCH_COUNT, 2000). %% something large for ra
149-
-define(MIN_CHECKPOINT_INTERVAL, 8192). %% the ra default is 16384
148+
%% setting a low default here to allow quorum queues to better chose themselves
149+
%% when to take a checkpoint
150+
-define(MIN_CHECKPOINT_INTERVAL, 64).
150151
-define(LEADER_HEALTH_CHECK_TIMEOUT, 5_000).
151152
-define(GLOBAL_LEADER_HEALTH_CHECK_TIMEOUT, 60_000).
152153

deps/rabbit/test/quorum_queue_SUITE.erl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1488,6 +1488,8 @@ gh_12635(Config) ->
14881488
publish_confirm(Ch0, QQ),
14891489
publish_confirm(Ch0, QQ),
14901490

1491+
%% a QQ will not take checkpoints more frequently than every 1s
1492+
timer:sleep(1000),
14911493
%% force a checkpoint on leader
14921494
ok = rpc:call(Server0, ra, cast_aux_command, [{RaName, Server0}, force_checkpoint]),
14931495
rabbit_ct_helpers:await_condition(

0 commit comments

Comments
 (0)