|
82 | 82 | file_handle_other_reservation/0,
|
83 | 83 | file_handle_release_reservation/0]).
|
84 | 84 |
|
| 85 | +-export([leader_health_check/2, run_leader_health_check/4]). |
| 86 | + |
85 | 87 | -ifdef(TEST).
|
86 | 88 | -export([filter_promotable/2,
|
87 | 89 | ra_machine_config/1]).
|
|
144 | 146 | -define(SNAPSHOT_INTERVAL, 8192). %% the ra default is 4096
|
145 | 147 | % -define(UNLIMITED_PREFETCH_COUNT, 2000). %% something large for ra
|
146 | 148 | -define(MIN_CHECKPOINT_INTERVAL, 8192). %% the ra default is 16384
|
| 149 | +-define(QQ_LEADER_HEALTH_CHECK_TIMEOUT, 1_000). |
| 150 | +-define(QQ_GLOBAL_LEADER_HEALTH_CHECK_TIMEOUT, 60_000). |
147 | 151 |
|
148 | 152 | %%----------- QQ policies ---------------------------------------------------
|
149 | 153 |
|
@@ -2145,3 +2149,63 @@ file_handle_other_reservation() ->
|
2145 | 2149 | file_handle_release_reservation() ->
|
2146 | 2150 | ok.
|
2147 | 2151 |
|
| 2152 | +leader_health_check(QueueNameOrRegEx, VHost) -> |
| 2153 | + %% Set a process limit threshold to 40% of ErlangVM process limit, beyond which |
| 2154 | + %% we cannot spawn any new processes for executing QQ leader health checks. |
| 2155 | + ProcessLimitThreshold = round(0.4 * erlang:system_info(process_limit)), |
| 2156 | + |
| 2157 | + HealthCheckRef = make_ref(), |
| 2158 | + HealthCheckPids = |
| 2159 | + lists:flatten( |
| 2160 | + [begin |
| 2161 | + {resource, VHost, queue, QueueName} = QResource = amqqueue:get_name(Q), |
| 2162 | + case check_process_limit_safety(ProcessLimitThreshold) of |
| 2163 | + true -> |
| 2164 | + case re:run(QueueName, QueueNameOrRegEx, [{capture, none}]) of |
| 2165 | + match -> |
| 2166 | + {ClusterName, _} = rabbit_amqqueue:pid_of(Q), |
| 2167 | + _Pid = spawn(fun() -> run_leader_health_check(ClusterName, QResource, HealthCheckRef, self()) end); |
| 2168 | + _ -> |
| 2169 | + [] |
| 2170 | + end; |
| 2171 | + false -> |
| 2172 | + [] |
| 2173 | + end |
| 2174 | + end || Q <- rabbit_amqqueue:list(VHost), amqqueue:get_type(Q) == ?MODULE]), |
| 2175 | + wait_for_leader_health_checks(HealthCheckRef, length(HealthCheckPids), []). |
| 2176 | + |
| 2177 | +run_leader_health_check(ClusterName, QResource, HealthCheckRef, From) -> |
| 2178 | + Leader = ra_leaderboard:lookup_leader(ClusterName), |
| 2179 | + case ra_server_proc:ping(Leader, ?QQ_LEADER_HEALTH_CHECK_TIMEOUT) of |
| 2180 | + {pong,leader} -> |
| 2181 | + From ! {ok, HealthCheckRef, QResource}; |
| 2182 | + _ -> |
| 2183 | + From ! {error, HealthCheckRef, QResource} |
| 2184 | + end, |
| 2185 | + ok. |
| 2186 | + |
| 2187 | +wait_for_leader_health_checks(Ref, N, UnhealthyAcc) -> |
| 2188 | + receive |
| 2189 | + {ok, Ref, _QResource} when N == 1 -> |
| 2190 | + UnhealthyAcc; |
| 2191 | + {error, Ref, QResource} when N == 1 -> |
| 2192 | + [cli_format(QResource) | UnhealthyAcc]; |
| 2193 | + {ok, Ref, _QResource} -> |
| 2194 | + wait_for_leader_health_checks(Ref, N - 1, UnhealthyAcc); |
| 2195 | + {error, Ref, QResource} -> |
| 2196 | + wait_for_leader_health_checks(Ref, N - 1, [cli_format(QResource) | UnhealthyAcc]) |
| 2197 | + after |
| 2198 | + ?QQ_GLOBAL_LEADER_HEALTH_CHECK_TIMEOUT -> |
| 2199 | + UnhealthyAcc |
| 2200 | + end. |
| 2201 | + |
| 2202 | +check_process_limit_safety(ProcessLimitThreshold) -> |
| 2203 | + erlang:system_info(process_count) < ProcessLimitThreshold. |
| 2204 | + |
| 2205 | +cli_format(QResource = {resource, VHost, queue, QName}) -> |
| 2206 | + #{ |
| 2207 | + <<"readable_name">> => rabbit_data_coercion:to_binary(rabbit_misc:rs(QResource)), |
| 2208 | + <<"name">> => QName, |
| 2209 | + <<"virtual_host">> => VHost, |
| 2210 | + <<"type">> => <<"quorum">> |
| 2211 | + }. |
0 commit comments