Khepri: use updated policy on rabbit_amqqueue_process

dcorbacho · dcorbacho · commit 7f16996123c4 · 2023-04-14T14:58:44.000+02:00
Similar to quorum queues, use the new record to update the policies in the internal
gen_server state. When using Khepri, the projections can have stale data
until it has been applied to all nodes. Test cases on `confirm_rejects_SUITE` show
this race condition on `rabbit_amqqueue_process` when updating policies.

This change uses the Khepri feature flag to hide the API change on `rabbit_amqqueue_process`
when handling the `policy_changed` notification.
diff --git a/deps/rabbit/src/rabbit_amqqueue_process.erl b/deps/rabbit/src/rabbit_amqqueue_process.erl
@@ -1684,6 +1684,30 @@ handle_cast(policy_changed, State = #q{q = Q0}) ->
     {ok, Q} = rabbit_amqqueue:lookup(Name),
     noreply(process_args_policy(State#q{q = Q}));
 
+handle_cast({policy_changed, Q0}, State) ->
+    Name = amqqueue:get_name(Q0),
+    PolicyVersion0 = amqqueue:get_policy_version(Q0),
+    %% We depend on the #q.q field being up to date at least WRT
+    %% policy (but not mirror pids) in various places, so when it
+    %% changes we go and read it from Mnesia again.
+    %%
+    %% This also has the side effect of waking us up so we emit a
+    %% stats event - so event consumers see the changed policy.
+    {ok, Q} = rabbit_amqqueue:lookup(Name),
+    PolicyVersion = amqqueue:get_policy_version(Q),
+    case PolicyVersion >= PolicyVersion0 of
+        true ->
+            noreply(process_args_policy(State#q{q = Q}));
+        false ->
+            %% Update just the policy, as pids and mirrors could have been
+            %% updated simultaneously. A testcase on the `confirm_rejects_SUITE`
+            %% fails consistently if the internal state is updated directly to `Q0`.
+            Q1 = amqqueue:set_policy(Q, amqqueue:get_policy(Q0)),
+            Q2 = amqqueue:set_operator_policy(Q1, amqqueue:get_operator_policy(Q0)),
+            Q3 = amqqueue:set_policy_version(Q2, PolicyVersion0),
+            noreply(process_args_policy(State#q{q = Q3}))
+    end;
+
 handle_cast({sync_start, _, _}, State = #q{q = Q}) ->
     Name = amqqueue:get_name(Q),
     %% Only a mirror should receive this, it means we are a duplicated master
diff --git a/deps/rabbit/src/rabbit_classic_queue.erl b/deps/rabbit/src/rabbit_classic_queue.erl
@@ -169,7 +169,19 @@ find_missing_queues([Q1|Rem1], [Q2|Rem2] = Q2s, Acc) ->
 -spec policy_changed(amqqueue:amqqueue()) -> ok.
 policy_changed(Q) ->
     QPid = amqqueue:get_pid(Q),
-    gen_server2:cast(QPid, policy_changed).
+    case rabbit_khepri:is_enabled() of
+        false ->
+            gen_server2:cast(QPid, policy_changed);
+        true ->
+            %% When using Khepri, projections are guaranteed to be atomic on
+            %% the node that processes them, but there might be a slight delay
+            %% until they're applied on other nodes. Some test suites fail
+            %% intermittently, showing that rabbit_amqqueue_process is reading
+            %% the old policy value. We use the khepri ff to hide this API change,
+            %% and use the up-to-date record to update the policy on the gen_server
+            %% state.
+            gen_server2:cast(QPid, {policy_changed, Q})
+    end.
 
 stat(Q) ->
     delegate:invoke(amqqueue:get_pid(Q),
diff --git a/deps/rabbit/test/confirms_rejects_SUITE.erl b/deps/rabbit/test/confirms_rejects_SUITE.erl
@@ -3,6 +3,7 @@
 
 -include_lib("common_test/include/ct.hrl").
 -include_lib("amqp_client/include/amqp_client.hrl").
+-include_lib("rabbitmq_ct_helpers/include/rabbit_assert.hrl").
 -compile(export_all).
 
 all() ->
@@ -278,7 +279,7 @@ policy_resets_to_default(Config) ->
         QueueName, QueueName, <<"queues">>,
         [{<<"max-length">>, MaxLength}, {<<"overflow">>, XOverflow}]),
 
-    timer:sleep(1000),
+    ?awaitMatch([_, _], get_policy_definition(Config, QueueName), 30000),
 
     [amqp_channel:call(Ch, #'basic.publish'{routing_key = QueueName},
                            #amqp_msg{payload = <<"HI">>})
@@ -301,6 +302,8 @@ policy_resets_to_default(Config) ->
         QueueName, QueueName, <<"queues">>,
         [{<<"max-length">>, MaxLength}]),
 
+    ?awaitMatch([_], get_policy_definition(Config, QueueName), 30000),
+
     NotRejectedMessage = <<"HI-not-rejected">>,
     amqp_channel:call(Ch, #'basic.publish'{routing_key = QueueName},
                           #amqp_msg{payload = NotRejectedMessage}),
@@ -318,6 +321,16 @@ policy_resets_to_default(Config) ->
         _ -> ok
     end.
 
+get_policy_definition(Config, QueueName) ->
+    {ok, Q} = rabbit_ct_broker_helpers:rpc(Config, 0, rabbit_amqqueue, lookup,
+                                           [rabbit_misc:r(<<"/">>, queue, QueueName)]),
+    case amqqueue:get_policy(Q) of
+        undefined ->
+            undefined;
+        Policy ->
+            proplists:get_value(definition, Policy, [])
+    end.
+
 consume_all_messages(Ch, QueueName) ->
     consume_all_messages(Ch, QueueName, []).