From 341e908bbf3123502f43cbe69fd478ea684d2a3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 4 Apr 2022 11:08:54 +0200
Subject: [PATCH 01/39] CQ: Merge lazy/default behavior into a unified mode

No longer reduce memory usage as well (except an explicit GC
that I am pondering about removing).
---
 deps/rabbit/src/rabbit_amqqueue_process.erl |   9 +-
 deps/rabbit/src/rabbit_time_travel_dbg.erl  |  58 ++
 deps/rabbit/src/rabbit_variable_queue.erl   | 634 ++------------------
 deps/rabbit/test/backing_queue_SUITE.erl    | 128 ++--
 4 files changed, 196 insertions(+), 633 deletions(-)
 create mode 100644 deps/rabbit/src/rabbit_time_travel_dbg.erl

diff --git a/deps/rabbit/src/rabbit_amqqueue_process.erl b/deps/rabbit/src/rabbit_amqqueue_process.erl
index cfecad93d3b1..ae5620e74c75 100644
--- a/deps/rabbit/src/rabbit_amqqueue_process.erl
+++ b/deps/rabbit/src/rabbit_amqqueue_process.erl
@@ -502,7 +502,14 @@ next_state(State = #q{q = Q,
                       backing_queue       = BQ,
                       backing_queue_state = BQS,
                       msg_id_to_channel   = MTC}) ->
-    assert_invariant(State),
+    try
+        assert_invariant(State)
+    catch C:E:S ->
+        rabbit_time_travel_dbg:dump(),
+        rabbit_time_travel_dbg:print(),
+        logger:error("BAD STATE ~p", [State]),
+        erlang:raise(C,E,S)
+    end,
     {MsgIds, BQS1} = BQ:drain_confirmed(BQS),
     MTC1 = confirm_messages(MsgIds, MTC, amqqueue:get_name(Q)),
     State1 = State#q{backing_queue_state = BQS1, msg_id_to_channel = MTC1},
diff --git a/deps/rabbit/src/rabbit_time_travel_dbg.erl b/deps/rabbit/src/rabbit_time_travel_dbg.erl
new file mode 100644
index 000000000000..03d49c036203
--- /dev/null
+++ b/deps/rabbit/src/rabbit_time_travel_dbg.erl
@@ -0,0 +1,58 @@
+-module(rabbit_time_travel_dbg).
+-compile(export_all).
+-compile(nowarn_export_all).
+
+start(Pid) ->
+    start(Pid, [rabbit, rabbit_common]).
+
+start(Pid, Apps) ->
+    Mods = apps_to_mods(Apps, []),
+    TracerPid = spawn_link(?MODULE, init, []),
+    {ok, _} = dbg:tracer(process, {fun (Msg, _) -> TracerPid ! Msg end, []}),
+    _ = [dbg:tpl(M, []) || M <- Mods],
+    dbg:p(Pid, [c]),
+    ok.
+
+apps_to_mods([], Acc) ->
+    lists:flatten(Acc);
+apps_to_mods([App|Tail], Acc) ->
+    _ = application:load(App),
+    {ok, Mods} = application:get_key(App, modules),
+    apps_to_mods(Tail, [Mods|Acc]).
+
+dump() ->
+    ?MODULE ! dump,
+    ok.
+
+print() ->
+    ?MODULE ! print,
+    ok.
+
+stop() ->
+    dbg:stop_clear(),
+    ?MODULE ! stop,
+    ok.
+
+init() ->
+    register(?MODULE, self()),
+    loop(queue:new()).
+
+loop(Q) ->
+    receive
+        dump ->
+            file:write_file("time_travel.dbg",
+                [io_lib:format("~0p~n", [E]) || E <- queue:to_list(Q)]),
+            loop(Q);
+        print ->
+            _ = [logger:error("~0p", [E]) || E <- queue:to_list(Q)],
+            loop(Q);
+        stop ->
+            ok;
+        Msg ->
+            case queue:len(Q) of
+                1000 ->
+                    loop(queue:in(Msg, queue:out(Q)));
+                _ ->
+                    loop(queue:in(Msg, Q))
+            end
+    end.
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index f55ca0f0c855..f5aaa5c5bac5 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -820,30 +820,7 @@ ack(AckTags, State) ->
                          store_state      = StoreState,
                          ack_out_counter  = AckOutCount + length(AckTags) })}.
 
-requeue(AckTags, #vqstate { mode       = default,
-                            delta      = Delta,
-                            q3         = Q3,
-                            q4         = Q4,
-                            in_counter = InCounter,
-                            len        = Len } = State) ->
-    {SeqIds,  Q4a, MsgIds,  State1} = queue_merge(lists:sort(AckTags), Q4, [],
-                                                  beta_limit(Q3),
-                                                  fun publish_alpha/2, State),
-    {SeqIds1, Q3a, MsgIds1, State2} = queue_merge(SeqIds, Q3, MsgIds,
-                                                  delta_limit(Delta),
-                                                  fun publish_beta/2, State1),
-    {Delta1, MsgIds2, State3}       = delta_merge(SeqIds1, Delta, MsgIds1,
-                                                  State2),
-    MsgCount = length(MsgIds2),
-    {MsgIds2, a(maybe_reduce_memory_use(
-                  maybe_update_rates(ui(
-                    State3 #vqstate { delta      = Delta1,
-                                      q3         = Q3a,
-                                      q4         = Q4a,
-                                      in_counter = InCounter + MsgCount,
-                                      len        = Len + MsgCount }))))};
-requeue(AckTags, #vqstate { mode       = lazy,
-                            delta      = Delta,
+requeue(AckTags, #vqstate { delta      = Delta,
                             q3         = Q3,
                             in_counter = InCounter,
                             len        = Len } = State) ->
@@ -903,7 +880,7 @@ set_ram_duration_target(
           (TargetRamCount =/= infinity andalso
            TargetRamCount1 >= TargetRamCount) of
           true  -> State1;
-          false -> reduce_memory_use(State1)
+          false -> State1 % reduce_memory_use(State1)
       end).
 
 maybe_update_rates(State = #vqstate{ in_counter  = InCount,
@@ -1031,6 +1008,7 @@ info(message_bytes_paged_out, #vqstate{delta_transient_bytes = PagedOutBytes}) -
     PagedOutBytes;
 info(head_message_timestamp, #vqstate{
           q3               = Q3,
+          %% @todo Drop Q4 variable.
           q4               = Q4,
           ram_pending_ack  = RPA,
           qi_pending_ack   = QPA}) ->
@@ -1075,18 +1053,7 @@ invoke(      _,   _, State) -> State.
 
 is_duplicate(_Msg, State) -> {false, State}.
 
-set_queue_mode(Mode, State = #vqstate { mode = Mode }) ->
-    State;
-set_queue_mode(lazy, State = #vqstate {
-                                target_ram_count = TargetRamCount }) ->
-    %% To become a lazy queue we need to page everything to disk first.
-    State1 = convert_to_lazy(State),
-    %% restore the original target_ram_count
-    a(State1 #vqstate { mode = lazy, target_ram_count = TargetRamCount });
-set_queue_mode(default, State) ->
-    %% becoming a default queue means loading messages from disk like
-    %% when a queue is recovered.
-    a(maybe_deltas_to_betas(State #vqstate { mode = default }));
+%% Queue mode has been unified.
 set_queue_mode(_, State) ->
     State.
 
@@ -1095,39 +1062,6 @@ zip_msgs_and_acks(Msgs, AckTags, Accumulator, _State) ->
                         [{Id, AckTag} | Acc]
                 end, Accumulator, lists:zip(Msgs, AckTags)).
 
-convert_to_lazy(State) ->
-    State1 = #vqstate { delta = Delta, q3 = Q3, len = Len, io_batch_size = IoBatchSize } =
-        set_ram_duration_target(0, State),
-    case Delta#delta.count + ?QUEUE:len(Q3) == Len of
-        true ->
-            State1;
-        false ->
-            %% When pushing messages to disk, we might have been
-            %% blocked by the msg_store, so we need to see if we have
-            %% to wait for more credit, and then keep paging messages.
-            %%
-            %% The amqqueue_process could have taken care of this, but
-            %% between the time it receives the bump_credit msg and
-            %% calls BQ:resume to keep paging messages to disk, some
-            %% other request may arrive to the BQ which at this moment
-            %% is not in a proper state for a lazy BQ (unless all
-            %% messages have been paged to disk already).
-            wait_for_msg_store_credit(),
-            %% Setting io_batch_size to 0 forces the writing of all messages to disk.
-            %% Otherwise a few could be left in memory, including in q2.
-            State2 = resume(State1#vqstate{ io_batch_size = 0 }),
-            convert_to_lazy(State2#vqstate{ io_batch_size = IoBatchSize })
-    end.
-
-wait_for_msg_store_credit() ->
-    case credit_flow:blocked() of
-        true  -> receive
-                     {bump_credit, Msg} ->
-                         credit_flow:handle_bump_msg(Msg)
-                 end;
-        false -> ok
-    end.
-
 %% No change.
 set_queue_version(Version, State = #vqstate { version = Version }) ->
     State;
@@ -1494,11 +1428,7 @@ get_pa_head(PA) ->
         true  -> undefined
     end.
 
-%%----------------------------------------------------------------------------
-%% Minor helpers
-%%----------------------------------------------------------------------------
-a(State = #vqstate { q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
-                     mode             = default,
+a(State = #vqstate { delta = Delta, q3 = Q3,
                      len              = Len,
                      bytes            = Bytes,
                      unacked_bytes    = UnackedBytes,
@@ -1506,66 +1436,11 @@ a(State = #vqstate { q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
                      persistent_bytes = PersistentBytes,
                      ram_msg_count    = RamMsgCount,
                      ram_bytes        = RamBytes}) ->
-    E1 = ?QUEUE:is_empty(Q1),
-    E2 = ?QUEUE:is_empty(Q2),
     ED = Delta#delta.count == 0,
     E3 = ?QUEUE:is_empty(Q3),
-    E4 = ?QUEUE:is_empty(Q4),
-    LZ = Len == 0,
-
-    %% if q1 has messages then q3 cannot be empty. See publish/6.
-    true = E1 or not E3,
-    %% if q2 has messages then we have messages in delta (paged to
-    %% disk). See push_alphas_to_betas/2.
-    true = E2 or not ED,
-    %% if delta has messages then q3 cannot be empty. This is enforced
-    %% by paging, where min([segment_entry_count(), len(q3)]) messages
-    %% are always kept on RAM.
-    true = ED or not E3,
-    %% if the queue length is 0, then q3 and q4 must be empty.
-    true = LZ == (E3 and E4),
-
-    true = Len             >= 0,
-    true = Bytes           >= 0,
-    true = UnackedBytes    >= 0,
-    true = PersistentCount >= 0,
-    true = PersistentBytes >= 0,
-    true = RamMsgCount     >= 0,
-    true = RamMsgCount     =< Len,
-    true = RamBytes        >= 0,
-    true = RamBytes        =< Bytes + UnackedBytes,
-
-    State;
-a(State = #vqstate { q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
-                     mode             = lazy,
-                     len              = Len,
-                     bytes            = Bytes,
-                     unacked_bytes    = UnackedBytes,
-                     persistent_count = PersistentCount,
-                     persistent_bytes = PersistentBytes,
-                     ram_msg_count    = RamMsgCount,
-                     ram_bytes        = RamBytes}) ->
-    E1 = ?QUEUE:is_empty(Q1),
-    E2 = ?QUEUE:is_empty(Q2),
-    ED = Delta#delta.count == 0,
-    E3 = ?QUEUE:is_empty(Q3),
-    E4 = ?QUEUE:is_empty(Q4),
     LZ = Len == 0,
     L3 = ?QUEUE:len(Q3),
 
-    %% q1 must always be empty, since q1 only gets messages during
-    %% publish, but for lazy queues messages go straight to delta.
-    true = E1,
-
-    %% q2 only gets messages from q1 when push_alphas_to_betas is
-    %% called for a non empty delta, which won't be the case for a
-    %% lazy queue. This means q2 must always be empty.
-    true = E2,
-
-    %% q4 must always be empty, since q1 only gets messages during
-    %% publish, but for lazy queues messages go straight to delta.
-    true = E4,
-
     %% if the queue is empty, then delta is empty and q3 is empty.
     true = LZ == (ED and E3),
 
@@ -1868,45 +1743,10 @@ blank_rates(Now) ->
              ack_out   = 0.0,
              timestamp = Now}.
 
-in_r(MsgStatus = #msg_status { msg = undefined },
-     State = #vqstate { mode = default, q3 = Q3, q4 = Q4 }) ->
-    case ?QUEUE:is_empty(Q4) of
-        true  -> State #vqstate { q3 = ?QUEUE:in_r(MsgStatus, Q3) };
-        false -> {Msg, State1 = #vqstate { q4 = Q4a }} =
-                     read_msg(MsgStatus, State),
-                 MsgStatus1 = MsgStatus#msg_status{msg = Msg},
-                 stats(ready0, {MsgStatus, MsgStatus1}, 0,
-                       State1 #vqstate { q4 = ?QUEUE:in_r(MsgStatus1, Q4a) })
-    end;
-in_r(MsgStatus,
-     State = #vqstate { mode = default, q4 = Q4 }) ->
-    State #vqstate { q4 = ?QUEUE:in_r(MsgStatus, Q4) };
-%% lazy queues
-in_r(MsgStatus = #msg_status { seq_id = SeqId, is_persistent = IsPersistent },
-     State = #vqstate { mode = lazy, q3 = Q3, delta = Delta}) ->
-    case ?QUEUE:is_empty(Q3) of
-        true  ->
-            {_MsgStatus1, State1} =
-                maybe_write_to_disk(true, true, MsgStatus, State),
-            State2 = stats(ready0, {MsgStatus, none}, 1, State1),
-            Delta1 = expand_delta(SeqId, Delta, IsPersistent),
-            State2 #vqstate{ delta = Delta1};
-        false ->
-            State #vqstate { q3 = ?QUEUE:in_r(MsgStatus, Q3) }
-    end.
+in_r(MsgStatus = #msg_status {}, State = #vqstate { q3 = Q3 }) ->
+    State #vqstate { q3 = ?QUEUE:in_r(MsgStatus, Q3) }.
 
-queue_out(State = #vqstate { mode = default, q4 = Q4 }) ->
-    case ?QUEUE:out(Q4) of
-        {empty, _Q4} ->
-            case fetch_from_q3(State) of
-                {empty, _State1} = Result     -> Result;
-                {loaded, {MsgStatus, State1}} -> {{value, set_deliver_flag(State, MsgStatus)}, State1}
-            end;
-        {{value, MsgStatus}, Q4a} ->
-            {{value, set_deliver_flag(State, MsgStatus)}, State #vqstate { q4 = Q4a }}
-    end;
-%% lazy queues
-queue_out(State = #vqstate { mode = lazy }) ->
+queue_out(State) ->
     case fetch_from_q3(State) of
         {empty, _State1} = Result     -> Result;
         {loaded, {MsgStatus, State1}} -> {{value, set_deliver_flag(State, MsgStatus)}, State1}
@@ -2268,8 +2108,7 @@ process_delivers_and_acks_fun(_) ->
 publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
          MsgProps = #message_properties { needs_confirming = NeedsConfirming },
          IsDelivered, _ChPid, _Flow, PersistFun,
-         State = #vqstate { q1 = Q1, q3 = Q3, q4 = Q4,
-                            mode                = default,
+         State = #vqstate { q3 = Q3, delta = Delta = #delta { count = DeltaCount },
                             version             = Version,
                             qi_embed_msgs_below = IndexMaxSize,
                             next_seq_id         = SeqId,
@@ -2282,41 +2121,44 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
     %% Unlike what the comment at the top of the file says, it is possible to
     %% have messages in q1 that are also persisted to disk, as that is a
     %% necessary requirement for confirms to be sent.
-    {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
-    State2 = case ?QUEUE:is_empty(Q3) of
-                 false -> State1 #vqstate { q1 = ?QUEUE:in(m(MsgStatus1), Q1) };
-                 true  -> State1 #vqstate { q4 = ?QUEUE:in(m(MsgStatus1), Q4) }
+    State4 = case {DeltaCount, ?QUEUE:len(Q3)} of
+                 {0, Q34Len} when Q34Len < 16384 -> %% @todo It would be interesting to get a dynamic max size.
+                     {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
+                     State2 = case IsDelivered of
+                         true -> record_pending_ack(m(MsgStatus1), State1);
+                         false -> State1 #vqstate { q3 = ?QUEUE:in(m(MsgStatus1), Q3) }
+                     end,
+                     %% @todo I am not sure about the stats from this.
+                     stats(case IsDelivered of
+                               true -> {0, 1};
+                               false -> {1, 0}
+                           end, {none, MsgStatus1}, 0, State2);
+                 _ ->
+                     {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
+                     State2 = case IsDelivered of
+                         true -> record_pending_ack(m(MsgStatus1), State1);
+                         false ->
+                             Delta1 = expand_delta(SeqId, Delta, IsPersistent),
+                             State1 #vqstate { delta = Delta1 }
+                     end,
+                     stats(case IsDelivered of
+                               true -> {0, 1};
+                               false -> lazy_pub
+                           end,
+                           case IsDelivered of
+                               true -> {none, MsgStatus1};
+                               false -> {lazy, MsgStatus1}
+                           end,
+                           case IsDelivered of
+                               true -> 0;
+                               false -> 1
+                           end, State2)
              end,
-    InCount1 = InCount + 1,
-    UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
-    stats({1, 0}, {none, MsgStatus1}, 0,
-          State2#vqstate{ next_seq_id         = SeqId + 1,
-                          next_deliver_seq_id = maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, IsDelivered),
-                          in_counter          = InCount1,
-                          unconfirmed         = UC1 });
-publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
-             MsgProps = #message_properties { needs_confirming = NeedsConfirming },
-             IsDelivered, _ChPid, _Flow, PersistFun,
-             State = #vqstate { mode                = lazy,
-                                version             = Version,
-                                qi_embed_msgs_below = IndexMaxSize,
-                                next_seq_id         = SeqId,
-                                next_deliver_seq_id = NextDeliverSeqId,
-                                in_counter          = InCount,
-                                durable             = IsDurable,
-                                unconfirmed         = UC,
-                                delta               = Delta}) ->
-    IsPersistent1 = IsDurable andalso IsPersistent,
-    MsgStatus = msg_status(Version, IsPersistent1, IsDelivered, SeqId, Msg, MsgProps, IndexMaxSize),
-    {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
-    Delta1 = expand_delta(SeqId, Delta, IsPersistent),
     UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
-    stats(lazy_pub, {lazy, m(MsgStatus1)}, 1,
-          State1#vqstate{ delta               = Delta1,
-                          next_seq_id         = SeqId + 1,
-                          next_deliver_seq_id = maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, IsDelivered),
-                          in_counter          = InCount + 1,
-                          unconfirmed         = UC1}).
+    State4#vqstate{ next_seq_id         = SeqId + 1,
+                    next_deliver_seq_id = maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, IsDelivered),
+                    in_counter          = InCount + 1,
+                    unconfirmed         = UC1 }.
 
 %% Only attempt to increase the next_deliver_seq_id for delivered messages.
 maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, true) ->
@@ -2328,58 +2170,8 @@ batch_publish1({Msg, MsgProps, IsDelivered}, {ChPid, Flow, State}) ->
     {ChPid, Flow, publish1(Msg, MsgProps, IsDelivered, ChPid, Flow,
                            fun maybe_prepare_write_to_disk/4, State)}.
 
-publish_delivered1(Msg = #basic_message { is_persistent = IsPersistent,
-                                          id = MsgId },
-                   MsgProps = #message_properties {
-                                 needs_confirming = NeedsConfirming },
-                   _ChPid, _Flow, PersistFun,
-                   State = #vqstate { mode                = default,
-                                      version             = Version,
-                                      qi_embed_msgs_below = IndexMaxSize,
-                                      next_seq_id         = SeqId,
-                                      next_deliver_seq_id = NextDeliverSeqId,
-                                      out_counter         = OutCount,
-                                      in_counter          = InCount,
-                                      durable             = IsDurable,
-                                      unconfirmed         = UC }) ->
-    IsPersistent1 = IsDurable andalso IsPersistent,
-    MsgStatus = msg_status(Version, IsPersistent1, true, SeqId, Msg, MsgProps, IndexMaxSize),
-    {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
-    State2 = record_pending_ack(m(MsgStatus1), State1),
-    UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
-    State3 = stats({0, 1}, {none, MsgStatus1}, 0,
-                   State2 #vqstate { next_seq_id         = SeqId    + 1,
-                                     next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
-                                     out_counter         = OutCount + 1,
-                                     in_counter          = InCount  + 1,
-                                     unconfirmed         = UC1 }),
-    {SeqId, State3};
-publish_delivered1(Msg = #basic_message { is_persistent = IsPersistent,
-                                          id = MsgId },
-                   MsgProps = #message_properties {
-                                 needs_confirming = NeedsConfirming },
-                   _ChPid, _Flow, PersistFun,
-                   State = #vqstate { mode                = lazy,
-                                      version             = Version,
-                                      qi_embed_msgs_below = IndexMaxSize,
-                                      next_seq_id         = SeqId,
-                                      next_deliver_seq_id = NextDeliverSeqId,
-                                      out_counter         = OutCount,
-                                      in_counter          = InCount,
-                                      durable             = IsDurable,
-                                      unconfirmed         = UC }) ->
-    IsPersistent1 = IsDurable andalso IsPersistent,
-    MsgStatus = msg_status(Version, IsPersistent1, true, SeqId, Msg, MsgProps, IndexMaxSize),
-    {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
-    State2 = record_pending_ack(m(MsgStatus1), State1),
-    UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
-    State3 = stats({0, 1}, {none, MsgStatus1}, 0,
-                   State2 #vqstate { next_seq_id         = SeqId    + 1,
-                                     next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
-                                     out_counter         = OutCount + 1,
-                                     in_counter          = InCount  + 1,
-                                     unconfirmed         = UC1 }),
-    {SeqId, State3}.
+publish_delivered1(Msg, MsgProps, ChPid, Flow, PersistFun, State = #vqstate { next_seq_id = SeqId }) ->
+    {SeqId, publish1(Msg, MsgProps, true, ChPid, Flow, PersistFun, State)}.
 
 batch_publish_delivered1({Msg, MsgProps}, {ChPid, Flow, SeqIds, State}) ->
     {SeqId, State1} =
@@ -2766,13 +2558,6 @@ msgs_and_indices_written_to_disk(Callback, MsgIdSet) ->
 %% Internal plumbing for requeue
 %%----------------------------------------------------------------------------
 
-publish_alpha(#msg_status { msg = undefined } = MsgStatus, State) ->
-    {Msg, State1} = read_msg(MsgStatus, State),
-    MsgStatus1 = MsgStatus#msg_status { msg = Msg },
-    {MsgStatus1, stats({1, -1}, {MsgStatus, MsgStatus1}, 0, State1)};
-publish_alpha(MsgStatus, State) ->
-    {MsgStatus, stats({1, -1}, {MsgStatus, MsgStatus}, 0, State)}.
-
 publish_beta(MsgStatus, State) ->
     {MsgStatus1, State1} = maybe_prepare_write_to_disk(true, false, MsgStatus, State),
     MsgStatus2 = m(trim_msg_status(MsgStatus1)),
@@ -2835,12 +2620,6 @@ msg_from_pending_ack(SeqId, State) ->
              State1}
     end.
 
-beta_limit(Q) ->
-    case ?QUEUE:get(Q, empty) of
-        #msg_status { seq_id = SeqId } -> SeqId;
-        empty -> undefined
-    end.
-
 delta_limit(?BLANK_DELTA_PATTERN(_))              -> undefined;
 delta_limit(#delta { start_seq_id = StartSeqId }) -> StartSeqId.
 
@@ -2948,205 +2727,13 @@ maybe_reduce_memory_use(State = #vqstate {memory_reduction_run_count = MRedRunCo
         false -> State#vqstate{memory_reduction_run_count =  MRedRunCount + 1}
     end.
 
-reduce_memory_use(State = #vqstate { target_ram_count = infinity }) ->
-    State;
-reduce_memory_use(State = #vqstate {
-                    mode             = default,
-                    ram_pending_ack  = RPA,
-                    ram_msg_count    = RamMsgCount,
-                    target_ram_count = TargetRamCount,
-                    io_batch_size    = IoBatchSize,
-                    rates            = #rates { in      = AvgIngress,
-                                                out     = AvgEgress,
-                                                ack_in  = AvgAckIngress,
-                                                ack_out = AvgAckEgress } }) ->
-    {CreditDiscBound, _} =rabbit_misc:get_env(rabbit,
-                                              msg_store_credit_disc_bound,
-                                              ?CREDIT_DISC_BOUND),
-    {NeedResumeA2B, State1} = {_, #vqstate { q2 = Q2, q3 = Q3 }} =
-        case chunk_size(RamMsgCount + gb_trees:size(RPA), TargetRamCount) of
-            0  -> {false, State};
-            %% Reduce memory of pending acks and alphas. The order is
-            %% determined based on which is growing faster. Whichever
-            %% comes second may very well get a quota of 0 if the
-            %% first manages to push out the max number of messages.
-            A2BChunk ->
-                %% In case there are few messages to be sent to a message store
-                %% and many messages to be embedded to the queue index,
-                %% we should limit the number of messages to be flushed
-                %% to avoid blocking the process.
-                A2BChunkActual = case A2BChunk > CreditDiscBound * 2 of
-                    true  -> CreditDiscBound * 2;
-                    false -> A2BChunk
-                end,
-                Funs = case ((AvgAckIngress - AvgAckEgress) >
-                                   (AvgIngress - AvgEgress)) of
-                             true  -> [fun limit_ram_acks/2,
-                                       fun push_alphas_to_betas/2];
-                             false -> [fun push_alphas_to_betas/2,
-                                       fun limit_ram_acks/2]
-                         end,
-                {Quota, State2} = lists:foldl(fun (ReduceFun, {QuotaN, StateN}) ->
-                                                    ReduceFun(QuotaN, StateN)
-                                              end, {A2BChunkActual, State}, Funs),
-                {(Quota == 0) andalso (A2BChunk > A2BChunkActual), State2}
-        end,
-    Permitted = permitted_beta_count(State1),
-    {NeedResumeB2D, State3} =
-        %% If there are more messages with their queue position held in RAM,
-        %% a.k.a. betas, in Q2 & Q3 than IoBatchSize,
-        %% write their queue position to disk, a.k.a. push_betas_to_deltas
-        case chunk_size(?QUEUE:len(Q2) + ?QUEUE:len(Q3),
-                        Permitted) of
-            B2DChunk when B2DChunk >= IoBatchSize ->
-                %% Same as for alphas to betas. Limit a number of messages
-                %% to be flushed to disk at once to avoid blocking the process.
-                B2DChunkActual = case B2DChunk > CreditDiscBound * 2 of
-                    true -> CreditDiscBound * 2;
-                    false -> B2DChunk
-                end,
-                StateBD = push_betas_to_deltas(B2DChunkActual, State1),
-                {B2DChunk > B2DChunkActual, StateBD};
-            _  ->
-                {false, State1}
-        end,
-    #vqstate{ index_mod   = IndexMod,
-              index_state = IndexState,
-              store_state = StoreState } = State3,
-    State4 = State3#vqstate{ index_state = IndexMod:flush(IndexState),
-                             store_state = rabbit_classic_queue_store_v2:sync(StoreState) },
-    %% We can be blocked by the credit flow, or limited by a batch size,
-    %% or finished with flushing.
-    %% If blocked by the credit flow - the credit grant will resume processing,
-    %% if limited by a batch - the batch continuation message should be sent.
-    %% The continuation message will be prioritised over publishes,
-    %% but not consumptions, so the queue can make progess.
-    Blocked = credit_flow:blocked(),
-    case {Blocked, NeedResumeA2B orelse NeedResumeB2D} of
-        %% Credit bump will continue paging
-        {true, _}      -> State4;
-        %% Finished with paging
-        {false, false} -> State4;
-        %% Planning next batch
-        {false, true}  ->
-            %% We don't want to use self-credit-flow, because it's harder to
-            %% reason about. So the process sends a (prioritised) message to
-            %% itself and sets a waiting_bump value to keep the message box clean
-            maybe_bump_reduce_memory_use(State4)
-    end;
-%% When using lazy queues, there are no alphas, so we don't need to
-%% call push_alphas_to_betas/2.
-reduce_memory_use(State = #vqstate {
-                             mode = lazy,
-                             ram_pending_ack  = RPA,
-                             ram_msg_count    = RamMsgCount,
-                             target_ram_count = TargetRamCount }) ->
-    State1 = #vqstate { q3 = Q3 } =
-        case chunk_size(RamMsgCount + gb_trees:size(RPA), TargetRamCount) of
-            0  -> State;
-            S1 -> {_, State2} = limit_ram_acks(S1, State),
-                  State2
-        end,
-
-    State3 =
-        case chunk_size(?QUEUE:len(Q3),
-                        permitted_beta_count(State1)) of
-            0  ->
-                State1;
-            S2 ->
-                push_betas_to_deltas(S2, State1)
-        end,
-    #vqstate{ index_mod   = IndexMod,
-              index_state = IndexState,
-              store_state = StoreState } = State3,
-    State4 = State3#vqstate{ index_state = IndexMod:flush(IndexState),
-                             store_state = rabbit_classic_queue_store_v2:sync(StoreState) },
-    garbage_collect(),
-    State4.
-
-maybe_bump_reduce_memory_use(State = #vqstate{ waiting_bump = true }) ->
-    State;
-maybe_bump_reduce_memory_use(State) ->
-    self() ! bump_reduce_memory_use,
-    State#vqstate{ waiting_bump = true }.
-
-limit_ram_acks(0, State) ->
-    {0, ui(State)};
-limit_ram_acks(Quota, State = #vqstate { ram_pending_ack  = RPA,
-                                         disk_pending_ack = DPA }) ->
-    case gb_trees:is_empty(RPA) of
-        true ->
-            {Quota, ui(State)};
-        false ->
-            {SeqId, MsgStatus, RPA1} = gb_trees:take_largest(RPA),
-            {MsgStatus1, State1} =
-                maybe_prepare_write_to_disk(true, false, MsgStatus, State),
-            MsgStatus2 = m(trim_msg_status(MsgStatus1)),
-            DPA1 = gb_trees:insert(SeqId, MsgStatus2, DPA),
-            limit_ram_acks(Quota - 1,
-                           stats({0, 0}, {MsgStatus, MsgStatus2}, 0,
-                                 State1 #vqstate { ram_pending_ack  = RPA1,
-                                                   disk_pending_ack = DPA1 }))
-    end.
+reduce_memory_use(State) ->
+    garbage_collect(), %% @todo Do we still want this? Probably not.
+    %% @todo Remove bump_reduce_memory_use message handling from other modules.
+    %% @todo Get rid of target ram count?
+    State.
 
-permitted_beta_count(#vqstate { len = 0 }) ->
-    infinity;
-permitted_beta_count(#vqstate { mode             = lazy,
-                                target_ram_count = TargetRamCount}) ->
-    TargetRamCount;
-permitted_beta_count(#vqstate { target_ram_count = 0, q3 = Q3, index_mod = IndexMod }) ->
-    lists:min([?QUEUE:len(Q3), IndexMod:next_segment_boundary(0)]);
-permitted_beta_count(#vqstate { q1               = Q1,
-                                q4               = Q4,
-                                index_mod        = IndexMod,
-                                target_ram_count = TargetRamCount,
-                                len              = Len }) ->
-    BetaDelta = Len - ?QUEUE:len(Q1) - ?QUEUE:len(Q4),
-    lists:max([IndexMod:next_segment_boundary(0),
-               BetaDelta - ((BetaDelta * BetaDelta) div
-                                (BetaDelta + TargetRamCount))]).
-
-chunk_size(Current, Permitted)
-  when Permitted =:= infinity orelse Permitted >= Current ->
-    0;
-chunk_size(Current, Permitted) ->
-    Current - Permitted.
-
-fetch_from_q3(State = #vqstate { mode  = default,
-                                 q1    = Q1,
-                                 q2    = Q2,
-                                 delta = #delta { count = DeltaCount },
-                                 q3    = Q3,
-                                 q4    = Q4 }) ->
-    case ?QUEUE:out(Q3) of
-        {empty, _Q3} ->
-            {empty, State};
-        {{value, MsgStatus}, Q3a} ->
-            State1 = State #vqstate { q3 = Q3a },
-            State2 = case {?QUEUE:is_empty(Q3a), 0 == DeltaCount} of
-                         {true, true} ->
-                             %% q3 is now empty, it wasn't before;
-                             %% delta is still empty. So q2 must be
-                             %% empty, and we know q4 is empty
-                             %% otherwise we wouldn't be loading from
-                             %% q3. As such, we can just set q4 to Q1.
-                             true = ?QUEUE:is_empty(Q2), %% ASSERTION
-                             true = ?QUEUE:is_empty(Q4), %% ASSERTION
-                             State1 #vqstate { q1 = ?QUEUE:new(), q4 = Q1 };
-                         {true, false} ->
-                             maybe_deltas_to_betas(State1);
-                         {false, _} ->
-                             %% q3 still isn't empty, we've not
-                             %% touched delta, so the invariants
-                             %% between q1, q2, delta and q3 are
-                             %% maintained
-                             State1
-                     end,
-            {loaded, {MsgStatus, State2}}
-    end;
-%% lazy queues
-fetch_from_q3(State = #vqstate { mode  = lazy,
-                                 delta = #delta { count = DeltaCount },
+fetch_from_q3(State = #vqstate { delta = #delta { count = DeltaCount },
                                  q3    = Q3 }) ->
     case ?QUEUE:out(Q3) of
         {empty, _Q3} when DeltaCount =:= 0 ->
@@ -3226,121 +2813,6 @@ maybe_deltas_to_betas(DelsAndAcksFun,
             end
     end.
 
-push_alphas_to_betas(Quota, State) ->
-    {Quota1, State1} =
-        push_alphas_to_betas(
-          fun ?QUEUE:out/1,
-          fun (MsgStatus, Q1a,
-               State0 = #vqstate { q3 = Q3, delta = #delta { count = 0,
-                                                             transient = 0 } }) ->
-                  State0 #vqstate { q1 = Q1a, q3 = ?QUEUE:in(MsgStatus, Q3) };
-              (MsgStatus, Q1a, State0 = #vqstate { q2 = Q2 }) ->
-                  State0 #vqstate { q1 = Q1a, q2 = ?QUEUE:in(MsgStatus, Q2) }
-          end, Quota, State #vqstate.q1, State),
-    {Quota2, State2} =
-        push_alphas_to_betas(
-          fun ?QUEUE:out_r/1,
-          fun (MsgStatus, Q4a, State0 = #vqstate { q3 = Q3 }) ->
-                  State0 #vqstate { q3 = ?QUEUE:in_r(MsgStatus, Q3), q4 = Q4a }
-          end, Quota1, State1 #vqstate.q4, State1),
-    {Quota2, State2}.
-
-push_alphas_to_betas(_Generator, _Consumer, Quota, _Q,
-                     State = #vqstate { ram_msg_count    = RamMsgCount,
-                                        target_ram_count = TargetRamCount })
-  when Quota =:= 0 orelse
-       TargetRamCount =:= infinity orelse
-       TargetRamCount >= RamMsgCount ->
-    {Quota, ui(State)};
-push_alphas_to_betas(Generator, Consumer, Quota, Q, State) ->
-    %% We consume credits from the message_store whenever we need to
-    %% persist a message to disk. See:
-    %% rabbit_variable_queue:msg_store_write/4. So perhaps the
-    %% msg_store is trying to throttle down our queue.
-    case credit_flow:blocked() of
-        true  -> {Quota, ui(State)};
-        false -> case Generator(Q) of
-                     {empty, _Q} ->
-                         {Quota, ui(State)};
-                     {{value, MsgStatus}, Qa} ->
-                         {MsgStatus1, State1} =
-                             maybe_prepare_write_to_disk(true, false, MsgStatus,
-                                                         State),
-                         MsgStatus2 = m(trim_msg_status(MsgStatus1)),
-                         State2 = stats(
-                                    ready0, {MsgStatus, MsgStatus2}, 0, State1),
-                         State3 = Consumer(MsgStatus2, Qa, State2),
-                         push_alphas_to_betas(Generator, Consumer, Quota - 1,
-                                              Qa, State3)
-                 end
-    end.
-
-push_betas_to_deltas(Quota, State = #vqstate { mode      = default,
-                                               q2        = Q2,
-                                               delta     = Delta,
-                                               q3        = Q3,
-                                               index_mod = IndexMod }) ->
-    PushState = {Quota, Delta, State},
-    {Q3a, PushState1} = push_betas_to_deltas(
-                          fun ?QUEUE:out_r/1,
-                          fun IndexMod:next_segment_boundary/1,
-                          Q3, PushState),
-    {Q2a, PushState2} = push_betas_to_deltas(
-                          fun ?QUEUE:out/1,
-                          fun (Q2MinSeqId) -> Q2MinSeqId end,
-                          Q2, PushState1),
-    {_, Delta1, State1} = PushState2,
-    State1 #vqstate { q2    = Q2a,
-                      delta = Delta1,
-                      q3    = Q3a };
-%% In the case of lazy queues we want to page as many messages as
-%% possible from q3.
-push_betas_to_deltas(Quota, State = #vqstate { mode  = lazy,
-                                               delta = Delta,
-                                               q3    = Q3}) ->
-    PushState = {Quota, Delta, State},
-    {Q3a, PushState1} = push_betas_to_deltas(
-                          fun ?QUEUE:out_r/1,
-                          fun (Q2MinSeqId) -> Q2MinSeqId end,
-                          Q3, PushState),
-    {_, Delta1, State1} = PushState1,
-    State1 #vqstate { delta = Delta1,
-                      q3    = Q3a }.
-
-
-push_betas_to_deltas(Generator, LimitFun, Q, PushState) ->
-    case ?QUEUE:is_empty(Q) of
-        true ->
-            {Q, PushState};
-        false ->
-            #msg_status { seq_id = MinSeqId } = ?QUEUE:get(Q),
-            #msg_status { seq_id = MaxSeqId } = ?QUEUE:get_r(Q),
-            Limit = LimitFun(MinSeqId),
-            case MaxSeqId < Limit of
-                true  -> {Q, PushState};
-                false -> push_betas_to_deltas1(Generator, Limit, Q, PushState)
-            end
-    end.
-
-push_betas_to_deltas1(_Generator, _Limit, Q, {0, Delta, State}) ->
-    {Q, {0, Delta, ui(State)}};
-push_betas_to_deltas1(Generator, Limit, Q, {Quota, Delta, State}) ->
-    case Generator(Q) of
-        {empty, _Q} ->
-            {Q, {Quota, Delta, ui(State)}};
-        {{value, #msg_status { seq_id = SeqId }}, _Qa}
-          when SeqId < Limit ->
-            {Q, {Quota, Delta, ui(State)}};
-        {{value, MsgStatus = #msg_status { seq_id = SeqId }}, Qa} ->
-            {#msg_status { index_on_disk = true,
-                           is_persistent = IsPersistent }, State1} =
-                maybe_batch_write_index_to_disk(true, MsgStatus, State),
-            State2 = stats(ready0, {MsgStatus, none}, 1, State1),
-            Delta1 = expand_delta(SeqId, Delta, IsPersistent),
-            push_betas_to_deltas1(Generator, Limit, Qa,
-                                  {Quota - 1, Delta1, State2})
-    end.
-
 %% Flushes queue index batch caches and updates queue index state.
 ui(#vqstate{index_mod        = IndexMod,
             index_state      = IndexState,
diff --git a/deps/rabbit/test/backing_queue_SUITE.erl b/deps/rabbit/test/backing_queue_SUITE.erl
index 50d44307ef56..79a5fa7257cb 100644
--- a/deps/rabbit/test/backing_queue_SUITE.erl
+++ b/deps/rabbit/test/backing_queue_SUITE.erl
@@ -986,29 +986,35 @@ variable_queue_partial_segments_delta_thing2(VQ0, _QName) ->
     OneAndAHalfSegment = SegmentSize + HalfSegment,
     VQ1 = variable_queue_publish(true, OneAndAHalfSegment, VQ0),
     {_Duration, VQ2} = rabbit_variable_queue:ram_duration(VQ1),
-    VQ3 = check_variable_queue_status(
-            variable_queue_set_ram_duration_target(0, VQ2),
-            %% one segment in q3, and half a segment in delta
-            [{delta, {delta, SegmentSize, HalfSegment, OneAndAHalfSegment}},
-             {q3, SegmentSize},
-             {len, SegmentSize + HalfSegment}]),
+    VQ3 = variable_queue_set_ram_duration_target(0, VQ2),
+%    VQ3 = check_variable_queue_status(
+%            variable_queue_set_ram_duration_target(0, VQ2),
+%            %% one segment in q3, and half a segment in delta
+%            %% @todo That's wrong now! v1/v2
+%            [{delta, {delta, SegmentSize, HalfSegment, 0, OneAndAHalfSegment}},
+%             {q3, SegmentSize},
+%             {len, SegmentSize + HalfSegment}]),
     VQ4 = variable_queue_set_ram_duration_target(infinity, VQ3),
-    VQ5 = check_variable_queue_status(
-            variable_queue_publish(true, 1, VQ4),
-            %% one alpha, but it's in the same segment as the deltas
-            [{q1, 1},
-             {delta, {delta, SegmentSize, HalfSegment, OneAndAHalfSegment}},
-             {q3, SegmentSize},
-             {len, SegmentSize + HalfSegment + 1}]),
+    VQ5 = variable_queue_publish(true, 1, VQ4),
+%    VQ5 = check_variable_queue_status(
+%            variable_queue_publish(true, 1, VQ4),
+%            %% one alpha, but it's in the same segment as the deltas
+%            %% @todo That's wrong now! v1/v2
+%            [{q1, 1},
+%             {delta, {delta, SegmentSize, HalfSegment, 0, OneAndAHalfSegment}},
+%             {q3, SegmentSize},
+%             {len, SegmentSize + HalfSegment + 1}]),
     {VQ6, AckTags} = variable_queue_fetch(SegmentSize, true, false,
                                           SegmentSize + HalfSegment + 1, VQ5),
-    VQ7 = check_variable_queue_status(
-            VQ6,
-            %% the half segment should now be in q3
-            [{q1, 1},
-             {delta, {delta, undefined, 0, undefined}},
-             {q3, HalfSegment},
-             {len, HalfSegment + 1}]),
+    VQ7 = VQ6,
+%    VQ7 = check_variable_queue_status(
+%            VQ6,
+%            %% the half segment should now be in q3
+%            %% @todo That's wrong now! v1/v2
+%            [{q1, 1},
+%             {delta, {delta, undefined, 0, 0, undefined}},
+%             {q3, HalfSegment},
+%             {len, HalfSegment + 1}]),
     {VQ8, AckTags1} = variable_queue_fetch(HalfSegment + 1, true, false,
                                            HalfSegment + 1, VQ7),
     {_Guids, VQ9} = rabbit_variable_queue:ack(AckTags ++ AckTags1, VQ8),
@@ -1265,7 +1271,7 @@ variable_queue_fetchwhile_varying_ram_duration1(Config) ->
 variable_queue_fetchwhile_varying_ram_duration2(VQ0, _QName) ->
     test_dropfetchwhile_varying_ram_duration(
       fun (VQ1) ->
-              {_, ok, VQ2} = rabbit_variable_queue:fetchwhile(
+              {Fetched, ok, VQ2} = rabbit_variable_queue:fetchwhile(
                                fun (_) -> false end,
                                fun (_, _, A) -> A end,
                                ok, VQ1),
@@ -1305,20 +1311,35 @@ variable_queue_ack_limiting2(VQ0, _Config) ->
     %% fetch half the messages
     {VQ4, _AckTags} = variable_queue_fetch(Len div 2, false, false, Len, VQ3),
 
-    VQ5 = check_variable_queue_status(
-            VQ4, [{len,                         Len div 2},
-                  {messages_unacknowledged_ram, Len div 2},
-                  {messages_ready_ram,          Len div 2},
-                  {messages_ram,                Len}]),
+    Status = variable_queue_status(VQ4),
+    VQ5 = VQ4,
+%    VQ5 = case proplists:get_value(mode, Status) of
+%        default ->
+%            check_variable_queue_status(
+%                VQ4, [{len,                         Len div 2},
+%                      {messages_unacknowledged_ram, Len div 2},
+%                      {messages_ready_ram,          Len div 2},
+%                      {messages_ram,                Len}]);
+%        lazy ->
+%            VQ4
+%            %% @todo This check has been broken for a very long time. (Or never worked.)
+%%            check_variable_queue_status(
+%%                VQ4, [{len,                         Len div 2},
+%%                      {messages_unacknowledged_ram, 0},
+%%                      {messages_ready_ram,          0},
+%%                      {messages_ram,                0}])
+%    end,
 
     %% ensure all acks go to disk on 0 duration target
-    VQ6 = check_variable_queue_status(
-            variable_queue_set_ram_duration_target(0, VQ5),
-            [{len,                         Len div 2},
-             {target_ram_count,            0},
-             {messages_unacknowledged_ram, 0},
-             {messages_ready_ram,          0},
-             {messages_ram,                0}]),
+    %% @todo They don't! v1 with embed limit 1024 only (so with embedded messages)
+%    VQ6 = check_variable_queue_status(
+%            variable_queue_set_ram_duration_target(0, VQ5),
+%            [{len,                         Len div 2},
+%             {target_ram_count,            0},
+%             {messages_unacknowledged_ram, 0},
+%             {messages_ready_ram,          0},
+%             {messages_ram,                0}]),
+    VQ6 = variable_queue_set_ram_duration_target(0, VQ5),
 
     VQ6.
 
@@ -1687,7 +1708,7 @@ with_fresh_variable_queue(Fun, Mode) ->
                        S0 = variable_queue_status(VQ),
                        assert_props(S0, [{q1, 0}, {q2, 0},
                                          {delta,
-                                          {delta, undefined, 0, undefined}},
+                                          {delta, undefined, 0, 0, undefined}},
                                          {q3, 0}, {q4, 0},
                                          {len, 0}]),
                        %% @todo Some tests probably don't keep this after restart (dropwhile_(sync)_restart, A2).
@@ -1710,7 +1731,7 @@ with_fresh_variable_queue(Fun, Mode) ->
 set_queue_mode(Mode, VQ) ->
     VQ1 = rabbit_variable_queue:set_queue_mode(Mode, VQ),
     S1 = variable_queue_status(VQ1),
-    assert_props(S1, [{mode, Mode}]),
+%    assert_props(S1, [{mode, Mode}]),
     VQ1.
 
 variable_queue_publish(IsPersistent, Count, VQ) ->
@@ -1792,7 +1813,11 @@ assert_prop(List, Prop, Value) ->
     end.
 
 assert_props(List, PropVals) ->
-    [assert_prop(List, Prop, Value) || {Prop, Value} <- PropVals].
+    Res = [assert_prop(List, Prop, Value) || {Prop, Value} <- PropVals],
+    case lists:usort(Res) of
+        [ok] -> ok;
+        Error -> error(Error -- [ok])
+    end.
 
 variable_queue_set_ram_duration_target(Duration, VQ) ->
     variable_queue_wait_for_shuffling_end(
@@ -1876,21 +1901,22 @@ variable_queue_with_holes(VQ0) ->
 
     {Seq3, Seq -- Seq3, lists:seq(Count + 1, Count + Interval), VQ8}.
 
-vq_with_holes_assertions(VQ, default) ->
-    [false =
-         case V of
-             {delta, _, 0, _} -> true;
-             0                -> true;
-             _                -> false
-         end || {K, V} <- variable_queue_status(VQ),
-                lists:member(K, [q1, delta, q3])];
-vq_with_holes_assertions(VQ, lazy) ->
-    [false =
-         case V of
-             {delta, _, 0, _} -> true;
-             _                -> false
-         end || {K, V} <- variable_queue_status(VQ),
-                lists:member(K, [delta])].
+vq_with_holes_assertions(VQ, _) -> ok.
+%vq_with_holes_assertions(VQ, default) ->
+%    [false =
+%         case V of
+%             {delta, _, 0, _, _} -> true;
+%             0                   -> true;
+%             _                   -> false
+%         end || {K, V} <- variable_queue_status(VQ),
+%                lists:member(K, [delta, q3])];
+%vq_with_holes_assertions(VQ, lazy) ->
+%    [false =
+%         case V of
+%             {delta, _, 0, _, _} -> true;
+%             _                -> false
+%         end || {K, V} <- variable_queue_status(VQ),
+%                lists:member(K, [delta])].
 
 check_variable_queue_status(VQ0, Props) ->
     VQ1 = variable_queue_wait_for_shuffling_end(VQ0),

From 892db25711a982ed4dd451e9e24c49f4b9b02b12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 12 Apr 2022 16:41:13 +0200
Subject: [PATCH 02/39] WIP putting back flush in reduce_memory_usage and other
 tweaks

---
 deps/rabbit/src/rabbit_amqqueue_process.erl   | 24 ++----
 deps/rabbit/src/rabbit_mirror_queue_slave.erl |  8 --
 deps/rabbit/src/rabbit_mirror_queue_sync.erl  |  8 +-
 deps/rabbit/src/rabbit_variable_queue.erl     | 84 +++++++++----------
 4 files changed, 48 insertions(+), 76 deletions(-)

diff --git a/deps/rabbit/src/rabbit_amqqueue_process.erl b/deps/rabbit/src/rabbit_amqqueue_process.erl
index ae5620e74c75..d114ca5ea2e9 100644
--- a/deps/rabbit/src/rabbit_amqqueue_process.erl
+++ b/deps/rabbit/src/rabbit_amqqueue_process.erl
@@ -502,14 +502,14 @@ next_state(State = #q{q = Q,
                       backing_queue       = BQ,
                       backing_queue_state = BQS,
                       msg_id_to_channel   = MTC}) ->
-    try
-        assert_invariant(State)
-    catch C:E:S ->
-        rabbit_time_travel_dbg:dump(),
-        rabbit_time_travel_dbg:print(),
-        logger:error("BAD STATE ~p", [State]),
-        erlang:raise(C,E,S)
-    end,
+%    try
+        assert_invariant(State),
+%    catch C:E:S ->
+%        rabbit_time_travel_dbg:dump(),
+%        rabbit_time_travel_dbg:print(),
+%        logger:error("BAD STATE ~p", [State]),
+%        erlang:raise(C,E,S)
+%    end,
     {MsgIds, BQS1} = BQ:drain_confirmed(BQS),
     MTC1 = confirm_messages(MsgIds, MTC, amqqueue:get_name(Q)),
     State1 = State#q{backing_queue_state = BQS1, msg_id_to_channel = MTC1},
@@ -1281,9 +1281,6 @@ prioritise_cast(Msg, _Len, State) ->
 %% stack are optimised for that) and to make things easier to reason
 %% about. Finally, we prioritise ack over resume since it should
 %% always reduce memory use.
-%% bump_reduce_memory_use is prioritised over publishes, because sending
-%% credit to self is hard to reason about. Consumers can continue while
-%% reduce_memory_use is in progress.
 
 consumer_bias(#q{backing_queue = BQ, backing_queue_state = BQS}, Low, High) ->
     case BQ:msg_rates(BQS) of
@@ -1301,7 +1298,6 @@ prioritise_info(Msg, _Len, #q{q = Q}) ->
         {drop_expired, _Version}             -> 8;
         emit_stats                           -> 7;
         sync_timeout                         -> 6;
-        bump_reduce_memory_use               -> 1;
         _                                    -> 0
     end.
 
@@ -1783,10 +1779,6 @@ handle_info({bump_credit, Msg}, State = #q{backing_queue       = BQ,
     %% rabbit_variable_queue:msg_store_write/4.
     credit_flow:handle_bump_msg(Msg),
     noreply(State#q{backing_queue_state = BQ:resume(BQS)});
-handle_info(bump_reduce_memory_use, State = #q{backing_queue       = BQ,
-                                               backing_queue_state = BQS0}) ->
-    BQS1 = BQ:handle_info(bump_reduce_memory_use, BQS0),
-    noreply(State#q{backing_queue_state = BQ:resume(BQS1)});
 
 handle_info(Info, State) ->
     {stop, {unhandled_info, Info}, State}.
diff --git a/deps/rabbit/src/rabbit_mirror_queue_slave.erl b/deps/rabbit/src/rabbit_mirror_queue_slave.erl
index d9f70229a53d..60cdfe2804f4 100644
--- a/deps/rabbit/src/rabbit_mirror_queue_slave.erl
+++ b/deps/rabbit/src/rabbit_mirror_queue_slave.erl
@@ -385,14 +385,6 @@ handle_info({bump_credit, Msg}, State) ->
     credit_flow:handle_bump_msg(Msg),
     noreply(State);
 
-handle_info(bump_reduce_memory_use, State = #state{backing_queue       = BQ,
-                                                backing_queue_state = BQS}) ->
-    BQS1 = BQ:handle_info(bump_reduce_memory_use, BQS),
-    BQS2 = BQ:resume(BQS1),
-    noreply(State#state{
-        backing_queue_state = BQS2
-    });
-
 %% In the event of a short partition during sync we can detect the
 %% master's 'death', drop out of sync, and then receive sync messages
 %% which were still in flight. Ignore them.
diff --git a/deps/rabbit/src/rabbit_mirror_queue_sync.erl b/deps/rabbit/src/rabbit_mirror_queue_sync.erl
index e3e6139c190a..e35e4d5f8b7e 100644
--- a/deps/rabbit/src/rabbit_mirror_queue_sync.erl
+++ b/deps/rabbit/src/rabbit_mirror_queue_sync.erl
@@ -333,8 +333,6 @@ wait_for_credit(SPids) ->
 
 wait_for_resources(Ref, SPids) ->
     erlang:garbage_collect(),
-    % Probably bump_reduce_memory_use messages should be handled here as well,
-    % otherwise the BQ is not pushing messages to disk
     receive
         {conserve_resources, memory, false} ->
             SPids;
@@ -414,11 +412,7 @@ slave_sync_loop(Args = {Ref, MRef, Syncer, BQ, UpdateRamDuration, Parent},
         %% If the master throws an exception
         {'$gen_cast', {gm, {delete_and_terminate, Reason}}} ->
             BQ:delete_and_terminate(Reason, BQS),
-            {stop, Reason, {[], TRef, undefined}};
-        bump_reduce_memory_use -> 
-            BQS1 = BQ:handle_info(bump_reduce_memory_use, BQS),
-            BQS2 = BQ:resume(BQS1),
-            slave_sync_loop(Args, {MA, TRef, BQS2})
+            {stop, Reason, {[], TRef, undefined}}
     end.
 
 %% We are partitioning messages by the Unacked element in the tuple.
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index f5aaa5c5bac5..813a8001f09d 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -18,7 +18,7 @@
          handle_pre_hibernate/1, resume/1, msg_rates/1,
          info/2, invoke/3, is_duplicate/2, set_queue_mode/2,
          set_queue_version/2,
-         zip_msgs_and_acks/4,  multiple_routing_keys/0, handle_info/2]).
+         zip_msgs_and_acks/4,  multiple_routing_keys/0]).
 
 -export([start/2, stop/1]).
 
@@ -880,7 +880,7 @@ set_ram_duration_target(
           (TargetRamCount =/= infinity andalso
            TargetRamCount1 >= TargetRamCount) of
           true  -> State1;
-          false -> State1 % reduce_memory_use(State1)
+          false -> reduce_memory_use(State1)
       end).
 
 maybe_update_rates(State = #vqstate{ in_counter  = InCount,
@@ -971,11 +971,6 @@ handle_pre_hibernate(State = #vqstate { index_mod   = IndexMod,
     State #vqstate { index_state = IndexMod:flush(IndexState),
                      store_state = rabbit_classic_queue_store_v2:sync(StoreState) }.
 
-handle_info(bump_reduce_memory_use, State = #vqstate{ waiting_bump = true }) ->
-    State#vqstate{ waiting_bump = false };
-handle_info(bump_reduce_memory_use, State) ->
-    State.
-
 resume(State) -> a(reduce_memory_use(State)).
 
 msg_rates(#vqstate { rates = #rates { in  = AvgIngressRate,
@@ -2118,41 +2113,17 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                             unconfirmed         = UC }) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
     MsgStatus = msg_status(Version, IsPersistent1, IsDelivered, SeqId, Msg, MsgProps, IndexMaxSize),
-    %% Unlike what the comment at the top of the file says, it is possible to
-    %% have messages in q1 that are also persisted to disk, as that is a
-    %% necessary requirement for confirms to be sent.
     State4 = case {DeltaCount, ?QUEUE:len(Q3)} of
-                 {0, Q34Len} when Q34Len < 16384 -> %% @todo It would be interesting to get a dynamic max size.
+                 {0, Q3Len} when Q3Len < 16384 -> %% @todo It would be interesting to get a dynamic max size.
                      {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
-                     State2 = case IsDelivered of
-                         true -> record_pending_ack(m(MsgStatus1), State1);
-                         false -> State1 #vqstate { q3 = ?QUEUE:in(m(MsgStatus1), Q3) }
-                     end,
+                     State2 = State1 #vqstate { q3 = ?QUEUE:in(m(MsgStatus1), Q3) },
                      %% @todo I am not sure about the stats from this.
-                     stats(case IsDelivered of
-                               true -> {0, 1};
-                               false -> {1, 0}
-                           end, {none, MsgStatus1}, 0, State2);
+                     stats({1, 0}, {none, MsgStatus1}, 0, State2);
                  _ ->
                      {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
-                     State2 = case IsDelivered of
-                         true -> record_pending_ack(m(MsgStatus1), State1);
-                         false ->
-                             Delta1 = expand_delta(SeqId, Delta, IsPersistent),
-                             State1 #vqstate { delta = Delta1 }
-                     end,
-                     stats(case IsDelivered of
-                               true -> {0, 1};
-                               false -> lazy_pub
-                           end,
-                           case IsDelivered of
-                               true -> {none, MsgStatus1};
-                               false -> {lazy, MsgStatus1}
-                           end,
-                           case IsDelivered of
-                               true -> 0;
-                               false -> 1
-                           end, State2)
+                     Delta1 = expand_delta(SeqId, Delta, IsPersistent),
+                     State2 = State1 #vqstate { delta = Delta1 },
+                     stats(lazy_pub, {lazy, MsgStatus1}, 1, State2)
              end,
     UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
     State4#vqstate{ next_seq_id         = SeqId + 1,
@@ -2170,8 +2141,29 @@ batch_publish1({Msg, MsgProps, IsDelivered}, {ChPid, Flow, State}) ->
     {ChPid, Flow, publish1(Msg, MsgProps, IsDelivered, ChPid, Flow,
                            fun maybe_prepare_write_to_disk/4, State)}.
 
-publish_delivered1(Msg, MsgProps, ChPid, Flow, PersistFun, State = #vqstate { next_seq_id = SeqId }) ->
-    {SeqId, publish1(Msg, MsgProps, true, ChPid, Flow, PersistFun, State)}.
+publish_delivered1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
+                   MsgProps = #message_properties { needs_confirming = NeedsConfirming },
+                   _ChPid, _Flow, PersistFun,
+                   State = #vqstate { version             = Version,
+                                      qi_embed_msgs_below = IndexMaxSize,
+                                      next_seq_id         = SeqId,
+                                      next_deliver_seq_id = NextDeliverSeqId,
+                                      in_counter          = InCount,
+                                      out_counter         = OutCount,
+                                      durable             = IsDurable,
+                                      unconfirmed         = UC }) ->
+    IsPersistent1 = IsDurable andalso IsPersistent,
+    MsgStatus = msg_status(Version, IsPersistent1, true, SeqId, Msg, MsgProps, IndexMaxSize),
+    {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
+    State2 = record_pending_ack(m(MsgStatus1), State1),
+    UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
+    {SeqId,
+     stats({0, 1}, {none, MsgStatus1}, 0,
+           State2#vqstate{ next_seq_id         = SeqId + 1,
+                           next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
+                           out_counter         = OutCount + 1,
+                           in_counter          = InCount + 1,
+                           unconfirmed         = UC1 })}.
 
 batch_publish_delivered1({Msg, MsgProps}, {ChPid, Flow, SeqIds, State}) ->
     {SeqId, State1} =
@@ -2719,17 +2711,19 @@ ifold(Fun, Acc, Its0, State0) ->
 %% Phase changes
 %%----------------------------------------------------------------------------
 
-maybe_reduce_memory_use(State = #vqstate {memory_reduction_run_count = MRedRunCount,
-                                          mode = Mode}) ->
-    case MRedRunCount >= ?EXPLICIT_GC_RUN_OP_THRESHOLD(Mode) of
+maybe_reduce_memory_use(State = #vqstate {memory_reduction_run_count = MRedRunCount}) ->
+    case MRedRunCount >= ?EXPLICIT_GC_RUN_OP_THRESHOLD(lazy) of
         true -> State1 = reduce_memory_use(State),
                 State1#vqstate{memory_reduction_run_count =  0};
         false -> State#vqstate{memory_reduction_run_count =  MRedRunCount + 1}
     end.
 
-reduce_memory_use(State) ->
-    garbage_collect(), %% @todo Do we still want this? Probably not.
-    %% @todo Remove bump_reduce_memory_use message handling from other modules.
+reduce_memory_use(State0 = #vqstate{ index_mod   = IndexMod,
+                                     index_state = IndexState,
+                                     store_state = StoreState }) ->
+    State = State0#vqstate{ index_state = IndexMod:flush(IndexState),
+                            store_state = rabbit_classic_queue_store_v2:sync(StoreState) },
+    garbage_collect(),
     %% @todo Get rid of target ram count?
     State.
 

From 1aed8c47805afe32fffcd6e9c8ad11c04ae707b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 12 Apr 2022 16:48:08 +0200
Subject: [PATCH 03/39] WIP Remove unused callback

---
 deps/rabbit/src/rabbit_backing_queue.erl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/deps/rabbit/src/rabbit_backing_queue.erl b/deps/rabbit/src/rabbit_backing_queue.erl
index 07cdc533cb68..cec608c888e0 100644
--- a/deps/rabbit/src/rabbit_backing_queue.erl
+++ b/deps/rabbit/src/rabbit_backing_queue.erl
@@ -257,11 +257,6 @@
                             [ack()], Acc, state())
                            -> Acc.
 
-%% Called when rabbit_amqqueue_process receives a message via
-%% handle_info and it should be processed by the backing
-%% queue
--callback handle_info(term(), state()) -> state().
-
 -spec info_keys() -> rabbit_types:info_keys().
 
 info_keys() -> ?INFO_KEYS.

From 397b051a6de0be65d04c36b2ab5b050038922598 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 10 May 2022 14:49:31 +0200
Subject: [PATCH 04/39] Don't force writing to disk in publish_delivered1

---
 deps/rabbit/src/rabbit_variable_queue.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 813a8001f09d..e3edceaa2b55 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -2154,7 +2154,7 @@ publish_delivered1(Msg = #basic_message { is_persistent = IsPersistent, id = Msg
                                       unconfirmed         = UC }) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
     MsgStatus = msg_status(Version, IsPersistent1, true, SeqId, Msg, MsgProps, IndexMaxSize),
-    {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
+    {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
     State2 = record_pending_ack(m(MsgStatus1), State1),
     UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
     {SeqId,

From 42e5411808efc0a122583894d548de7b5066bb25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 10 May 2022 15:40:31 +0200
Subject: [PATCH 05/39] When DeltaCount = 0 we can look at Len directly

---
 deps/rabbit/src/rabbit_variable_queue.erl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index e3edceaa2b55..da5c649b15c5 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -2104,6 +2104,7 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
          MsgProps = #message_properties { needs_confirming = NeedsConfirming },
          IsDelivered, _ChPid, _Flow, PersistFun,
          State = #vqstate { q3 = Q3, delta = Delta = #delta { count = DeltaCount },
+                            len                 = Len,
                             version             = Version,
                             qi_embed_msgs_below = IndexMaxSize,
                             next_seq_id         = SeqId,
@@ -2113,8 +2114,9 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                             unconfirmed         = UC }) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
     MsgStatus = msg_status(Version, IsPersistent1, IsDelivered, SeqId, Msg, MsgProps, IndexMaxSize),
-    State4 = case {DeltaCount, ?QUEUE:len(Q3)} of
-                 {0, Q3Len} when Q3Len < 16384 -> %% @todo It would be interesting to get a dynamic max size.
+    State4 = case DeltaCount of
+                 %% Len is the same as Q3Len when DeltaCount =:= 0.
+                 0 when Len < 16384 -> %% @todo It would be interesting to get a dynamic max size.
                      {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
                      State2 = State1 #vqstate { q3 = ?QUEUE:in(m(MsgStatus1), Q3) },
                      %% @todo I am not sure about the stats from this.

From ace498767920b085a84ef85907112c9baa8f5606 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 10 May 2022 17:40:52 +0200
Subject: [PATCH 06/39] Don't reduce memory use

Gets rid of an explicit GC that might have caused
slower performance than v2-lazy on master.
---
 deps/rabbit/src/rabbit_variable_queue.erl | 77 ++++-------------------
 1 file changed, 11 insertions(+), 66 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index da5c649b15c5..973953825a69 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -478,28 +478,6 @@
 %% rabbit_amqqueue_process need fairly fresh rates.
 -define(MSGS_PER_RATE_CALC, 100).
 
-%% we define the garbage collector threshold
-%% it needs to tune the `reduce_memory_use` calls. Thus, the garbage collection.
-%% see: rabbitmq-server-973 and rabbitmq-server-964
--define(DEFAULT_EXPLICIT_GC_RUN_OP_THRESHOLD, 1000).
--define(EXPLICIT_GC_RUN_OP_THRESHOLD(Mode),
-    case get(explicit_gc_run_operation_threshold) of
-        undefined ->
-            Val = explicit_gc_run_operation_threshold_for_mode(Mode),
-            put(explicit_gc_run_operation_threshold, Val),
-            Val;
-        Val       -> Val
-    end).
-
-explicit_gc_run_operation_threshold_for_mode(Mode) ->
-    {Key, Fallback} = case Mode of
-        lazy -> {lazy_queue_explicit_gc_run_operation_threshold,
-                 ?DEFAULT_EXPLICIT_GC_RUN_OP_THRESHOLD};
-        _    -> {queue_explicit_gc_run_operation_threshold,
-                 ?DEFAULT_EXPLICIT_GC_RUN_OP_THRESHOLD}
-    end,
-    rabbit_misc:get_env(rabbit, Key, Fallback).
-
 %%----------------------------------------------------------------------------
 %% Public API
 %%----------------------------------------------------------------------------
@@ -697,27 +675,27 @@ publish(Msg, MsgProps, IsDelivered, ChPid, Flow, State) ->
         publish1(Msg, MsgProps, IsDelivered, ChPid, Flow,
                  fun maybe_write_to_disk/4,
                  State),
-    a(maybe_reduce_memory_use(maybe_update_rates(State1))).
+    a(maybe_update_rates(State1)).
 
 batch_publish(Publishes, ChPid, Flow, State) ->
     {ChPid, Flow, State1} =
         lists:foldl(fun batch_publish1/2, {ChPid, Flow, State}, Publishes),
     State2 = ui(State1),
-    a(maybe_reduce_memory_use(maybe_update_rates(State2))).
+    a(maybe_update_rates(State2)).
 
 publish_delivered(Msg, MsgProps, ChPid, Flow, State) ->
     {SeqId, State1} =
         publish_delivered1(Msg, MsgProps, ChPid, Flow,
                            fun maybe_write_to_disk/4,
                            State),
-    {SeqId, a(maybe_reduce_memory_use(maybe_update_rates(State1)))}.
+    {SeqId, a(maybe_update_rates(State1))}.
 
 batch_publish_delivered(Publishes, ChPid, Flow, State) ->
     {ChPid, Flow, SeqIds, State1} =
         lists:foldl(fun batch_publish_delivered1/2,
                     {ChPid, Flow, [], State}, Publishes),
     State2 = ui(State1),
-    {lists:reverse(SeqIds), a(maybe_reduce_memory_use(maybe_update_rates(State2)))}.
+    {lists:reverse(SeqIds), a(maybe_update_rates(State2))}.
 
 discard(_MsgId, _ChPid, _Flow, State) -> State.
 
@@ -830,12 +808,12 @@ requeue(AckTags, #vqstate { delta      = Delta,
     {Delta1, MsgIds1, State2}     = delta_merge(SeqIds, Delta, MsgIds,
                                                 State1),
     MsgCount = length(MsgIds1),
-    {MsgIds1, a(maybe_reduce_memory_use(
+    {MsgIds1, a(
                   maybe_update_rates(ui(
                     State2 #vqstate { delta      = Delta1,
                                       q3         = Q3a,
                                       in_counter = InCounter + MsgCount,
-                                      len        = Len + MsgCount }))))}.
+                                      len        = Len + MsgCount })))}.
 
 ackfold(MsgFun, Acc, State, AckTags) ->
     {AccN, StateN} =
@@ -861,28 +839,10 @@ is_empty(State) -> 0 == len(State).
 depth(State) ->
     len(State) + count_pending_acks(State).
 
-set_ram_duration_target(
-  DurationTarget, State = #vqstate {
-                    rates = #rates { in      = AvgIngressRate,
-                                     out     = AvgEgressRate,
-                                     ack_in  = AvgAckIngressRate,
-                                     ack_out = AvgAckEgressRate },
-                    target_ram_count = TargetRamCount }) ->
-    Rate =
-        AvgEgressRate + AvgIngressRate + AvgAckEgressRate + AvgAckIngressRate,
-    TargetRamCount1 =
-        case DurationTarget of
-            infinity  -> infinity;
-            _         -> trunc(DurationTarget * Rate) %% msgs = sec * msgs/sec
-        end,
-    State1 = State #vqstate { target_ram_count = TargetRamCount1 },
-    a(case TargetRamCount1 == infinity orelse
-          (TargetRamCount =/= infinity andalso
-           TargetRamCount1 >= TargetRamCount) of
-          true  -> State1;
-          false -> reduce_memory_use(State1)
-      end).
+set_ram_duration_target(_DurationTarget, State) ->
+    State.
 
+%% @todo Do we need this? Not for reduce_memory_use type of stuff.
 maybe_update_rates(State = #vqstate{ in_counter  = InCount,
                                      out_counter = OutCount })
   when InCount + OutCount > ?MSGS_PER_RATE_CALC ->
@@ -922,6 +882,7 @@ update_rate(Now, TS, Count, Rate) ->
                                                 Count / Time, Rate)
     end.
 
+%% @todo Remove this as well.
 ram_duration(State) ->
     State1 = #vqstate { rates = #rates { in      = AvgIngressRate,
                                          out     = AvgEgressRate,
@@ -971,7 +932,7 @@ handle_pre_hibernate(State = #vqstate { index_mod   = IndexMod,
     State #vqstate { index_state = IndexMod:flush(IndexState),
                      store_state = rabbit_classic_queue_store_v2:sync(StoreState) }.
 
-resume(State) -> a(reduce_memory_use(State)).
+resume(State) -> a(State).
 
 msg_rates(#vqstate { rates = #rates { in  = AvgIngressRate,
                                       out = AvgEgressRate } }) ->
@@ -2713,22 +2674,6 @@ ifold(Fun, Acc, Its0, State0) ->
 %% Phase changes
 %%----------------------------------------------------------------------------
 
-maybe_reduce_memory_use(State = #vqstate {memory_reduction_run_count = MRedRunCount}) ->
-    case MRedRunCount >= ?EXPLICIT_GC_RUN_OP_THRESHOLD(lazy) of
-        true -> State1 = reduce_memory_use(State),
-                State1#vqstate{memory_reduction_run_count =  0};
-        false -> State#vqstate{memory_reduction_run_count =  MRedRunCount + 1}
-    end.
-
-reduce_memory_use(State0 = #vqstate{ index_mod   = IndexMod,
-                                     index_state = IndexState,
-                                     store_state = StoreState }) ->
-    State = State0#vqstate{ index_state = IndexMod:flush(IndexState),
-                            store_state = rabbit_classic_queue_store_v2:sync(StoreState) },
-    garbage_collect(),
-    %% @todo Get rid of target ram count?
-    State.
-
 fetch_from_q3(State = #vqstate { delta = #delta { count = DeltaCount },
                                  q3    = Q3 }) ->
     case ?QUEUE:out(Q3) of

From a5bf5555d3ce536ab9f42aacc10e3238f4e29276 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Wed, 11 May 2022 09:58:42 +0200
Subject: [PATCH 07/39] Remove a missed distinction between default/lazy

---
 deps/rabbit/src/rabbit_variable_queue.erl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 973953825a69..b6825371721a 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -1999,11 +1999,7 @@ count_pending_acks(#vqstate { ram_pending_ack   = RPA,
     gb_trees:size(RPA) + gb_trees:size(DPA) + gb_trees:size(QPA).
 
 purge_betas_and_deltas(DelsAndAcksFun, State = #vqstate { mode = Mode }) ->
-    State0 = #vqstate { q3 = Q3 } =
-        case Mode of
-            lazy -> maybe_deltas_to_betas(DelsAndAcksFun, State);
-            _    -> State
-        end,
+    State0 = #vqstate { q3 = Q3 } = maybe_deltas_to_betas(DelsAndAcksFun, State),
 
     case ?QUEUE:is_empty(Q3) of
         true  -> State0;

From ea0f4c1430d62ada6c62e2dc374c33ae0f504277 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Wed, 11 May 2022 15:10:44 +0200
Subject: [PATCH 08/39] Don't use q1 q2 and q4 at all anymore

---
 deps/rabbit/src/rabbit_variable_queue.erl | 74 +++++++----------------
 1 file changed, 22 insertions(+), 52 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index b6825371721a..710dcbdad1c2 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -280,11 +280,11 @@
 -behaviour(rabbit_backing_queue).
 
 -record(vqstate,
-        { q1,
-          q2,
+        { q1, %% Unused.
+          q2, %% Unused.
           delta,
           q3,
-          q4,
+          q4, %% Unused.
           next_seq_id,
           %% seq_id() of first undelivered message
           %% everything before this seq_id() was delivered at least once
@@ -329,12 +329,12 @@
           io_batch_size,
 
           %% default queue or lazy queue
-          mode,
+          mode, %% Unused.
           version = 1,
           %% number of reduce_memory_usage executions, once it
           %% reaches a threshold the queue will manually trigger a runtime GC
           %% see: maybe_execute_gc/1
-          memory_reduction_run_count,
+          memory_reduction_run_count, %% Unused.
           %% Queue data is grouped by VHost. We need to store it
           %% to work with queue index.
           virtual_host,
@@ -964,17 +964,15 @@ info(message_bytes_paged_out, #vqstate{delta_transient_bytes = PagedOutBytes}) -
     PagedOutBytes;
 info(head_message_timestamp, #vqstate{
           q3               = Q3,
-          %% @todo Drop Q4 variable.
-          q4               = Q4,
           ram_pending_ack  = RPA,
           qi_pending_ack   = QPA}) ->
-          head_message_timestamp(Q3, Q4, RPA, QPA);
+          head_message_timestamp(Q3, RPA, QPA);
 info(disk_reads, #vqstate{disk_read_count = Count}) ->
     Count;
 info(disk_writes, #vqstate{disk_write_count = Count}) ->
     Count;
 info(backing_queue_status, #vqstate {
-          q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
+          delta = Delta, q3 = Q3,
           mode             = Mode,
           version          = Version,
           len              = Len,
@@ -988,11 +986,11 @@ info(backing_queue_status, #vqstate {
 
     [ {mode                , Mode},
       {version             , Version},
-      {q1                  , ?QUEUE:len(Q1)},
-      {q2                  , ?QUEUE:len(Q2)},
+      {q1                  , 0},
+      {q2                  , 0},
       {delta               , Delta},
       {q3                  , ?QUEUE:len(Q3)},
-      {q4                  , ?QUEUE:len(Q4)},
+      {q4                  , 0},
       {len                 , Len},
       {target_ram_count    , TargetRamCount},
       {next_seq_id         , NextSeqId},
@@ -1341,12 +1339,11 @@ convert_from_v2_to_v1_loop(QueueName, V1Index0, V2Index0, V2Store0,
 %% are paged out - we assume some will soon be paged in rather than
 %% forcing it to happen.  Pending ack msgs are included as they are
 %% regarded as unprocessed until acked, this also prevents the result
-%% apparently oscillating during repeated rejects.  Q3 is only checked
-%% when Q4 is empty as any Q4 msg will be earlier.
-head_message_timestamp(Q3, Q4, RPA, QPA) ->
+%% apparently oscillating during repeated rejects.
+head_message_timestamp(Q3, RPA, QPA) ->
     HeadMsgs = [ HeadMsgStatus#msg_status.msg ||
                    HeadMsgStatus <-
-                       [ get_qs_head([Q4, Q3]),
+                       [ get_q_head(Q3),
                          get_pa_head(RPA),
                          get_pa_head(QPA) ],
                    HeadMsgStatus /= undefined,
@@ -1364,15 +1361,6 @@ head_message_timestamp(Q3, Q4, RPA, QPA) ->
         false -> lists:min(Timestamps)
     end.
 
-get_qs_head(Qs) ->
-    catch lists:foldl(
-            fun (Q, Acc) ->
-                    case get_q_head(Q) of
-                        undefined -> Acc;
-                        Val -> throw(Val)
-                    end
-            end, undefined, Qs).
-
 get_q_head(Q) ->
     ?QUEUE:get(Q, undefined).
 
@@ -1957,27 +1945,15 @@ purge_and_index_reset(State) ->
     State1 = purge1(process_delivers_and_acks_fun(none), State),
     a(reset_qi_state(State1)).
 
-%% This function removes messages from each of {q1, q2, q3, q4}.
-%%
-%% With remove_queue_entries/3 q1 and q4 are emptied, while q2 and q3
-%% are specially handled by purge_betas_and_deltas/2.
+%% This function removes messages from each of delta and q3.
 %%
 %% purge_betas_and_deltas/2 loads messages from the queue index,
-%% filling up q3 and in some cases moving messages form q2 to q3 while
-%% resetting q2 to an empty queue (see maybe_deltas_to_betas/2). The
-%% messages loaded into q3 are removed by calling
+%% filling up q3. The messages loaded into q3 are removed by calling
 %% remove_queue_entries/3 until there are no more messages to be read
 %% from the queue index. Messages are read in batches from the queue
 %% index.
-purge1(AfterFun, State = #vqstate { q4 = Q4}) ->
-    State1 = remove_queue_entries(Q4, AfterFun, State),
-
-    State2 = #vqstate {q1 = Q1} =
-        purge_betas_and_deltas(AfterFun, State1#vqstate{q4 = ?QUEUE:new()}),
-
-    State3 = remove_queue_entries(Q1, AfterFun, State2),
-
-    a(State3#vqstate{q1 = ?QUEUE:new()}).
+purge1(AfterFun, State) ->
+    a(purge_betas_and_deltas(AfterFun, State)).
 
 reset_qi_state(State = #vqstate{ index_mod   = IndexMod,
                                  index_state = IndexState0,
@@ -2589,12 +2565,9 @@ qi_ack_iterator(State) ->
 
 msg_iterator(State) -> istate(start, State).
 
-istate(start, State) -> {q4,    State#vqstate.q4,    State};
-istate(q4,    State) -> {q3,    State#vqstate.q3,    State};
+istate(start, State) -> {q3,    State#vqstate.q3,    State};
 istate(q3,    State) -> {delta, State#vqstate.delta, State};
-istate(delta, State) -> {q2,    State#vqstate.q2,    State};
-istate(q2,    State) -> {q1,    State#vqstate.q1,    State};
-istate(q1,   _State) -> done.
+istate(delta, _State) -> done.
 
 next({ack, It}, IndexState) ->
     case gb_trees:next(It) of
@@ -2691,7 +2664,6 @@ maybe_deltas_to_betas(_DelsAndAcksFun,
     State;
 maybe_deltas_to_betas(DelsAndAcksFun,
                       State = #vqstate {
-                        q2                   = Q2,
                         delta                = Delta,
                         q3                   = Q3,
                         index_mod            = IndexMod,
@@ -2733,11 +2705,9 @@ maybe_deltas_to_betas(DelsAndAcksFun,
             Q3b = ?QUEUE:join(Q3, Q3a),
             case DeltaCount - Q3aLen of
                 0 ->
-                    %% delta is now empty, but it wasn't before, so
-                    %% can now join q2 onto q3
-                    State2 #vqstate { q2    = ?QUEUE:new(),
-                                      delta = ?BLANK_DELTA,
-                                      q3    = ?QUEUE:join(Q3b, Q2),
+                    %% delta is now empty
+                    State2 #vqstate { delta = ?BLANK_DELTA,
+                                      q3    = Q3b,
                                       delta_transient_bytes = 0};
                 N when N > 0 ->
                     Delta1 = d(#delta { start_seq_id = DeltaSeqId1,

From d576ca9f8bb6c1a72e9f07b06efd121d610a0359 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 12 May 2022 15:18:36 +0200
Subject: [PATCH 09/39] Fetch from disk based on consume rate

---
 deps/rabbit/src/rabbit_variable_queue.erl | 73 +++++++++++++----------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 710dcbdad1c2..b665a23b0741 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -842,7 +842,6 @@ depth(State) ->
 set_ram_duration_target(_DurationTarget, State) ->
     State.
 
-%% @todo Do we need this? Not for reduce_memory_use type of stuff.
 maybe_update_rates(State = #vqstate{ in_counter  = InCount,
                                      out_counter = OutCount })
   when InCount + OutCount > ?MSGS_PER_RATE_CALC ->
@@ -882,33 +881,33 @@ update_rate(Now, TS, Count, Rate) ->
                                                 Count / Time, Rate)
     end.
 
-%% @todo Remove this as well.
+%% @todo Should be renamed since it's only used to update_rates.
 ram_duration(State) ->
-    State1 = #vqstate { rates = #rates { in      = AvgIngressRate,
-                                         out     = AvgEgressRate,
-                                         ack_in  = AvgAckIngressRate,
-                                         ack_out = AvgAckEgressRate },
-                        ram_msg_count      = RamMsgCount,
-                        ram_msg_count_prev = RamMsgCountPrev,
-                        ram_pending_ack    = RPA,
-                        qi_pending_ack     = QPA,
-                        ram_ack_count_prev = RamAckCountPrev } =
+    State1 = %#vqstate { rates = #rates { in      = AvgIngressRate,
+             %                            out     = AvgEgressRate,
+             %                            ack_in  = AvgAckIngressRate,
+             %                            ack_out = AvgAckEgressRate },
+             %           ram_msg_count      = RamMsgCount,
+             %           ram_msg_count_prev = RamMsgCountPrev,
+             %           ram_pending_ack    = RPA,
+             %           qi_pending_ack     = QPA,
+             %           ram_ack_count_prev = RamAckCountPrev } =
         update_rates(State),
 
-    RamAckCount = gb_trees:size(RPA) + gb_trees:size(QPA),
-
-    Duration = %% msgs+acks / (msgs+acks/sec) == sec
-        case lists:all(fun (X) -> X < 0.01 end,
-                       [AvgEgressRate, AvgIngressRate,
-                        AvgAckEgressRate, AvgAckIngressRate]) of
-            true  -> infinity;
-            false -> (RamMsgCountPrev + RamMsgCount +
-                          RamAckCount + RamAckCountPrev) /
-                         (4 * (AvgEgressRate + AvgIngressRate +
-                                   AvgAckEgressRate + AvgAckIngressRate))
-        end,
-
-    {Duration, State1}.
+%    RamAckCount = gb_trees:size(RPA) + gb_trees:size(QPA),
+%
+%    Duration = %% msgs+acks / (msgs+acks/sec) == sec
+%        case lists:all(fun (X) -> X < 0.01 end,
+%                       [AvgEgressRate, AvgIngressRate,
+%                        AvgAckEgressRate, AvgAckIngressRate]) of
+%            true  -> infinity;
+%            false -> (RamMsgCountPrev + RamMsgCount +
+%                          RamAckCount + RamAckCountPrev) /
+%                         (4 * (AvgEgressRate + AvgIngressRate +
+%                                   AvgAckEgressRate + AvgAckIngressRate))
+%        end,
+
+    {infinity, State1}.
 
 needs_timeout(#vqstate { index_mod   = IndexMod,
                          index_state = IndexState }) ->
@@ -1974,6 +1973,7 @@ count_pending_acks(#vqstate { ram_pending_ack   = RPA,
                               qi_pending_ack    = QPA }) ->
     gb_trees:size(RPA) + gb_trees:size(DPA) + gb_trees:size(QPA).
 
+%% @todo This should set the out rate to infinity temporarily while we drop deltas.
 purge_betas_and_deltas(DelsAndAcksFun, State = #vqstate { mode = Mode }) ->
     State0 = #vqstate { q3 = Q3 } = maybe_deltas_to_betas(DelsAndAcksFun, State),
 
@@ -2044,12 +2044,17 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                             next_deliver_seq_id = NextDeliverSeqId,
                             in_counter          = InCount,
                             durable             = IsDurable,
-                            unconfirmed         = UC }) ->
+                            unconfirmed         = UC,
+                            rates               = #rates{ out = OutRate }}) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
     MsgStatus = msg_status(Version, IsPersistent1, IsDelivered, SeqId, Msg, MsgProps, IndexMaxSize),
+    %% We allow from 1 to 2048 messages in memory depending on the consume rate. The lower
+    %% limit is at 1 because the queue process will need to access this message to know
+    %% expiration information.
+    MemoryLimit = min(1 + floor(2 * OutRate), 2048),
     State4 = case DeltaCount of
                  %% Len is the same as Q3Len when DeltaCount =:= 0.
-                 0 when Len < 16384 -> %% @todo It would be interesting to get a dynamic max size.
+                 0 when Len < MemoryLimit ->
                      {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
                      State2 = State1 #vqstate { q3 = ?QUEUE:in(m(MsgStatus1), Q3) },
                      %% @todo I am not sure about the stats from this.
@@ -2582,6 +2587,9 @@ next({delta, #delta{start_seq_id = SeqId,
 next({delta, #delta{start_seq_id = SeqId,
                     end_seq_id   = SeqIdEnd} = Delta, State = #vqstate{index_mod = IndexMod}}, IndexState) ->
     SeqIdB = IndexMod:next_segment_boundary(SeqId),
+    %% It may make sense to limit this based on rate. But this
+    %% is not called outside of CMQs so I will leave it alone
+    %% for the time being.
     SeqId1 = lists:min([SeqIdB,
                         %% We must limit the number of messages read at once
                         %% otherwise the queue will attempt to read up to segment_entry_count()
@@ -2672,18 +2680,21 @@ maybe_deltas_to_betas(DelsAndAcksFun,
                         ram_bytes            = RamBytes,
                         disk_read_count      = DiskReadCount,
                         delta_transient_bytes = DeltaTransientBytes,
-                        transient_threshold  = TransientThreshold }) ->
+                        transient_threshold  = TransientThreshold,
+                        rates                = #rates{ out = OutRate }}) ->
     #delta { start_seq_id = DeltaSeqId,
              count        = DeltaCount,
              transient    = Transient,
              end_seq_id   = DeltaSeqIdEnd } = Delta,
+    %% We allow from 1 to 2048 messages in memory depending on the consume rate.
+    MemoryLimit = min(1 + floor(2 * OutRate), 2048),
     DeltaSeqId1 =
         lists:min([IndexMod:next_segment_boundary(DeltaSeqId),
                    %% We must limit the number of messages read at once
                    %% otherwise the queue will attempt to read up to segment_entry_count()
-                   %% messages from the index each time. The value
-                   %% chosen here is arbitrary.
-                   DeltaSeqId + 2048,
+                   %% messages from the index each time. The value is determined
+                   %% using the consuming rate.
+                   DeltaSeqId + MemoryLimit,
                    DeltaSeqIdEnd]),
     {List, IndexState1} = IndexMod:read(DeltaSeqId, DeltaSeqId1, IndexState),
     {Q3a, RamCountsInc, RamBytesInc, State1, TransientCount, TransientBytes} =

From de93e89eb22d37e7c06103cd543cda99d1ae25a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 19 May 2022 09:18:28 +0200
Subject: [PATCH 10/39] Fix warning

---
 deps/rabbit/src/rabbit_variable_queue.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index b665a23b0741..e186c07417bf 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -1974,7 +1974,7 @@ count_pending_acks(#vqstate { ram_pending_ack   = RPA,
     gb_trees:size(RPA) + gb_trees:size(DPA) + gb_trees:size(QPA).
 
 %% @todo This should set the out rate to infinity temporarily while we drop deltas.
-purge_betas_and_deltas(DelsAndAcksFun, State = #vqstate { mode = Mode }) ->
+purge_betas_and_deltas(DelsAndAcksFun, State) ->
     State0 = #vqstate { q3 = Q3 } = maybe_deltas_to_betas(DelsAndAcksFun, State),
 
     case ?QUEUE:is_empty(Q3) of

From 38f335e83b0365940efccc2329fcc3cb3367c43c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Fri, 3 Jun 2022 11:53:45 +0200
Subject: [PATCH 11/39] Rework CQ stats code to be obvious and efficient

---
 Makefile                                      |   2 +-
 .../src/rabbit_classic_queue_store_v2.erl     |   3 +
 deps/rabbit/src/rabbit_variable_queue.erl     | 220 ++++++++++++------
 deps/rabbit/test/backing_queue_SUITE.erl      |  72 +++---
 4 files changed, 185 insertions(+), 112 deletions(-)

diff --git a/Makefile b/Makefile
index 018b0a91b750..08098537d364 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ include plugins.mk
 # Note: When including NIFs in a release make sure to build
 # them on the appropriate platform for the target environment.
 # For example build looking_glass on Linux when targeting Docker.
-ADDITIONAL_PLUGINS ?=
+ADDITIONAL_PLUGINS ?= looking_glass
 
 DEPS = rabbit_common rabbit $(PLUGINS) $(ADDITIONAL_PLUGINS)
 
diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index df9298b00ec0..3787c232605d 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -265,6 +265,9 @@ sync(State = #qs{ confirms = Confirms,
 -spec read(rabbit_variable_queue:seq_id(), msg_location(), State)
         -> {rabbit_types:basic_message(), State} when State::state().
 
+%% @todo We should try to have a read_many for when reading many from the index
+%%       so that we fetch many different messages in a single file:pread. See
+%%       if that helps improve the performance.
 read(SeqId, DiskLocation, State = #qs{ cache = Cache }) ->
     ?DEBUG("~0p ~0p ~0p", [SeqId, DiskLocation, State]),
     case Cache of
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index e186c07417bf..374761049a0f 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -300,7 +300,7 @@
           transient_threshold,
           qi_embed_msgs_below,
 
-          len,                %% w/o unacked
+          len,                %% w/o unacked @todo No longer needed, is delta+q3.
           bytes,              %% w/o unacked
           unacked_bytes,
           persistent_count,   %% w   unacked
@@ -782,6 +782,7 @@ ack(AckTags, State) ->
                          ack_out_counter   = AckOutCount }} =
         lists:foldl(
           fun (SeqId, {Acc, State2}) ->
+                  %% @todo When acking many messages we should update stats once not for every.
                   case remove_pending_ack(true, SeqId, State2) of
                       {none, _} ->
                           {Acc, State2};
@@ -802,6 +803,10 @@ requeue(AckTags, #vqstate { delta      = Delta,
                             q3         = Q3,
                             in_counter = InCounter,
                             len        = Len } = State) ->
+
+    %% @todo This can be heavily simplified: if the message falls into delta,
+    %%       add it there. Otherwise just add it to q3 in the correct position.
+
     {SeqIds, Q3a, MsgIds, State1} = queue_merge(lists:sort(AckTags), Q3, [],
                                                 delta_limit(Delta),
                                                 fun publish_beta/2, State),
@@ -1253,7 +1258,9 @@ convert_from_v2_to_v1_split_tree_loop(Iterator0, RPAAcc, QPAAcc, State0) ->
             end
     end.
 
-convert_from_v2_to_v1_msg_status(MsgStatus0, State1 = #vqstate{ store_state = StoreState0 }, Ready) ->
+convert_from_v2_to_v1_msg_status(MsgStatus0, State1 = #vqstate{ store_state = StoreState0,
+                                                                ram_msg_count = RamMsgCount,
+                                                                ram_bytes = RamBytes }, Ready) ->
     case MsgStatus0 of
         #msg_status{ seq_id = SeqId,
                      msg_location = MsgLocation = {rabbit_classic_queue_store_v2, _, _} } ->
@@ -1262,12 +1269,10 @@ convert_from_v2_to_v1_msg_status(MsgStatus0, State1 = #vqstate{ store_state = St
                                                msg_location = memory,
                                                is_delivered = true,
                                                persist_to   = queue_index },
-            %% We have read the message into memory. We must update the stats.
-            State2 = case Ready of
-                true  -> stats(ready0, {MsgStatus0, MsgStatus}, 0, State1);
-                false -> stats({0, 0}, {MsgStatus0, MsgStatus}, 0, State1)
-            end,
-            {MsgStatus, State2#vqstate{ store_state = StoreState }};
+            %% We have read the message into memory. We must also update the stats.
+            {MsgStatus, State1#vqstate{ store_state = StoreState,
+                                        ram_msg_count = RamMsgCount + one_if(Ready),
+                                        ram_bytes = RamBytes + msg_size(MsgStatus) }};
         #msg_status{ persist_to = queue_store } ->
             {MsgStatus0#msg_status{ is_delivered = true,
                                     persist_to   = queue_index }, State1};
@@ -1461,12 +1466,12 @@ beta_msg_status0(SeqId, MsgProps, IsPersistent) ->
               index_on_disk = true,
               msg_props     = MsgProps}.
 
-trim_msg_status(MsgStatus) ->
-    case persist_to(MsgStatus) of
-        msg_store   -> MsgStatus#msg_status{msg = undefined};
-        queue_store -> MsgStatus#msg_status{msg = undefined};
-        queue_index -> MsgStatus
-    end.
+%trim_msg_status(MsgStatus) ->
+%    case persist_to(MsgStatus) of
+%        msg_store   -> MsgStatus#msg_status{msg = undefined};
+%        queue_store -> MsgStatus#msg_status{msg = undefined};
+%        queue_index -> MsgStatus
+%    end.
 
 with_msg_store_state({MSCStateP, MSCStateT},  true, Fun) ->
     {Result, MSCStateP1} = Fun(MSCStateP),
@@ -1719,54 +1724,112 @@ read_msg(_, MsgId, IsPersistent, rabbit_msg_store, State = #vqstate{msg_store_cl
     {Msg, State #vqstate {msg_store_clients = MSCState1,
                           disk_read_count   = Count + 1}}.
 
-stats(Signs, Statuses, DeltaPaged, State) ->
-    stats0(expand_signs(Signs), expand_statuses(Statuses), DeltaPaged, State).
-
-expand_signs(ready0)        -> {0, 0, true};
-expand_signs(lazy_pub)      -> {1, 0, true};
-expand_signs({A, B})        -> {A, B, false}.
-
-expand_statuses({none, A})    -> {false,         msg_in_ram(A), A};
-expand_statuses({B,    none}) -> {msg_in_ram(B), false,         B};
-expand_statuses({lazy, A})    -> {false        , false,         A};
-expand_statuses({B,    A})    -> {msg_in_ram(B), msg_in_ram(A), B}.
-
-%% In this function at least, we are religious: the variable name
-%% contains "Ready" or "Unacked" iff that is what it counts. If
-%% neither is present it counts both.
-stats0({DeltaReady, DeltaUnacked, ReadyMsgPaged},
-       {InRamBefore, InRamAfter, MsgStatus}, DeltaPaged,
-       State = #vqstate{len              = ReadyCount,
-                        bytes            = ReadyBytes,
-                        ram_msg_count    = RamReadyCount,
-                        persistent_count = PersistentCount,
-                        unacked_bytes    = UnackedBytes,
-                        ram_bytes        = RamBytes,
-                        delta_transient_bytes = DeltaBytes,
-                        persistent_bytes = PersistentBytes}) ->
-    S = msg_size(MsgStatus),
-    DeltaTotal = DeltaReady + DeltaUnacked,
-    DeltaRam = case {InRamBefore, InRamAfter} of
-                   {false, false} ->  0;
-                   {false, true}  ->  1;
-                   {true,  false} -> -1;
-                   {true,  true}  ->  0
-               end,
-    DeltaRamReady = case DeltaReady of
-                        1                    -> one_if(InRamAfter);
-                        -1                   -> -one_if(InRamBefore);
-                        0 when ReadyMsgPaged -> DeltaRam;
-                        0                    -> 0
-                    end,
-    DeltaPersistent = DeltaTotal * one_if(MsgStatus#msg_status.is_persistent),
-    State#vqstate{len               = ReadyCount      + DeltaReady,
-                  ram_msg_count     = RamReadyCount   + DeltaRamReady,
-                  persistent_count  = PersistentCount + DeltaPersistent,
-                  bytes             = ReadyBytes      + DeltaReady       * S,
-                  unacked_bytes     = UnackedBytes    + DeltaUnacked     * S,
-                  ram_bytes         = RamBytes        + DeltaRam         * S,
-                  persistent_bytes  = PersistentBytes + DeltaPersistent  * S,
-                  delta_transient_bytes = DeltaBytes  + DeltaPaged * one_if(not MsgStatus#msg_status.is_persistent) * S}.
+%% Helper macros to make the code as obvious as possible.
+%% It's OK to call msg_size/1 for Inc because it gets inlined.
+-define(UP(A, Inc),
+    A = St#vqstate.A + Inc).
+-define(UP(A, B, Inc),
+    A = St#vqstate.A + Inc,
+    B = St#vqstate.B + Inc).
+-define(UP(A, B, C, Inc),
+    A = St#vqstate.A + Inc,
+    B = St#vqstate.B + Inc,
+    C = St#vqstate.C + Inc).
+
+%% When publishing to memory, transient messages do not get written to disk.
+%% On the other hand, persistent messages are kept in memory as well as disk.
+stats_published_memory(MS = #msg_status{is_persistent = true}, St) ->
+    St#vqstate{?UP(len, ram_msg_count, persistent_count, +1),
+               ?UP(bytes, ram_bytes, persistent_bytes, +msg_size(MS))};
+stats_published_memory(MS = #msg_status{is_persistent = false}, St) ->
+    St#vqstate{?UP(len, ram_msg_count, +1),
+               ?UP(bytes, ram_bytes, +msg_size(MS))}.
+
+%% Messages published directly to disk are not kept in memory.
+stats_published_disk(MS = #msg_status{is_persistent = true}, St) ->
+    St#vqstate{?UP(len, persistent_count, +1),
+               ?UP(bytes, persistent_bytes, +msg_size(MS))};
+stats_published_disk(MS = #msg_status{is_persistent = false}, St) ->
+    St#vqstate{?UP(len, +1),
+               ?UP(bytes, delta_transient_bytes, +msg_size(MS))}.
+
+%% Pending acks do not add to len. Messages are kept in memory.
+stats_published_pending_acks(MS = #msg_status{is_persistent = true}, St) ->
+    St#vqstate{?UP(persistent_count, +1),
+               ?UP(persistent_bytes, unacked_bytes, ram_bytes, +msg_size(MS))};
+stats_published_pending_acks(MS = #msg_status{is_persistent = false}, St) ->
+    St#vqstate{?UP(unacked_bytes, ram_bytes, +msg_size(MS))}.
+
+%% Messages are moved from memory to pending acks. They may have
+%% the message body either in memory or on disk depending on how
+%% the message got to memory in the first place (if the message
+%% was fully on disk the content will not be read immediately).
+%% The contents stay where they are during this operation.
+stats_pending_acks(MS = #msg_status{msg = undefined}, St) ->
+    St#vqstate{?UP(len, -1),
+               ?UP(bytes, -msg_size(MS)), ?UP(unacked_bytes, +msg_size(MS))};
+stats_pending_acks(MS, St) ->
+    St#vqstate{?UP(len, ram_msg_count, -1),
+               ?UP(bytes, -msg_size(MS)), ?UP(unacked_bytes, +msg_size(MS))}.
+
+%% Message may or may not be persistent and the contents
+%% may or may not be in memory.
+%%
+%% Removal from delta_transient_bytes is done by maybe_deltas_to_betas.
+stats_removed(MS = #msg_status{is_persistent = true, msg = undefined}, St) ->
+    St#vqstate{?UP(len, persistent_count, -1),
+               ?UP(bytes, persistent_bytes, -msg_size(MS))};
+stats_removed(MS = #msg_status{is_persistent = true}, St) ->
+    St#vqstate{?UP(len, ram_msg_count, persistent_count, -1),
+               ?UP(bytes, ram_bytes, persistent_bytes, -msg_size(MS))};
+stats_removed(MS = #msg_status{is_persistent = false, msg = undefined}, St) ->
+    St#vqstate{?UP(len, -1), ?UP(bytes, -msg_size(MS))};
+stats_removed(MS = #msg_status{is_persistent = false}, St) ->
+    St#vqstate{?UP(len, ram_msg_count, -1),
+               ?UP(bytes, ram_bytes, -msg_size(MS))}.
+
+%% @todo Very confusing that ram_msg_count is without unacked but ram_bytes is with.
+%%       Rename the fields to make these things obvious. Fields are internal
+%%       so that should be OK.
+
+stats_acked_pending(MS = #msg_status{is_persistent = true, msg = undefined}, St) ->
+    St#vqstate{?UP(persistent_count, -1),
+               ?UP(persistent_bytes, unacked_bytes, -msg_size(MS))};
+stats_acked_pending(MS = #msg_status{is_persistent = true}, St) ->
+    St#vqstate{?UP(persistent_count, -1),
+               ?UP(persistent_bytes, unacked_bytes, ram_bytes, -msg_size(MS))};
+stats_acked_pending(MS = #msg_status{is_persistent = false,
+                                     msg           = undefined}, St) ->
+    St#vqstate{?UP(unacked_bytes, -msg_size(MS))};
+stats_acked_pending(MS = #msg_status{is_persistent = false}, St) ->
+    St#vqstate{?UP(unacked_bytes, ram_bytes, -msg_size(MS))}.
+
+%% Notice that this is the reverse of stats_pending_acks.
+stats_requeued_memory(MS = #msg_status{msg = undefined}, St) ->
+    St#vqstate{?UP(len, +1),
+               ?UP(bytes, +msg_size(MS)), ?UP(unacked_bytes, -msg_size(MS))};
+stats_requeued_memory(MS, St) ->
+    St#vqstate{?UP(len, ram_msg_count, +1),
+               ?UP(bytes, +msg_size(MS)), ?UP(unacked_bytes, -msg_size(MS))}.
+
+%% @todo For v2 since we don't remove from disk until we ack, we don't need
+%%       to write to disk again on requeue. If the message falls within delta
+%%       we can just drop the MsgStatus. Otherwise we just put it in q3 and
+%%       we don't do any disk writes.
+%%
+%%       For v1 I'm not sure? I don't think we need to write to the index
+%%       at least, but maybe we need to write the message if not embedded?
+%%       I don't think we need to...
+%%
+%%       So we don't need to change anything except how we count stats as
+%%       well as delta stats if the message falls within delta.
+stats_requeued_disk(MS = #msg_status{is_persistent = true}, St) ->
+    St#vqstate{?UP(len, +1),
+               ?UP(bytes, +msg_size(MS)), ?UP(unacked_bytes, -msg_size(MS))};
+stats_requeued_disk(MS = #msg_status{is_persistent = false}, St) ->
+    St#vqstate{?UP(len, +1),
+               ?UP(bytes, delta_transient_bytes, +msg_size(MS)),
+               ?UP(unacked_bytes, -msg_size(MS))}.
 
 msg_size(#msg_status{msg_props = #message_properties{size = Size}}) -> Size.
 
@@ -1783,7 +1846,7 @@ remove(true, MsgStatus = #msg_status {
                MsgStatus #msg_status {
                  is_delivered = true }, State),
 
-    State2 = stats({-1, 1}, {MsgStatus, MsgStatus}, 0, State1),
+    State2 = stats_pending_acks(MsgStatus, State1),
 
     {SeqId, maybe_update_rates(
               State2 #vqstate {next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
@@ -1822,7 +1885,7 @@ remove(false, MsgStatus = #msg_status {
 
     StoreState = rabbit_classic_queue_store_v2:delete_segments(DeletedSegments, StoreState1),
 
-    State1 = stats({-1, 0}, {MsgStatus, none}, 0, State),
+    State1 = stats_removed(MsgStatus, State),
 
     {undefined, maybe_update_rates(
                   State1 #vqstate {next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
@@ -1914,7 +1977,7 @@ process_queue_entries1(
                  is_delivered = true }, State1),
     {next_deliver_seq_id(SeqId, NextDeliverSeqId),
      Fun(Msg, SeqId, FetchAcc),
-     stats({-1, 1}, {MsgStatus, MsgStatus}, 0, State2)}.
+     stats_pending_acks(MsgStatus, State2)}.
 
 collect_by_predicate(Pred, QAcc, State) ->
     case queue_out(State) of
@@ -1974,6 +2037,9 @@ count_pending_acks(#vqstate { ram_pending_ack   = RPA,
     gb_trees:size(RPA) + gb_trees:size(DPA) + gb_trees:size(QPA).
 
 %% @todo This should set the out rate to infinity temporarily while we drop deltas.
+%% @todo When doing maybe_deltas_to_betas stats are updated. Then stats
+%%       are updated again in remove_queue_entries1. All unnecessary since
+%%       we are purging anyway?
 purge_betas_and_deltas(DelsAndAcksFun, State) ->
     State0 = #vqstate { q3 = Q3 } = maybe_deltas_to_betas(DelsAndAcksFun, State),
 
@@ -2005,7 +2071,8 @@ remove_queue_entries1(
      end,
      next_deliver_seq_id(SeqId, NextDeliverSeqId),
      cons_if(IndexOnDisk, SeqId, Acks),
-     stats({-1, 0}, {MsgStatus, none}, 0, State)}.
+     %% @todo Probably don't do this on a per-message basis...
+     stats_removed(MsgStatus, State)}.
 
 process_delivers_and_acks_fun(deliver_and_ack) ->
     fun (NextDeliverSeqId, Acks, State = #vqstate { index_mod   = IndexMod,
@@ -2057,13 +2124,12 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                  0 when Len < MemoryLimit ->
                      {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
                      State2 = State1 #vqstate { q3 = ?QUEUE:in(m(MsgStatus1), Q3) },
-                     %% @todo I am not sure about the stats from this.
-                     stats({1, 0}, {none, MsgStatus1}, 0, State2);
+                     stats_published_memory(MsgStatus1, State2);
                  _ ->
                      {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
                      Delta1 = expand_delta(SeqId, Delta, IsPersistent),
                      State2 = State1 #vqstate { delta = Delta1 },
-                     stats(lazy_pub, {lazy, MsgStatus1}, 1, State2)
+                     stats_published_disk(MsgStatus1, State2)
              end,
     UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
     State4#vqstate{ next_seq_id         = SeqId + 1,
@@ -2098,7 +2164,7 @@ publish_delivered1(Msg = #basic_message { is_persistent = IsPersistent, id = Msg
     State2 = record_pending_ack(m(MsgStatus1), State1),
     UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
     {SeqId,
-     stats({0, 1}, {none, MsgStatus1}, 0,
+     stats_published_pending_acks(MsgStatus1,
            State2#vqstate{ next_seq_id         = SeqId + 1,
                            next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
                            out_counter         = OutCount + 1,
@@ -2322,12 +2388,13 @@ lookup_pending_ack(SeqId, #vqstate { ram_pending_ack  = RPA,
     end.
 
 %% First parameter = UpdateStats
+%% @todo Do the stats updating outside of this function.
 remove_pending_ack(true, SeqId, State) ->
     case remove_pending_ack(false, SeqId, State) of
         {none, _} ->
             {none, State};
         {MsgStatus, State1} ->
-            {MsgStatus, stats({0, -1}, {MsgStatus, none}, 0, State1)}
+            {MsgStatus, stats_acked_pending(MsgStatus, State1)}
     end;
 remove_pending_ack(false, SeqId, State = #vqstate{ram_pending_ack  = RPA,
                                                   disk_pending_ack = DPA,
@@ -2490,10 +2557,13 @@ msgs_and_indices_written_to_disk(Callback, MsgIdSet) ->
 %% Internal plumbing for requeue
 %%----------------------------------------------------------------------------
 
+%% @todo This function is misnamed because we don't do alpha/beta anymore.
+%%       So we don't need to write the messages to disk at all, just add
+%%       the messages back to q3 and update the stats.
 publish_beta(MsgStatus, State) ->
-    {MsgStatus1, State1} = maybe_prepare_write_to_disk(true, false, MsgStatus, State),
-    MsgStatus2 = m(trim_msg_status(MsgStatus1)),
-    {MsgStatus2, stats({1, -1}, {MsgStatus, MsgStatus2}, 0, State1)}.
+%    {MsgStatus1, State1} = maybe_prepare_write_to_disk(true, false, MsgStatus, State),
+%    MsgStatus2 = m(trim_msg_status(MsgStatus1)),
+    {MsgStatus, stats_requeued_memory(MsgStatus, State)}.
 
 %% Rebuild queue, inserting sequence ids to maintain ordering
 queue_merge(SeqIds, Q, MsgIds, Limit, PubFun, State) ->
@@ -2537,7 +2607,7 @@ delta_merge(SeqIds, Delta, MsgIds, State) ->
                                 {_MsgStatus, State2} =
                                     maybe_prepare_write_to_disk(true, true, MsgStatus, State1),
                                 {expand_delta(SeqId, Delta0, IsPersistent), [MsgId | MsgIds0],
-                                 stats({1, -1}, {MsgStatus, none}, 1, State2)}
+                                 stats_requeued_disk(MsgStatus, State2)}
                         end
                 end, {Delta, MsgIds, State}, SeqIds).
 
diff --git a/deps/rabbit/test/backing_queue_SUITE.erl b/deps/rabbit/test/backing_queue_SUITE.erl
index 79a5fa7257cb..f55dee148a9d 100644
--- a/deps/rabbit/test/backing_queue_SUITE.erl
+++ b/deps/rabbit/test/backing_queue_SUITE.erl
@@ -20,7 +20,7 @@
 -define(VHOST, <<"/">>).
 
 -define(VARIABLE_QUEUE_TESTCASES, [
-    variable_queue_dynamic_duration_change,
+%    variable_queue_dynamic_duration_change,
     variable_queue_partial_segments_delta_thing,
     variable_queue_all_the_bits_not_covered_elsewhere_A,
     variable_queue_all_the_bits_not_covered_elsewhere_B,
@@ -934,41 +934,41 @@ get_queue_sup_pid([{_, SupPid, _, _} | Rest], QueuePid) ->
 get_queue_sup_pid([], _QueuePid) ->
     undefined.
 
-variable_queue_dynamic_duration_change(Config) ->
-    passed = rabbit_ct_broker_helpers:rpc(Config, 0,
-      ?MODULE, variable_queue_dynamic_duration_change1, [Config]).
-
-variable_queue_dynamic_duration_change1(Config) ->
-    with_fresh_variable_queue(
-      fun variable_queue_dynamic_duration_change2/2,
-      ?config(variable_queue_type, Config)).
-
-variable_queue_dynamic_duration_change2(VQ0, _QName) ->
-    IndexMod = index_mod(),
-    SegmentSize = IndexMod:next_segment_boundary(0),
-
-    %% start by sending in a couple of segments worth
-    Len = 2*SegmentSize,
-    VQ1 = variable_queue_publish(false, Len, VQ0),
-    %% squeeze and relax queue
-    Churn = Len div 32,
-    VQ2 = publish_fetch_and_ack(Churn, Len, VQ1),
-
-    {Duration, VQ3} = rabbit_variable_queue:ram_duration(VQ2),
-    VQ7 = lists:foldl(
-            fun (Duration1, VQ4) ->
-                    {_Duration, VQ5} = rabbit_variable_queue:ram_duration(VQ4),
-                    VQ6 = variable_queue_set_ram_duration_target(
-                            Duration1, VQ5),
-                    publish_fetch_and_ack(Churn, Len, VQ6)
-            end, VQ3, [Duration / 4, 0, Duration / 4, infinity]),
-
-    %% drain
-    {VQ8, AckTags} = variable_queue_fetch(Len, false, false, Len, VQ7),
-    {_Guids, VQ9} = rabbit_variable_queue:ack(AckTags, VQ8),
-    {empty, VQ10} = rabbit_variable_queue:fetch(true, VQ9),
-
-    VQ10.
+%variable_queue_dynamic_duration_change(Config) ->
+%    passed = rabbit_ct_broker_helpers:rpc(Config, 0,
+%      ?MODULE, variable_queue_dynamic_duration_change1, [Config]).
+%
+%variable_queue_dynamic_duration_change1(Config) ->
+%    with_fresh_variable_queue(
+%      fun variable_queue_dynamic_duration_change2/2,
+%      ?config(variable_queue_type, Config)).
+%
+%variable_queue_dynamic_duration_change2(VQ0, _QName) ->
+%    IndexMod = index_mod(),
+%    SegmentSize = IndexMod:next_segment_boundary(0),
+%
+%    %% start by sending in a couple of segments worth
+%    Len = 2*SegmentSize,
+%    VQ1 = variable_queue_publish(false, Len, VQ0),
+%    %% squeeze and relax queue
+%    Churn = Len div 32,
+%    VQ2 = publish_fetch_and_ack(Churn, Len, VQ1),
+%
+%    {Duration, VQ3} = rabbit_variable_queue:ram_duration(VQ2),
+%    VQ7 = lists:foldl(
+%            fun (Duration1, VQ4) ->
+%                    {_Duration, VQ5} = rabbit_variable_queue:ram_duration(VQ4),
+%                    VQ6 = variable_queue_set_ram_duration_target(
+%                            Duration1, VQ5),
+%                    publish_fetch_and_ack(Churn, Len, VQ6)
+%            end, VQ3, [Duration / 4, 0, Duration / 4, infinity]),
+%
+%    %% drain
+%    {VQ8, AckTags} = variable_queue_fetch(Len, false, false, Len, VQ7),
+%    {_Guids, VQ9} = rabbit_variable_queue:ack(AckTags, VQ8),
+%    {empty, VQ10} = rabbit_variable_queue:fetch(true, VQ9),
+%
+%    VQ10.
 
 variable_queue_partial_segments_delta_thing(Config) ->
     passed = rabbit_ct_broker_helpers:rpc(Config, 0,

From e475f46a384d6186237295d762e58320731d23aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Wed, 8 Jun 2022 15:39:42 +0200
Subject: [PATCH 12/39] CQ: Use only ram_pending_acks, not qi_pending_acks

There's no point maintaining a difference since we are
no longer paging messages to disk.
---
 deps/rabbit/src/rabbit_variable_queue.erl | 139 ++++++----------------
 1 file changed, 36 insertions(+), 103 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 374761049a0f..8eb7db1ca7f5 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -289,9 +289,9 @@
           %% seq_id() of first undelivered message
           %% everything before this seq_id() was delivered at least once
           next_deliver_seq_id,
-          ram_pending_ack,    %% msgs using store, still in RAM
+          ram_pending_ack,    %% msgs still in RAM
           disk_pending_ack,   %% msgs in store, paged out
-          qi_pending_ack,     %% msgs using qi, *can't* be paged out
+          qi_pending_ack, %% Unused.
           index_mod,
           index_state,
           store_state,
@@ -783,6 +783,7 @@ ack(AckTags, State) ->
         lists:foldl(
           fun (SeqId, {Acc, State2}) ->
                   %% @todo When acking many messages we should update stats once not for every.
+                  %%       Also remove the pending acks all at once instead of every.
                   case remove_pending_ack(true, SeqId, State2) of
                       {none, _} ->
                           {Acc, State2};
@@ -833,8 +834,7 @@ fold(Fun, Acc, State = #vqstate{index_state = IndexState}) ->
     {Its, IndexState1} = lists:foldl(fun inext/2, {[], IndexState},
                                      [msg_iterator(State),
                                       disk_ack_iterator(State),
-                                      ram_ack_iterator(State),
-                                      qi_ack_iterator(State)]),
+                                      ram_ack_iterator(State)]),
     ifold(Fun, Acc, Its, State#vqstate{index_state = IndexState1}).
 
 len(#vqstate { len = Len }) -> Len.
@@ -895,7 +895,6 @@ ram_duration(State) ->
              %           ram_msg_count      = RamMsgCount,
              %           ram_msg_count_prev = RamMsgCountPrev,
              %           ram_pending_ack    = RPA,
-             %           qi_pending_ack     = QPA,
              %           ram_ack_count_prev = RamAckCountPrev } =
         update_rates(State),
 
@@ -944,9 +943,8 @@ msg_rates(#vqstate { rates = #rates { in  = AvgIngressRate,
 
 info(messages_ready_ram, #vqstate{ram_msg_count = RamMsgCount}) ->
     RamMsgCount;
-info(messages_unacknowledged_ram, #vqstate{ram_pending_ack = RPA,
-                                           qi_pending_ack  = QPA}) ->
-    gb_trees:size(RPA) + gb_trees:size(QPA);
+info(messages_unacknowledged_ram, #vqstate{ram_pending_ack = RPA}) ->
+    gb_trees:size(RPA);
 info(messages_ram, State) ->
     info(messages_ready_ram, State) + info(messages_unacknowledged_ram, State);
 info(messages_persistent, #vqstate{persistent_count = PersistentCount}) ->
@@ -968,9 +966,8 @@ info(message_bytes_paged_out, #vqstate{delta_transient_bytes = PagedOutBytes}) -
     PagedOutBytes;
 info(head_message_timestamp, #vqstate{
           q3               = Q3,
-          ram_pending_ack  = RPA,
-          qi_pending_ack   = QPA}) ->
-          head_message_timestamp(Q3, RPA, QPA);
+          ram_pending_ack  = RPA}) ->
+    head_message_timestamp(Q3, RPA);
 info(disk_reads, #vqstate{disk_read_count = Count}) ->
     Count;
 info(disk_writes, #vqstate{disk_write_count = Count}) ->
@@ -1066,25 +1063,20 @@ convert_from_v1_to_v2_in_memory(State = #vqstate{ q1 = Q1b,
                                                   q3 = Q3b,
                                                   q4 = Q4b,
                                                   ram_pending_ack  = RPAb,
-                                                  disk_pending_ack = DPAb,
-                                                  qi_pending_ack   = QPAb }) ->
+                                                  disk_pending_ack = DPAb }) ->
     Q1 = convert_from_v1_to_v2_queue(Q1b),
     Q2 = convert_from_v1_to_v2_queue(Q2b),
     Q3 = convert_from_v1_to_v2_queue(Q3b),
     Q4 = convert_from_v1_to_v2_queue(Q4b),
     %% We also must convert the #msg_status entries in the pending_ack fields.
-    %% Because v2 does not embed messages we must move the entries from
-    %% qi_pending_ack to ram_pending_ack where they are expected.
-    RPAc = convert_from_v1_to_v2_tree(RPAb),
-    DPA  = convert_from_v1_to_v2_tree(DPAb),
-    RPA  = convert_from_v1_to_v2_merge_tree(RPAc, QPAb),
+    RPA = convert_from_v1_to_v2_tree(RPAb),
+    DPA = convert_from_v1_to_v2_tree(DPAb),
     State#vqstate{ q1 = Q1,
                    q2 = Q2,
                    q3 = Q3,
                    q4 = Q4,
                    ram_pending_ack  = RPA,
-                   disk_pending_ack = DPA,
-                   qi_pending_ack   = gb_trees:empty() }.
+                   disk_pending_ack = DPA }.
 
 %% We change where the message is expected to be persisted to.
 %% We do not need to worry about the message location because
@@ -1097,19 +1089,6 @@ convert_from_v1_to_v2_queue(Q) ->
 convert_from_v1_to_v2_tree(T) ->
     gb_trees:map(fun (_, MsgStatus) -> convert_from_v1_to_v2_msg_status(MsgStatus) end, T).
 
-%% This function converts QPA and merges it into RPA.
-convert_from_v1_to_v2_merge_tree(RPA, QPA) ->
-    convert_from_v1_to_v2_merge_tree_loop(RPA, gb_trees:iterator(QPA)).
-
-convert_from_v1_to_v2_merge_tree_loop(T, Iterator0) ->
-    case gb_trees:next(Iterator0) of
-        none ->
-            T;
-        {Key, Value0, Iterator} ->
-            Value = convert_from_v1_to_v2_msg_status(Value0),
-            convert_from_v1_to_v2_merge_tree_loop(gb_trees:insert(Key, Value, T), Iterator)
-    end.
-
 convert_from_v1_to_v2_msg_status(MsgStatus) ->
     case MsgStatus of
         #msg_status{ persist_to = queue_index } ->
@@ -1200,8 +1179,7 @@ convert_from_v2_to_v1_in_memory(State0 = #vqstate{ q1 = Q1b,
                                                    q3 = Q3b,
                                                    q4 = Q4b,
                                                    ram_pending_ack  = RPAb,
-                                                   disk_pending_ack = DPAb,
-                                                   qi_pending_ack   = QPAb }) ->
+                                                   disk_pending_ack = DPAb }) ->
     {Q1, State1} = convert_from_v2_to_v1_queue(Q1b, State0),
     {Q2, State2} = convert_from_v2_to_v1_queue(Q2b, State1),
     {Q3, State3} = convert_from_v2_to_v1_queue(Q3b, State2),
@@ -1209,16 +1187,14 @@ convert_from_v2_to_v1_in_memory(State0 = #vqstate{ q1 = Q1b,
     %% We also must convert the #msg_status entries in the pending_ack fields.
     %% We must separate entries in the queue index from other entries as
     %% that is what is expected from the v1 index.
-    true               = gb_trees:is_empty(QPAb), %% assert
-    {RPA, QPA, State5} = convert_from_v2_to_v1_split_tree(RPAb, State4),
-    {DPA, State6}      = convert_from_v2_to_v1_tree(DPAb, State5),
+    {RPA, State5} = convert_from_v2_to_v1_tree(RPAb, State4),
+    {DPA, State6} = convert_from_v2_to_v1_tree(DPAb, State5),
     State6#vqstate{ q1 = Q1,
                     q2 = Q2,
                     q3 = Q3,
                     q4 = Q4,
                     ram_pending_ack  = RPA,
-                    disk_pending_ack = DPA,
-                    qi_pending_ack   = QPA }.
+                    disk_pending_ack = DPA }.
 
 %% We fetch the message from the per-queue store if necessary
 %% and mark all messages as delivered to make the v1 index happy.
@@ -1241,23 +1217,6 @@ convert_from_v2_to_v1_tree_loop(Iterator0, Acc, State0) ->
             convert_from_v2_to_v1_tree_loop(Iterator, gb_trees:insert(Key, Value, Acc), State)
     end.
 
-convert_from_v2_to_v1_split_tree(T, State) ->
-    convert_from_v2_to_v1_split_tree_loop(gb_trees:iterator(T), gb_trees:empty(), gb_trees:empty(), State).
-
-convert_from_v2_to_v1_split_tree_loop(Iterator0, RPAAcc, QPAAcc, State0) ->
-    case gb_trees:next(Iterator0) of
-        none ->
-            {RPAAcc, QPAAcc, State0};
-        {Key, Value0, Iterator} ->
-            {Value, State} = convert_from_v2_to_v1_msg_status(Value0, State0, false),
-            case Value of
-                #msg_status{ persist_to = queue_index } ->
-                    convert_from_v2_to_v1_split_tree_loop(Iterator, RPAAcc, gb_trees:insert(Key, Value, QPAAcc), State);
-                _ ->
-                    convert_from_v2_to_v1_split_tree_loop(Iterator, gb_trees:insert(Key, Value, RPAAcc), QPAAcc, State)
-            end
-    end.
-
 convert_from_v2_to_v1_msg_status(MsgStatus0, State1 = #vqstate{ store_state = StoreState0,
                                                                 ram_msg_count = RamMsgCount,
                                                                 ram_bytes = RamBytes }, Ready) ->
@@ -1344,12 +1303,13 @@ convert_from_v2_to_v1_loop(QueueName, V1Index0, V2Index0, V2Store0,
 %% forcing it to happen.  Pending ack msgs are included as they are
 %% regarded as unprocessed until acked, this also prevents the result
 %% apparently oscillating during repeated rejects.
-head_message_timestamp(Q3, RPA, QPA) ->
+%%
+%% @todo OK I think we can do this differently
+head_message_timestamp(Q3, RPA) ->
     HeadMsgs = [ HeadMsgStatus#msg_status.msg ||
                    HeadMsgStatus <-
                        [ get_q_head(Q3),
-                         get_pa_head(RPA),
-                         get_pa_head(QPA) ],
+                         get_pa_head(RPA) ],
                    HeadMsgStatus /= undefined,
                    HeadMsgStatus#msg_status.msg /= undefined ],
 
@@ -1570,11 +1530,9 @@ next_deliver_seq_id(_, NextDeliverSeqId) ->
     NextDeliverSeqId.
 
 is_msg_in_pending_acks(SeqId, #vqstate { ram_pending_ack  = RPA,
-                                         disk_pending_ack = DPA,
-                                         qi_pending_ack   = QPA }) ->
-    (gb_trees:is_defined(SeqId, RPA) orelse
-     gb_trees:is_defined(SeqId, DPA) orelse
-     gb_trees:is_defined(SeqId, QPA)).
+                                         disk_pending_ack = DPA }) ->
+    gb_trees:is_defined(SeqId, RPA) orelse
+    gb_trees:is_defined(SeqId, DPA).
 
 expand_delta(SeqId, ?BLANK_DELTA_PATTERN(X), IsPersistent) ->
     d(#delta { start_seq_id = SeqId, count = 1, end_seq_id = SeqId + 1,
@@ -2032,9 +1990,8 @@ is_unconfirmed_empty(#vqstate { unconfirmed = UC }) ->
     gb_sets:is_empty(UC).
 
 count_pending_acks(#vqstate { ram_pending_ack   = RPA,
-                              disk_pending_ack  = DPA,
-                              qi_pending_ack    = QPA }) ->
-    gb_trees:size(RPA) + gb_trees:size(DPA) + gb_trees:size(QPA).
+                              disk_pending_ack  = DPA }) ->
+    gb_trees:size(RPA) + gb_trees:size(DPA).
 
 %% @todo This should set the out rate to infinity temporarily while we drop deltas.
 %% @todo When doing maybe_deltas_to_betas stats are updated. Then stats
@@ -2358,33 +2315,22 @@ prepare_to_store(Msg) ->
 record_pending_ack(#msg_status { seq_id = SeqId } = MsgStatus,
                    State = #vqstate { ram_pending_ack  = RPA,
                                       disk_pending_ack = DPA,
-                                      qi_pending_ack   = QPA,
                                       ack_in_counter   = AckInCount}) ->
     Insert = fun (Tree) -> gb_trees:insert(SeqId, MsgStatus, Tree) end,
-    {RPA1, DPA1, QPA1} =
-        case {msg_in_ram(MsgStatus), persist_to(MsgStatus)} of
-            {false, _}           -> {RPA, Insert(DPA), QPA};
-            {_,     queue_index} -> {RPA, DPA, Insert(QPA)};
-            %% The per-queue store behaves more like the per-vhost store
-            %% than the v1 index embedding. We can page out to disk the
-            %% pending per-queue store messages.
-            {_,     queue_store} -> {Insert(RPA), DPA, QPA};
-            {_,     msg_store}   -> {Insert(RPA), DPA, QPA}
+    {RPA1, DPA1} =
+        case msg_in_ram(MsgStatus) of
+            false -> {RPA, Insert(DPA)};
+            _     -> {Insert(RPA), DPA}
         end,
     State #vqstate { ram_pending_ack  = RPA1,
                      disk_pending_ack = DPA1,
-                     qi_pending_ack   = QPA1,
                      ack_in_counter   = AckInCount + 1}.
 
 lookup_pending_ack(SeqId, #vqstate { ram_pending_ack  = RPA,
-                                     disk_pending_ack = DPA,
-                                     qi_pending_ack   = QPA}) ->
+                                     disk_pending_ack = DPA}) ->
     case gb_trees:lookup(SeqId, RPA) of
         {value, V} -> V;
-        none       -> case gb_trees:lookup(SeqId, DPA) of
-                          {value, V} -> V;
-                          none       -> gb_trees:get(SeqId, QPA)
-                      end
+        none       -> gb_trees:get(SeqId, DPA)
     end.
 
 %% First parameter = UpdateStats
@@ -2397,8 +2343,7 @@ remove_pending_ack(true, SeqId, State) ->
             {MsgStatus, stats_acked_pending(MsgStatus, State1)}
     end;
 remove_pending_ack(false, SeqId, State = #vqstate{ram_pending_ack  = RPA,
-                                                  disk_pending_ack = DPA,
-                                                  qi_pending_ack   = QPA}) ->
+                                                  disk_pending_ack = DPA}) ->
     case gb_trees:lookup(SeqId, RPA) of
         {value, V} -> RPA1 = gb_trees:delete(SeqId, RPA),
                       {V, State #vqstate { ram_pending_ack = RPA1 }};
@@ -2407,13 +2352,7 @@ remove_pending_ack(false, SeqId, State = #vqstate{ram_pending_ack  = RPA,
                               DPA1 = gb_trees:delete(SeqId, DPA),
                               {V, State#vqstate{disk_pending_ack = DPA1}};
                           none ->
-                              case gb_trees:lookup(SeqId, QPA) of
-                                  {value, V} ->
-                                      QPA1 = gb_trees:delete(SeqId, QPA),
-                                      {V, State#vqstate{qi_pending_ack = QPA1}};
-                                  none ->
-                                      {none, State}
-                              end
+                              {none, State}
                       end
     end.
 
@@ -2449,17 +2388,14 @@ purge_pending_ack_delete_and_terminate(
                       store_state = StoreState1 }.
 
 purge_pending_ack1(State = #vqstate { ram_pending_ack   = RPA,
-                                      disk_pending_ack  = DPA,
-                                      qi_pending_ack    = QPA }) ->
+                                      disk_pending_ack  = DPA }) ->
     F = fun (_SeqId, MsgStatus, Acc) -> accumulate_ack(MsgStatus, Acc) end,
     {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, _AllMsgIds} =
         rabbit_misc:gb_trees_fold(
           F, rabbit_misc:gb_trees_fold(
-               F,  rabbit_misc:gb_trees_fold(
-                     F, accumulate_ack_init(), RPA), DPA), QPA),
+               F, accumulate_ack_init(), RPA), DPA),
     State1 = State #vqstate { ram_pending_ack  = gb_trees:empty(),
-                              disk_pending_ack = gb_trees:empty(),
-                              qi_pending_ack   = gb_trees:empty()},
+                              disk_pending_ack = gb_trees:empty()},
     {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, State1}.
 
 %% MsgIdsByStore is an map with two keys:
@@ -2635,9 +2571,6 @@ ram_ack_iterator(State) ->
 disk_ack_iterator(State) ->
     {ack, gb_trees:iterator(State#vqstate.disk_pending_ack)}.
 
-qi_ack_iterator(State) ->
-    {ack, gb_trees:iterator(State#vqstate.qi_pending_ack)}.
-
 msg_iterator(State) -> istate(start, State).
 
 istate(start, State) -> {q3,    State#vqstate.q3,    State};

From 0f8a8ba9096cf79d41e39df22be01ba152bc71ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Wed, 8 Jun 2022 16:31:08 +0200
Subject: [PATCH 13/39] CQ: Use maps instead of gb_trees for pending acks

This should generate less garbage and provide better
performance for large multi-acks.
---
 deps/rabbit/src/rabbit_variable_queue.erl | 98 +++++++++++------------
 1 file changed, 47 insertions(+), 51 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 8eb7db1ca7f5..644f4cc555e2 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -416,9 +416,9 @@
              q4                    :: ?QUEUE:?QUEUE(),
              next_seq_id           :: seq_id(),
              next_deliver_seq_id   :: seq_id(),
-             ram_pending_ack       :: gb_trees:tree(),
-             disk_pending_ack      :: gb_trees:tree(),
-             qi_pending_ack        :: gb_trees:tree(),
+             ram_pending_ack       :: map(),
+             disk_pending_ack      :: map(),
+             qi_pending_ack        :: undefined,
              index_mod             :: rabbit_queue_index | rabbit_classic_queue_index_v2,
              index_state           :: any(),
              store_state           :: any(),
@@ -898,7 +898,7 @@ ram_duration(State) ->
              %           ram_ack_count_prev = RamAckCountPrev } =
         update_rates(State),
 
-%    RamAckCount = gb_trees:size(RPA) + gb_trees:size(QPA),
+%    RamAckCount = map_size(RPA),
 %
 %    Duration = %% msgs+acks / (msgs+acks/sec) == sec
 %        case lists:all(fun (X) -> X < 0.01 end,
@@ -944,7 +944,7 @@ msg_rates(#vqstate { rates = #rates { in  = AvgIngressRate,
 info(messages_ready_ram, #vqstate{ram_msg_count = RamMsgCount}) ->
     RamMsgCount;
 info(messages_unacknowledged_ram, #vqstate{ram_pending_ack = RPA}) ->
-    gb_trees:size(RPA);
+    map_size(RPA);
 info(messages_ram, State) ->
     info(messages_ready_ram, State) + info(messages_unacknowledged_ram, State);
 info(messages_persistent, #vqstate{persistent_count = PersistentCount}) ->
@@ -1069,8 +1069,8 @@ convert_from_v1_to_v2_in_memory(State = #vqstate{ q1 = Q1b,
     Q3 = convert_from_v1_to_v2_queue(Q3b),
     Q4 = convert_from_v1_to_v2_queue(Q4b),
     %% We also must convert the #msg_status entries in the pending_ack fields.
-    RPA = convert_from_v1_to_v2_tree(RPAb),
-    DPA = convert_from_v1_to_v2_tree(DPAb),
+    RPA = convert_from_v1_to_v2_map(RPAb),
+    DPA = convert_from_v1_to_v2_map(DPAb),
     State#vqstate{ q1 = Q1,
                    q2 = Q2,
                    q3 = Q3,
@@ -1086,8 +1086,8 @@ convert_from_v1_to_v2_queue(Q) ->
     List = lists:map(fun (MsgStatus) -> convert_from_v1_to_v2_msg_status(MsgStatus) end, List0),
     ?QUEUE:from_list(List).
 
-convert_from_v1_to_v2_tree(T) ->
-    gb_trees:map(fun (_, MsgStatus) -> convert_from_v1_to_v2_msg_status(MsgStatus) end, T).
+convert_from_v1_to_v2_map(T) ->
+    maps:map(fun (_, MsgStatus) -> convert_from_v1_to_v2_msg_status(MsgStatus) end, T).
 
 convert_from_v1_to_v2_msg_status(MsgStatus) ->
     case MsgStatus of
@@ -1187,8 +1187,8 @@ convert_from_v2_to_v1_in_memory(State0 = #vqstate{ q1 = Q1b,
     %% We also must convert the #msg_status entries in the pending_ack fields.
     %% We must separate entries in the queue index from other entries as
     %% that is what is expected from the v1 index.
-    {RPA, State5} = convert_from_v2_to_v1_tree(RPAb, State4),
-    {DPA, State6} = convert_from_v2_to_v1_tree(DPAb, State5),
+    {RPA, State5} = convert_from_v2_to_v1_map(RPAb, State4),
+    {DPA, State6} = convert_from_v2_to_v1_map(DPAb, State5),
     State6#vqstate{ q1 = Q1,
                     q2 = Q2,
                     q3 = Q3,
@@ -1205,16 +1205,16 @@ convert_from_v2_to_v1_queue(Q, State0) ->
     end, State0, List0),
     {?QUEUE:from_list(List), State}.
 
-convert_from_v2_to_v1_tree(T, State) ->
-    convert_from_v2_to_v1_tree_loop(gb_trees:iterator(T), gb_trees:empty(), State).
+convert_from_v2_to_v1_map(T, State) ->
+    convert_from_v2_to_v1_map_loop(maps:iterator(T), #{}, State).
 
-convert_from_v2_to_v1_tree_loop(Iterator0, Acc, State0) ->
-    case gb_trees:next(Iterator0) of
+convert_from_v2_to_v1_map_loop(Iterator0, Acc, State0) ->
+    case maps:next(Iterator0) of
         none ->
             {Acc, State0};
         {Key, Value0, Iterator} ->
             {Value, State} = convert_from_v2_to_v1_msg_status(Value0, State0, false),
-            convert_from_v2_to_v1_tree_loop(Iterator, gb_trees:insert(Key, Value, Acc), State)
+            convert_from_v2_to_v1_map_loop(Iterator, maps:put(Key, Value, Acc), State)
     end.
 
 convert_from_v2_to_v1_msg_status(MsgStatus0, State1 = #vqstate{ store_state = StoreState0,
@@ -1329,11 +1329,11 @@ get_q_head(Q) ->
     ?QUEUE:get(Q, undefined).
 
 get_pa_head(PA) ->
-    case gb_trees:is_empty(PA) of
-        false ->
-            {_, MsgStatus} = gb_trees:smallest(PA),
-            MsgStatus;
-        true  -> undefined
+    case maps:keys(PA) of
+        [] -> undefined;
+        Keys ->
+            Smallest = lists:min(Keys),
+            map_get(Smallest, PA)
     end.
 
 a(State = #vqstate { delta = Delta, q3 = Q3,
@@ -1531,8 +1531,8 @@ next_deliver_seq_id(_, NextDeliverSeqId) ->
 
 is_msg_in_pending_acks(SeqId, #vqstate { ram_pending_ack  = RPA,
                                          disk_pending_ack = DPA }) ->
-    gb_trees:is_defined(SeqId, RPA) orelse
-    gb_trees:is_defined(SeqId, DPA).
+    maps:is_key(SeqId, RPA) orelse
+    maps:is_key(SeqId, DPA).
 
 expand_delta(SeqId, ?BLANK_DELTA_PATTERN(X), IsPersistent) ->
     d(#delta { start_seq_id = SeqId, count = 1, end_seq_id = SeqId + 1,
@@ -1599,9 +1599,8 @@ init(QueueVsn, IsDurable, IndexMod, IndexState, StoreState, DeltaCount, DeltaByt
       q4                  = ?QUEUE:new(),
       next_seq_id         = NextSeqId,
       next_deliver_seq_id = NextDeliverSeqId,
-      ram_pending_ack     = gb_trees:empty(),
-      disk_pending_ack    = gb_trees:empty(),
-      qi_pending_ack      = gb_trees:empty(),
+      ram_pending_ack     = #{},
+      disk_pending_ack    = #{},
       index_mod           = IndexMod,
       index_state         = IndexState1,
       store_state         = StoreState,
@@ -1991,7 +1990,7 @@ is_unconfirmed_empty(#vqstate { unconfirmed = UC }) ->
 
 count_pending_acks(#vqstate { ram_pending_ack   = RPA,
                               disk_pending_ack  = DPA }) ->
-    gb_trees:size(RPA) + gb_trees:size(DPA).
+    map_size(RPA) + map_size(DPA).
 
 %% @todo This should set the out rate to infinity temporarily while we drop deltas.
 %% @todo When doing maybe_deltas_to_betas stats are updated. Then stats
@@ -2316,11 +2315,10 @@ record_pending_ack(#msg_status { seq_id = SeqId } = MsgStatus,
                    State = #vqstate { ram_pending_ack  = RPA,
                                       disk_pending_ack = DPA,
                                       ack_in_counter   = AckInCount}) ->
-    Insert = fun (Tree) -> gb_trees:insert(SeqId, MsgStatus, Tree) end,
     {RPA1, DPA1} =
         case msg_in_ram(MsgStatus) of
-            false -> {RPA, Insert(DPA)};
-            _     -> {Insert(RPA), DPA}
+            false -> {RPA, maps:put(SeqId, MsgStatus, DPA)};
+            _     -> {maps:put(SeqId, MsgStatus, RPA), DPA}
         end,
     State #vqstate { ram_pending_ack  = RPA1,
                      disk_pending_ack = DPA1,
@@ -2328,9 +2326,9 @@ record_pending_ack(#msg_status { seq_id = SeqId } = MsgStatus,
 
 lookup_pending_ack(SeqId, #vqstate { ram_pending_ack  = RPA,
                                      disk_pending_ack = DPA}) ->
-    case gb_trees:lookup(SeqId, RPA) of
-        {value, V} -> V;
-        none       -> gb_trees:get(SeqId, DPA)
+    case maps:get(SeqId, RPA, none) of
+        none -> maps:get(SeqId, DPA);
+        V    -> V
     end.
 
 %% First parameter = UpdateStats
@@ -2344,16 +2342,16 @@ remove_pending_ack(true, SeqId, State) ->
     end;
 remove_pending_ack(false, SeqId, State = #vqstate{ram_pending_ack  = RPA,
                                                   disk_pending_ack = DPA}) ->
-    case gb_trees:lookup(SeqId, RPA) of
-        {value, V} -> RPA1 = gb_trees:delete(SeqId, RPA),
-                      {V, State #vqstate { ram_pending_ack = RPA1 }};
-        none       -> case gb_trees:lookup(SeqId, DPA) of
-                          {value, V} ->
-                              DPA1 = gb_trees:delete(SeqId, DPA),
-                              {V, State#vqstate{disk_pending_ack = DPA1}};
-                          none ->
-                              {none, State}
-                      end
+    case maps:get(SeqId, RPA, none) of
+        none -> case maps:get(SeqId, DPA, none) of
+                     none ->
+                         {none, State};
+                     V ->
+                         DPA1 = maps:remove(SeqId, DPA),
+                         {V, State#vqstate{disk_pending_ack = DPA1}}
+                 end;
+        V    -> RPA1 = maps:remove(SeqId, RPA),
+                {V, State #vqstate { ram_pending_ack = RPA1 }}
     end.
 
 purge_pending_ack(KeepPersistent,
@@ -2391,11 +2389,9 @@ purge_pending_ack1(State = #vqstate { ram_pending_ack   = RPA,
                                       disk_pending_ack  = DPA }) ->
     F = fun (_SeqId, MsgStatus, Acc) -> accumulate_ack(MsgStatus, Acc) end,
     {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, _AllMsgIds} =
-        rabbit_misc:gb_trees_fold(
-          F, rabbit_misc:gb_trees_fold(
-               F, accumulate_ack_init(), RPA), DPA),
-    State1 = State #vqstate { ram_pending_ack  = gb_trees:empty(),
-                              disk_pending_ack = gb_trees:empty()},
+        maps:fold(F, maps:fold(F, accumulate_ack_init(), RPA), DPA),
+    State1 = State #vqstate{ram_pending_ack  = #{},
+                            disk_pending_ack = #{}},
     {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, State1}.
 
 %% MsgIdsByStore is an map with two keys:
@@ -2566,10 +2562,10 @@ delta_limit(#delta { start_seq_id = StartSeqId }) -> StartSeqId.
 %%----------------------------------------------------------------------------
 
 ram_ack_iterator(State) ->
-    {ack, gb_trees:iterator(State#vqstate.ram_pending_ack)}.
+    {ack, maps:iterator(State#vqstate.ram_pending_ack)}.
 
 disk_ack_iterator(State) ->
-    {ack, gb_trees:iterator(State#vqstate.disk_pending_ack)}.
+    {ack, maps:iterator(State#vqstate.disk_pending_ack)}.
 
 msg_iterator(State) -> istate(start, State).
 
@@ -2578,7 +2574,7 @@ istate(q3,    State) -> {delta, State#vqstate.delta, State};
 istate(delta, _State) -> done.
 
 next({ack, It}, IndexState) ->
-    case gb_trees:next(It) of
+    case maps:next(It) of
         none                     -> {empty, IndexState};
         {_SeqId, MsgStatus, It1} -> Next = {ack, It1},
                                     {value, MsgStatus, true, Next, IndexState}

From 1fb44267b44e56281f6f34a4a37e30acd52d1157 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 9 Jun 2022 13:24:10 +0200
Subject: [PATCH 14/39] Tweak test suites following CQ changes

---
 .../test/channel_operation_timeout_test_queue.erl | 14 +++++++-------
 deps/rabbit/test/lazy_queue_SUITE.erl             | 15 ++++-----------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/deps/rabbit/test/channel_operation_timeout_test_queue.erl b/deps/rabbit/test/channel_operation_timeout_test_queue.erl
index e3c36adbab37..c06509dd4d8c 100644
--- a/deps/rabbit/test/channel_operation_timeout_test_queue.erl
+++ b/deps/rabbit/test/channel_operation_timeout_test_queue.erl
@@ -149,9 +149,9 @@
              q3                    :: ?QUEUE:?QUEUE(),
              q4                    :: ?QUEUE:?QUEUE(),
              next_seq_id           :: seq_id(),
-             ram_pending_ack       :: gb_trees:tree(),
-             disk_pending_ack      :: gb_trees:tree(),
-             qi_pending_ack        :: gb_trees:tree(),
+             ram_pending_ack       :: map(),
+             disk_pending_ack      :: map(),
+             qi_pending_ack        :: map(),
              index_state           :: any(),
              msg_store_clients     :: 'undefined' | {{any(), binary()},
                                                     {any(), binary()}},
@@ -211,7 +211,7 @@ delete_and_terminate(Reason, State) ->
 delete_crashed(Q) ->
     rabbit_variable_queue:delete_crashed(Q).
 
-purge(State = #vqstate { qi_pending_ack= QPA }) ->
+purge(State = #vqstate { ram_pending_ack= QPA }) ->
     maybe_delay(QPA),
     rabbit_variable_queue:purge(State);
 %% For v3.9.x and below because the state has changed.
@@ -255,7 +255,7 @@ drop(AckRequired, State) ->
 ack(List, State) ->
     rabbit_variable_queue:ack(List, State).
 
-requeue(AckTags, #vqstate { qi_pending_ack = QPA } = State) ->
+requeue(AckTags, #vqstate { ram_pending_ack = QPA } = State) ->
     maybe_delay(QPA),
     rabbit_variable_queue:requeue(AckTags, State);
 %% For v3.9.x and below because the state has changed.
@@ -270,7 +270,7 @@ ackfold(MsgFun, Acc, State, AckTags) ->
 fold(Fun, Acc, State) ->
     rabbit_variable_queue:fold(Fun, Acc, State).
 
-len(#vqstate { qi_pending_ack = QPA } = State) ->
+len(#vqstate { ram_pending_ack = QPA } = State) ->
     maybe_delay(QPA),
     rabbit_variable_queue:len(State);
 %% For v3.9.x and below because the state has changed.
@@ -325,7 +325,7 @@ zip_msgs_and_acks(Msgs, AckTags, Accumulator, State) ->
 
 %% Delay
 maybe_delay(QPA) ->
-  case is_timeout_test(gb_trees:values(QPA)) of
+  case is_timeout_test(maps:values(QPA)) of
     true -> receive
               %% The queue received an EXIT message, it's probably the
               %% node being stopped with "rabbitmqctl stop". Thus, abort
diff --git a/deps/rabbit/test/lazy_queue_SUITE.erl b/deps/rabbit/test/lazy_queue_SUITE.erl
index 7f91f2c3fbf2..2c2b5f26233e 100644
--- a/deps/rabbit/test/lazy_queue_SUITE.erl
+++ b/deps/rabbit/test/lazy_queue_SUITE.erl
@@ -192,18 +192,11 @@ set_ha_mode_policy(Config, Node, Mode) ->
       [{<<"queue-mode">>, Mode}]).
 
 
-wait_for_queue_mode(_Node, _Q, _Mode, Max) when Max < 0 ->
-    fail;
-wait_for_queue_mode(Node, Q, Mode, Max) ->
-    case get_queue_mode(Node, Q) of
-        Mode  -> ok;
-        _     -> timer:sleep(100),
-                 wait_for_queue_mode(Node, Q, Mode, Max - 100)
-    end.
+wait_for_queue_mode(_, _, _, _) ->
+    ok.
 
-assert_queue_mode(Node, Q, Expected) ->
-    Actual = get_queue_mode(Node, Q),
-    Expected = Actual.
+assert_queue_mode(_, _, _) ->
+    ok.
 
 get_queue_mode(Node, Q) ->
     QNameRes = rabbit_misc:r(<<"/">>, queue, Q),

From a31be66af56892d2af9b01cfdd46d7860d3ed782 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 9 Jun 2022 15:47:38 +0200
Subject: [PATCH 15/39] CQ: Fix prop suite after removal of lazy and other
 changes

---
 .../src/rabbit_classic_queue_store_v2.erl     |   4 +
 deps/rabbit/src/rabbit_variable_queue.erl     |   1 +
 deps/rabbit/test/classic_queue_prop_SUITE.erl | 111 +++---------------
 3 files changed, 24 insertions(+), 92 deletions(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 3787c232605d..7e21fa90bca5 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -186,6 +186,10 @@ write(SeqId, Msg=#basic_message{ id = MsgId }, Props, State0) ->
     %%       after reading it back from disk. But we have to support
     %%       going back to v1 for the time being. When rolling back
     %%       to v1 is no longer possible, set `id = undefined` here.
+    %% @todo We should manage flushing to disk ourselves so that we
+    %%       only do term_to_iovec if the message is going to hit
+    %%       the disk. If it won't, we can replace it with an empty
+    %%       binary. The size can be estimated via erlang:external_size/1.
     MsgIovec = term_to_iovec(Msg),
     Size = iolist_size(MsgIovec),
     %% Calculate the CRC for the data if configured to do so.
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 644f4cc555e2..a554168f0020 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -1222,6 +1222,7 @@ convert_from_v2_to_v1_msg_status(MsgStatus0, State1 = #vqstate{ store_state = St
                                                                 ram_bytes = RamBytes }, Ready) ->
     case MsgStatus0 of
         #msg_status{ seq_id = SeqId,
+                     msg = undefined,
                      msg_location = MsgLocation = {rabbit_classic_queue_store_v2, _, _} } ->
             {Msg, StoreState} = rabbit_classic_queue_store_v2:read(SeqId, MsgLocation, StoreState0),
             MsgStatus = MsgStatus0#msg_status{ msg = Msg,
diff --git a/deps/rabbit/test/classic_queue_prop_SUITE.erl b/deps/rabbit/test/classic_queue_prop_SUITE.erl
index 8814619b3d93..c25bbb0606a1 100644
--- a/deps/rabbit/test/classic_queue_prop_SUITE.erl
+++ b/deps/rabbit/test/classic_queue_prop_SUITE.erl
@@ -23,7 +23,6 @@
 -record(cq, {
     amq = undefined :: amqqueue:amqqueue(),
     name :: atom(),
-    mode :: classic | lazy,
     version :: 1 | 2,
 
     %% We have one queue per way of publishing messages (such as channels).
@@ -80,9 +79,7 @@ groups() ->
     [{classic_queue_tests, [], [
 %        manual%,
         classic_queue_v1,
-        lazy_queue_v1,
-        classic_queue_v2,
-        lazy_queue_v2
+        classic_queue_v2
      ]},
      {classic_queue_regressions, [], [
         reg_v1_full_recover_only_journal
@@ -136,10 +133,10 @@ instrs_to_manual([Instrs]) ->
     io:format("~ndo_manual(Config) ->~n~n"),
     lists:foreach(fun
         ({init, CQ}) ->
-            #cq{name=Name, mode=Mode, version=Version} = CQ,
-            io:format("    St0 = #cq{name=~0p, mode=~0p, version=~0p,~n"
+            #cq{name=Name, version=Version} = CQ,
+            io:format("    St0 = #cq{name=~0p, version=~0p,~n"
                       "              config=minimal_config(Config)},~n~n",
-                      [Name, Mode, Version]);
+                      [Name, Version]);
         ({set, {var,Var}, {call, ?MODULE, cmd_setup_queue, _}}) ->
             Res = "Res" ++ integer_to_list(Var),
             PrevSt = "St" ++ integer_to_list(Var - 1),
@@ -206,15 +203,6 @@ do_classic_queue_v1(Config) ->
                              [{on_output, on_output_fun()},
                               {numtests, ?NUM_TESTS}]).
 
-lazy_queue_v1(Config) ->
-    true = rabbit_ct_broker_helpers:rpc(Config, 0,
-        ?MODULE, do_lazy_queue_v1, [Config]).
-
-do_lazy_queue_v1(Config) ->
-    true = proper:quickcheck(prop_lazy_queue_v1(Config),
-                             [{on_output, on_output_fun()},
-                              {numtests, ?NUM_TESTS}]).
-
 classic_queue_v2(Config) ->
     true = rabbit_ct_broker_helpers:rpc(Config, 0,
         ?MODULE, do_classic_queue_v2, [Config]).
@@ -224,15 +212,6 @@ do_classic_queue_v2(Config) ->
                              [{on_output, on_output_fun()},
                               {numtests, ?NUM_TESTS}]).
 
-lazy_queue_v2(Config) ->
-    true = rabbit_ct_broker_helpers:rpc(Config, 0,
-        ?MODULE, do_lazy_queue_v2, [Config]).
-
-do_lazy_queue_v2(Config) ->
-    true = proper:quickcheck(prop_lazy_queue_v2(Config),
-                             [{on_output, on_output_fun()},
-                              {numtests, ?NUM_TESTS}]).
-
 on_output_fun() ->
     fun (".", _) -> ok; % don't print the '.'s on new lines
         ("!", _) -> ok;
@@ -245,25 +224,13 @@ on_output_fun() ->
 
 prop_classic_queue_v1(Config) ->
     {ok, LimiterPid} = rabbit_limiter:start_link(no_id),
-    InitialState = #cq{name=?FUNCTION_NAME, mode=default, version=1,
-                       config=minimal_config(Config), limiter=LimiterPid},
-    prop_common(InitialState).
-
-prop_lazy_queue_v1(Config) ->
-    {ok, LimiterPid} = rabbit_limiter:start_link(no_id),
-    InitialState = #cq{name=?FUNCTION_NAME, mode=lazy, version=1,
+    InitialState = #cq{name=?FUNCTION_NAME, version=1,
                        config=minimal_config(Config), limiter=LimiterPid},
     prop_common(InitialState).
 
 prop_classic_queue_v2(Config) ->
     {ok, LimiterPid} = rabbit_limiter:start_link(no_id),
-    InitialState = #cq{name=?FUNCTION_NAME, mode=default, version=2,
-                       config=minimal_config(Config), limiter=LimiterPid},
-    prop_common(InitialState).
-
-prop_lazy_queue_v2(Config) ->
-    {ok, LimiterPid} = rabbit_limiter:start_link(no_id),
-    InitialState = #cq{name=?FUNCTION_NAME, mode=lazy, version=2,
+    InitialState = #cq{name=?FUNCTION_NAME, version=2,
                        config=minimal_config(Config), limiter=LimiterPid},
     prop_common(InitialState).
 
@@ -317,9 +284,7 @@ command(St) ->
         %% These change internal configuration.
         { 10, {call, ?MODULE, cmd_set_v2_check_crc32, [boolean()]}},
         %% These set policies.
-        { 50, {call, ?MODULE, cmd_set_mode, [St, oneof([default, lazy])]}},
-        { 50, {call, ?MODULE, cmd_set_version, [St, oneof([1, 2])]}},
-        { 50, {call, ?MODULE, cmd_set_mode_version, [oneof([default, lazy]), oneof([1, 2])]}},
+        { 50, {call, ?MODULE, cmd_set_version, [oneof([1, 2])]}},
         %% These are direct operations using internal functions.
         { 50, {call, ?MODULE, cmd_publish_msg, [St, integer(0, 1024*1024), integer(1, 2), boolean(), expiration()]}},
         { 50, {call, ?MODULE, cmd_basic_get_msg, [St]}},
@@ -375,12 +340,8 @@ next_state(St=#cq{q=Q0, confirmed=Confirmed, uncertain=Uncertain0}, AMQ, {call,
     St#cq{amq=AMQ, q=Q, restarted=true, crashed=true, uncertain=Uncertain};
 next_state(St, _, {call, _, cmd_set_v2_check_crc32, _}) ->
     St;
-next_state(St, _, {call, _, cmd_set_mode, [_, Mode]}) ->
-    St#cq{mode=Mode};
-next_state(St, _, {call, _, cmd_set_version, [_, Version]}) ->
+next_state(St, _, {call, _, cmd_set_version, [Version]}) ->
     St#cq{version=Version};
-next_state(St, _, {call, _, cmd_set_mode_version, [Mode, Version]}) ->
-    St#cq{mode=Mode, version=Version};
 next_state(St=#cq{q=Q}, Msg, {call, _, cmd_publish_msg, _}) ->
     IntQ = maps:get(internal, Q, queue:new()),
     St#cq{q=Q#{internal => queue:in(Msg, IntQ)}};
@@ -566,14 +527,8 @@ postcondition(_, {call, _, Cmd, _}, Q) when
     element(1, Q) =:= amqqueue;
 postcondition(_, {call, _, cmd_set_v2_check_crc32, _}, Res) ->
     Res =:= ok;
-postcondition(#cq{amq=AMQ}, {call, _, cmd_set_mode, [_, Mode]}, _) ->
-    do_check_queue_mode(AMQ, Mode) =:= ok;
-postcondition(#cq{amq=AMQ}, {call, _, cmd_set_version, [_, Version]}, _) ->
+postcondition(#cq{amq=AMQ}, {call, _, cmd_set_version, [Version]}, _) ->
     do_check_queue_version(AMQ, Version) =:= ok;
-postcondition(#cq{amq=AMQ}, {call, _, cmd_set_mode_version, [Mode, Version]}, _) ->
-    (do_check_queue_mode(AMQ, Mode) =:= ok)
-    andalso
-    (do_check_queue_version(AMQ, Version) =:= ok);
 postcondition(_, {call, _, cmd_publish_msg, _}, Msg) ->
     is_record(Msg, amqp_msg);
 postcondition(_, {call, _, cmd_purge, _}, Res) ->
@@ -736,22 +691,20 @@ crashed_and_previously_received(#cq{crashed=Crashed, received=Received}, Msg) ->
 
 %% Helpers.
 
-cmd_setup_queue(St=#cq{name=Name, mode=Mode, version=Version}) ->
+cmd_setup_queue(St=#cq{name=Name, version=Version}) ->
     ?DEBUG("~0p", [St]),
     IsDurable = true, %% We want to be able to restart the queue process.
     IsAutoDelete = false,
-    %% We cannot use args to set mode/version as the arguments override
+    %% We cannot use args to set the version as the arguments override
     %% the policies and we also want to test policy changes.
-    cmd_set_mode_version(Mode, Version),
+    cmd_set_version(Version),
     Args = [
-%        {<<"x-queue-mode">>, longstr, atom_to_binary(Mode, utf8)},
 %        {<<"x-queue-version">>, long, Version}
     ],
     QName = rabbit_misc:r(<<"/">>, queue, iolist_to_binary([atom_to_binary(Name, utf8), $_,
                                                             integer_to_binary(erlang:unique_integer([positive]))])),
     {new, AMQ} = rabbit_amqqueue:declare(QName, IsDurable, IsAutoDelete, Args, none, <<"acting-user">>),
-    %% We check that the queue was creating with the right mode/version.
-    ok = do_check_queue_mode(AMQ, Mode),
+    %% We check that the queue was creating with the right version.
     ok = do_check_queue_version(AMQ, Version),
     AMQ.
 
@@ -768,7 +721,7 @@ cmd_teardown_queue(St=#cq{amq=AMQ, channels=Channels}) ->
         || Ch <- maps:keys(Channels)],
     %% Then we can delete the queue.
     rabbit_amqqueue:delete(AMQ, false, false, <<"acting-user">>),
-    rabbit_policy:delete(<<"/">>, <<"queue-mode-version-policy">>, <<"acting-user">>),
+    rabbit_policy:delete(<<"/">>, <<"queue-version-policy">>, <<"acting-user">>),
     ok.
 
 cmd_restart_vhost_clean(St=#cq{amq=AMQ0}) ->
@@ -812,27 +765,11 @@ do_wait_updated_amqqueue(Name, Pid) ->
 cmd_set_v2_check_crc32(Value) ->
     application:set_env(rabbit, classic_queue_store_v2_check_crc32, Value).
 
-cmd_set_mode(St=#cq{version=Version}, Mode) ->
-    ?DEBUG("~0p ~0p", [St, Mode]),
-    do_set_policy(Mode, Version).
-
-%% We loop until the queue has switched mode.
-do_check_queue_mode(AMQ, Mode) ->
-    do_check_queue_mode(AMQ, Mode, 1000).
-
-do_check_queue_mode(_, _, 0) ->
-    error;
-do_check_queue_mode(AMQ, Mode, N) ->
-    timer:sleep(1),
-    [{backing_queue_status, Status}] = rabbit_amqqueue:info(AMQ, [backing_queue_status]),
-    case proplists:get_value(mode, Status) of
-        Mode -> ok;
-        _ -> do_check_queue_mode(AMQ, Mode, N - 1)
-    end.
-
-cmd_set_version(St=#cq{mode=Mode}, Version) ->
-    ?DEBUG("~0p ~0p", [St, Version]),
-    do_set_policy(Mode, Version).
+cmd_set_version(Version) ->
+    ?DEBUG("~0p ~0p", [Version]),
+    rabbit_policy:set(<<"/">>, <<"queue-version-policy">>, <<".*">>,
+        [{<<"queue-version">>, Version}],
+        0, <<"queues">>, <<"acting-user">>).
 
 %% We loop until the queue has switched version.
 do_check_queue_version(AMQ, Version) ->
@@ -848,16 +785,6 @@ do_check_queue_version(AMQ, Version, N) ->
         _ -> do_check_queue_version(AMQ, Version, N - 1)
     end.
 
-cmd_set_mode_version(Mode, Version) ->
-    ?DEBUG("~0p ~0p", [Mode, Version]),
-    do_set_policy(Mode, Version).
-
-do_set_policy(Mode, Version) ->
-    rabbit_policy:set(<<"/">>, <<"queue-mode-version-policy">>, <<".*">>,
-        [{<<"queue-mode">>, atom_to_binary(Mode, utf8)},
-         {<<"queue-version">>, Version}],
-        0, <<"queues">>, <<"acting-user">>).
-
 cmd_publish_msg(St=#cq{amq=AMQ}, PayloadSize, DeliveryMode, Mandatory, Expiration) ->
     ?DEBUG("~0p ~0p ~0p ~0p ~0p", [St, PayloadSize, DeliveryMode, Mandatory, Expiration]),
     Payload = do_rand_payload(PayloadSize),

From 3683ab9a6e205ffe42c654890ce3fce223ca8d36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Fri, 10 Jun 2022 15:24:02 +0200
Subject: [PATCH 16/39] CQ: Use v2 sets instead of gb_sets for confirms

For the following flags I see an improvement of
30k/s to 34k/s on my machine:

-x 1 -y 1 -A 1000 -q 1000 -c 1000 -s 1000 -f persistent
-u cqv2 --queue-args=x-queue-version=2
---
 .../src/rabbit_classic_queue_index_v2.erl     | 12 ++---
 .../src/rabbit_classic_queue_store_v2.erl     | 10 ++--
 deps/rabbit/src/rabbit_msg_store.erl          | 21 ++++----
 deps/rabbit/src/rabbit_queue_index.erl        | 24 ++++-----
 deps/rabbit/src/rabbit_variable_queue.erl     | 52 +++++++++----------
 deps/rabbit/test/backing_queue_SUITE.erl      | 16 +++---
 deps/rabbit_common/src/rabbit_misc.erl        |  5 --
 7 files changed, 67 insertions(+), 73 deletions(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_index_v2.erl b/deps/rabbit/src/rabbit_classic_queue_index_v2.erl
index 7e3d1d1200c3..6a9b73f8753f 100644
--- a/deps/rabbit/src/rabbit_classic_queue_index_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_index_v2.erl
@@ -111,7 +111,7 @@
     %% and there are outstanding unconfirmed messages.
     %% In that case the buffer is flushed to disk when
     %% the queue requests a sync (after a timeout).
-    confirms = gb_sets:new() :: gb_sets:set(),
+    confirms = sets:new([{version,2}]) :: sets:set(),
 
     %% Segments we currently know of along with the
     %% number of unacked messages remaining in the
@@ -156,7 +156,7 @@
 
 %% Types copied from rabbit_queue_index.
 
--type on_sync_fun() :: fun ((gb_sets:set()) -> ok).
+-type on_sync_fun() :: fun ((sets:set()) -> ok).
 -type contains_predicate() :: fun ((rabbit_types:msg_id()) -> boolean()).
 -type shutdown_terms() :: list() | 'non_clean_shutdown'.
 
@@ -658,7 +658,7 @@ reduce_fd_usage(SegmentToOpen, State = #qi{ fds = OpenFds0 }) ->
 
 maybe_mark_unconfirmed(MsgId, #message_properties{ needs_confirming = true },
         State = #qi { confirms = Confirms }) ->
-    State#qi{ confirms = gb_sets:add_element(MsgId, Confirms) };
+    State#qi{ confirms = sets:add_element(MsgId, Confirms) };
 maybe_mark_unconfirmed(_, _, State) ->
     State.
 
@@ -1055,19 +1055,19 @@ sync(State0 = #qi{ confirms = Confirms,
                    on_sync = OnSyncFun }) ->
     ?DEBUG("~0p", [State0]),
     State = flush_buffer(State0, full, segment_entry_count()),
-    _ = case gb_sets:is_empty(Confirms) of
+    _ = case sets:is_empty(Confirms) of
         true ->
             ok;
         false ->
             OnSyncFun(Confirms)
     end,
-    State#qi{ confirms = gb_sets:new() }.
+    State#qi{ confirms = sets:new([{version,2}]) }.
 
 -spec needs_sync(state()) -> 'false'.
 
 needs_sync(State = #qi{ confirms = Confirms }) ->
     ?DEBUG("~0p", [State]),
-    case gb_sets:is_empty(Confirms) of
+    case sets:is_empty(Confirms) of
         true -> false;
         false -> confirms
     end.
diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 7e21fa90bca5..1af34cb8e7d7 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -122,7 +122,7 @@
     %% publisher confirms will be sent at regular
     %% intervals after the index has been flushed
     %% to disk.
-    confirms = gb_sets:new() :: gb_sets:set(),
+    confirms = sets:new([{version,2}]) :: sets:set(),
     on_sync :: on_sync_fun()
 }).
 
@@ -131,7 +131,7 @@
 -type msg_location() :: {?MODULE, non_neg_integer(), non_neg_integer()}.
 -export_type([msg_location/0]).
 
--type on_sync_fun() :: fun ((gb_sets:set(), 'written' | 'ignored') -> any()).
+-type on_sync_fun() :: fun ((sets:set(), 'written' | 'ignored') -> any()).
 
 -spec init(rabbit_amqqueue:name(), on_sync_fun()) -> state().
 
@@ -248,7 +248,7 @@ maybe_cache(SeqId, MsgSize, Msg, State = #qs{ cache = Cache,
 
 maybe_mark_unconfirmed(MsgId, #message_properties{ needs_confirming = true },
         State = #qs { confirms = Confirms }) ->
-    State#qs{ confirms = gb_sets:add_element(MsgId, Confirms) };
+    State#qs{ confirms = sets:add_element(MsgId, Confirms) };
 maybe_mark_unconfirmed(_, _, State) ->
     State.
 
@@ -258,12 +258,12 @@ sync(State = #qs{ confirms = Confirms,
                   on_sync = OnSyncFun }) ->
     ?DEBUG("~0p", [State]),
     flush_write_fd(State),
-    case gb_sets:is_empty(Confirms) of
+    case sets:is_empty(Confirms) of
         true ->
             State;
         false ->
             OnSyncFun(Confirms, written),
-            State#qs{ confirms = gb_sets:new() }
+            State#qs{ confirms = sets:new([{version,2}]) }
     end.
 
 -spec read(rabbit_variable_queue:seq_id(), msg_location(), State)
diff --git a/deps/rabbit/src/rabbit_msg_store.erl b/deps/rabbit/src/rabbit_msg_store.erl
index 5f8601bad8e0..52386b2fd873 100644
--- a/deps/rabbit/src/rabbit_msg_store.erl
+++ b/deps/rabbit/src/rabbit_msg_store.erl
@@ -162,7 +162,7 @@
         fun ((A) -> 'finished' |
                     {rabbit_types:msg_id(), non_neg_integer(), A}).
 -type maybe_msg_id_fun() ::
-        'undefined' | fun ((gb_sets:set(), 'written' | 'ignored') -> any()).
+        'undefined' | fun ((sets:set(), 'written' | 'ignored') -> any()).
 -type maybe_close_fds_fun() :: 'undefined' | fun (() -> 'ok').
 -type deletion_thunk() :: fun (() -> boolean()).
 
@@ -907,7 +907,7 @@ handle_cast({write, CRef, MsgId, Flow},
         ignore ->
             %% A 'remove' has already been issued and eliminated the
             %% 'write'.
-            State1 = blind_confirm(CRef, gb_sets:singleton(MsgId),
+            State1 = blind_confirm(CRef, sets:add_element(MsgId, sets:new([{version,2}])),
                                    ignored, State),
             %% If all writes get eliminated, cur_file_cache_ets could
             %% grow unbounded. To prevent that we delete the cache
@@ -938,7 +938,7 @@ handle_cast({remove, CRef, MsgIds}, State) ->
                       ignore  -> {Removed, State2}
                   end
           end, {[], State}, MsgIds),
-    noreply(maybe_compact(client_confirm(CRef, gb_sets:from_list(RemovedMsgIds),
+    noreply(maybe_compact(client_confirm(CRef, sets:from_list(RemovedMsgIds),
                                          ignored, State1)));
 
 handle_cast({combine_files, Source, Destination, Reclaimed},
@@ -1066,7 +1066,7 @@ internal_sync(State = #msstate { current_file_handle = CurHdl,
                                  cref_to_msg_ids     = CTM }) ->
     State1 = stop_sync_timer(State),
     CGs = maps:fold(fun (CRef, MsgIds, NS) ->
-                            case gb_sets:is_empty(MsgIds) of
+                            case sets:is_empty(MsgIds) of
                                 true  -> NS;
                                 false -> [{CRef, MsgIds} | NS]
                             end
@@ -1156,7 +1156,7 @@ write_message(MsgId, Msg, CRef,
             true = ets:delete_object(CurFileCacheEts, {MsgId, Msg, 0}),
             update_pending_confirms(
               fun (MsgOnDiskFun, CTM) ->
-                      MsgOnDiskFun(gb_sets:singleton(MsgId), written),
+                      MsgOnDiskFun(sets:add_element(MsgId, sets:new([{version,2}])), written),
                       CTM
               end, CRef, State1)
     end.
@@ -1356,8 +1356,8 @@ record_pending_confirm(CRef, MsgId, State) ->
     update_pending_confirms(
       fun (_MsgOnDiskFun, CTM) ->
             NewMsgIds = case maps:find(CRef, CTM) of
-                error        -> gb_sets:singleton(MsgId);
-                {ok, MsgIds} -> gb_sets:add(MsgId, MsgIds)
+                error        -> sets:add_element(MsgId, sets:new([{version,2}]));
+                {ok, MsgIds} -> sets:add_element(MsgId, MsgIds)
             end,
             maps:put(CRef, NewMsgIds, CTM)
       end, CRef, State).
@@ -1366,11 +1366,10 @@ client_confirm(CRef, MsgIds, ActionTaken, State) ->
     update_pending_confirms(
       fun (MsgOnDiskFun, CTM) ->
               case maps:find(CRef, CTM) of
-                  {ok, Gs} -> MsgOnDiskFun(gb_sets:intersection(Gs, MsgIds),
+                  {ok, Gs} -> MsgOnDiskFun(sets:intersection(Gs, MsgIds),
                                            ActionTaken),
-                              MsgIds1 = rabbit_misc:gb_sets_difference(
-                                          Gs, MsgIds),
-                              case gb_sets:is_empty(MsgIds1) of
+                              MsgIds1 = sets:subtract(Gs, MsgIds),
+                              case sets:is_empty(MsgIds1) of
                                   true  -> maps:remove(CRef, CTM);
                                   false -> maps:put(CRef, MsgIds1, CTM)
                               end;
diff --git a/deps/rabbit/src/rabbit_queue_index.erl b/deps/rabbit/src/rabbit_queue_index.erl
index 952c68147e66..4287a87ab11a 100644
--- a/deps/rabbit/src/rabbit_queue_index.erl
+++ b/deps/rabbit/src/rabbit_queue_index.erl
@@ -249,7 +249,7 @@
                                unacked            :: non_neg_integer()
                              }).
 -type seg_map() :: {map(), [segment()]}.
--type on_sync_fun() :: fun ((gb_sets:set()) -> ok).
+-type on_sync_fun() :: fun ((sets:set()) -> ok).
 -type qistate() :: #qistate { dir                 :: file:filename(),
                               segments            :: 'undefined' | seg_map(),
                               journal_handle      :: hdl(),
@@ -257,8 +257,8 @@
                               max_journal_entries :: non_neg_integer(),
                               on_sync             :: on_sync_fun(),
                               on_sync_msg         :: on_sync_fun(),
-                              unconfirmed         :: gb_sets:set(),
-                              unconfirmed_msg     :: gb_sets:set(),
+                              unconfirmed         :: sets:set(),
+                              unconfirmed_msg     :: sets:set(),
                               pre_publish_cache   :: list(),
                               delivered_cache     :: list()
                             }.
@@ -439,9 +439,9 @@ maybe_needs_confirming(MsgProps, MsgOrId,
             end,
     ?MSG_ID_BYTES = size(MsgId),
     case {MsgProps#message_properties.needs_confirming, MsgOrId} of
-      {true,  MsgId} -> UC1  = gb_sets:add_element(MsgId, UC),
+      {true,  MsgId} -> UC1  = sets:add_element(MsgId, UC),
                         State#qistate{unconfirmed     = UC1};
-      {true,  _}     -> UCM1 = gb_sets:add_element(MsgId, UCM),
+      {true,  _}     -> UCM1 = sets:add_element(MsgId, UCM),
                         State#qistate{unconfirmed_msg = UCM1};
       {false, _}     -> State
     end.
@@ -474,7 +474,7 @@ needs_sync(#qistate{journal_handle = undefined}) ->
 needs_sync(#qistate{journal_handle  = JournalHdl,
                     unconfirmed     = UC,
                     unconfirmed_msg = UCM}) ->
-    case gb_sets:is_empty(UC) andalso gb_sets:is_empty(UCM) of
+    case sets:is_empty(UC) andalso sets:is_empty(UCM) of
         true  -> case file_handle_cache:needs_sync(JournalHdl) of
                      true  -> other;
                      false -> false
@@ -623,8 +623,8 @@ blank_state_name_dir_funs(Name, Dir, OnSyncFun, OnSyncMsgFun) ->
                max_journal_entries = MaxJournal,
                on_sync             = OnSyncFun,
                on_sync_msg         = OnSyncMsgFun,
-               unconfirmed         = gb_sets:new(),
-               unconfirmed_msg     = gb_sets:new(),
+               unconfirmed         = sets:new([{version,2}]),
+               unconfirmed_msg     = sets:new([{version,2}]),
                pre_publish_cache   = [],
                delivered_cache     = [],
                queue_name          = Name }.
@@ -1101,15 +1101,15 @@ notify_sync(State = #qistate{unconfirmed     = UC,
                              unconfirmed_msg = UCM,
                              on_sync         = OnSyncFun,
                              on_sync_msg     = OnSyncMsgFun}) ->
-    State1 = case gb_sets:is_empty(UC) of
+    State1 = case sets:is_empty(UC) of
                  true  -> State;
                  false -> OnSyncFun(UC),
-                          State#qistate{unconfirmed = gb_sets:new()}
+                          State#qistate{unconfirmed = sets:new([{version,2}])}
              end,
-    case gb_sets:is_empty(UCM) of
+    case sets:is_empty(UCM) of
         true  -> State1;
         false -> OnSyncMsgFun(UCM),
-                 State1#qistate{unconfirmed_msg = gb_sets:new()}
+                 State1#qistate{unconfirmed_msg = sets:new([{version,2}])}
     end.
 
 %%----------------------------------------------------------------------------
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index a554168f0020..4eb57a8e886b 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -443,10 +443,10 @@
              out_counter           :: non_neg_integer(),
              in_counter            :: non_neg_integer(),
              rates                 :: rates(),
-             msgs_on_disk          :: gb_sets:set(),
-             msg_indices_on_disk   :: gb_sets:set(),
-             unconfirmed           :: gb_sets:set(),
-             confirmed             :: gb_sets:set(),
+             msgs_on_disk          :: sets:set(),
+             msg_indices_on_disk   :: sets:set(),
+             unconfirmed           :: sets:set(),
+             confirmed             :: sets:set(),
              ack_out_counter       :: non_neg_integer(),
              ack_in_counter        :: non_neg_integer(),
              disk_read_count       :: non_neg_integer(),
@@ -700,10 +700,10 @@ batch_publish_delivered(Publishes, ChPid, Flow, State) ->
 discard(_MsgId, _ChPid, _Flow, State) -> State.
 
 drain_confirmed(State = #vqstate { confirmed = C }) ->
-    case gb_sets:is_empty(C) of
+    case sets:is_empty(C) of
         true  -> {[], State}; %% common case
-        false -> {gb_sets:to_list(C), State #vqstate {
-                                        confirmed = gb_sets:new() }}
+        false -> {sets:to_list(C), State #vqstate {
+                                        confirmed = sets:new([{version, 2}]) }}
     end.
 
 dropwhile(Pred, State) ->
@@ -1385,8 +1385,8 @@ one_if(false) -> 0.
 cons_if(true,   E, L) -> [E | L];
 cons_if(false, _E, L) -> L.
 
-gb_sets_maybe_insert(false, _Val, Set) -> Set;
-gb_sets_maybe_insert(true,   Val, Set) -> gb_sets:add(Val, Set).
+sets_maybe_insert(false, _Val, Set) -> Set;
+sets_maybe_insert(true,   Val, Set) -> sets:add_element(Val, Set).
 
 msg_status(Version, IsPersistent, IsDelivered, SeqId,
            Msg = #basic_message {id = MsgId}, MsgProps, IndexMaxSize) ->
@@ -1625,10 +1625,10 @@ init(QueueVsn, IsDurable, IndexMod, IndexState, StoreState, DeltaCount, DeltaByt
       out_counter         = 0,
       in_counter          = 0,
       rates               = blank_rates(Now),
-      msgs_on_disk        = gb_sets:new(),
-      msg_indices_on_disk = gb_sets:new(),
-      unconfirmed         = gb_sets:new(),
-      confirmed           = gb_sets:new(),
+      msgs_on_disk        = sets:new([{version,2}]),
+      msg_indices_on_disk = sets:new([{version,2}]),
+      unconfirmed         = sets:new([{version,2}]),
+      confirmed           = sets:new([{version,2}]),
       ack_out_counter     = 0,
       ack_in_counter      = 0,
       disk_read_count     = 0,
@@ -1987,7 +1987,7 @@ is_pending_ack_empty(State) ->
     count_pending_acks(State) =:= 0.
 
 is_unconfirmed_empty(#vqstate { unconfirmed = UC }) ->
-    gb_sets:is_empty(UC).
+    sets:is_empty(UC).
 
 count_pending_acks(#vqstate { ram_pending_ack   = RPA,
                               disk_pending_ack  = DPA }) ->
@@ -2088,7 +2088,7 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                      State2 = State1 #vqstate { delta = Delta1 },
                      stats_published_disk(MsgStatus1, State2)
              end,
-    UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
+    UC1 = sets_maybe_insert(NeedsConfirming, MsgId, UC),
     State4#vqstate{ next_seq_id         = SeqId + 1,
                     next_deliver_seq_id = maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, IsDelivered),
                     in_counter          = InCount + 1,
@@ -2119,7 +2119,7 @@ publish_delivered1(Msg = #basic_message { is_persistent = IsPersistent, id = Msg
     MsgStatus = msg_status(Version, IsPersistent1, true, SeqId, Msg, MsgProps, IndexMaxSize),
     {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
     State2 = record_pending_ack(m(MsgStatus1), State1),
-    UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
+    UC1 = sets_maybe_insert(NeedsConfirming, MsgId, UC),
     {SeqId,
      stats_published_pending_acks(MsgStatus1,
            State2#vqstate{ next_seq_id         = SeqId + 1,
@@ -2444,10 +2444,10 @@ record_confirms(MsgIdSet, State = #vqstate { msgs_on_disk        = MOD,
                                              unconfirmed         = UC,
                                              confirmed           = C }) ->
     State #vqstate {
-      msgs_on_disk        = rabbit_misc:gb_sets_difference(MOD,  MsgIdSet),
-      msg_indices_on_disk = rabbit_misc:gb_sets_difference(MIOD, MsgIdSet),
-      unconfirmed         = rabbit_misc:gb_sets_difference(UC,   MsgIdSet),
-      confirmed           = gb_sets:union(C, MsgIdSet) }.
+      msgs_on_disk        = sets:subtract(MOD,  MsgIdSet),
+      msg_indices_on_disk = sets:subtract(MIOD, MsgIdSet),
+      unconfirmed         = sets:subtract(UC,   MsgIdSet),
+      confirmed           = sets:union(C, MsgIdSet) }.
 
 msgs_written_to_disk(Callback, MsgIdSet, ignored) ->
     Callback(?MODULE,
@@ -2463,11 +2463,11 @@ msgs_written_to_disk(Callback, MsgIdSet, written) ->
                      %%       this intersection call.
                      %%
                      %%       The same may apply to msg_indices_written_to_disk as well.
-                     Confirmed = gb_sets:intersection(UC, MsgIdSet),
-                     record_confirms(gb_sets:intersection(MsgIdSet, MIOD),
+                     Confirmed = sets:intersection(UC, MsgIdSet),
+                     record_confirms(sets:intersection(MsgIdSet, MIOD),
                                      State #vqstate {
                                        msgs_on_disk =
-                                           gb_sets:union(MOD, Confirmed) })
+                                           sets:union(MOD, Confirmed) })
              end).
 
 msg_indices_written_to_disk(Callback, MsgIdSet) ->
@@ -2475,11 +2475,11 @@ msg_indices_written_to_disk(Callback, MsgIdSet) ->
              fun (?MODULE, State = #vqstate { msgs_on_disk        = MOD,
                                               msg_indices_on_disk = MIOD,
                                               unconfirmed         = UC }) ->
-                     Confirmed = gb_sets:intersection(UC, MsgIdSet),
-                     record_confirms(gb_sets:intersection(MsgIdSet, MOD),
+                     Confirmed = sets:intersection(UC, MsgIdSet),
+                     record_confirms(sets:intersection(MsgIdSet, MOD),
                                      State #vqstate {
                                        msg_indices_on_disk =
-                                           gb_sets:union(MIOD, Confirmed) })
+                                           sets:union(MIOD, Confirmed) })
              end).
 
 msgs_and_indices_written_to_disk(Callback, MsgIdSet) ->
diff --git a/deps/rabbit/test/backing_queue_SUITE.erl b/deps/rabbit/test/backing_queue_SUITE.erl
index f55dee148a9d..ffd2778c4479 100644
--- a/deps/rabbit/test/backing_queue_SUITE.erl
+++ b/deps/rabbit/test/backing_queue_SUITE.erl
@@ -368,7 +368,7 @@ on_disk_capture([_|_], _Awaiting, Pid) ->
 on_disk_capture(OnDisk, Awaiting, Pid) ->
     receive
         {on_disk, MsgIdsS} ->
-            MsgIds = gb_sets:to_list(MsgIdsS),
+            MsgIds = sets:to_list(MsgIdsS),
             on_disk_capture(OnDisk ++ (MsgIds -- Awaiting), Awaiting -- MsgIds,
                             Pid);
         stop ->
@@ -481,7 +481,7 @@ test_msg_store_confirm_timer() ->
         ?PERSISTENT_MSG_STORE,
         Ref,
         fun (MsgIds, _ActionTaken) ->
-            case gb_sets:is_member(MsgId, MsgIds) of
+            case sets:is_element(MsgId, MsgIds) of
                 true  -> Self ! on_disk;
                 false -> ok
             end
@@ -1673,23 +1673,23 @@ publish_and_confirm(Q, Payload, Count) ->
               {ok, Acc, _Actions} = rabbit_queue_type:deliver([Q], Delivery, Acc0),
               Acc
       end, QTState0, Seqs),
-    wait_for_confirms(gb_sets:from_list(Seqs)),
+    wait_for_confirms(sets:from_list(Seqs)),
     QTState.
 
 wait_for_confirms(Unconfirmed) ->
-    case gb_sets:is_empty(Unconfirmed) of
+    case sets:is_empty(Unconfirmed) of
         true  -> ok;
         false ->
             receive
                 {'$gen_cast',
                  {queue_event, _QName, {confirm, Confirmed, _}}} ->
                     wait_for_confirms(
-                      rabbit_misc:gb_sets_difference(
-                        Unconfirmed, gb_sets:from_list(Confirmed)));
+                      sets:subtract(
+                        Unconfirmed, sets:from_list(Confirmed)));
                 {'$gen_cast', {confirm, Confirmed, _}} ->
                     wait_for_confirms(
-                      rabbit_misc:gb_sets_difference(
-                        Unconfirmed, gb_sets:from_list(Confirmed)))
+                      sets:subtract(
+                        Unconfirmed, sets:from_list(Confirmed)))
             after ?TIMEOUT ->
                       flush(),
                       exit(timeout_waiting_for_confirm)
diff --git a/deps/rabbit_common/src/rabbit_misc.erl b/deps/rabbit_common/src/rabbit_misc.erl
index d0cc1cbc8e32..abda1f65bbfb 100644
--- a/deps/rabbit_common/src/rabbit_misc.erl
+++ b/deps/rabbit_common/src/rabbit_misc.erl
@@ -64,7 +64,6 @@
 -export([append_rpc_all_nodes/4, append_rpc_all_nodes/5]).
 -export([os_cmd/1]).
 -export([is_os_process_alive/1]).
--export([gb_sets_difference/2]).
 -export([version/0, otp_release/0, platform_and_version/0, otp_system_version/0,
          rabbitmq_and_erlang_versions/0, which_applications/0]).
 -export([sequence_error/1]).
@@ -232,7 +231,6 @@
 -spec format_message_queue(any(), priority_queue:q()) -> term().
 -spec os_cmd(string()) -> string().
 -spec is_os_process_alive(non_neg_integer()) -> boolean().
--spec gb_sets_difference(gb_sets:set(), gb_sets:set()) -> gb_sets:set().
 -spec version() -> string().
 -spec otp_release() -> string().
 -spec otp_system_version() -> string().
@@ -1208,9 +1206,6 @@ exit_loop(Port) ->
         {Port, _}                 -> exit_loop(Port)
     end.
 
-gb_sets_difference(S1, S2) ->
-    gb_sets:fold(fun gb_sets:delete_any/2, S1, S2).
-
 version() ->
     {ok, VSN} = application:get_key(rabbit, vsn),
     VSN.

From 9d5533e90f64ac97c547649a3af80f541b997a4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 16 Jun 2022 11:21:47 +0200
Subject: [PATCH 17/39] CQv2: Add a simpler confirms code path to improve
 performance

---
 .../src/rabbit_classic_queue_index_v2.erl     |  19 ++--
 .../src/rabbit_classic_queue_store_v2.erl     |  51 ++-------
 deps/rabbit/src/rabbit_queue_index.erl        |   5 +-
 deps/rabbit/src/rabbit_variable_queue.erl     | 101 ++++++++++++------
 4 files changed, 94 insertions(+), 82 deletions(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_index_v2.erl b/deps/rabbit/src/rabbit_classic_queue_index_v2.erl
index 6a9b73f8753f..1316d578b8ba 100644
--- a/deps/rabbit/src/rabbit_classic_queue_index_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_index_v2.erl
@@ -9,7 +9,7 @@
 
 -export([erase/1, init/3, reset_state/1, recover/7,
          terminate/3, delete_and_terminate/1,
-         publish/7, ack/2, read/3]).
+         publish/7, publish/8, ack/2, read/3]).
 
 %% Recovery. Unlike other functions in this module, these
 %% apply to all queues all at once.
@@ -290,7 +290,7 @@ recover_segments(State0 = #qi { queue_name = Name, dir = Dir }, Terms, IsMsgStor
                 list_to_integer(filename:basename(F, ?SEGMENT_EXTENSION))
             || F <- SegmentFiles]),
             %% We use a temporary store state to check that messages do exist.
-            StoreState0 = rabbit_classic_queue_store_v2:init(Name, OnSyncMsgFun),
+            StoreState0 = rabbit_classic_queue_store_v2:init(Name),
             {State1, StoreState} = recover_segments(State0, ContainsCheckFun, StoreState0, CountersRef, Segments),
             _ = rabbit_classic_queue_store_v2:terminate(StoreState),
             State1
@@ -482,7 +482,7 @@ recover_index_v1_dirty(State0 = #qi{ queue_name = Name }, Terms, IsMsgStoreClean
 recover_index_v1_common(State0 = #qi{ queue_name = Name, dir = Dir },
                         V1State, CountersRef) ->
     %% Use a temporary per-queue store state to store embedded messages.
-    StoreState0 = rabbit_classic_queue_store_v2:init(Name, fun(_, _) -> ok end),
+    StoreState0 = rabbit_classic_queue_store_v2:init(Name),
     %% Go through the v1 index and publish messages to the v2 index.
     {LoSeqId, HiSeqId, _} = rabbit_queue_index:bounds(V1State),
     %% When resuming after a crash we need to double check the messages that are both
@@ -564,9 +564,12 @@ delete_and_terminate(State = #qi { dir = Dir,
               rabbit_types:message_properties(), boolean(),
               non_neg_integer() | infinity, State) -> State when State::state().
 
+publish(MsgId, SeqId, Location, Props, IsPersistent, TargetRamCount, State) ->
+    publish(MsgId, SeqId, Location, Props, IsPersistent, true, TargetRamCount, State).
+
 %% Because we always persist to the msg_store, the Msg(Or)Id argument
 %% here is always a binary, never a record.
-publish(MsgId, SeqId, Location, Props, IsPersistent, TargetRamCount,
+publish(MsgId, SeqId, Location, Props, IsPersistent, ShouldConfirm, TargetRamCount,
         State0 = #qi { write_buffer = WriteBuffer0,
                        segments = Segments }) ->
     ?DEBUG("~0p ~0p ~0p ~0p ~0p ~0p ~0p", [MsgId, SeqId, Location, Props, IsPersistent, TargetRamCount, State0]),
@@ -583,7 +586,7 @@ publish(MsgId, SeqId, Location, Props, IsPersistent, TargetRamCount,
     end,
     %% When publisher confirms have been requested for this
     %% message we mark the message as unconfirmed.
-    State = maybe_mark_unconfirmed(MsgId, Props, State2),
+    State = maybe_mark_unconfirmed(MsgId, Props, ShouldConfirm, State2),
     maybe_flush_buffer(State, SegmentEntryCount).
 
 new_segment_file(Segment, SegmentEntryCount, State = #qi{ segments = Segments }) ->
@@ -657,9 +660,9 @@ reduce_fd_usage(SegmentToOpen, State = #qi{ fds = OpenFds0 }) ->
     end.
 
 maybe_mark_unconfirmed(MsgId, #message_properties{ needs_confirming = true },
-        State = #qi { confirms = Confirms }) ->
+        true, State = #qi { confirms = Confirms }) ->
     State#qi{ confirms = sets:add_element(MsgId, Confirms) };
-maybe_mark_unconfirmed(_, _, State) ->
+maybe_mark_unconfirmed(_, _, _, State) ->
     State.
 
 maybe_flush_buffer(State = #qi { write_buffer = WriteBuffer,
@@ -1183,7 +1186,7 @@ stop(VHost) ->
 
 pre_publish(MsgOrId, SeqId, Location, Props, IsPersistent, TargetRamCount, State) ->
     ?DEBUG("~0p ~0p ~0p ~0p ~0p ~0p ~0p", [MsgOrId, SeqId, Location, Props, IsPersistent, TargetRamCount, State]),
-    publish(MsgOrId, SeqId, Location, Props, IsPersistent, TargetRamCount, State).
+    publish(MsgOrId, SeqId, Location, Props, IsPersistent, false, TargetRamCount, State).
 
 flush_pre_publish_cache(TargetRamCount, State) ->
     ?DEBUG("~0p ~0p", [TargetRamCount, State]),
diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 1af34cb8e7d7..d58c5eaf583c 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -32,8 +32,7 @@
 %% they should be. What this effectively means is that the
 %% ?STORE:sync function is called right before the ?INDEX:sync
 %% function, and only if there are outstanding confirms in the
-%% index. As a result the store does not need to keep track of
-%% confirms.
+%% index.
 %%
 %% The old rabbit_msg_store has separate transient/persistent stores
 %% to make recovery of data on disk quicker. We do not need to
@@ -50,7 +49,7 @@
 
 -module(rabbit_classic_queue_store_v2).
 
--export([init/2, terminate/1,
+-export([init/1, terminate/1,
          write/4, sync/1, read/3, check_msg_on_disk/3,
          remove/2, delete_segments/2]).
 
@@ -116,14 +115,7 @@
     %% we are using file:pread/3 which leaves the file
     %% position undetermined.
     read_segment = undefined :: undefined | non_neg_integer(),
-    read_fd = undefined :: undefined | file:fd(),
-
-    %% Messages waiting for publisher confirms. The
-    %% publisher confirms will be sent at regular
-    %% intervals after the index has been flushed
-    %% to disk.
-    confirms = sets:new([{version,2}]) :: sets:set(),
-    on_sync :: on_sync_fun()
+    read_fd = undefined :: undefined | file:fd()
 }).
 
 -type state() :: #qs{}.
@@ -131,18 +123,13 @@
 -type msg_location() :: {?MODULE, non_neg_integer(), non_neg_integer()}.
 -export_type([msg_location/0]).
 
--type on_sync_fun() :: fun ((sets:set(), 'written' | 'ignored') -> any()).
+-spec init(rabbit_amqqueue:name()) -> state().
 
--spec init(rabbit_amqqueue:name(), on_sync_fun()) -> state().
-
-init(#resource{ virtual_host = VHost } = Name, OnSyncFun) ->
-    ?DEBUG("~0p ~0p", [Name, OnSyncFun]),
+init(#resource{ virtual_host = VHost } = Name) ->
+    ?DEBUG("~0p", [Name]),
     VHostDir = rabbit_vhost:msg_store_dir_path(VHost),
     Dir = rabbit_classic_queue_index_v2:queue_dir(VHostDir, Name),
-    #qs{
-        dir = Dir,
-        on_sync = OnSyncFun
-    }.
+    #qs{dir = Dir}.
 
 -spec terminate(State) -> State when State::state().
 
@@ -173,7 +160,7 @@ maybe_close_fd(Fd) ->
             rabbit_types:message_properties(), State)
         -> {msg_location(), State} when State::state().
 
-write(SeqId, Msg=#basic_message{ id = MsgId }, Props, State0) ->
+write(SeqId, Msg, Props, State0) ->
     ?DEBUG("~0p ~0p ~0p ~0p", [SeqId, Msg, Props, State0]),
     SegmentEntryCount = segment_entry_count(),
     Segment = SeqId div SegmentEntryCount,
@@ -202,10 +189,7 @@ write(SeqId, Msg=#basic_message{ id = MsgId }, Props, State0) ->
     %% Size:32, Flags:8, CRC32:16, Unused:8.
     ok = file:write(Fd, [<<Size:32/unsigned, 0:7, UseCRC32:1, CRC32:16, 0:8>>, MsgIovec]),
     %% Maybe cache the message.
-    State2 = maybe_cache(SeqId, Size, Msg, State1),
-    %% When publisher confirms have been requested for this
-    %% message we mark the message as unconfirmed.
-    State = maybe_mark_unconfirmed(MsgId, Props, State2),
+    State = maybe_cache(SeqId, Size, Msg, State1),
     %% Keep track of the offset we are at.
     {{?MODULE, Offset, Size}, State#qs{ write_offset = Offset + Size + ?ENTRY_HEADER_SIZE }}.
 
@@ -246,25 +230,12 @@ maybe_cache(SeqId, MsgSize, Msg, State = #qs{ cache = Cache,
                       cache_size = CacheSize + MsgSize }
     end.
 
-maybe_mark_unconfirmed(MsgId, #message_properties{ needs_confirming = true },
-        State = #qs { confirms = Confirms }) ->
-    State#qs{ confirms = sets:add_element(MsgId, Confirms) };
-maybe_mark_unconfirmed(_, _, State) ->
-    State.
-
 -spec sync(State) -> State when State::state().
 
-sync(State = #qs{ confirms = Confirms,
-                  on_sync = OnSyncFun }) ->
+sync(State) ->
     ?DEBUG("~0p", [State]),
     flush_write_fd(State),
-    case sets:is_empty(Confirms) of
-        true ->
-            State;
-        false ->
-            OnSyncFun(Confirms, written),
-            State#qs{ confirms = sets:new([{version,2}]) }
-    end.
+    State.
 
 -spec read(rabbit_variable_queue:seq_id(), msg_location(), State)
         -> {rabbit_types:basic_message(), State} when State::state().
diff --git a/deps/rabbit/src/rabbit_queue_index.erl b/deps/rabbit/src/rabbit_queue_index.erl
index 4287a87ab11a..098356b90226 100644
--- a/deps/rabbit/src/rabbit_queue_index.erl
+++ b/deps/rabbit/src/rabbit_queue_index.erl
@@ -12,7 +12,7 @@
 -export([erase/1, init/3, reset_state/1, recover/7,
          terminate/3, delete_and_terminate/1,
          pre_publish/7, flush_pre_publish_cache/2,
-         publish/7, deliver/2, ack/2, sync/1, needs_sync/1, flush/1,
+         publish/7, publish/8, deliver/2, ack/2, sync/1, needs_sync/1, flush/1,
          read/3, next_segment_boundary/1, bounds/1, start/2, stop/1]).
 
 -export([add_queue_ttl/0, avoid_zeroes/0, store_msg_size/0, store_msg/0]).
@@ -430,6 +430,9 @@ publish(MsgOrId, SeqId, _Location, MsgProps, IsPersistent, JournalSizeHint, Stat
       JournalSizeHint,
       add_to_journal(SeqId, {IsPersistent, Bin, MsgBin}, State1)).
 
+publish(MsgOrId, SeqId, Location, MsgProps, IsPersistent, _, JournalSizeHint, State) ->
+    publish(MsgOrId, SeqId, Location, MsgProps, IsPersistent, JournalSizeHint, State).
+
 maybe_needs_confirming(MsgProps, MsgOrId,
         State = #qistate{unconfirmed     = UC,
                          unconfirmed_msg = UCM}) ->
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 4eb57a8e886b..c31af94367fa 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -315,6 +315,11 @@
           out_counter,
           in_counter,
           rates,
+          %% There are two confirms paths: either store/index produce confirms
+          %% separately (v1 and v2 with per-vhost message store) or the confirms
+          %% are produced all at once while syncing/flushing (v2 with per-queue
+          %% message store). The latter is more efficient as it avoids many
+          %% sets operations.
           msgs_on_disk,
           msg_indices_on_disk,
           unconfirmed,
@@ -331,10 +336,13 @@
           %% default queue or lazy queue
           mode, %% Unused.
           version = 1,
-          %% number of reduce_memory_usage executions, once it
-          %% reaches a threshold the queue will manually trigger a runtime GC
-          %% see: maybe_execute_gc/1
-          memory_reduction_run_count, %% Unused.
+          %% Fast path for confirms handling. Instead of having
+          %% index/store keep track of confirms separately and
+          %% doing intersect/subtract/union we just put the messages
+          %% here and on sync move them to 'confirmed'.
+          %%
+          %% Note: This field used to be 'memory_reduction_run_count'.
+          unconfirmed_simple,
           %% Queue data is grouped by VHost. We need to store it
           %% to work with queue index.
           virtual_host,
@@ -455,7 +463,7 @@
              io_batch_size         :: pos_integer(),
              mode                  :: 'default' | 'lazy',
              version               :: 1 | 2,
-             memory_reduction_run_count :: non_neg_integer()}.
+             unconfirmed_simple    :: sets:set()}.
 
 -define(BLANK_DELTA, #delta { start_seq_id = undefined,
                               count        = 0,
@@ -543,7 +551,7 @@ init(Q, new, AsyncCallback, MsgOnDiskFun, MsgIdxOnDiskFun, MsgAndIdxOnDiskFun) w
     %% between queue versions unnecessarily.
     IndexMod = index_mod(Q),
     IndexState = IndexMod:init(QueueName, MsgIdxOnDiskFun, MsgAndIdxOnDiskFun),
-    StoreState = rabbit_classic_queue_store_v2:init(QueueName, MsgOnDiskFun),
+    StoreState = rabbit_classic_queue_store_v2:init(QueueName),
     VHost = QueueName#resource.virtual_host,
     init(queue_version(Q),
          IsDurable, IndexMod, IndexState, StoreState, 0, 0, [],
@@ -586,7 +594,7 @@ init(Q, Terms, AsyncCallback, MsgOnDiskFun, MsgIdxOnDiskFun, MsgAndIdxOnDiskFun)
               ?PERSISTENT_MSG_STORE),
           ContainsCheckFun, MsgIdxOnDiskFun, MsgAndIdxOnDiskFun,
           main),
-    StoreState = rabbit_classic_queue_store_v2:init(QueueName, MsgOnDiskFun),
+    StoreState = rabbit_classic_queue_store_v2:init(QueueName),
     init(queue_version(Q),
          IsDurable, IndexMod, IndexState, StoreState, DeltaCount, DeltaBytes, RecoveryTerms,
          PersistentClient, TransientClient, VHost).
@@ -914,26 +922,38 @@ ram_duration(State) ->
     {infinity, State1}.
 
 needs_timeout(#vqstate { index_mod   = IndexMod,
-                         index_state = IndexState }) ->
-    case IndexMod:needs_sync(IndexState) of
-        confirms -> timed;
-        other    -> idle;
-        false    -> false
+                         index_state = IndexState,
+                         unconfirmed_simple = UCS }) ->
+    case {IndexMod:needs_sync(IndexState), sets:is_empty(UCS)} of
+        {false, false} -> timed;
+        {confirms, _}  -> timed;
+        {other, _}     -> idle;
+        {false, true}  -> false
     end.
 
 timeout(State = #vqstate { index_mod   = IndexMod,
                            index_state = IndexState0,
-                           store_state = StoreState0 }) ->
-    StoreState = rabbit_classic_queue_store_v2:sync(StoreState0),
+                           store_state = StoreState0,
+                           unconfirmed_simple = UCS,
+                           confirmed   = C }) ->
     IndexState = IndexMod:sync(IndexState0),
+    StoreState = rabbit_classic_queue_store_v2:sync(StoreState0),
     State #vqstate { index_state = IndexState,
-                     store_state = StoreState }.
+                     store_state = StoreState,
+                     unconfirmed_simple = sets:new([{version,2}]),
+                     confirmed   = sets:union(C, UCS) }.
 
 handle_pre_hibernate(State = #vqstate { index_mod   = IndexMod,
-                                        index_state = IndexState,
-                                        store_state = StoreState }) ->
-    State #vqstate { index_state = IndexMod:flush(IndexState),
-                     store_state = rabbit_classic_queue_store_v2:sync(StoreState) }.
+                                        index_state = IndexState0,
+                                        store_state = StoreState0,
+                                        unconfirmed_simple = UCS,
+                                        confirmed   = C }) ->
+    IndexState = IndexMod:flush(IndexState0),
+    StoreState = rabbit_classic_queue_store_v2:sync(StoreState0),
+    State #vqstate { index_state = IndexState,
+                     store_state = StoreState,
+                     unconfirmed_simple = sets:new([{version,2}]),
+                     confirmed   = sets:union(C, UCS) }.
 
 resume(State) -> a(State).
 
@@ -1385,9 +1405,6 @@ one_if(false) -> 0.
 cons_if(true,   E, L) -> [E | L];
 cons_if(false, _E, L) -> L.
 
-sets_maybe_insert(false, _Val, Set) -> Set;
-sets_maybe_insert(true,   Val, Set) -> sets:add_element(Val, Set).
-
 msg_status(Version, IsPersistent, IsDelivered, SeqId,
            Msg = #basic_message {id = MsgId}, MsgProps, IndexMaxSize) ->
     #msg_status{seq_id        = SeqId,
@@ -1628,6 +1645,7 @@ init(QueueVsn, IsDurable, IndexMod, IndexState, StoreState, DeltaCount, DeltaByt
       msgs_on_disk        = sets:new([{version,2}]),
       msg_indices_on_disk = sets:new([{version,2}]),
       unconfirmed         = sets:new([{version,2}]),
+      unconfirmed_simple  = sets:new([{version,2}]),
       confirmed           = sets:new([{version,2}]),
       ack_out_counter     = 0,
       ack_in_counter      = 0,
@@ -1638,7 +1656,6 @@ init(QueueVsn, IsDurable, IndexMod, IndexState, StoreState, DeltaCount, DeltaByt
 
       mode                = default,
       version             = QueueVsn,
-      memory_reduction_run_count = 0,
       virtual_host        = VHost},
     a(maybe_deltas_to_betas(State)).
 
@@ -1986,8 +2003,8 @@ reset_qi_state(State = #vqstate{ index_mod   = IndexMod,
 is_pending_ack_empty(State) ->
     count_pending_acks(State) =:= 0.
 
-is_unconfirmed_empty(#vqstate { unconfirmed = UC }) ->
-    sets:is_empty(UC).
+is_unconfirmed_empty(#vqstate { unconfirmed = UC, unconfirmed_simple = UCS }) ->
+    sets:is_empty(UC) andalso sets:is_empty(UCS).
 
 count_pending_acks(#vqstate { ram_pending_ack   = RPA,
                               disk_pending_ack  = DPA }) ->
@@ -2069,6 +2086,7 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                             in_counter          = InCount,
                             durable             = IsDurable,
                             unconfirmed         = UC,
+                            unconfirmed_simple  = UCS,
                             rates               = #rates{ out = OutRate }}) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
     MsgStatus = msg_status(Version, IsPersistent1, IsDelivered, SeqId, Msg, MsgProps, IndexMaxSize),
@@ -2076,7 +2094,7 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
     %% limit is at 1 because the queue process will need to access this message to know
     %% expiration information.
     MemoryLimit = min(1 + floor(2 * OutRate), 2048),
-    State4 = case DeltaCount of
+    State3 = case DeltaCount of
                  %% Len is the same as Q3Len when DeltaCount =:= 0.
                  0 when Len < MemoryLimit ->
                      {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
@@ -2088,11 +2106,13 @@ publish1(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                      State2 = State1 #vqstate { delta = Delta1 },
                      stats_published_disk(MsgStatus1, State2)
              end,
-    UC1 = sets_maybe_insert(NeedsConfirming, MsgId, UC),
-    State4#vqstate{ next_seq_id         = SeqId + 1,
+    {UC1, UCS1} = maybe_needs_confirming(NeedsConfirming, persist_to(MsgStatus),
+                                         Version, MsgId, UC, UCS),
+    State3#vqstate{ next_seq_id         = SeqId + 1,
                     next_deliver_seq_id = maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, IsDelivered),
                     in_counter          = InCount + 1,
-                    unconfirmed         = UC1 }.
+                    unconfirmed         = UC1,
+                    unconfirmed_simple  = UCS1 }.
 
 %% Only attempt to increase the next_deliver_seq_id for delivered messages.
 maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, true) ->
@@ -2114,19 +2134,32 @@ publish_delivered1(Msg = #basic_message { is_persistent = IsPersistent, id = Msg
                                       in_counter          = InCount,
                                       out_counter         = OutCount,
                                       durable             = IsDurable,
-                                      unconfirmed         = UC }) ->
+                                      unconfirmed         = UC,
+                                      unconfirmed_simple  = UCS }) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
     MsgStatus = msg_status(Version, IsPersistent1, true, SeqId, Msg, MsgProps, IndexMaxSize),
     {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
     State2 = record_pending_ack(m(MsgStatus1), State1),
-    UC1 = sets_maybe_insert(NeedsConfirming, MsgId, UC),
+    {UC1, UCS1} = maybe_needs_confirming(NeedsConfirming, persist_to(MsgStatus),
+                                         Version, MsgId, UC, UCS),
     {SeqId,
      stats_published_pending_acks(MsgStatus1,
            State2#vqstate{ next_seq_id         = SeqId + 1,
                            next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
                            out_counter         = OutCount + 1,
                            in_counter          = InCount + 1,
-                           unconfirmed         = UC1 })}.
+                           unconfirmed         = UC1,
+                           unconfirmed_simple  = UCS1 })}.
+
+maybe_needs_confirming(false, _, _, _, UC, UCS) ->
+    {UC, UCS};
+%% When storing to the v2 queue store we take the simple confirms
+%% path because we don't need to track index and store separately.
+maybe_needs_confirming(true, queue_store, 2, MsgId, UC, UCS) ->
+    {UC, sets:add_element(MsgId, UCS)};
+%% Otherwise we keep tracking as it used to be.
+maybe_needs_confirming(true, _, _, MsgId, UC, UCS) ->
+    {sets:add_element(MsgId, UC), UCS}.
 
 batch_publish_delivered1({Msg, MsgProps}, {ChPid, Flow, SeqIds, State}) ->
     {SeqId, State1} =
@@ -2230,7 +2263,8 @@ maybe_write_index_to_disk(Force, MsgStatus = #msg_status {
             queue_index -> {prepare_to_store(Msg), DiskWriteCount + 1}
         end,
     IndexState2 = IndexMod:publish(
-                    MsgOrId, SeqId, MsgLocation, MsgProps, IsPersistent, TargetRamCount,
+                    MsgOrId, SeqId, MsgLocation, MsgProps, IsPersistent,
+                    persist_to(MsgStatus) =:= msg_store, TargetRamCount,
                     IndexState),
     %% We always deliver messages when the old index is used.
     %% We are actually tracking message deliveries per-queue
@@ -2482,6 +2516,7 @@ msg_indices_written_to_disk(Callback, MsgIdSet) ->
                                            sets:union(MIOD, Confirmed) })
              end).
 
+%% @todo Having to call run_backing_queue is probably reducing performance...
 msgs_and_indices_written_to_disk(Callback, MsgIdSet) ->
     Callback(?MODULE,
              fun (?MODULE, State) -> record_confirms(MsgIdSet, State) end).

From f4c5f51f5ed1d8b1525d7576119c526926aa54f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 16 Jun 2022 11:51:02 +0200
Subject: [PATCH 18/39] CQ: Fix an xref error

---
 deps/rabbit/src/rabbit_queue_index.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_queue_index.erl b/deps/rabbit/src/rabbit_queue_index.erl
index 098356b90226..52cad50c9b26 100644
--- a/deps/rabbit/src/rabbit_queue_index.erl
+++ b/deps/rabbit/src/rabbit_queue_index.erl
@@ -734,7 +734,7 @@ recover_index_v2_dirty(State0 = #qistate { queue_name = Name,
 recover_index_v2_common(State0 = #qistate { queue_name = Name, dir = Dir },
                         V2State, CountersRef) ->
     %% Use a temporary per-queue store state to read embedded messages.
-    StoreState0 = rabbit_classic_queue_store_v2:init(Name, fun(_, _) -> ok end),
+    StoreState0 = rabbit_classic_queue_store_v2:init(Name),
     %% Go through the v2 index and publish messages to v1 index.
     {LoSeqId, HiSeqId, _} = rabbit_classic_queue_index_v2:bounds(V2State),
     %% When resuming after a crash we need to double check the messages that are both

From 615e66719374a0a0bfbbd1bd6bb7eafd97e3552b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Fri, 17 Jun 2022 10:26:43 +0200
Subject: [PATCH 19/39] CQ: Enable read/write concurrency for old msg store ets

The rabbit_msg_store_flying ets table runs into lock trouble
with large fan outs. This should help.
---
 deps/rabbit/src/rabbit_msg_store.erl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_msg_store.erl b/deps/rabbit/src/rabbit_msg_store.erl
index 52386b2fd873..e5f2dd231039 100644
--- a/deps/rabbit/src/rabbit_msg_store.erl
+++ b/deps/rabbit/src/rabbit_msg_store.erl
@@ -758,7 +758,9 @@ init([VHost, Type, BaseDir, ClientRefs, StartupFunState]) ->
     FileHandlesEts  = ets:new(rabbit_msg_store_shared_file_handles,
                               [ordered_set, public]),
     CurFileCacheEts = ets:new(rabbit_msg_store_cur_file, [set, public]),
-    FlyingEts       = ets:new(rabbit_msg_store_flying, [set, public]),
+    FlyingEts       = ets:new(rabbit_msg_store_flying, [set, public,
+                              {read_concurrency, true},
+                              {write_concurrency, true}]),
 
     {ok, FileSizeLimit} = application:get_env(rabbit, msg_store_file_size_limit),
 

From 2b291b17b721d087c9993443d6c9bf120e2845c9 Mon Sep 17 00:00:00 2001
From: Michal Kuratczyk <mkuratczyk@vmware.com>
Date: Fri, 17 Jun 2022 08:37:40 +0000
Subject: [PATCH 20/39] CQ: Enable read/write concurrency for old msg store ets

The rabbit_msg_store_cur_file ets table runs into lock trouble
with large fan outs. This should help.
---
 deps/rabbit/src/rabbit_msg_store.erl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_msg_store.erl b/deps/rabbit/src/rabbit_msg_store.erl
index e5f2dd231039..75d106e19def 100644
--- a/deps/rabbit/src/rabbit_msg_store.erl
+++ b/deps/rabbit/src/rabbit_msg_store.erl
@@ -757,7 +757,9 @@ init([VHost, Type, BaseDir, ClientRefs, StartupFunState]) ->
 
     FileHandlesEts  = ets:new(rabbit_msg_store_shared_file_handles,
                               [ordered_set, public]),
-    CurFileCacheEts = ets:new(rabbit_msg_store_cur_file, [set, public]),
+    CurFileCacheEts = ets:new(rabbit_msg_store_cur_file, [set, public,
+                              {read_concurrency, true},
+                              {write_concurrency, true}]),
     FlyingEts       = ets:new(rabbit_msg_store_flying, [set, public,
                               {read_concurrency, true},
                               {write_concurrency, true}]),

From 0c99efe6dc25c51981f78bcd54599b1eaa0bed74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 27 Jun 2022 14:56:05 +0200
Subject: [PATCH 21/39] CQv2: Optimise the per-queue store

This removes the use of delayed_write and instead uses an
internal buffer that's also used as a cache, similar to how
the index works. The offset and size of messages in the file
are calculated using erlang:external_size/1 and as a result
the files may be a little bigger than before, but they should
not be significantly, especially considering messages are
mostly made of atoms and binaries.

The performance is boosted by around 10% on my machine
as a result of these changes.
---
 .../src/rabbit_classic_queue_store_v2.erl     | 331 +++++++++---------
 1 file changed, 162 insertions(+), 169 deletions(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index d58c5eaf583c..47eb69991958 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -62,9 +62,6 @@
 
 -include_lib("rabbit_common/include/rabbit.hrl").
 
-%% We need this to directly access the delayed file io pid.
--include_lib("kernel/include/file.hrl").
-
 %% Set to true to get an awful lot of debug logs.
 -if(false).
 -define(DEBUG(X,Y), logger:debug("~0p: " ++ X, [?FUNCTION_NAME|Y])).
@@ -72,43 +69,34 @@
 -define(DEBUG(X,Y), _ = X, _ = Y, ok).
 -endif.
 
+-type buffer() :: #{
+    %% SeqId => {Offset, Size, Msg}
+    rabbit_variable_queue:seq_id() => {non_neg_integer(), non_neg_integer(), #basic_message{}}
+}.
+
 -record(qs, {
     %% Store directory - same as the queue index.
     dir :: file:filename(),
 
-    %% We keep up to one write fd open at any time.
-    %% Because queues are FIFO, writes are mostly sequential
-    %% and they occur only once, we do not need to worry
-    %% much about writing to multiple different segment
-    %% files.
-    %%
-    %% We keep track of which segment is open, its fd,
+    %% We keep track of which segment is open
     %% and the current offset in the file. This offset
     %% is the position at which the next message will
-    %% be written, and it will be kept track in the
-    %% queue (and potentially in the queue index) for
-    %% a later read.
+    %% be written. The offset will be returned to be
+    %% kept track of in the queue (or queue index) for
+    %% later reads.
     write_segment = undefined :: undefined | non_neg_integer(),
-    write_fd = undefined :: undefined | file:fd(),
     write_offset = ?HEADER_SIZE :: non_neg_integer(),
 
+    %% We must keep the offset, expected size and message in order
+    %% to write the message.
+    write_buffer = #{} :: buffer(),
+    write_buffer_size = 0 :: non_neg_integer(),
+
     %% We keep a cache of messages for faster reading
     %% for the cases where consumers can keep up with
-    %% producers. Messages are written to the case as
-    %% long as there is space available. Messages are
-    %% not removed from the cache until the message
-    %% is either acked or removed by the queue. This
-    %% means that only a subset of messages may make
-    %% it onto the store. We do not prune messages as
-    %% that could lead to caching messages and pruning
-    %% them before they can be read from the cache.
-    %%
-    %% The cache size is the size of the messages,
-    %% it does not include the metadata. The real
-    %% cache size will therefore potentially be larger
-    %% than the configured maximum size.
-    cache = #{} :: #{ rabbit_variable_queue:seq_id() => {non_neg_integer(), #basic_message{}}},
-    cache_size = 0 :: non_neg_integer(),
+    %% producers. The write_buffer becomes the cache
+    %% when it is written to disk.
+    cache = #{} :: buffer(),
 
     %% Similarly, we keep track of a single read fd.
     %% We cannot share this fd with the write fd because
@@ -133,24 +121,15 @@ init(#resource{ virtual_host = VHost } = Name) ->
 
 -spec terminate(State) -> State when State::state().
 
-terminate(State = #qs{ write_fd = WriteFd,
-                       read_fd = ReadFd }) ->
-    ?DEBUG("~0p", [State]),
-    %% Fsync and close all FDs as needed.
-    maybe_sync_close_fd(WriteFd),
+terminate(State0 = #qs{ read_fd = ReadFd }) ->
+    ?DEBUG("~0p", [State0]),
+    State = flush_buffer(State0),
     maybe_close_fd(ReadFd),
     State#qs{ write_segment = undefined,
-              write_fd = undefined,
               write_offset = ?HEADER_SIZE,
               read_segment = undefined,
               read_fd = undefined }.
 
-maybe_sync_close_fd(undefined) ->
-    ok;
-maybe_sync_close_fd(Fd) ->
-    ok = file:sync(Fd),
-    ok = file:close(Fd).
-
 maybe_close_fd(undefined) ->
     ok;
 maybe_close_fd(Fd) ->
@@ -160,82 +139,120 @@ maybe_close_fd(Fd) ->
             rabbit_types:message_properties(), State)
         -> {msg_location(), State} when State::state().
 
-write(SeqId, Msg, Props, State0) ->
+%% @todo I think we can disable the old message store at the same
+%%       place where we create MsgId. If many queues receive the
+%%       message, then we create an MsgId. If not, we don't. But
+%%       we can only do this after removing support for v1.
+write(SeqId, Msg, Props, State0 = #qs{ write_buffer = WriteBuffer0,
+                                       write_buffer_size = WriteBufferSize }) ->
     ?DEBUG("~0p ~0p ~0p ~0p", [SeqId, Msg, Props, State0]),
     SegmentEntryCount = segment_entry_count(),
     Segment = SeqId div SegmentEntryCount,
-    %% We simply append to the related segment file.
-    %% We will then return the offset and size. That
-    %% combined with the SeqId is enough to find the
-    %% message again.
-    {Fd, Offset, State1} = get_write_fd(Segment, State0),
-    %% @todo We could remove MsgId because we are not going to use it
-    %%       after reading it back from disk. But we have to support
-    %%       going back to v1 for the time being. When rolling back
-    %%       to v1 is no longer possible, set `id = undefined` here.
-    %% @todo We should manage flushing to disk ourselves so that we
-    %%       only do term_to_iovec if the message is going to hit
-    %%       the disk. If it won't, we can replace it with an empty
-    %%       binary. The size can be estimated via erlang:external_size/1.
+    Size = erlang:external_size(Msg),
+    {Offset, State1} = get_write_offset(Segment, Size, State0),
+    WriteBuffer = WriteBuffer0#{SeqId => {Offset, Size, Msg}},
+    State = State1#qs{ write_buffer = WriteBuffer,
+                       write_buffer_size = WriteBufferSize + Size },
+    {{?MODULE, Offset, Size}, maybe_flush_buffer(State)}.
+
+get_write_offset(Segment, Size, State = #qs{ write_segment = Segment,
+                                             write_offset = Offset }) ->
+    {Offset, State#qs{ write_offset = Offset + ?ENTRY_HEADER_SIZE + Size }};
+get_write_offset(Segment, Size, State = #qs{ write_segment = WriteSegment })
+        when Segment > WriteSegment; WriteSegment =:= undefined ->
+    SegmentEntryCount = segment_entry_count(),
+    FromSeqId = Segment * SegmentEntryCount,
+    ToSeqId = FromSeqId + SegmentEntryCount,
+    ok = file:write_file(segment_file(Segment, State),
+        << ?MAGIC:32,
+           ?VERSION:8,
+           FromSeqId:64/unsigned,
+           ToSeqId:64/unsigned,
+           0:344 >>),
+    {?HEADER_SIZE, State#qs{ write_segment = Segment,
+                             write_offset = ?HEADER_SIZE + ?ENTRY_HEADER_SIZE + Size }}.
+
+-spec sync(State) -> State when State::state().
+
+sync(State) ->
+    ?DEBUG("~0p", [State]),
+    flush_buffer(State).
+
+maybe_flush_buffer(State = #qs{ write_buffer_size = WriteBufferSize }) ->
+    case WriteBufferSize >= max_cache_size() of
+        true -> flush_buffer(State);
+        false -> State
+    end.
+
+flush_buffer(State = #qs{ write_buffer_size = 0 }) ->
+    State;
+flush_buffer(State0 = #qs{ write_buffer = WriteBuffer }) ->
+    CheckCRC32 = check_crc32(),
+    SegmentEntryCount = segment_entry_count(),
+    %% First we prepare the writes sorted by segment.
+    WriteList = lists:sort(maps:to_list(WriteBuffer)),
+    Writes = flush_buffer_build(WriteList, CheckCRC32, SegmentEntryCount),
+    %% Then we do the writes for each segment.
+    State = lists:foldl(fun({Segment, LocBytes}, FoldState) ->
+        {ok, Fd} = file:open(segment_file(Segment, FoldState), [read, write, raw, binary]),
+        ok = file:pwrite(Fd, lists:sort(LocBytes)),
+        ok = file:close(Fd),
+        FoldState
+    end, State0, Writes),
+    %% Finally we move the write_buffer to the cache.
+    State#qs{ write_buffer = #{},
+              write_buffer_size = 0,
+              cache = WriteBuffer }.
+
+flush_buffer_build(WriteBuffer = [{FirstSeqId, {Offset, _, _}}|_],
+                   CheckCRC32, SegmentEntryCount) ->
+    Segment = FirstSeqId div SegmentEntryCount,
+    SegmentThreshold = (1 + Segment) * SegmentEntryCount,
+    {Tail, LocBytes} = flush_buffer_build(WriteBuffer,
+        CheckCRC32, SegmentThreshold, ?HEADER_SIZE, Offset, [], []),
+    [{Segment, LocBytes}|flush_buffer_build(Tail, CheckCRC32, SegmentEntryCount)];
+flush_buffer_build([], _, _) ->
+    [].
+
+flush_buffer_build(Tail = [{SeqId, _}|_], _, SegmentThreshold, _, WriteOffset, WriteAcc, Acc)
+        when SeqId >= SegmentThreshold ->
+    case WriteAcc of
+        [] -> {Tail, Acc};
+        _ -> {Tail, [{WriteOffset, lists:reverse(WriteAcc)}|Acc]}
+    end;
+flush_buffer_build([{_, Entry = {Offset, Size, _}}|Tail],
+        CheckCRC32, SegmentEntryCount, Offset, WriteOffset, WriteAcc, Acc) ->
+    flush_buffer_build(Tail, CheckCRC32, SegmentEntryCount,
+        Offset + ?ENTRY_HEADER_SIZE + Size, WriteOffset,
+        [build_data(Entry, CheckCRC32)|WriteAcc], Acc);
+flush_buffer_build([{_, Entry = {Offset, Size, _}}|Tail],
+        CheckCRC32, SegmentEntryCount, _, _, [], Acc) ->
+    flush_buffer_build(Tail, CheckCRC32, SegmentEntryCount,
+        Offset + ?ENTRY_HEADER_SIZE + Size, Offset, [build_data(Entry, CheckCRC32)], Acc);
+flush_buffer_build([{_, Entry = {Offset, Size, _}}|Tail],
+        CheckCRC32, SegmentEntryCount, _, WriteOffset, WriteAcc, Acc) ->
+    flush_buffer_build(Tail, CheckCRC32, SegmentEntryCount,
+        Offset + ?ENTRY_HEADER_SIZE + Size, Offset, [build_data(Entry, CheckCRC32)],
+        [{WriteOffset, lists:reverse(WriteAcc)}|Acc]);
+flush_buffer_build([], _, _, _, _, [], Acc) ->
+    {[], Acc};
+flush_buffer_build([], _, _, _, WriteOffset, WriteAcc, Acc) ->
+    {[], [{WriteOffset, lists:reverse(WriteAcc)}|Acc]}.
+
+build_data({_, Size, Msg}, CheckCRC32) ->
     MsgIovec = term_to_iovec(Msg),
-    Size = iolist_size(MsgIovec),
+    Padding = (Size - iolist_size(MsgIovec)) * 8,
     %% Calculate the CRC for the data if configured to do so.
     %% We will truncate the CRC to 16 bits to save on space. (Idea taken from postgres.)
-    {UseCRC32, CRC32} = case check_crc32() of
+    %% @todo Move check_crc32 to flush_buffer.
+    {UseCRC32, CRC32} = case CheckCRC32 of
         true -> {1, erlang:crc32(MsgIovec)};
         false -> {0, 0}
     end,
-    %% Append to the buffer. A short entry header is prepended:
-    %% Size:32, Flags:8, CRC32:16, Unused:8.
-    ok = file:write(Fd, [<<Size:32/unsigned, 0:7, UseCRC32:1, CRC32:16, 0:8>>, MsgIovec]),
-    %% Maybe cache the message.
-    State = maybe_cache(SeqId, Size, Msg, State1),
-    %% Keep track of the offset we are at.
-    {{?MODULE, Offset, Size}, State#qs{ write_offset = Offset + Size + ?ENTRY_HEADER_SIZE }}.
-
-get_write_fd(Segment, State = #qs{ write_fd = Fd,
-                                   write_segment = Segment,
-                                   write_offset = Offset }) ->
-    {Fd, Offset, State};
-get_write_fd(Segment, State = #qs{ write_fd = OldFd }) ->
-    maybe_close_fd(OldFd),
-    {ok, Fd} = file:open(segment_file(Segment, State), [read, append, raw, binary, {delayed_write, 512000, 10000}]),
-    {ok, Offset0} = file:position(Fd, eof),
-    %% If we are at offset 0, write the file header.
-    Offset = case Offset0 of
-        0 ->
-            SegmentEntryCount = segment_entry_count(),
-            FromSeqId = Segment * SegmentEntryCount,
-            ToSeqId = FromSeqId + SegmentEntryCount,
-            ok = file:write(Fd, << ?MAGIC:32,
-                                   ?VERSION:8,
-                                   FromSeqId:64/unsigned,
-                                   ToSeqId:64/unsigned,
-                                   0:344 >>),
-            ?HEADER_SIZE;
-        _ ->
-            Offset0
-    end,
-    {Fd, Offset, State#qs{ write_segment = Segment,
-                           write_fd = Fd }}.
-
-maybe_cache(SeqId, MsgSize, Msg, State = #qs{ cache = Cache,
-                                              cache_size = CacheSize }) ->
-    MaxCacheSize = max_cache_size(),
-    if
-        CacheSize + MsgSize > MaxCacheSize ->
-            State;
-        true ->
-            State#qs{ cache = Cache#{ SeqId => {MsgSize, Msg} },
-                      cache_size = CacheSize + MsgSize }
-    end.
-
--spec sync(State) -> State when State::state().
-
-sync(State) ->
-    ?DEBUG("~0p", [State]),
-    flush_write_fd(State),
-    State.
+    [
+        <<Size:32/unsigned, 0:7, UseCRC32:1, CRC32:16, 0:8>>,
+        MsgIovec, <<0:Padding>>
+    ].
 
 -spec read(rabbit_variable_queue:seq_id(), msg_location(), State)
         -> {rabbit_types:basic_message(), State} when State::state().
@@ -243,13 +260,19 @@ sync(State) ->
 %% @todo We should try to have a read_many for when reading many from the index
 %%       so that we fetch many different messages in a single file:pread. See
 %%       if that helps improve the performance.
-read(SeqId, DiskLocation, State = #qs{ cache = Cache }) ->
+read(SeqId, DiskLocation, State = #qs{ write_buffer = WriteBuffer,
+                                       cache = Cache }) ->
     ?DEBUG("~0p ~0p ~0p", [SeqId, DiskLocation, State]),
-    case Cache of
-        #{ SeqId := {_, Msg} } ->
+    case WriteBuffer of
+        #{ SeqId := {_, _, Msg} } ->
             {Msg, State};
         _ ->
-            read_from_disk(SeqId, DiskLocation, State)
+            case Cache of
+                #{ SeqId := {_, _, Msg} } ->
+                    {Msg, State};
+                _ ->
+                    read_from_disk(SeqId, DiskLocation, State)
+            end
     end.
 
 read_from_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
@@ -265,6 +288,7 @@ read_from_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
             ok;
         1 ->
             %% We only want to check the CRC32 if configured to do so.
+            %% @todo Always check if the message has a CRC32 to be consistent with QQs.
             case check_crc32() of
                 false ->
                     ok;
@@ -286,11 +310,9 @@ read_from_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
 
 get_read_fd(Segment, State = #qs{ read_segment = Segment,
                                   read_fd = Fd }) ->
-    maybe_flush_write_fd(Segment, State),
     {ok, Fd, State};
 get_read_fd(Segment, State = #qs{ read_fd = OldFd }) ->
     maybe_close_fd(OldFd),
-    maybe_flush_write_fd(Segment, State),
     case file:open(segment_file(Segment, State), [read, raw, binary]) of
         {ok, Fd} ->
             case file:read(Fd, ?HEADER_SIZE) of
@@ -310,24 +332,6 @@ get_read_fd(Segment, State = #qs{ read_fd = OldFd }) ->
             {{error, no_file}, State}
     end.
 
-maybe_flush_write_fd(Segment, State = #qs{ write_segment = Segment }) ->
-    flush_write_fd(State);
-maybe_flush_write_fd(_, _) ->
-    ok.
-
-flush_write_fd(#qs{ write_fd = undefined }) ->
-    ok;
-flush_write_fd(#qs{ write_fd = Fd }) ->
-    %% We tell the pid handling delayed writes to flush to disk
-    %% without issuing a separate command to the fd. We need
-    %% to do this in order to read from a separate fd that
-    %% points to the same file.
-    #file_descriptor{
-        module = raw_file_io_delayed, %% assert
-        data = #{ pid := Pid }
-    } = Fd,
-    gen_statem:call(Pid, '$synchronous_flush').
-
 -spec check_msg_on_disk(rabbit_variable_queue:seq_id(), msg_location(), State)
         -> {ok | {error, any()}, State} when State::state().
 
@@ -372,17 +376,18 @@ check_msg_on_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
 
 -spec remove(rabbit_variable_queue:seq_id(), State) -> State when State::state().
 
-%% We only remove the message from the cache. We will remove
-%% the message from the disk when we delete the segment file.
-remove(SeqId, State = #qs{ cache = Cache0,
-                           cache_size = CacheSize }) ->
+%% We only remove the message from the write_buffer. We will remove
+%% the message from the disk when we delete the segment file, and
+%% from the cache on the next write.
+remove(SeqId, State = #qs{ write_buffer = WriteBuffer0,
+                           write_buffer_size = WriteBufferSize }) ->
     ?DEBUG("~0p ~0p", [SeqId, State]),
-    case maps:take(SeqId, Cache0) of
+    case maps:take(SeqId, WriteBuffer0) of
         error ->
             State;
-        {{MsgSize, _}, Cache} ->
-            State#qs{ cache = Cache,
-                      cache_size = CacheSize - MsgSize }
+        {{_, MsgSize, _}, WriteBuffer} ->
+            State#qs{ write_buffer = WriteBuffer,
+                      write_buffer_size = WriteBufferSize - MsgSize }
     end.
 
 -spec delete_segments([non_neg_integer()], State) -> State when State::state().
@@ -392,32 +397,16 @@ remove(SeqId, State = #qs{ cache = Cache0,
 delete_segments([], State) ->
     ?DEBUG("[] ~0p", [State]),
     State;
-delete_segments(Segments, State0 = #qs{ write_segment = WriteSegment,
-                                        write_fd = WriteFd,
+delete_segments(Segments, State0 = #qs{ write_buffer = WriteBuffer0,
+                                        write_buffer_size = WriteBufferSize0,
                                         read_segment = ReadSegment,
-                                        read_fd = ReadFd,
-                                        cache = Cache0,
-                                        cache_size = CacheSize0 }) ->
+                                        read_fd = ReadFd }) ->
     ?DEBUG("~0p ~0p", [Segments, State0]),
     %% First we have to close fds for the segments, if any.
     %% 'undefined' is never in Segments so we don't
     %% need to special case it.
-    CloseWrite = lists:member(WriteSegment, Segments),
     CloseRead = lists:member(ReadSegment, Segments),
     State = if
-        CloseWrite andalso CloseRead ->
-            ok = file:close(WriteFd),
-            ok = file:close(ReadFd),
-            State0#qs{ write_segment = undefined,
-                       write_fd = undefined,
-                       write_offset = ?HEADER_SIZE,
-                       read_segment = undefined,
-                       read_fd = undefined };
-        CloseWrite ->
-            ok = file:close(WriteFd),
-            State0#qs{ write_segment = undefined,
-                       write_fd = undefined,
-                       write_offset = ?HEADER_SIZE };
         CloseRead ->
             ok = file:close(ReadFd),
             State0#qs{ read_segment = undefined,
@@ -434,23 +423,27 @@ delete_segments(Segments, State0 = #qs{ write_segment = WriteSegment,
             {error, enoent} -> ok
         end
     || Segment <- Segments],
-    %% Finally, we remove any entries from the cache that fall within
+    %% Finally, we remove any entries from the buffer that fall within
     %% the segments that were deleted. For simplicity's sake, we take
     %% the highest SeqId from these files and remove any SeqId lower
-    %% than or equal to this SeqId from the cache.
+    %% than or equal to this SeqId from the buffer.
+    %%
+    %% @todo If this becomes a performance issue we may take inspiration
+    %%       from sets:filter/2.
     HighestSegment = lists:foldl(fun
         (S, SAcc) when S > SAcc -> S;
         (_, SAcc) -> SAcc
     end, -1, Segments),
     HighestSeqId = (1 + HighestSegment) * segment_entry_count(),
-    {Cache, CacheSize} = maps:fold(fun
-        (SeqId, {MsgSize, _}, {CacheAcc, CacheSize1}) when SeqId =< HighestSeqId ->
-            {CacheAcc, CacheSize1 - MsgSize};
-        (SeqId, Value, {CacheAcc, CacheSize1}) ->
-            {CacheAcc#{SeqId => Value}, CacheSize1}
-    end, {#{}, CacheSize0}, Cache0),
-    State#qs{ cache = Cache,
-              cache_size = CacheSize }.
+    {WriteBuffer, WriteBufferSize} = maps:fold(fun
+        (SeqId, {_, MsgSize, _}, {WriteBufferAcc, WriteBufferSize1})
+                when SeqId =< HighestSeqId ->
+            {WriteBufferAcc, WriteBufferSize1 - MsgSize};
+        (SeqId, Value, {WriteBufferAcc, WriteBufferSize1}) ->
+            {WriteBufferAcc#{SeqId => Value}, WriteBufferSize1}
+    end, {#{}, WriteBufferSize0}, WriteBuffer0),
+    State#qs{ write_buffer = WriteBuffer,
+              write_buffer_size = WriteBufferSize }.
 
 %% ----
 %%

From 3b8ee13d82e92253ba1eaafe0fa7c3804766e987 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 27 Jun 2022 15:15:52 +0200
Subject: [PATCH 22/39] CQv2: Always do the CRC32 check if it was computed on
 write

Brings the behavior in line with QQs and streams.
---
 .../src/rabbit_classic_queue_store_v2.erl     | 27 +++++++------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 47eb69991958..01c68b2b52d7 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -244,7 +244,6 @@ build_data({_, Size, Msg}, CheckCRC32) ->
     Padding = (Size - iolist_size(MsgIovec)) * 8,
     %% Calculate the CRC for the data if configured to do so.
     %% We will truncate the CRC to 16 bits to save on space. (Idea taken from postgres.)
-    %% @todo Move check_crc32 to flush_buffer.
     {UseCRC32, CRC32} = case CheckCRC32 of
         true -> {1, erlang:crc32(MsgIovec)};
         false -> {0, 0}
@@ -287,22 +286,16 @@ read_from_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
         0 ->
             ok;
         1 ->
-            %% We only want to check the CRC32 if configured to do so.
-            %% @todo Always check if the message has a CRC32 to be consistent with QQs.
-            case check_crc32() of
-                false ->
-                    ok;
-                true ->
-                    CRC32 = erlang:crc32(MsgBin),
-                    %% We currently crash if the CRC32 is incorrect as we cannot recover automatically.
-                    try
-                        CRC32Expected = <<CRC32:16>>,
-                        ok
-                    catch C:E:S ->
-                        rabbit_log:error("Per-queue store CRC32 check failed in ~s seq id ~b offset ~b size ~b",
-                                         [segment_file(Segment, State), SeqId, Offset, Size]),
-                        erlang:raise(C, E, S)
-                    end
+            %% Always check the CRC32 if it was computed on write.
+            CRC32 = erlang:crc32(MsgBin),
+            %% We currently crash if the CRC32 is incorrect as we cannot recover automatically.
+            try
+                CRC32Expected = <<CRC32:16>>,
+                ok
+            catch C:E:S ->
+                rabbit_log:error("Per-queue store CRC32 check failed in ~s seq id ~b offset ~b size ~b",
+                                 [segment_file(Segment, State), SeqId, Offset, Size]),
+                erlang:raise(C, E, S)
             end
     end,
     Msg = binary_to_term(MsgBin),

From 649ebbbc032e9236090f554f1e36667cdf8f2270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 28 Jun 2022 15:30:45 +0200
Subject: [PATCH 23/39] CQv2 store: Use raw for file:write_file for the file
 header

This is an attempt to fix a race condition.
---
 deps/rabbit/src/rabbit_classic_queue_store_v2.erl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 01c68b2b52d7..44331af40a00 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -168,7 +168,8 @@ get_write_offset(Segment, Size, State = #qs{ write_segment = WriteSegment })
            ?VERSION:8,
            FromSeqId:64/unsigned,
            ToSeqId:64/unsigned,
-           0:344 >>),
+           0:344 >>,
+        [raw]),
     {?HEADER_SIZE, State#qs{ write_segment = Segment,
                              write_offset = ?HEADER_SIZE + ?ENTRY_HEADER_SIZE + Size }}.
 

From 432f8d20d3ce693faebeb90f808c8c8537f60be6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 5 Jul 2022 16:00:39 +0200
Subject: [PATCH 24/39] CQv2: Read many messages at once from v2 store when
 possible

---
 .../src/rabbit_classic_queue_store_v2.erl     | 102 +++++++++++++++++-
 deps/rabbit/src/rabbit_variable_queue.erl     |  35 +++++-
 2 files changed, 133 insertions(+), 4 deletions(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 44331af40a00..2a9709148cf4 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -50,7 +50,7 @@
 -module(rabbit_classic_queue_store_v2).
 
 -export([init/1, terminate/1,
-         write/4, sync/1, read/3, check_msg_on_disk/3,
+         write/4, sync/1, read/3, read_many/2, check_msg_on_disk/3,
          remove/2, delete_segments/2]).
 
 -define(SEGMENT_EXTENSION, ".qs").
@@ -302,6 +302,106 @@ read_from_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
     Msg = binary_to_term(MsgBin),
     {Msg, State}.
 
+read_many([], State) ->
+    {[], State};
+read_many(Reads0, State0 = #qs{ write_buffer = WriteBuffer,
+                                cache        = Cache }) ->
+    %% First we read what we can from memory.
+    %% Because the Reads0 list is ordered we start from the end
+    %% and stop when we get to a message that isn't in memory.
+    case read_many_from_memory(lists:reverse(Reads0), WriteBuffer, Cache, []) of
+        %% We've read everything, return.
+        {Msgs, []} ->
+            {Msgs, State0};
+        %% Everything else will be on disk.
+        {Msgs, Reads1} ->
+            %% We prepare the pread locations sorted by segment.
+            Reads2 = lists:reverse(Reads1),
+            SegmentEntryCount = segment_entry_count(),
+            [{FirstSeqId, _}|_] = Reads2,
+            FirstSegment = FirstSeqId div SegmentEntryCount,
+            SegmentThreshold = (1 + FirstSegment) * SegmentEntryCount,
+            Segs = consolidate_reads(Reads2, SegmentThreshold,
+                FirstSegment, SegmentEntryCount, [], #{}),
+            %% We read from disk and convert multiple messages at a time.
+            read_many_from_disk(Segs, Msgs, State0)
+    end.
+
+%% We only read from Map. If we don't find the entry in Map,
+%% we replace Map with NextMap and continue. If we then don't
+%% find an entry in NextMap, we are done.
+read_many_from_memory(Reads = [{SeqId, _}|Tail], Map, NextMap, Acc) ->
+    case Map of
+        #{ SeqId := {_, _, Msg} } ->
+            read_many_from_memory(Tail, Map, NextMap, [Msg|Acc]);
+        _ when NextMap =:= #{} ->
+            %% The Acc is in the order we want, no need to reverse.
+            {Acc, Reads};
+        _ ->
+            read_many_from_memory(Reads, NextMap, #{}, Acc)
+    end;
+read_many_from_memory([], _, _, Acc) ->
+    {Acc, []}.
+
+consolidate_reads(Reads = [{SeqId, _}|_], SegmentThreshold,
+        Segment, SegmentEntryCount, Acc, Segs)
+        when SeqId >= SegmentThreshold ->
+    %% We Segment + 1 because we expect reads to be contiguous.
+    consolidate_reads(Reads, SegmentThreshold + SegmentEntryCount,
+        Segment + 1, SegmentEntryCount, [], Segs#{Segment => lists:reverse(Acc)});
+%% NextSize does not include the entry header size.
+consolidate_reads([{_, {?MODULE, NextOffset, NextSize}}|Tail], SegmentThreshold,
+        Segment, SegmentEntryCount, [{Offset, Size}|Acc], Segs)
+        when Offset + Size =:= NextOffset ->
+    consolidate_reads(Tail, SegmentThreshold, Segment, SegmentEntryCount,
+        [{Offset, Size + NextSize + ?ENTRY_HEADER_SIZE}|Acc], Segs);
+consolidate_reads([{_, {?MODULE, Offset, Size}}|Tail], SegmentThreshold,
+        Segment, SegmentEntryCount, Acc, Segs) ->
+    consolidate_reads(Tail, SegmentThreshold, Segment, SegmentEntryCount,
+        [{Offset, Size + ?ENTRY_HEADER_SIZE}|Acc], Segs);
+consolidate_reads([], _, _, _, [], Segs) ->
+    Segs;
+consolidate_reads([], _, Segment, _, Acc, Segs) ->
+    %% We lists:reverse/1 because we need to preserve order.
+    Segs#{Segment => lists:reverse(Acc)}.
+
+read_many_from_disk(Segs, Msgs, State) ->
+    %% We read from segments in reverse order because
+    %% we need to control the order of returned messages.
+    %% @todo As a result it doesn't help much to keep the read FD.
+    Keys = lists:reverse(lists:sort(maps:keys(Segs))),
+    lists:foldl(fun(Segment, {Acc0, FoldState0}) ->
+        {ok, Fd, FoldState} = get_read_fd(Segment, FoldState0),
+        {ok, Bin} = file:pread(Fd, maps:get(Segment, Segs)),
+        Acc = parse_many_from_disk(Bin, Segment, FoldState, Acc0),
+        {Acc, FoldState}
+    end, {Msgs, State}, Keys).
+
+parse_many_from_disk([<<Size:32/unsigned, _:7, UseCRC32:1, CRC32Expected:16/bits,
+                       _:8, MsgBin:Size/binary, R/bits>>|Tail], Segment, State, Acc) ->
+    case UseCRC32 of
+        0 ->
+            ok;
+        1 ->
+            %% Always check the CRC32 if it was computed on write.
+            CRC32 = erlang:crc32(MsgBin),
+            %% We currently crash if the CRC32 is incorrect as we cannot recover automatically.
+            try
+                CRC32Expected = <<CRC32:16>>,
+                ok
+            catch C:E:S ->
+                rabbit_log:error("Per-queue store CRC32 check failed in ~s",
+                                 [segment_file(Segment, State)]),
+                erlang:raise(C, E, S)
+            end
+    end,
+    Msg = binary_to_term(MsgBin),
+    [Msg|parse_many_from_disk([R|Tail], Segment, State, Acc)];
+parse_many_from_disk([<<>>], _, _, Acc) ->
+    Acc;
+parse_many_from_disk([<<>>|Tail], Segment, State, Acc) ->
+    parse_many_from_disk(Tail, Segment, State, Acc).
+
 get_read_fd(Segment, State = #qs{ read_segment = Segment,
                                   read_fd = Fd }) ->
     {ok, Fd, State};
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index c31af94367fa..4ffcc22abae1 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -1427,6 +1427,14 @@ beta_msg_status({Msg = #basic_message{id = MsgId},
                    persist_to   = queue_index,
                    msg_location = memory};
 
+beta_msg_status({Msg = #basic_message{id = MsgId},
+                 SeqId, {rabbit_classic_queue_store_v2, _, _}, MsgProps, IsPersistent}) ->
+    MS0 = beta_msg_status0(SeqId, MsgProps, IsPersistent),
+    MS0#msg_status{msg_id       = MsgId,
+                   msg          = Msg,
+                   persist_to   = queue_store,
+                   msg_location = memory};
+
 beta_msg_status({MsgId, SeqId, MsgLocation, MsgProps, IsPersistent}) ->
     MS0 = beta_msg_status0(SeqId, MsgProps, IsPersistent),
     MS0#msg_status{msg_id       = MsgId,
@@ -2711,12 +2719,14 @@ maybe_deltas_to_betas(DelsAndAcksFun,
                         q3                   = Q3,
                         index_mod            = IndexMod,
                         index_state          = IndexState,
+                        store_state          = StoreState,
                         ram_msg_count        = RamMsgCount,
                         ram_bytes            = RamBytes,
                         disk_read_count      = DiskReadCount,
                         delta_transient_bytes = DeltaTransientBytes,
                         transient_threshold  = TransientThreshold,
-                        rates                = #rates{ out = OutRate }}) ->
+                        rates                = #rates{ out = OutRate },
+                        version              = Version }) ->
     #delta { start_seq_id = DeltaSeqId,
              count        = DeltaCount,
              transient    = Transient,
@@ -2731,11 +2741,22 @@ maybe_deltas_to_betas(DelsAndAcksFun,
                    %% using the consuming rate.
                    DeltaSeqId + MemoryLimit,
                    DeltaSeqIdEnd]),
-    {List, IndexState1} = IndexMod:read(DeltaSeqId, DeltaSeqId1, IndexState),
+    {List0, IndexState1} = IndexMod:read(DeltaSeqId, DeltaSeqId1, IndexState),
+    {List, StoreState2} = case Version of
+        1 -> {List0, StoreState};
+        %% When using v2 we try to read all messages from disk at once
+        %% instead of 1 by 1 at fetch time.
+        2 ->
+            Reads = [{SeqId, MsgLocation}
+                || {_, SeqId, MsgLocation, _, _} <- List0, is_tuple(MsgLocation)],
+            {Msgs, StoreState1} = rabbit_classic_queue_store_v2:read_many(Reads, StoreState),
+            {merge_read_msgs(List0, Reads, Msgs), StoreState1}
+    end,
     {Q3a, RamCountsInc, RamBytesInc, State1, TransientCount, TransientBytes} =
         betas_from_index_entries(List, TransientThreshold,
                                  DelsAndAcksFun,
-                                 State #vqstate { index_state = IndexState1 }),
+                                 State #vqstate { index_state = IndexState1,
+                                                  store_state = StoreState2 }),
     State2 = State1 #vqstate { ram_msg_count     = RamMsgCount   + RamCountsInc,
                                ram_bytes         = RamBytes      + RamBytesInc,
                                disk_read_count   = DiskReadCount + RamCountsInc },
@@ -2766,6 +2787,13 @@ maybe_deltas_to_betas(DelsAndAcksFun,
             end
     end.
 
+merge_read_msgs([M = {_, SeqId, _, _, _}|MTail], [{SeqId, _}|RTail], [Msg|MsgTail]) ->
+    [setelement(1, M, Msg)|merge_read_msgs(MTail, RTail, MsgTail)];
+merge_read_msgs([M|MTail], RTail, MsgTail) ->
+    [M|merge_read_msgs(MTail, RTail, MsgTail)];
+merge_read_msgs([], [], []) ->
+    [].
+
 %% Flushes queue index batch caches and updates queue index state.
 ui(#vqstate{index_mod        = IndexMod,
             index_state      = IndexState,
@@ -2776,6 +2804,7 @@ ui(#vqstate{index_mod        = IndexMod,
 
 %%----------------------------------------------------------------------------
 %% Upgrading
+%% @todo Remove this, also see rabbit_upgrade.
 %%----------------------------------------------------------------------------
 
 -spec multiple_routing_keys() -> 'ok'.

From 962cc0aa229fa38bef62638cd426cee379b3a19a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 18 Jul 2022 18:05:02 +0200
Subject: [PATCH 25/39] CQv2: Fix property suite

Also always check the CRC32 even if not currently configured
to do so, if the CRC is available in the data.
---
 .../src/rabbit_classic_queue_store_v2.erl     | 110 ++++++++++--------
 deps/rabbit/src/rabbit_variable_queue.erl     |   7 +-
 2 files changed, 63 insertions(+), 54 deletions(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 2a9709148cf4..0ce02dbb37dc 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -123,10 +123,11 @@ init(#resource{ virtual_host = VHost } = Name) ->
 
 terminate(State0 = #qs{ read_fd = ReadFd }) ->
     ?DEBUG("~0p", [State0]),
-    State = flush_buffer(State0),
+    State = flush_buffer(State0, fun(Fd) -> ok = file:sync(Fd) end),
     maybe_close_fd(ReadFd),
     State#qs{ write_segment = undefined,
               write_offset = ?HEADER_SIZE,
+              cache = #{},
               read_segment = undefined,
               read_fd = undefined }.
 
@@ -159,35 +160,41 @@ get_write_offset(Segment, Size, State = #qs{ write_segment = Segment,
                                              write_offset = Offset }) ->
     {Offset, State#qs{ write_offset = Offset + ?ENTRY_HEADER_SIZE + Size }};
 get_write_offset(Segment, Size, State = #qs{ write_segment = WriteSegment })
-        when Segment > WriteSegment; WriteSegment =:= undefined ->
-    SegmentEntryCount = segment_entry_count(),
-    FromSeqId = Segment * SegmentEntryCount,
-    ToSeqId = FromSeqId + SegmentEntryCount,
-    ok = file:write_file(segment_file(Segment, State),
-        << ?MAGIC:32,
-           ?VERSION:8,
-           FromSeqId:64/unsigned,
-           ToSeqId:64/unsigned,
-           0:344 >>,
-        [raw]),
+        when Segment > WriteSegment ->
     {?HEADER_SIZE, State#qs{ write_segment = Segment,
-                             write_offset = ?HEADER_SIZE + ?ENTRY_HEADER_SIZE + Size }}.
+                             write_offset = ?HEADER_SIZE + ?ENTRY_HEADER_SIZE + Size }};
+%% The first time we write we have to figure out the write_offset by
+%% looking at the segment file directly.
+get_write_offset(Segment, Size, State = #qs{ write_segment = undefined }) ->
+    Offset = case file:open(segment_file(Segment, State), [read, raw, binary]) of
+        {ok, Fd} ->
+            {ok, Offset0} = file:position(Fd, eof),
+            ok = file:close(Fd),
+            case Offset0 of
+                0 -> ?HEADER_SIZE;
+                _ -> Offset0
+            end;
+        {error, enoent} ->
+            ?HEADER_SIZE
+    end,
+    {Offset, State#qs{ write_segment = Segment,
+                       write_offset = Offset + ?ENTRY_HEADER_SIZE + Size }}.
 
 -spec sync(State) -> State when State::state().
 
 sync(State) ->
     ?DEBUG("~0p", [State]),
-    flush_buffer(State).
+    flush_buffer(State, fun(_) -> ok end).
 
 maybe_flush_buffer(State = #qs{ write_buffer_size = WriteBufferSize }) ->
     case WriteBufferSize >= max_cache_size() of
-        true -> flush_buffer(State);
+        true -> flush_buffer(State, fun(_) -> ok end);
         false -> State
     end.
 
-flush_buffer(State = #qs{ write_buffer_size = 0 }) ->
+flush_buffer(State = #qs{ write_buffer_size = 0 }, _) ->
     State;
-flush_buffer(State0 = #qs{ write_buffer = WriteBuffer }) ->
+flush_buffer(State0 = #qs{ write_buffer = WriteBuffer }, FsyncFun) ->
     CheckCRC32 = check_crc32(),
     SegmentEntryCount = segment_entry_count(),
     %% First we prepare the writes sorted by segment.
@@ -196,7 +203,22 @@ flush_buffer(State0 = #qs{ write_buffer = WriteBuffer }) ->
     %% Then we do the writes for each segment.
     State = lists:foldl(fun({Segment, LocBytes}, FoldState) ->
         {ok, Fd} = file:open(segment_file(Segment, FoldState), [read, write, raw, binary]),
+        case file:position(Fd, eof) of
+            {ok, 0} ->
+                %% We write the file header if it does not exist.
+                FromSeqId = Segment * SegmentEntryCount,
+                ToSeqId = FromSeqId + SegmentEntryCount,
+                ok = file:write(Fd,
+                    << ?MAGIC:32,
+                       ?VERSION:8,
+                       FromSeqId:64/unsigned,
+                       ToSeqId:64/unsigned,
+                       0:344 >>);
+             _ ->
+                ok
+        end,
         ok = file:pwrite(Fd, lists:sort(LocBytes)),
+        FsyncFun(Fd),
         ok = file:close(Fd),
         FoldState
     end, State0, Writes),
@@ -222,17 +244,17 @@ flush_buffer_build(Tail = [{SeqId, _}|_], _, SegmentThreshold, _, WriteOffset, W
         _ -> {Tail, [{WriteOffset, lists:reverse(WriteAcc)}|Acc]}
     end;
 flush_buffer_build([{_, Entry = {Offset, Size, _}}|Tail],
-        CheckCRC32, SegmentEntryCount, Offset, WriteOffset, WriteAcc, Acc) ->
-    flush_buffer_build(Tail, CheckCRC32, SegmentEntryCount,
+        CheckCRC32, SegmentThreshold, Offset, WriteOffset, WriteAcc, Acc) ->
+    flush_buffer_build(Tail, CheckCRC32, SegmentThreshold,
         Offset + ?ENTRY_HEADER_SIZE + Size, WriteOffset,
         [build_data(Entry, CheckCRC32)|WriteAcc], Acc);
 flush_buffer_build([{_, Entry = {Offset, Size, _}}|Tail],
-        CheckCRC32, SegmentEntryCount, _, _, [], Acc) ->
-    flush_buffer_build(Tail, CheckCRC32, SegmentEntryCount,
+        CheckCRC32, SegmentThreshold, _, _, [], Acc) ->
+    flush_buffer_build(Tail, CheckCRC32, SegmentThreshold,
         Offset + ?ENTRY_HEADER_SIZE + Size, Offset, [build_data(Entry, CheckCRC32)], Acc);
 flush_buffer_build([{_, Entry = {Offset, Size, _}}|Tail],
-        CheckCRC32, SegmentEntryCount, _, WriteOffset, WriteAcc, Acc) ->
-    flush_buffer_build(Tail, CheckCRC32, SegmentEntryCount,
+        CheckCRC32, SegmentThreshold, _, WriteOffset, WriteAcc, Acc) ->
+    flush_buffer_build(Tail, CheckCRC32, SegmentThreshold,
         Offset + ?ENTRY_HEADER_SIZE + Size, Offset, [build_data(Entry, CheckCRC32)],
         [{WriteOffset, lists:reverse(WriteAcc)}|Acc]);
 flush_buffer_build([], _, _, _, _, [], Acc) ->
@@ -423,7 +445,8 @@ get_read_fd(Segment, State = #qs{ read_fd = OldFd }) ->
                                                     read_fd = undefined }}
             end;
         {error, enoent} ->
-            {{error, no_file}, State}
+            {{error, no_file}, State#qs{ read_segment = undefined,
+                                         read_fd = undefined }}
     end.
 
 -spec check_msg_on_disk(rabbit_variable_queue:seq_id(), msg_location(), State)
@@ -444,16 +467,10 @@ check_msg_on_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
                                 0 ->
                                     {ok, State};
                                 1 ->
-                                    %% We only want to check the CRC32 if configured to do so.
-                                    case check_crc32() of
-                                        false ->
-                                            {ok, State};
-                                        true ->
-                                            CRC32 = erlang:crc32(MsgBin),
-                                            case <<CRC32:16>> of
-                                                CRC32Expected -> {ok, State};
-                                                _ -> {{error, bad_crc}, State}
-                                            end
+                                    CRC32 = erlang:crc32(MsgBin),
+                                    case <<CRC32:16>> of
+                                        CRC32Expected -> {ok, State};
+                                        _ -> {{error, bad_crc}, State}
                                     end
                             end;
                         _ ->
@@ -518,23 +535,16 @@ delete_segments(Segments, State0 = #qs{ write_buffer = WriteBuffer0,
         end
     || Segment <- Segments],
     %% Finally, we remove any entries from the buffer that fall within
-    %% the segments that were deleted. For simplicity's sake, we take
-    %% the highest SeqId from these files and remove any SeqId lower
-    %% than or equal to this SeqId from the buffer.
-    %%
-    %% @todo If this becomes a performance issue we may take inspiration
-    %%       from sets:filter/2.
-    HighestSegment = lists:foldl(fun
-        (S, SAcc) when S > SAcc -> S;
-        (_, SAcc) -> SAcc
-    end, -1, Segments),
-    HighestSeqId = (1 + HighestSegment) * segment_entry_count(),
+    %% the segments that were deleted.
+    SegmentEntryCount = segment_entry_count(),
     {WriteBuffer, WriteBufferSize} = maps:fold(fun
-        (SeqId, {_, MsgSize, _}, {WriteBufferAcc, WriteBufferSize1})
-                when SeqId =< HighestSeqId ->
-            {WriteBufferAcc, WriteBufferSize1 - MsgSize};
-        (SeqId, Value, {WriteBufferAcc, WriteBufferSize1}) ->
-            {WriteBufferAcc#{SeqId => Value}, WriteBufferSize1}
+        (SeqId, Value = {_, MsgSize, _}, {WriteBufferAcc, WriteBufferSize1}) ->
+            case lists:member(SeqId div SegmentEntryCount, Segments) of
+                true ->
+                    {WriteBufferAcc, WriteBufferSize1 - MsgSize};
+                false ->
+                    {WriteBufferAcc#{SeqId => Value}, WriteBufferSize1}
+            end
     end, {#{}, WriteBufferSize0}, WriteBuffer0),
     State#qs{ write_buffer = WriteBuffer,
               write_buffer_size = WriteBufferSize }.
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 4ffcc22abae1..9be43626eaa0 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -1192,7 +1192,7 @@ convert_from_v2_to_v1(State0 = #vqstate{ index_mod   = rabbit_classic_queue_inde
     %% We have already closed the v2 index/store FDs when deleting the files.
     State#vqstate{ index_mod   = rabbit_queue_index,
                    index_state = V1Index,
-                   store_state = V2Store }.
+                   store_state = rabbit_classic_queue_store_v2:terminate(V2Store) }.
 
 convert_from_v2_to_v1_in_memory(State0 = #vqstate{ q1 = Q1b,
                                                    q2 = Q2b,
@@ -2403,13 +2403,12 @@ purge_pending_ack(KeepPersistent,
                                      store_state       = StoreState0,
                                      msg_store_clients = MSCState }) ->
     {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, State1} = purge_pending_ack1(State),
-    StoreState1 = lists:foldl(fun rabbit_classic_queue_store_v2:remove/2, StoreState0, SeqIdsInStore),
-    %% @todo Sounds like we might want to remove only transients from the cache?
     case KeepPersistent of
         true  -> remove_transient_msgs_by_id(MsgIdsByStore, MSCState),
-                 State1 #vqstate { store_state = StoreState1 };
+                 State1;
         false -> {DeletedSegments, IndexState1} =
                      IndexMod:ack(IndexOnDiskSeqIds, IndexState),
+                 StoreState1 = lists:foldl(fun rabbit_classic_queue_store_v2:remove/2, StoreState0, SeqIdsInStore),
                  StoreState = rabbit_classic_queue_store_v2:delete_segments(DeletedSegments, StoreState1),
                  remove_vhost_msgs_by_id(MsgIdsByStore, MSCState),
                  State1 #vqstate { index_state = IndexState1,

From f3963a5998a5d0cce5f80bdc8e077aef2fad3ecc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 1 Aug 2022 13:07:34 +0200
Subject: [PATCH 26/39] CQv2: Sync/handle confirms before conversion

---
 deps/rabbit/src/rabbit_variable_queue.erl | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 9be43626eaa0..4ecf0bc3ade0 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -1041,10 +1041,19 @@ zip_msgs_and_acks(Msgs, AckTags, Accumulator, _State) ->
 set_queue_version(Version, State = #vqstate { version = Version }) ->
     State;
 %% v2 -> v1.
-set_queue_version(1, State = #vqstate { version = 2 }) ->
+set_queue_version(1, State0 = #vqstate { version = 2 }) ->
+    %% We call timeout/1 so that we sync to disk and get the confirms
+    %% handled before we do the conversion. This is necessary because
+    %% v2 now has a simpler confirms code path.
+    %% @todo Perhaps rename the function.
+    State = timeout(State0),
     convert_from_v2_to_v1(State #vqstate { version = 1 });
 %% v1 -> v2.
-set_queue_version(2, State = #vqstate { version = 1 }) ->
+set_queue_version(2, State0 = #vqstate { version = 1 }) ->
+    %% We call timeout/1 so that we sync to disk and get the confirms
+    %% handled before we do the conversion. This is necessary because
+    %% v2 now has a simpler confirms code path.
+    State = timeout(State0),
     convert_from_v1_to_v2(State #vqstate { version = 2 }).
 
 -define(CONVERT_COUNT, 1).
@@ -2057,6 +2066,7 @@ remove_queue_entries1(
      stats_removed(MsgStatus, State)}.
 
 process_delivers_and_acks_fun(deliver_and_ack) ->
+    %% @todo Make a clause for empty Acks list?
     fun (NextDeliverSeqId, Acks, State = #vqstate { index_mod   = IndexMod,
                                                     index_state = IndexState,
                                                     store_state = StoreState0}) ->

From 8051b00305ad401e7bd81d22804be1fcccfb3829 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 5 Sep 2022 09:41:34 +0200
Subject: [PATCH 27/39] CQv2: Small fixes of and via the property suite

---
 deps/rabbit/src/rabbit_msg_store.erl          | 6 ++++++
 deps/rabbit/test/classic_queue_prop_SUITE.erl | 6 +++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_msg_store.erl b/deps/rabbit/src/rabbit_msg_store.erl
index 75d106e19def..baa22da997a5 100644
--- a/deps/rabbit/src/rabbit_msg_store.erl
+++ b/deps/rabbit/src/rabbit_msg_store.erl
@@ -974,6 +974,12 @@ handle_info(sync, State) ->
 handle_info(timeout, State) ->
     noreply(internal_sync(State));
 
+%% @todo When a CQ crashes the message store does not remove
+%%       the client information and clean up. This eventually
+%%       leads to the queue running a full recovery on the next
+%%       message store restart because the store will keep the
+%%       crashed queue's ref in its persistent state and fail
+%%       to find the corresponding ref during start.
 handle_info({'DOWN', _MRef, process, Pid, _Reason}, State) ->
     %% similar to what happens in
     %% rabbit_amqqueue_process:handle_ch_down but with a relation of
diff --git a/deps/rabbit/test/classic_queue_prop_SUITE.erl b/deps/rabbit/test/classic_queue_prop_SUITE.erl
index c25bbb0606a1..e32798bba298 100644
--- a/deps/rabbit/test/classic_queue_prop_SUITE.erl
+++ b/deps/rabbit/test/classic_queue_prop_SUITE.erl
@@ -544,7 +544,11 @@ postcondition(_, {call, _, cmd_channel_publish, _}, Msg) ->
 postcondition(_, {call, _, cmd_channel_publish_many, _}, Msgs) ->
     lists:all(fun(Msg) -> is_record(Msg, amqp_msg) end, Msgs);
 postcondition(_, {call, _, cmd_channel_wait_for_confirms, _}, Res) ->
-    Res =:= true;
+    %% It is possible for nacks to be sent during restarts.
+    %% This is a rare event but it is not a bug, the client
+    %% is simply expected to take action. Timeouts are
+    %% always a bug however (acks/nacks not sent at all).
+    Res =:= true orelse Res =:= false;
 postcondition(_, {call, _, cmd_channel_consume, _}, _) ->
     true;
 postcondition(_, {call, _, cmd_channel_cancel, _}, _) ->

From 723cc54705305857cfad3f5e7bae778cdd61af54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 5 Sep 2022 13:37:33 +0200
Subject: [PATCH 28/39] CQ: Some cleanup

---
 Makefile                                      |  2 +-
 deps/rabbit/src/rabbit_amqqueue_process.erl   |  9 +--
 .../src/rabbit_classic_queue_store_v2.erl     |  6 +-
 deps/rabbit/src/rabbit_variable_queue.erl     | 77 +++++--------------
 deps/rabbit/test/backing_queue_SUITE.erl      | 45 +----------
 5 files changed, 25 insertions(+), 114 deletions(-)

diff --git a/Makefile b/Makefile
index 08098537d364..018b0a91b750 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ include plugins.mk
 # Note: When including NIFs in a release make sure to build
 # them on the appropriate platform for the target environment.
 # For example build looking_glass on Linux when targeting Docker.
-ADDITIONAL_PLUGINS ?= looking_glass
+ADDITIONAL_PLUGINS ?=
 
 DEPS = rabbit_common rabbit $(PLUGINS) $(ADDITIONAL_PLUGINS)
 
diff --git a/deps/rabbit/src/rabbit_amqqueue_process.erl b/deps/rabbit/src/rabbit_amqqueue_process.erl
index d114ca5ea2e9..5a3145ac7220 100644
--- a/deps/rabbit/src/rabbit_amqqueue_process.erl
+++ b/deps/rabbit/src/rabbit_amqqueue_process.erl
@@ -502,14 +502,7 @@ next_state(State = #q{q = Q,
                       backing_queue       = BQ,
                       backing_queue_state = BQS,
                       msg_id_to_channel   = MTC}) ->
-%    try
-        assert_invariant(State),
-%    catch C:E:S ->
-%        rabbit_time_travel_dbg:dump(),
-%        rabbit_time_travel_dbg:print(),
-%        logger:error("BAD STATE ~p", [State]),
-%        erlang:raise(C,E,S)
-%    end,
+    assert_invariant(State),
     {MsgIds, BQS1} = BQ:drain_confirmed(BQS),
     MTC1 = confirm_messages(MsgIds, MTC, amqqueue:get_name(Q)),
     State1 = State#q{backing_queue_state = BQS1, msg_id_to_channel = MTC1},
diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 0ce02dbb37dc..5681fba1b007 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -279,9 +279,6 @@ build_data({_, Size, Msg}, CheckCRC32) ->
 -spec read(rabbit_variable_queue:seq_id(), msg_location(), State)
         -> {rabbit_types:basic_message(), State} when State::state().
 
-%% @todo We should try to have a read_many for when reading many from the index
-%%       so that we fetch many different messages in a single file:pread. See
-%%       if that helps improve the performance.
 read(SeqId, DiskLocation, State = #qs{ write_buffer = WriteBuffer,
                                        cache = Cache }) ->
     ?DEBUG("~0p ~0p ~0p", [SeqId, DiskLocation, State]),
@@ -324,6 +321,9 @@ read_from_disk(SeqId, {?MODULE, Offset, Size}, State0) ->
     Msg = binary_to_term(MsgBin),
     {Msg, State}.
 
+-spec read_many([{rabbit_variable_queue:seq_id(), msg_location()}], State)
+        -> {[rabbit_types:basic_message()], State} when State::state().
+
 read_many([], State) ->
     {[], State};
 read_many(Reads0, State0 = #qs{ write_buffer = WriteBuffer,
diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 4ecf0bc3ade0..661870cb7a62 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -812,13 +812,10 @@ requeue(AckTags, #vqstate { delta      = Delta,
                             q3         = Q3,
                             in_counter = InCounter,
                             len        = Len } = State) ->
-
     %% @todo This can be heavily simplified: if the message falls into delta,
     %%       add it there. Otherwise just add it to q3 in the correct position.
-
-    {SeqIds, Q3a, MsgIds, State1} = queue_merge(lists:sort(AckTags), Q3, [],
-                                                delta_limit(Delta),
-                                                fun publish_beta/2, State),
+    {SeqIds, Q3a, MsgIds, State1} = requeue_merge(lists:sort(AckTags), Q3, [],
+                                                  delta_limit(Delta), State),
     {Delta1, MsgIds1, State2}     = delta_merge(SeqIds, Delta, MsgIds,
                                                 State1),
     MsgCount = length(MsgIds1),
@@ -895,30 +892,9 @@ update_rate(Now, TS, Count, Rate) ->
     end.
 
 %% @todo Should be renamed since it's only used to update_rates.
+%%       Can do this after mirroring gets removed.
 ram_duration(State) ->
-    State1 = %#vqstate { rates = #rates { in      = AvgIngressRate,
-             %                            out     = AvgEgressRate,
-             %                            ack_in  = AvgAckIngressRate,
-             %                            ack_out = AvgAckEgressRate },
-             %           ram_msg_count      = RamMsgCount,
-             %           ram_msg_count_prev = RamMsgCountPrev,
-             %           ram_pending_ack    = RPA,
-             %           ram_ack_count_prev = RamAckCountPrev } =
-        update_rates(State),
-
-%    RamAckCount = map_size(RPA),
-%
-%    Duration = %% msgs+acks / (msgs+acks/sec) == sec
-%        case lists:all(fun (X) -> X < 0.01 end,
-%                       [AvgEgressRate, AvgIngressRate,
-%                        AvgAckEgressRate, AvgAckIngressRate]) of
-%            true  -> infinity;
-%            false -> (RamMsgCountPrev + RamMsgCount +
-%                          RamAckCount + RamAckCountPrev) /
-%                         (4 * (AvgEgressRate + AvgIngressRate +
-%                                   AvgAckEgressRate + AvgAckIngressRate))
-%        end,
-
+    State1 = update_rates(State),
     {infinity, State1}.
 
 needs_timeout(#vqstate { index_mod   = IndexMod,
@@ -1045,7 +1021,6 @@ set_queue_version(1, State0 = #vqstate { version = 2 }) ->
     %% We call timeout/1 so that we sync to disk and get the confirms
     %% handled before we do the conversion. This is necessary because
     %% v2 now has a simpler confirms code path.
-    %% @todo Perhaps rename the function.
     State = timeout(State0),
     convert_from_v2_to_v1(State #vqstate { version = 1 });
 %% v1 -> v2.
@@ -1461,13 +1436,6 @@ beta_msg_status0(SeqId, MsgProps, IsPersistent) ->
               index_on_disk = true,
               msg_props     = MsgProps}.
 
-%trim_msg_status(MsgStatus) ->
-%    case persist_to(MsgStatus) of
-%        msg_store   -> MsgStatus#msg_status{msg = undefined};
-%        queue_store -> MsgStatus#msg_status{msg = undefined};
-%        queue_index -> MsgStatus
-%    end.
-
 with_msg_store_state({MSCStateP, MSCStateT},  true, Fun) ->
     {Result, MSCStateP1} = Fun(MSCStateP),
     {Result, {MSCStateP1, MSCStateT}};
@@ -2542,42 +2510,33 @@ msgs_and_indices_written_to_disk(Callback, MsgIdSet) ->
 %% Internal plumbing for requeue
 %%----------------------------------------------------------------------------
 
-%% @todo This function is misnamed because we don't do alpha/beta anymore.
-%%       So we don't need to write the messages to disk at all, just add
-%%       the messages back to q3 and update the stats.
-publish_beta(MsgStatus, State) ->
-%    {MsgStatus1, State1} = maybe_prepare_write_to_disk(true, false, MsgStatus, State),
-%    MsgStatus2 = m(trim_msg_status(MsgStatus1)),
-    {MsgStatus, stats_requeued_memory(MsgStatus, State)}.
-
 %% Rebuild queue, inserting sequence ids to maintain ordering
-queue_merge(SeqIds, Q, MsgIds, Limit, PubFun, State) ->
-    queue_merge(SeqIds, Q, ?QUEUE:new(), MsgIds,
-                Limit, PubFun, State).
+requeue_merge(SeqIds, Q, MsgIds, Limit, State) ->
+    requeue_merge(SeqIds, Q, ?QUEUE:new(), MsgIds,
+                Limit, State).
 
-queue_merge([SeqId | Rest] = SeqIds, Q, Front, MsgIds,
-            Limit, PubFun, State)
+requeue_merge([SeqId | Rest] = SeqIds, Q, Front, MsgIds,
+            Limit, State)
   when Limit == undefined orelse SeqId < Limit ->
     case ?QUEUE:out(Q) of
         {{value, #msg_status { seq_id = SeqIdQ } = MsgStatus}, Q1}
           when SeqIdQ < SeqId ->
             %% enqueue from the remaining queue
-            queue_merge(SeqIds, Q1, ?QUEUE:in(MsgStatus, Front), MsgIds,
-                        Limit, PubFun, State);
+            requeue_merge(SeqIds, Q1, ?QUEUE:in(MsgStatus, Front), MsgIds,
+                        Limit, State);
         {_, _Q1} ->
             %% enqueue from the remaining list of sequence ids
             case msg_from_pending_ack(SeqId, State) of
                 {none, _} ->
-                    queue_merge(Rest, Q, Front, MsgIds, Limit, PubFun, State);
-                {MsgStatus, State1} ->
-                    {#msg_status { msg_id = MsgId } = MsgStatus1, State2} =
-                        PubFun(MsgStatus, State1),
-                    queue_merge(Rest, Q, ?QUEUE:in(MsgStatus1, Front), [MsgId | MsgIds],
-                                Limit, PubFun, State2)
+                    requeue_merge(Rest, Q, Front, MsgIds, Limit, State);
+                {#msg_status { msg_id = MsgId } = MsgStatus, State1} ->
+                    State2 = stats_requeued_memory(MsgStatus, State1),
+                    requeue_merge(Rest, Q, ?QUEUE:in(MsgStatus, Front), [MsgId | MsgIds],
+                                Limit, State2)
             end
     end;
-queue_merge(SeqIds, Q, Front, MsgIds,
-            _Limit, _PubFun, State) ->
+requeue_merge(SeqIds, Q, Front, MsgIds,
+            _Limit, State) ->
     {SeqIds, ?QUEUE:join(Front, Q), MsgIds, State}.
 
 delta_merge([], Delta, MsgIds, State) ->
diff --git a/deps/rabbit/test/backing_queue_SUITE.erl b/deps/rabbit/test/backing_queue_SUITE.erl
index ffd2778c4479..538178cdf4a9 100644
--- a/deps/rabbit/test/backing_queue_SUITE.erl
+++ b/deps/rabbit/test/backing_queue_SUITE.erl
@@ -20,7 +20,6 @@
 -define(VHOST, <<"/">>).
 
 -define(VARIABLE_QUEUE_TESTCASES, [
-%    variable_queue_dynamic_duration_change,
     variable_queue_partial_segments_delta_thing,
     variable_queue_all_the_bits_not_covered_elsewhere_A,
     variable_queue_all_the_bits_not_covered_elsewhere_B,
@@ -934,42 +933,6 @@ get_queue_sup_pid([{_, SupPid, _, _} | Rest], QueuePid) ->
 get_queue_sup_pid([], _QueuePid) ->
     undefined.
 
-%variable_queue_dynamic_duration_change(Config) ->
-%    passed = rabbit_ct_broker_helpers:rpc(Config, 0,
-%      ?MODULE, variable_queue_dynamic_duration_change1, [Config]).
-%
-%variable_queue_dynamic_duration_change1(Config) ->
-%    with_fresh_variable_queue(
-%      fun variable_queue_dynamic_duration_change2/2,
-%      ?config(variable_queue_type, Config)).
-%
-%variable_queue_dynamic_duration_change2(VQ0, _QName) ->
-%    IndexMod = index_mod(),
-%    SegmentSize = IndexMod:next_segment_boundary(0),
-%
-%    %% start by sending in a couple of segments worth
-%    Len = 2*SegmentSize,
-%    VQ1 = variable_queue_publish(false, Len, VQ0),
-%    %% squeeze and relax queue
-%    Churn = Len div 32,
-%    VQ2 = publish_fetch_and_ack(Churn, Len, VQ1),
-%
-%    {Duration, VQ3} = rabbit_variable_queue:ram_duration(VQ2),
-%    VQ7 = lists:foldl(
-%            fun (Duration1, VQ4) ->
-%                    {_Duration, VQ5} = rabbit_variable_queue:ram_duration(VQ4),
-%                    VQ6 = variable_queue_set_ram_duration_target(
-%                            Duration1, VQ5),
-%                    publish_fetch_and_ack(Churn, Len, VQ6)
-%            end, VQ3, [Duration / 4, 0, Duration / 4, infinity]),
-%
-%    %% drain
-%    {VQ8, AckTags} = variable_queue_fetch(Len, false, false, Len, VQ7),
-%    {_Guids, VQ9} = rabbit_variable_queue:ack(AckTags, VQ8),
-%    {empty, VQ10} = rabbit_variable_queue:fetch(true, VQ9),
-%
-%    VQ10.
-
 variable_queue_partial_segments_delta_thing(Config) ->
     passed = rabbit_ct_broker_helpers:rpc(Config, 0,
       ?MODULE, variable_queue_partial_segments_delta_thing1, [Config]).
@@ -1271,7 +1234,7 @@ variable_queue_fetchwhile_varying_ram_duration1(Config) ->
 variable_queue_fetchwhile_varying_ram_duration2(VQ0, _QName) ->
     test_dropfetchwhile_varying_ram_duration(
       fun (VQ1) ->
-              {Fetched, ok, VQ2} = rabbit_variable_queue:fetchwhile(
+              {_, ok, VQ2} = rabbit_variable_queue:fetchwhile(
                                fun (_) -> false end,
                                fun (_, _, A) -> A end,
                                ok, VQ1),
@@ -1711,7 +1674,6 @@ with_fresh_variable_queue(Fun, Mode) ->
                                           {delta, undefined, 0, 0, undefined}},
                                          {q3, 0}, {q4, 0},
                                          {len, 0}]),
-                       %% @todo Some tests probably don't keep this after restart (dropwhile_(sync)_restart, A2).
                        VQ1 = set_queue_mode(Mode, VQ),
                        try
                            _ = rabbit_variable_queue:delete_and_terminate(
@@ -1729,10 +1691,7 @@ with_fresh_variable_queue(Fun, Mode) ->
     passed.
 
 set_queue_mode(Mode, VQ) ->
-    VQ1 = rabbit_variable_queue:set_queue_mode(Mode, VQ),
-    S1 = variable_queue_status(VQ1),
-%    assert_props(S1, [{mode, Mode}]),
-    VQ1.
+    rabbit_variable_queue:set_queue_mode(Mode, VQ).
 
 variable_queue_publish(IsPersistent, Count, VQ) ->
     variable_queue_publish(IsPersistent, Count, fun (_N, P) -> P end, VQ).

From 23f1346b08068fb737e1be60126335977026f86f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Mon, 5 Sep 2022 14:34:27 +0200
Subject: [PATCH 29/39] CQ: Some more cleanup

---
 .../src/rabbit_classic_queue_store_v2.erl      |  1 -
 deps/rabbit/src/rabbit_time_travel_dbg.erl     | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
index 5681fba1b007..f76521d74be4 100644
--- a/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
+++ b/deps/rabbit/src/rabbit_classic_queue_store_v2.erl
@@ -390,7 +390,6 @@ consolidate_reads([], _, Segment, _, Acc, Segs) ->
 read_many_from_disk(Segs, Msgs, State) ->
     %% We read from segments in reverse order because
     %% we need to control the order of returned messages.
-    %% @todo As a result it doesn't help much to keep the read FD.
     Keys = lists:reverse(lists:sort(maps:keys(Segs))),
     lists:foldl(fun(Segment, {Acc0, FoldState0}) ->
         {ok, Fd, FoldState} = get_read_fd(Segment, FoldState0),
diff --git a/deps/rabbit/src/rabbit_time_travel_dbg.erl b/deps/rabbit/src/rabbit_time_travel_dbg.erl
index 03d49c036203..75f3ee4cb541 100644
--- a/deps/rabbit/src/rabbit_time_travel_dbg.erl
+++ b/deps/rabbit/src/rabbit_time_travel_dbg.erl
@@ -1,3 +1,21 @@
+%% This Source Code Form is subject to the terms of the Mozilla Public
+%% License, v. 2.0. If a copy of the MPL was not distributed with this
+%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
+%%
+%% Copyright (c) 2007-2022 VMware, Inc. or its affiliates.  All rights reserved.
+%%
+
+%% This module is a debugging utility mainly meant for debugging
+%% test failures. It should not be used in production in its
+%% current form.
+%%
+%% The tracer is started and configured against a pid and a set
+%% of applications. Then update the code where you need to
+%% obtain events, such as a crash. Wrapping the crash in
+%% a try/catch and adding rabbit_time_travel_dbg:print()
+%% will print up to 1000 events before the crash occured,
+%% allowing you to easily figure out what happened.
+
 -module(rabbit_time_travel_dbg).
 -compile(export_all).
 -compile(nowarn_export_all).

From f59020191b40c9396b983c96a1b77e91698b160f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 6 Sep 2022 11:51:34 +0200
Subject: [PATCH 30/39] CQ: Enable borken checks in backing_queue_SUITE again

---
 deps/rabbit/test/backing_queue_SUITE.erl | 108 ++++++++---------------
 1 file changed, 35 insertions(+), 73 deletions(-)

diff --git a/deps/rabbit/test/backing_queue_SUITE.erl b/deps/rabbit/test/backing_queue_SUITE.erl
index 538178cdf4a9..8697f2257d4d 100644
--- a/deps/rabbit/test/backing_queue_SUITE.erl
+++ b/deps/rabbit/test/backing_queue_SUITE.erl
@@ -949,35 +949,31 @@ variable_queue_partial_segments_delta_thing2(VQ0, _QName) ->
     OneAndAHalfSegment = SegmentSize + HalfSegment,
     VQ1 = variable_queue_publish(true, OneAndAHalfSegment, VQ0),
     {_Duration, VQ2} = rabbit_variable_queue:ram_duration(VQ1),
-    VQ3 = variable_queue_set_ram_duration_target(0, VQ2),
-%    VQ3 = check_variable_queue_status(
-%            variable_queue_set_ram_duration_target(0, VQ2),
-%            %% one segment in q3, and half a segment in delta
-%            %% @todo That's wrong now! v1/v2
-%            [{delta, {delta, SegmentSize, HalfSegment, 0, OneAndAHalfSegment}},
-%             {q3, SegmentSize},
-%             {len, SegmentSize + HalfSegment}]),
+    VQ3 = check_variable_queue_status(
+            variable_queue_set_ram_duration_target(0, VQ2),
+            %% We only have one message in memory because the amount in memory
+            %% depends on the consume rate, which is nil in this test.
+            [{delta, {delta, 1, OneAndAHalfSegment - 1, 0, OneAndAHalfSegment}},
+             {q3, 1},
+             {len, OneAndAHalfSegment}]),
     VQ4 = variable_queue_set_ram_duration_target(infinity, VQ3),
-    VQ5 = variable_queue_publish(true, 1, VQ4),
-%    VQ5 = check_variable_queue_status(
-%            variable_queue_publish(true, 1, VQ4),
-%            %% one alpha, but it's in the same segment as the deltas
-%            %% @todo That's wrong now! v1/v2
-%            [{q1, 1},
-%             {delta, {delta, SegmentSize, HalfSegment, 0, OneAndAHalfSegment}},
-%             {q3, SegmentSize},
-%             {len, SegmentSize + HalfSegment + 1}]),
+    VQ5 = check_variable_queue_status(
+            variable_queue_publish(true, 1, VQ4),
+            %% one alpha, but it's in the same segment as the deltas
+            %% @todo That's wrong now! v1/v2
+            [{delta, {delta, 1, OneAndAHalfSegment, 0, OneAndAHalfSegment + 1}},
+             {q3, 1},
+             {len, OneAndAHalfSegment + 1}]),
     {VQ6, AckTags} = variable_queue_fetch(SegmentSize, true, false,
                                           SegmentSize + HalfSegment + 1, VQ5),
-    VQ7 = VQ6,
-%    VQ7 = check_variable_queue_status(
-%            VQ6,
-%            %% the half segment should now be in q3
-%            %% @todo That's wrong now! v1/v2
-%            [{q1, 1},
-%             {delta, {delta, undefined, 0, 0, undefined}},
-%             {q3, HalfSegment},
-%             {len, HalfSegment + 1}]),
+    VQ7 = check_variable_queue_status(
+            VQ6,
+            %% We only read from delta up to the end of the segment, so
+            %% after fetching exactly one segment, we should have no
+            %% messages in memory.
+            [{delta, {delta, SegmentSize, HalfSegment + 1, 0, OneAndAHalfSegment + 1}},
+             {q3, 0},
+             {len, HalfSegment + 1}]),
     {VQ8, AckTags1} = variable_queue_fetch(HalfSegment + 1, true, false,
                                            HalfSegment + 1, VQ7),
     {_Guids, VQ9} = rabbit_variable_queue:ack(AckTags ++ AckTags1, VQ8),
@@ -1274,34 +1270,10 @@ variable_queue_ack_limiting2(VQ0, _Config) ->
     %% fetch half the messages
     {VQ4, _AckTags} = variable_queue_fetch(Len div 2, false, false, Len, VQ3),
 
-    Status = variable_queue_status(VQ4),
-    VQ5 = VQ4,
-%    VQ5 = case proplists:get_value(mode, Status) of
-%        default ->
-%            check_variable_queue_status(
-%                VQ4, [{len,                         Len div 2},
-%                      {messages_unacknowledged_ram, Len div 2},
-%                      {messages_ready_ram,          Len div 2},
-%                      {messages_ram,                Len}]);
-%        lazy ->
-%            VQ4
-%            %% @todo This check has been broken for a very long time. (Or never worked.)
-%%            check_variable_queue_status(
-%%                VQ4, [{len,                         Len div 2},
-%%                      {messages_unacknowledged_ram, 0},
-%%                      {messages_ready_ram,          0},
-%%                      {messages_ram,                0}])
-%    end,
-
-    %% ensure all acks go to disk on 0 duration target
-    %% @todo They don't! v1 with embed limit 1024 only (so with embedded messages)
-%    VQ6 = check_variable_queue_status(
-%            variable_queue_set_ram_duration_target(0, VQ5),
-%            [{len,                         Len div 2},
-%             {target_ram_count,            0},
-%             {messages_unacknowledged_ram, 0},
-%             {messages_ready_ram,          0},
-%             {messages_ram,                0}]),
+    %% We only check the length anymore because
+    %% that's the only predictable stats we got.
+    VQ5 = check_variable_queue_status(VQ4, [{len, Len div 2}]),
+
     VQ6 = variable_queue_set_ram_duration_target(0, VQ5),
 
     VQ6.
@@ -1850,9 +1822,7 @@ variable_queue_with_holes(VQ0) ->
             true, Count + 1, Interval,
             fun (_, P) -> P end, fun erlang:term_to_binary/1, VQ7),
     %% assertions
-    Status = variable_queue_status(VQ8),
-
-    vq_with_holes_assertions(VQ8, proplists:get_value(mode, Status)),
+    vq_with_holes_assertions(VQ8),
     Depth = Count + Interval,
     Depth = rabbit_variable_queue:depth(VQ8),
     Len = Depth - length(Subset3),
@@ -1860,22 +1830,14 @@ variable_queue_with_holes(VQ0) ->
 
     {Seq3, Seq -- Seq3, lists:seq(Count + 1, Count + Interval), VQ8}.
 
-vq_with_holes_assertions(VQ, _) -> ok.
-%vq_with_holes_assertions(VQ, default) ->
-%    [false =
-%         case V of
-%             {delta, _, 0, _, _} -> true;
-%             0                   -> true;
-%             _                   -> false
-%         end || {K, V} <- variable_queue_status(VQ),
-%                lists:member(K, [delta, q3])];
-%vq_with_holes_assertions(VQ, lazy) ->
-%    [false =
-%         case V of
-%             {delta, _, 0, _, _} -> true;
-%             _                -> false
-%         end || {K, V} <- variable_queue_status(VQ),
-%                lists:member(K, [delta])].
+vq_with_holes_assertions(VQ) ->
+    [false =
+         case V of
+             {delta, _, 0, _, _} -> true;
+             0                   -> true;
+             _                   -> false
+         end || {K, V} <- variable_queue_status(VQ),
+                lists:member(K, [delta, q3])].
 
 check_variable_queue_status(VQ0, Props) ->
     VQ1 = variable_queue_wait_for_shuffling_end(VQ0),

From f1ae0074558d449c405406d2eadca0eca920a7e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 6 Sep 2022 12:14:43 +0200
Subject: [PATCH 31/39] CQ: Fix test compilation error following rebase

---
 deps/rabbit/test/classic_queue_prop_SUITE.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/rabbit/test/classic_queue_prop_SUITE.erl b/deps/rabbit/test/classic_queue_prop_SUITE.erl
index e32798bba298..25f0e009fc6b 100644
--- a/deps/rabbit/test/classic_queue_prop_SUITE.erl
+++ b/deps/rabbit/test/classic_queue_prop_SUITE.erl
@@ -1102,7 +1102,7 @@ reg_v1_full_recover_only_journal(Config) ->
 
 do_reg_v1_full_recover_only_journal(Config) ->
 
-    St0 = #cq{name=prop_classic_queue_v1, mode=lazy, version=1,
+    St0 = #cq{name=prop_classic_queue_v1, version=1,
               config=minimal_config(Config)},
 
     Res1 = cmd_setup_queue(St0),

From 1d7ce62f4bd74925f8bfad6ae5567e7d6e56944a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 8 Sep 2022 15:41:01 +0200
Subject: [PATCH 32/39] CQ: Remove a couple more unneded callbacks

This callback was removed in a previous commit and was only
used for bump_reduce_memory_use.
---
 deps/rabbit/src/rabbit_mirror_queue_master.erl | 6 +-----
 deps/rabbit/src/rabbit_priority_queue.erl      | 7 +------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/deps/rabbit/src/rabbit_mirror_queue_master.erl b/deps/rabbit/src/rabbit_mirror_queue_master.erl
index bf26eb8dea7b..e7ef264a1078 100644
--- a/deps/rabbit/src/rabbit_mirror_queue_master.erl
+++ b/deps/rabbit/src/rabbit_mirror_queue_master.erl
@@ -16,7 +16,7 @@
          needs_timeout/1, timeout/1, handle_pre_hibernate/1, resume/1,
          msg_rates/1, info/2, invoke/3, is_duplicate/2, set_queue_mode/2,
          set_queue_version/2,
-         zip_msgs_and_acks/4, handle_info/2]).
+         zip_msgs_and_acks/4]).
 
 -export([start/2, stop/1, delete_crashed/1]).
 
@@ -418,10 +418,6 @@ handle_pre_hibernate(State = #state { backing_queue       = BQ,
                                       backing_queue_state = BQS }) ->
     State #state { backing_queue_state = BQ:handle_pre_hibernate(BQS) }.
 
-handle_info(Msg, State = #state { backing_queue       = BQ,
-                                  backing_queue_state = BQS }) ->
-    State #state { backing_queue_state = BQ:handle_info(Msg, BQS) }.
-
 resume(State = #state { backing_queue       = BQ,
                         backing_queue_state = BQS }) ->
     State #state { backing_queue_state = BQ:resume(BQS) }.
diff --git a/deps/rabbit/src/rabbit_priority_queue.erl b/deps/rabbit/src/rabbit_priority_queue.erl
index be2310d174e6..d5f2cf0f53b8 100644
--- a/deps/rabbit/src/rabbit_priority_queue.erl
+++ b/deps/rabbit/src/rabbit_priority_queue.erl
@@ -35,7 +35,7 @@
          handle_pre_hibernate/1, resume/1, msg_rates/1,
          info/2, invoke/3, is_duplicate/2, set_queue_mode/2,
          set_queue_version/2,
-         zip_msgs_and_acks/4, handle_info/2]).
+         zip_msgs_and_acks/4]).
 
 -record(state, {bq, bqss, max_priority}).
 -record(passthrough, {bq, bqs}).
@@ -395,11 +395,6 @@ handle_pre_hibernate(State = #state{bq = BQ}) ->
 handle_pre_hibernate(State = #passthrough{bq = BQ, bqs = BQS}) ->
     ?passthrough1(handle_pre_hibernate(BQS)).
 
-handle_info(Msg, State = #state{bq = BQ}) ->
-    foreach1(fun (_P, BQSN) -> BQ:handle_info(Msg, BQSN) end, State);
-handle_info(Msg, State = #passthrough{bq = BQ, bqs = BQS}) ->
-    ?passthrough1(handle_info(Msg, BQS)).
-
 resume(State = #state{bq = BQ}) ->
     foreach1(fun (_P, BQSN) -> BQ:resume(BQSN) end, State);
 resume(State = #passthrough{bq = BQ, bqs = BQS}) ->

From 0e0635ffee15fbcf73c610d23471c17b75981d3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 8 Sep 2022 15:41:46 +0200
Subject: [PATCH 33/39] CQ: Make the resume/1 function sync to disk

This just restores behavior that was there before via
reduce_memory_use. I am not sure if it is of any use
but it doesn't hurt to have it.
---
 deps/rabbit/src/rabbit_variable_queue.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index 661870cb7a62..ec58d32d2744 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -931,7 +931,7 @@ handle_pre_hibernate(State = #vqstate { index_mod   = IndexMod,
                      unconfirmed_simple = sets:new([{version,2}]),
                      confirmed   = sets:union(C, UCS) }.
 
-resume(State) -> a(State).
+resume(State) -> a(timeout(State)).
 
 msg_rates(#vqstate { rates = #rates { in  = AvgIngressRate,
                                       out = AvgEgressRate } }) ->

From c86254ae7dba888d87a69d36b5b070619138e1d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Thu, 8 Sep 2022 15:42:59 +0200
Subject: [PATCH 34/39] CQ: Don't use outgoing rate to throttle purging the
 queue

When purging the queue we want to read the maximum number
of messages from disk (2048) because these messages will
quickly be gone. Using the outgoing rate could end up
making us read 1 message at a time which makes the
performance much worse.
---
 deps/rabbit/src/rabbit_variable_queue.erl | 29 ++++++++++++-----------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index ec58d32d2744..d049ab37ed89 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -1995,20 +1995,18 @@ count_pending_acks(#vqstate { ram_pending_ack   = RPA,
                               disk_pending_ack  = DPA }) ->
     map_size(RPA) + map_size(DPA).
 
-%% @todo This should set the out rate to infinity temporarily while we drop deltas.
 %% @todo When doing maybe_deltas_to_betas stats are updated. Then stats
 %%       are updated again in remove_queue_entries1. All unnecessary since
 %%       we are purging anyway?
 purge_betas_and_deltas(DelsAndAcksFun, State) ->
-    State0 = #vqstate { q3 = Q3 } = maybe_deltas_to_betas(DelsAndAcksFun, State),
+    %% We use the maximum memory limit when purging to get greater performance.
+    MemoryLimit = 2048,
+    State0 = #vqstate { q3 = Q3 } = maybe_deltas_to_betas(DelsAndAcksFun, State, MemoryLimit),
 
     case ?QUEUE:is_empty(Q3) of
         true  -> State0;
         false -> State1 = remove_queue_entries(Q3, DelsAndAcksFun, State0),
-                 purge_betas_and_deltas(DelsAndAcksFun,
-                                        maybe_deltas_to_betas(
-                                          DelsAndAcksFun,
-                                          State1#vqstate{q3 = ?QUEUE:new()}))
+                 purge_betas_and_deltas(DelsAndAcksFun, State1#vqstate{q3 = ?QUEUE:new()})
     end.
 
 remove_queue_entries(Q, DelsAndAcksFun,
@@ -2674,12 +2672,15 @@ fetch_from_q3(State = #vqstate { delta = #delta { count = DeltaCount },
             {loaded, {MsgStatus, State1}}
     end.
 
-maybe_deltas_to_betas(State) ->
+maybe_deltas_to_betas(State = #vqstate { rates = #rates{ out = OutRate }}) ->
     AfterFun = process_delivers_and_acks_fun(deliver_and_ack),
-    maybe_deltas_to_betas(AfterFun, State).
+    %% We allow from 1 to 2048 messages in memory depending on the consume rate.
+    MemoryLimit = min(1 + floor(2 * OutRate), 2048),
+    maybe_deltas_to_betas(AfterFun, State, MemoryLimit).
 
 maybe_deltas_to_betas(_DelsAndAcksFun,
-                      State = #vqstate {delta = ?BLANK_DELTA_PATTERN(X) }) ->
+                      State = #vqstate {delta = ?BLANK_DELTA_PATTERN(X) },
+                      _MemoryLimit) ->
     State;
 maybe_deltas_to_betas(DelsAndAcksFun,
                       State = #vqstate {
@@ -2693,14 +2694,12 @@ maybe_deltas_to_betas(DelsAndAcksFun,
                         disk_read_count      = DiskReadCount,
                         delta_transient_bytes = DeltaTransientBytes,
                         transient_threshold  = TransientThreshold,
-                        rates                = #rates{ out = OutRate },
-                        version              = Version }) ->
+                        version              = Version },
+                      MemoryLimit) ->
     #delta { start_seq_id = DeltaSeqId,
              count        = DeltaCount,
              transient    = Transient,
              end_seq_id   = DeltaSeqIdEnd } = Delta,
-    %% We allow from 1 to 2048 messages in memory depending on the consume rate.
-    MemoryLimit = min(1 + floor(2 * OutRate), 2048),
     DeltaSeqId1 =
         lists:min([IndexMod:next_segment_boundary(DeltaSeqId),
                    %% We must limit the number of messages read at once
@@ -2735,7 +2734,8 @@ maybe_deltas_to_betas(DelsAndAcksFun,
             maybe_deltas_to_betas(
               DelsAndAcksFun,
               State2 #vqstate {
-                delta = d(Delta #delta { start_seq_id = DeltaSeqId1 })});
+                delta = d(Delta #delta { start_seq_id = DeltaSeqId1 })},
+              MemoryLimit);
         Q3aLen ->
             Q3b = ?QUEUE:join(Q3, Q3a),
             case DeltaCount - Q3aLen of
@@ -2747,6 +2747,7 @@ maybe_deltas_to_betas(DelsAndAcksFun,
                 N when N > 0 ->
                     Delta1 = d(#delta { start_seq_id = DeltaSeqId1,
                                         count        = N,
+                                        %% @todo Probably something wrong, seen it become negative...
                                         transient    = Transient - TransientCount,
                                         end_seq_id   = DeltaSeqIdEnd }),
                     State2 #vqstate { delta = Delta1,

From ef257323afd06929f40de7756498b011a34d2a4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Fri, 9 Sep 2022 12:29:33 +0200
Subject: [PATCH 35/39] system_SUITE: wait for messages to be queued

Somehow the CQ changes made one of the test in this suite
fail with the wrong message count. This is in essence a
followup to d5e81c921168afdb5aef79b43cf30ee5a506dce1 which
already added a timeout to other tests in the suite.
---
 deps/rabbitmq_recent_history_exchange/test/system_SUITE.erl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/deps/rabbitmq_recent_history_exchange/test/system_SUITE.erl b/deps/rabbitmq_recent_history_exchange/test/system_SUITE.erl
index f87d6d9d1f50..b42ac89705dc 100644
--- a/deps/rabbitmq_recent_history_exchange/test/system_SUITE.erl
+++ b/deps/rabbitmq_recent_history_exchange/test/system_SUITE.erl
@@ -148,6 +148,9 @@ e2e_test(Config) ->
                          routing_key = <<"">>
                         }),
 
+    %% Wait a few seconds for all messages to be queued.
+    timer:sleep(3000),
+
     #'queue.declare_ok'{message_count = Count, queue = Q} =
         amqp_channel:call(Chan, #'queue.declare' {
                                    passive   = true,

From 73dd0acf0186612023d228b04b704440371a5c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Fri, 9 Sep 2022 13:44:58 +0200
Subject: [PATCH 36/39] rabbit_prometheus_http_SUITE: Update tests for new CQs

CQs without consumers will have only one message in memory.
---
 .../test/rabbit_prometheus_http_SUITE.erl                   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
index d51f4abfa092..24bf3ac596f4 100644
--- a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
+++ b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
@@ -467,11 +467,11 @@ queue_coarse_metrics_per_object_test(Config) ->
 
 queue_metrics_per_object_test(Config) ->
     Expected1 =  #{#{queue => "vhost-1-queue-with-consumer", vhost => "vhost-1"} => [7],
-                   #{queue => "vhost-1-queue-with-messages", vhost => "vhost-1"} => [7]},
+                   #{queue => "vhost-1-queue-with-messages", vhost => "vhost-1"} => [1]},
     Expected2 =  #{#{queue => "vhost-2-queue-with-consumer", vhost => "vhost-2"} => [11],
-                   #{queue => "vhost-2-queue-with-messages", vhost => "vhost-2"} => [11]},
+                   #{queue => "vhost-2-queue-with-messages", vhost => "vhost-2"} => [1]},
     ExpectedD =  #{#{queue => "default-queue-with-consumer", vhost => "/"} => [3],
-                   #{queue => "default-queue-with-messages", vhost => "/"} => [3]},
+                   #{queue => "default-queue-with-messages", vhost => "/"} => [1]},
     {_, Body1} = http_get_with_pal(Config, "/metrics/detailed?vhost=vhost-1&family=queue_metrics", [], 200),
     ?assertEqual(Expected1,
                  map_get(rabbitmq_detailed_queue_messages_ram, parse_response(Body1))),

From 69efad9f9f0c4bcb6c5c6a258c1cfaaafaf7bd71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Fri, 9 Sep 2022 13:47:45 +0200
Subject: [PATCH 37/39] CQ: Update shards count for the property suite

---
 deps/rabbit/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/rabbit/BUILD.bazel b/deps/rabbit/BUILD.bazel
index 9132e464a5ed..1d11ccdecb22 100644
--- a/deps/rabbit/BUILD.bazel
+++ b/deps/rabbit/BUILD.bazel
@@ -309,7 +309,7 @@ suites = [
         PACKAGE,
         name = "classic_queue_prop_SUITE",
         size = "large",
-        shard_count = 5,
+        shard_count = 3,
         sharding_method = "case",
         deps = [
             "@proper//:erlang_app",

From e09cbeb00c68de781fc710333dffd40d42f747db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Fri, 9 Sep 2022 14:13:11 +0200
Subject: [PATCH 38/39] CQ: Fix channel_operation_timeout_SUITE mixed versions

Since ram_pending_acks is now a map the test must support both
map and gb_trees to continue working. Also updated the state to
reflect a field name change.
---
 .../channel_operation_timeout_test_queue.erl   | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/deps/rabbit/test/channel_operation_timeout_test_queue.erl b/deps/rabbit/test/channel_operation_timeout_test_queue.erl
index c06509dd4d8c..98a1920660b9 100644
--- a/deps/rabbit/test/channel_operation_timeout_test_queue.erl
+++ b/deps/rabbit/test/channel_operation_timeout_test_queue.erl
@@ -83,10 +83,13 @@
           %% default queue or lazy queue
           mode,
           version = 1,
-          %% number of reduce_memory_usage executions, once it
-          %% reaches a threshold the queue will manually trigger a runtime GC
-            %% see: maybe_execute_gc/1
-          memory_reduction_run_count,
+          %% Fast path for confirms handling. Instead of having
+          %% index/store keep track of confirms separately and
+          %% doing intersect/subtract/union we just put the messages
+          %% here and on sync move them to 'confirmed'.
+          %%
+          %% Note: This field used to be 'memory_reduction_run_count'.
+          unconfirmed_simple,
           %% Queue data is grouped by VHost. We need to store it
           %% to work with queue index.
           virtual_host,
@@ -325,7 +328,12 @@ zip_msgs_and_acks(Msgs, AckTags, Accumulator, State) ->
 
 %% Delay
 maybe_delay(QPA) ->
-  case is_timeout_test(maps:values(QPA)) of
+  %% The structure for ram_pending_acks has changed to maps in 3.12.
+  Values = case is_map(QPA) of
+    true -> maps:values(QPA);
+    false -> gb_trees:values(QPA)
+  end,
+  case is_timeout_test(Values) of
     true -> receive
               %% The queue received an EXIT message, it's probably the
               %% node being stopped with "rabbitmqctl stop". Thus, abort

From 1eb17109c2de9a39917ec5faea9b17724491d8d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <lhoguin@vmware.com>
Date: Tue, 20 Sep 2022 13:58:37 +0200
Subject: [PATCH 39/39] CQ: Update long description at the top of the module

---
 deps/rabbit/src/rabbit_variable_queue.erl | 259 +++++++---------------
 1 file changed, 76 insertions(+), 183 deletions(-)

diff --git a/deps/rabbit/src/rabbit_variable_queue.erl b/deps/rabbit/src/rabbit_variable_queue.erl
index d049ab37ed89..f30655331c84 100644
--- a/deps/rabbit/src/rabbit_variable_queue.erl
+++ b/deps/rabbit/src/rabbit_variable_queue.erl
@@ -41,204 +41,65 @@
 -define(EMPTY_START_FUN_STATE, {fun (ok) -> finished end, ok}).
 
 %%----------------------------------------------------------------------------
-%% Messages, and their position in the queue, can be in memory or on
-%% disk, or both. Persistent messages will have both message and
-%% position pushed to disk as soon as they arrive; transient messages
-%% can be written to disk (and thus both types can be evicted from
-%% memory) under memory pressure. The question of whether a message is
-%% in RAM and whether it is persistent are orthogonal.
+%% Messages, their metadata and their position in the queue (SeqId),
+%% can be in memory or on disk, or both. Persistent messages in
+%% durable queues are always written to disk when they arrive.
+%% Transient messages as well as persistent messages in non-durable
+%% queues may be kept only in memory.
 %%
-%% Messages are persisted using the queue index and the message
-%% store. Normally the queue index holds the position of the message
-%% *within this queue* along with a couple of small bits of metadata,
-%% while the message store holds the message itself (including headers
-%% and other properties).
+%% The number of messages kept in memory is dependent on the consume
+%% rate of the queue. At a minimum 1 message is kept (necessary because
+%% we often need to check the expiration at the head of the queue) and
+%% at a maximum the semi-arbitrary number 2048.
 %%
-%% However, as an optimisation, small messages can be embedded
-%% directly in the queue index and bypass the message store
-%% altogether.
+%% Messages are never written back to disk after they have been read
+%% into memory. Instead the queue is designed to avoid keeping too
+%% much to begin with.
 %%
-%% Definitions:
+%% Messages are persisted using a queue index and a message store.
+%% A few different scenarios may play out depending on the message
+%% size and the queue-version argument.
 %%
-%% alpha: this is a message where both the message itself, and its
-%%        position within the queue are held in RAM
-%%
-%% beta:  this is a message where the message itself is only held on
-%%        disk (if persisted to the message store) but its position
-%%        within the queue is held in RAM.
-%%
-%% gamma: this is a message where the message itself is only held on
-%%        disk, but its position is both in RAM and on disk.
-%%
-%% delta: this is a collection of messages, represented by a single
-%%        term, where the messages and their position are only held on
-%%        disk.
-%%
-%% Note that for persistent messages, the message and its position
-%% within the queue are always held on disk, *in addition* to being in
-%% one of the above classifications.
-%%
-%% Also note that within this code, the term gamma seldom
-%% appears. It's frequently the case that gammas are defined by betas
-%% who have had their queue position recorded on disk.
-%%
-%% In general, messages move q1 -> q2 -> delta -> q3 -> q4, though
-%% many of these steps are frequently skipped. q1 and q4 only hold
-%% alphas, q2 and q3 hold both betas and gammas. When a message
-%% arrives, its classification is determined. It is then added to the
-%% rightmost appropriate queue.
-%%
-%% If a new message is determined to be a beta or gamma, q1 is
-%% empty. If a new message is determined to be a delta, q1 and q2 are
-%% empty (and actually q4 too).
-%%
-%% When removing messages from a queue, if q4 is empty then q3 is read
-%% directly. If q3 becomes empty then the next segment's worth of
-%% messages from delta are read into q3, reducing the size of
-%% delta. If the queue is non empty, either q4 or q3 contain
-%% entries. It is never permitted for delta to hold all the messages
-%% in the queue.
-%%
-%% The duration indicated to us by the memory_monitor is used to
-%% calculate, given our current ingress and egress rates, how many
-%% messages we should hold in RAM (i.e. as alphas). We track the
-%% ingress and egress rates for both messages and pending acks and
-%% rates for both are considered when calculating the number of
-%% messages to hold in RAM. When we need to push alphas to betas or
-%% betas to gammas, we favour writing out messages that are further
-%% from the head of the queue. This minimises writes to disk, as the
-%% messages closer to the tail of the queue stay in the queue for
-%% longer, thus do not need to be replaced as quickly by sending other
-%% messages to disk.
-%%
-%% Whilst messages are pushed to disk and forgotten from RAM as soon
-%% as requested by a new setting of the queue RAM duration, the
-%% inverse is not true: we only load messages back into RAM as
-%% demanded as the queue is read from. Thus only publishes to the
-%% queue will take up available spare capacity.
-%%
-%% When we report our duration to the memory monitor, we calculate
-%% average ingress and egress rates over the last two samples, and
-%% then calculate our duration based on the sum of the ingress and
-%% egress rates. More than two samples could be used, but it's a
-%% balance between responding quickly enough to changes in
-%% producers/consumers versus ignoring temporary blips. The problem
-%% with temporary blips is that with just a few queues, they can have
-%% substantial impact on the calculation of the average duration and
-%% hence cause unnecessary I/O. Another alternative is to increase the
-%% amqqueue_process:RAM_DURATION_UPDATE_PERIOD to beyond 5
-%% seconds. However, that then runs the risk of being too slow to
-%% inform the memory monitor of changes. Thus a 5 second interval,
-%% plus a rolling average over the last two samples seems to work
-%% well in practice.
+%% - queue-version=1, size < qi_msgs_embed_below: both the message
+%%   metadata and content are stored in rabbit_queue_index
 %%
-%% The sum of the ingress and egress rates is used because the egress
-%% rate alone is not sufficient. Adding in the ingress rate means that
-%% queues which are being flooded by messages are given more memory,
-%% resulting in them being able to process the messages faster (by
-%% doing less I/O, or at least deferring it) and thus helping keep
-%% their mailboxes empty and thus the queue as a whole is more
-%% responsive. If such a queue also has fast but previously idle
-%% consumers, the consumer can then start to be driven as fast as it
-%% can go, whereas if only egress rate was being used, the incoming
-%% messages may have to be written to disk and then read back in,
-%% resulting in the hard disk being a bottleneck in driving the
-%% consumers. Generally, we want to give Rabbit every chance of
-%% getting rid of messages as fast as possible and remaining
-%% responsive, and using only the egress rate impacts that goal.
+%% - queue-version=1, size >= qi_msgs_embed_below: the metadata
+%%   is stored in rabbit_queue_index, while the content is stored
+%%   in the per-vhost shared rabbit_msg_store
 %%
-%% Once the queue has more alphas than the target_ram_count, the
-%% surplus must be converted to betas, if not gammas, if not rolled
-%% into delta. The conditions under which these transitions occur
-%% reflect the conflicting goals of minimising RAM cost per msg, and
-%% minimising CPU cost per msg. Once the msg has become a beta, its
-%% payload is no longer in RAM, thus a read from the msg_store must
-%% occur before the msg can be delivered, but the RAM cost of a beta
-%% is the same as a gamma, so converting a beta to gamma will not free
-%% up any further RAM. To reduce the RAM cost further, the gamma must
-%% be rolled into delta. Whilst recovering a beta or a gamma to an
-%% alpha requires only one disk read (from the msg_store), recovering
-%% a msg from within delta will require two reads (queue_index and
-%% then msg_store). But delta has a near-0 per-msg RAM cost. So the
-%% conflict is between using delta more, which will free up more
-%% memory, but require additional CPU and disk ops, versus using delta
-%% less and gammas and betas more, which will cost more memory, but
-%% require fewer disk ops and less CPU overhead.
+%% - queue-version=2, size < qi_msgs_embed_below: the metadata
+%%   is stored in rabbit_classic_queue_index_v2, while the content
+%%   is stored in the per-queue rabbit_classic_queue_store_v2
 %%
-%% In the case of a persistent msg published to a durable queue, the
-%% msg is immediately written to the msg_store and queue_index. If
-%% then additionally converted from an alpha, it'll immediately go to
-%% a gamma (as it's already in queue_index), and cannot exist as a
-%% beta. Thus a durable queue with a mixture of persistent and
-%% transient msgs in it which has more messages than permitted by the
-%% target_ram_count may contain an interspersed mixture of betas and
-%% gammas in q2 and q3.
+%% - queue-version=2, size >= qi_msgs_embed_below: the metadata
+%%   is stored in rabbit_classic_queue_index_v2, while the content
+%%   is stored in the per-vhost shared rabbit_msg_store
 %%
-%% There is then a ratio that controls how many betas and gammas there
-%% can be. This is based on the target_ram_count and thus expresses
-%% the fact that as the number of permitted alphas in the queue falls,
-%% so should the number of betas and gammas fall (i.e. delta
-%% grows). If q2 and q3 contain more than the permitted number of
-%% betas and gammas, then the surplus are forcibly converted to gammas
-%% (as necessary) and then rolled into delta. The ratio is that
-%% delta/(betas+gammas+delta) equals
-%% (betas+gammas+delta)/(target_ram_count+betas+gammas+delta). I.e. as
-%% the target_ram_count shrinks to 0, so must betas and gammas.
+%% When messages must be read from disk, message bodies will
+%% also be read from disk except if the message in stored
+%% in the per-vhost shared rabbit_msg_store. In that case
+%% the message gets read before it needs to be sent to the
+%% consumer. Messages are read from rabbit_msg_store one
+%% at a time currently.
 %%
-%% The conversion of betas to deltas is done if there are at least
-%% ?IO_BATCH_SIZE betas in q2 & q3. This value should not be too small,
-%% otherwise the frequent operations on the queues of q2 and q3 will not be
-%% effectively amortised (switching the direction of queue access defeats
-%% amortisation). Note that there is a natural upper bound due to credit_flow
-%% limits on the alpha to beta conversion.
+%% The queue also keeps track of messages that were delivered
+%% but for which the ack has not been received. Pending acks
+%% are currently kept in memory although the message may be
+%% on disk.
 %%
-%% The conversion from alphas to betas is chunked due to the
-%% credit_flow limits of the msg_store. This further smooths the
-%% effects of changes to the target_ram_count and ensures the queue
-%% remains responsive even when there is a large amount of IO work to
-%% do. The 'resume' callback is utilised to ensure that conversions
-%% are done as promptly as possible whilst ensuring the queue remains
-%% responsive.
-%%
-%% In the queue we keep track of both messages that are pending
-%% delivery and messages that are pending acks. In the event of a
-%% queue purge, we only need to load qi segments if the queue has
-%% elements in deltas (i.e. it came under significant memory
-%% pressure). In the event of a queue deletion, in addition to the
-%% preceding, by keeping track of pending acks in RAM, we do not need
-%% to search through qi segments looking for messages that are yet to
-%% be acknowledged.
-%%
-%% Pending acks are recorded in memory by storing the message itself.
-%% If the message has been sent to disk, we do not store the message
-%% content. During memory reduction, pending acks containing message
-%% content have that content removed and the corresponding messages
-%% are pushed out to disk.
-%%
-%% Messages from pending acks are returned to q4, q3 and delta during
-%% requeue, based on the limits of seq_id contained in each. Requeued
-%% messages retain their original seq_id, maintaining order
-%% when requeued.
-%%
-%% The order in which alphas are pushed to betas and pending acks
-%% are pushed to disk is determined dynamically. We always prefer to
-%% push messages for the source (alphas or acks) that is growing the
-%% fastest (with growth measured as avg. ingress - avg. egress).
-%%
-%% Notes on Clean Shutdown
-%% (This documents behaviour in variable_queue, queue_index and
-%% msg_store.)
+%% Messages being requeued are returned to their position in
+%% the queue using their SeqId value.
 %%
 %% In order to try to achieve as fast a start-up as possible, if a
 %% clean shutdown occurs, we try to save out state to disk to reduce
-%% work on startup. In the msg_store this takes the form of the
+%% work on startup. In rabbit_msg_store this takes the form of the
 %% index_module's state, plus the file_summary ets table, and client
 %% refs. In the VQ, this takes the form of the count of persistent
 %% messages in the queue and references into the msg_stores. The
-%% queue_index adds to these terms the details of its segments and
+%% queue index adds to these terms the details of its segments and
 %% stores the terms in the queue directory.
 %%
-%% Two message stores are used. One is created for persistent messages
+%% Two rabbit_msg_store(s) are used. One is created for persistent messages
 %% to durable queues that must survive restarts, and the other is used
 %% for all other messages that just happen to need to be written to
 %% disk. On start up we can therefore nuke the transient message
@@ -248,7 +109,7 @@
 %% The references to the msg_stores are there so that the msg_store
 %% knows to only trust its saved state if all of the queues it was
 %% previously talking to come up cleanly. Likewise, the queues
-%% themselves (esp queue_index) skips work in init if all the queues
+%% themselves (especially indexes) skip work in init if all the queues
 %% and msg_store were shutdown cleanly. This gives both good speed
 %% improvements and also robustness so that if anything possibly went
 %% wrong in shutdown (or there was subsequent manual tampering), all
@@ -263,9 +124,9 @@
 %% stored in the transient msg_store which would have had its saved
 %% state nuked on startup). This avoids the expensive operation of
 %% scanning the entire queue on startup in order to delete transient
-%% messages that were only pushed to disk to save memory.
+%% messages that were written to disk.
 %%
-%% v2 UPDATE: The queue is keeping track of delivers via the
+%% The queue is keeping track of delivers via the
 %% next_deliver_seq_id variable. This variable gets increased
 %% with every (first-time) delivery. When delivering messages
 %% the seq_id of the message is checked against this variable
@@ -275,6 +136,38 @@
 %% message in the queue (effectively marking all messages as
 %% delivered, like the v1 index was doing).
 %%
+%% Previous versions of classic queues had a much more complex
+%% way of working. Messages were categorized into four groups,
+%% and remnants of these terms remain in the code at the time
+%% of writing:
+%%
+%% alpha: this is a message where both the message itself, and its
+%%        position within the queue are held in RAM
+%%
+%% beta:  this is a message where the message itself is only held on
+%%        disk (if persisted to the message store) but its position
+%%        within the queue is held in RAM.
+%%
+%% gamma: this is a message where the message itself is only held on
+%%        disk, but its position is both in RAM and on disk.
+%%
+%% delta: this is a collection of messages, represented by a single
+%%        term, where the messages and their position are only held on
+%%        disk.
+%%
+%% Messages may have been stored in q1, q2, delta, q3 or q4 depending
+%% on their location in the queue. The current version of classic
+%% queues only use delta (on-disk, for the tail of the queue) or
+%% q3 (in-memory, head of the queue). Messages used to move from
+%% q1 -> q2 -> delta -> q3 -> q4 (and sometimes q3 -> delta or
+%% q4 -> delta to reduce memory use). Now messages only move
+%% from delta to q3. Full details on the old mechanisms can be
+%% found in previous versions of this file (such as the 3.11 version).
+%%
+%% In the current version of classic queues, there is no distinction
+%% between default and lazy queues. The current behavior is close to
+%% lazy queues, except we avoid some write to disks when queues are
+%% empty.
 %%----------------------------------------------------------------------------
 
 -behaviour(rabbit_backing_queue).