Skip to content

Commit

Permalink
mirrored_supervisor: Catch timeout from Khepri in hanlde_info/2
Browse files Browse the repository at this point in the history
[Why]
The code assumed that the transaction would always succeed. It was kind
of the case with Mnesia because it would throw an exception if it
failed.

Khepri returns an error instead. The code has to handle it. In
particular, we see timeouts in CI and before this patch, they caused a
crash because the list comprehension was asked to work on a tuple.

[How]
We now retry a few times for 10 seconds.
  • Loading branch information
dumbbell committed Nov 29, 2024
1 parent 913bd9f commit 4621fe7
Showing 1 changed file with 17 additions and 1 deletion.
18 changes: 17 additions & 1 deletion deps/rabbit/src/mirrored_supervisor.erl
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason},
child_order = ChildOrder}) ->
%% No guarantee pg will have received the DOWN before us.
R = case lists:sort(pg:get_members(Group)) -- [Pid] of
[O | _] -> ChildSpecs = update_all(O, Pid),
[O | _] -> ChildSpecs = retry_update_all(O, Pid),
[start(Delegate, ChildSpec)
|| ChildSpec <- restore_child_order(ChildSpecs,
ChildOrder)];
Expand Down Expand Up @@ -428,6 +428,22 @@ check_stop(Group, Delegate, Id) ->

id({Id, _, _, _, _, _}) -> Id.

retry_update_all(O, Pid) ->
retry_update_all(O, Pid, 10000).

retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 ->
case update_all(O, Pid) of
List when is_list(List) ->
List;
{error, timeout} ->
Sleep = 200,
TimeLeft1 = TimeLeft - Sleep,
timer:sleep(Sleep),
retry_update_all(O, Pid, TimeLeft1)
end;
retry_update_all(O, Pid, _TimeLeft) ->
update_all(O, Pid).

update_all(Overall, OldOverall) ->
rabbit_db_msup:update_all(Overall, OldOverall).

Expand Down

0 comments on commit 4621fe7

Please sign in to comment.