Skip to content

Commit e7de95c

Browse files
committed
Remove feature flag code handling potential code server deadlocks
These test cases and rpc calls were concerned with deadlocks possible from modifying the code server process from multiple callers. With the switch to persistent_term in the parent commits these kinds of deadlocks are no longer possible.
1 parent e6b5c36 commit e7de95c

File tree

2 files changed

+3
-254
lines changed

2 files changed

+3
-254
lines changed

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,8 @@ collect_inventory_on_nodes(Nodes, Timeout) ->
11201120
Rets = inventory_rpcs(Nodes, Timeout),
11211121
maps:fold(
11221122
fun
1123+
(_Node, init_required, {ok, Inventory}) ->
1124+
{ok, Inventory};
11231125
(Node,
11241126
#{feature_flags := FeatureFlags1,
11251127
applications := ScannedApps,
@@ -1144,44 +1146,7 @@ collect_inventory_on_nodes(Nodes, Timeout) ->
11441146
end, {ok, Inventory0}, Rets).
11451147

11461148
inventory_rpcs(Nodes, Timeout) ->
1147-
%% We must use `rabbit_ff_registry_wrapper' if it is available to avoid
1148-
%% any deadlock with the Code server. If it is unavailable, we fall back
1149-
%% to `rabbit_ff_registry'.
1150-
%%
1151-
%% See commit aacfa1978e24bcacd8de7d06a7c3c5d9d8bd098e and pull request
1152-
%% #8155.
1153-
Rets0 = rpc_calls(
1154-
Nodes,
1155-
rabbit_ff_registry_wrapper, inventory, [], Timeout),
1156-
OlderNodes = maps:fold(
1157-
fun
1158-
(Node,
1159-
{error,
1160-
{exception, undef,
1161-
[{rabbit_ff_registry_wrapper,_ , _, _} | _]}},
1162-
Acc) ->
1163-
[Node | Acc];
1164-
(_Node, _Ret, Acc) ->
1165-
Acc
1166-
end, [], Rets0),
1167-
case OlderNodes of
1168-
[] ->
1169-
Rets0;
1170-
_ ->
1171-
?LOG_INFO(
1172-
"Feature flags: the following nodes run an older version of "
1173-
"RabbitMQ causing the "
1174-
"\"rabbit_ff_registry_wrapper:inventory[] undefined\" error "
1175-
"above: ~2p~n"
1176-
"Feature flags: falling back to another method for those "
1177-
"nodes; this may trigger a bug causing them to hang",
1178-
[lists:sort(OlderNodes)],
1179-
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
1180-
Rets1 = rpc_calls(
1181-
OlderNodes,
1182-
rabbit_ff_registry, inventory, [], Timeout),
1183-
maps:merge(Rets0, Rets1)
1184-
end.
1149+
rpc_calls(Nodes, rabbit_ff_registry, inventory, [], Timeout).
11851150

11861151
merge_feature_flags(FeatureFlagsA, FeatureFlagsB) ->
11871152
FeatureFlags = maps:merge(FeatureFlagsA, FeatureFlagsB),

deps/rabbit/test/feature_flags_SUITE.erl

Lines changed: 0 additions & 216 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525

2626
registry_general_usage/1,
2727
registry_concurrent_reloads/1,
28-
try_to_deadlock_in_registry_reload_1/1,
29-
try_to_deadlock_in_registry_reload_2/1,
3028
registry_reset/1,
3129
enable_feature_flag_in_a_healthy_situation/1,
3230
enable_unsupported_feature_flag_in_a_healthy_situation/1,
@@ -104,8 +102,6 @@ groups() ->
104102
[
105103
registry_general_usage,
106104
registry_concurrent_reloads,
107-
try_to_deadlock_in_registry_reload_1,
108-
try_to_deadlock_in_registry_reload_2,
109105
registry_reset
110106
]},
111107
{feature_flags_v2, [], Groups}
@@ -573,218 +569,6 @@ registry_spammer1(FeatureNames) ->
573569
?assertEqual(FeatureNames, ?list_ff(all)),
574570
registry_spammer1(FeatureNames).
575571

576-
try_to_deadlock_in_registry_reload_1(_Config) ->
577-
rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
578-
_ = code:delete(rabbit_ff_registry),
579-
?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
580-
581-
FeatureName = ?FUNCTION_NAME,
582-
FeatureProps = #{provided_by => rabbit,
583-
stability => stable},
584-
585-
Parent = self(),
586-
587-
%% Deadlock steps:
588-
%% * Process A acquires the lock first.
589-
%% * Process B loads the registry stub and waits for the lock.
590-
%% * Process A deletes the registry stub and loads the initialized
591-
%% registry.
592-
%% * Process B wants to purge the deleted registry stub by sending a
593-
%% request to the Code server.
594-
%%
595-
%% => Process B waits forever the return from the Code server because the
596-
%% Code server waits for process B to be runnable again to handle the
597-
%% signal.
598-
599-
ProcessA = spawn_link(
600-
fun() ->
601-
%% Process A acquires the lock manually first to
602-
%% ensure the ordering of events. It can be acquired
603-
%% recursively, so the feature flag injection can
604-
%% "acquire" it again and proceed.
605-
ct:pal("Process A: Acquire registry loading lock"),
606-
Lock =
607-
rabbit_ff_registry_factory:registry_loading_lock(),
608-
global:set_lock(Lock, [node()]),
609-
receive proceed -> ok end,
610-
611-
ct:pal(
612-
"Process A: "
613-
"Inject arbitrary feature flag to reload "
614-
"registry"),
615-
rabbit_feature_flags:inject_test_feature_flags(
616-
#{FeatureName => FeatureProps}),
617-
618-
ct:pal("Process A: Release registry loading lock"),
619-
global:del_lock(Lock, [node()]),
620-
621-
ct:pal("Process A: Exiting..."),
622-
erlang:unlink(Parent)
623-
end),
624-
timer:sleep(500),
625-
626-
ProcessB = spawn_link(
627-
fun() ->
628-
%% Process B is the one loading the registry stub and
629-
%% wants to initialize the real registry.
630-
ct:pal(
631-
"Process B: "
632-
"Trigger automatic initial registry load"),
633-
FF = rabbit_ff_registry_wrapper:get(FeatureName),
634-
635-
ct:pal("Process B: Exiting..."),
636-
erlang:unlink(Parent),
637-
Parent ! {self(), FF}
638-
end),
639-
timer:sleep(500),
640-
641-
begin
642-
{_, StacktraceA} = erlang:process_info(ProcessA, current_stacktrace),
643-
{_, StacktraceB} = erlang:process_info(ProcessB, current_stacktrace),
644-
ct:pal(
645-
"Process stacktraces:~n"
646-
" Process A: ~p~n"
647-
" Process B: ~p",
648-
[StacktraceA, StacktraceB])
649-
end,
650-
651-
%% Process A is resumed. Without a proper check, process B would try to
652-
%% purge the copy of the registry it is currently using itself, causing a
653-
%% deadlock because the Code server wants process A to handle a signal, but
654-
%% process A is not runnable.
655-
ProcessA ! proceed,
656-
657-
ct:pal("Waiting for process B to exit"),
658-
receive
659-
{ProcessB, FF} ->
660-
?assertEqual(FeatureProps#{name => FeatureName}, FF),
661-
ok
662-
after 10000 ->
663-
{_, StacktraceB} = erlang:process_info(
664-
ProcessB, current_stacktrace),
665-
ct:pal("Process B stuck; stacktrace: ~p", [StacktraceB]),
666-
error(registry_reload_deadlock)
667-
end.
668-
669-
try_to_deadlock_in_registry_reload_2(_Config) ->
670-
rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
671-
_ = code:delete(rabbit_ff_registry),
672-
?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
673-
674-
FeatureName = ?FUNCTION_NAME,
675-
FeatureProps = #{provided_by => rabbit,
676-
stability => stable},
677-
678-
ct:pal("Inject arbitrary feature flag to reload registry"),
679-
rabbit_feature_flags:inject_test_feature_flags(
680-
#{FeatureName => FeatureProps},
681-
false),
682-
683-
_ = erlang:process_flag(trap_exit, true),
684-
685-
ct:pal("Parent ~p: Acquire registry loading lock", [self()]),
686-
Lock = rabbit_ff_registry_factory:registry_loading_lock(),
687-
global:set_lock(Lock, [node()]),
688-
689-
Parent = self(),
690-
691-
%% Deadlock steps:
692-
%% * Processes A, B1 and B2 wait for the lock. The registry stub is loaded.
693-
%% * Process B1 acquires the lock.
694-
%% * Process B1 deletes the registry stub and loads the initialized
695-
%% registry.
696-
%% * Process A acquires the lock.
697-
%% * Process A wants to purge the deleted registry stub by sending a
698-
%% request to the Code server.
699-
%%
700-
%% => Process A waits forever the return from the Code server because the
701-
%% Code server waits for process B2 to stop lingering on the deleted
702-
%% registry stub, but process B2 waits for the lock.
703-
704-
ProcessA = spawn_link(
705-
fun() ->
706-
%% Process A acquires the lock automatically as part
707-
%% of requesting an explicit initialization of the
708-
%% registry. Process A doesn't linger on the registry
709-
%% stub.
710-
ct:pal(
711-
"Process A ~p: "
712-
"Trigger manual initial registry load",
713-
[self()]),
714-
rabbit_ff_registry_factory:initialize_registry(),
715-
716-
ct:pal("Process A ~p: Exiting...", [self()]),
717-
erlang:unlink(Parent),
718-
Parent ! {self(), done}
719-
end),
720-
721-
FunB = fun() ->
722-
%% Processes B1 and B2 acquire the lock automatically as
723-
%% part of trying to load the registry as part of they
724-
%% querying a feature flag.
725-
ct:pal(
726-
"Process B ~p: "
727-
"Trigger automatic initial registry load",
728-
[self()]),
729-
_ = rabbit_ff_registry_wrapper:get(FeatureName),
730-
731-
ct:pal(
732-
"Process B ~p: Exiting...",
733-
[self()]),
734-
erlang:unlink(Parent),
735-
Parent ! {self(), done}
736-
end,
737-
ProcessB1 = spawn_link(FunB),
738-
ProcessB2 = spawn_link(FunB),
739-
timer:sleep(500),
740-
741-
%% We use `erlang:suspend_process/1' and `erlang:resume_process/1' to
742-
%% ensure the order in which processes acquire the lock.
743-
erlang:suspend_process(ProcessA),
744-
erlang:suspend_process(ProcessB1),
745-
erlang:suspend_process(ProcessB2),
746-
timer:sleep(500),
747-
748-
ct:pal("Parent ~p: Release registry loading lock", [self()]),
749-
global:del_lock(Lock, [node()]),
750-
751-
erlang:resume_process(ProcessB1),
752-
timer:sleep(500),
753-
erlang:resume_process(ProcessA),
754-
timer:sleep(500),
755-
erlang:resume_process(ProcessB2),
756-
757-
ct:pal("Waiting for processes to exit"),
758-
Procs = [ProcessA, ProcessB1, ProcessB2],
759-
lists:foreach(
760-
fun(Pid) ->
761-
receive
762-
{Pid, done} ->
763-
ok;
764-
{'EXIT', Pid, Reason} ->
765-
ct:pal("Process ~p exited; reason: ~p", [Pid, Reason]),
766-
error(test_process_killed)
767-
after 10000 ->
768-
lists:foreach(
769-
fun(Pid1) ->
770-
PI = erlang:process_info(
771-
Pid1, current_stacktrace),
772-
case PI of
773-
undefined ->
774-
ok;
775-
{_, Stacktrace} ->
776-
ct:pal(
777-
"Process ~p stuck; "
778-
"stacktrace: ~p",
779-
[Pid1, Stacktrace])
780-
end
781-
end, Procs),
782-
error(registry_reload_deadlock)
783-
end
784-
end, Procs),
785-
786-
ok.
787-
788572
registry_reset(_Config) ->
789573
%% At first, the registry must be uninitialized.
790574
?assertNot(rabbit_ff_registry:is_registry_initialized()),

0 commit comments

Comments
 (0)