135 lines
4.5 KiB
Diff
135 lines
4.5 KiB
Diff
|
From: Peter Lemenkov <lemenkov@gmail.com>
|
||
|
Date: Wed, 24 Oct 2018 14:58:41 +0200
|
||
|
Subject: [PATCH] Improve nodes querying
|
||
|
|
||
|
We've got a few similar stacktraces once. See the following one for
|
||
|
example:
|
||
|
|
||
|
** Reason for termination ==
|
||
|
** {badarg,
|
||
|
[{ets,next,[sys_dist,'rabbitmq-cli-42@host.example.com'],[]},
|
||
|
{net_kernel,get_nodes,2,[{file,"net_kernel.erl"},{line,1025}]},
|
||
|
{net_kernel,get_nodes,2,[{file,"net_kernel.erl"},{line,1019}]},
|
||
|
{net_kernel,get_nodes_info,0,[{file,"net_kernel.erl"},{line,1439}]},
|
||
|
{rabbit_mgmt_external_stats,cluster_links,0,
|
||
|
[{file,"src/rabbit_mgmt_external_stats.erl"},{line,252}]},
|
||
|
{rabbit_mgmt_external_stats,emit_node_node_stats,1,
|
||
|
[{file,"src/rabbit_mgmt_external_stats.erl"},{line,366}]},
|
||
|
{rabbit_mgmt_external_stats,handle_info,2,
|
||
|
[{file,"src/rabbit_mgmt_external_stats.erl"},{line,347}]},
|
||
|
{gen_server,try_dispatch,4,[{file,"gen_server.erl"},{line,615}]}]}
|
||
|
|
||
|
The problem is that when we're trying to query a list of connected
|
||
|
nodes, we're doing it in the following way:
|
||
|
|
||
|
Call for the first record in ETS
|
||
|
While not EOF:
|
||
|
Call for the next record in ETS
|
||
|
|
||
|
What happens, when some Node disconnects during the "not EOF" loop?
|
||
|
We'll get an exception.
|
||
|
|
||
|
Let's do it differently - query a list of nodes in one shot, and then
|
||
|
get info from each of the nodes in list (w/o extra calls to ets). These
|
||
|
individual calls care of disconnected nodes so everything will be fine
|
||
|
even if a node disconnects.
|
||
|
|
||
|
Signed-off-by: Peter Lemenkov <lemenkov@gmail.com>
|
||
|
|
||
|
diff --git a/lib/kernel/src/net_kernel.erl b/lib/kernel/src/net_kernel.erl
|
||
|
index 7da89dd7cb..f57324b67f 100644
|
||
|
--- a/lib/kernel/src/net_kernel.erl
|
||
|
+++ b/lib/kernel/src/net_kernel.erl
|
||
|
@@ -609,24 +609,16 @@ code_change(_OldVsn, State, _Extra) ->
|
||
|
|
||
|
terminate(no_network, State) ->
|
||
|
lists:foreach(
|
||
|
- fun({Node, Type}) ->
|
||
|
- case Type of
|
||
|
- normal -> ?nodedown(Node, State);
|
||
|
- _ -> ok
|
||
|
- end
|
||
|
- end, get_up_nodes() ++ [{node(), normal}]);
|
||
|
+ fun(Node) -> ?nodedown(Node, State)
|
||
|
+ end, get_nodes_up_normal() ++ [node()]);
|
||
|
terminate(_Reason, State) ->
|
||
|
lists:foreach(
|
||
|
fun(#listen {listen = Listen,module = Mod}) ->
|
||
|
Mod:close(Listen)
|
||
|
end, State#state.listen),
|
||
|
lists:foreach(
|
||
|
- fun({Node, Type}) ->
|
||
|
- case Type of
|
||
|
- normal -> ?nodedown(Node, State);
|
||
|
- _ -> ok
|
||
|
- end
|
||
|
- end, get_up_nodes() ++ [{node(), normal}]).
|
||
|
+ fun(Node) -> ?nodedown(Node, State)
|
||
|
+ end, get_nodes_up_normal() ++ [node()]).
|
||
|
|
||
|
|
||
|
%% ------------------------------------------------------------
|
||
|
@@ -1045,35 +1037,10 @@ disconnect_pid(Pid, State) ->
|
||
|
%%
|
||
|
%%
|
||
|
%%
|
||
|
-get_nodes(Which) ->
|
||
|
- get_nodes(ets:first(sys_dist), Which).
|
||
|
|
||
|
-get_nodes('$end_of_table', _) ->
|
||
|
- [];
|
||
|
-get_nodes(Key, Which) ->
|
||
|
- case ets:lookup(sys_dist, Key) of
|
||
|
- [Conn = #connection{state = up}] ->
|
||
|
- [Conn#connection.node | get_nodes(ets:next(sys_dist, Key),
|
||
|
- Which)];
|
||
|
- [Conn = #connection{}] when Which =:= all ->
|
||
|
- [Conn#connection.node | get_nodes(ets:next(sys_dist, Key),
|
||
|
- Which)];
|
||
|
- _ ->
|
||
|
- get_nodes(ets:next(sys_dist, Key), Which)
|
||
|
- end.
|
||
|
-
|
||
|
-%% Return a list of all nodes that are 'up'.
|
||
|
-get_up_nodes() ->
|
||
|
- get_up_nodes(ets:first(sys_dist)).
|
||
|
-
|
||
|
-get_up_nodes('$end_of_table') -> [];
|
||
|
-get_up_nodes(Key) ->
|
||
|
- case ets:lookup(sys_dist, Key) of
|
||
|
- [#connection{state=up,node=Node,type=Type}] ->
|
||
|
- [{Node,Type}|get_up_nodes(ets:next(sys_dist, Key))];
|
||
|
- _ ->
|
||
|
- get_up_nodes(ets:next(sys_dist, Key))
|
||
|
- end.
|
||
|
+%% Return a list of all nodes that are 'up' and not hidden.
|
||
|
+get_nodes_up_normal() ->
|
||
|
+ ets:select(sys_dist, [{#connection{node = '$1', state = up, type = normal, _ = '_'}, [], ['$1']}]).
|
||
|
|
||
|
ticker(Kernel, Tick) when is_integer(Tick) ->
|
||
|
process_flag(priority, max),
|
||
|
@@ -1525,15 +1492,14 @@ get_node_info(Node, Key) ->
|
||
|
end.
|
||
|
|
||
|
get_nodes_info() ->
|
||
|
- get_nodes_info(get_nodes(all), []).
|
||
|
-
|
||
|
-get_nodes_info([Node|Nodes], InfoList) ->
|
||
|
- case get_node_info(Node) of
|
||
|
- {ok, Info} -> get_nodes_info(Nodes, [{Node, Info}|InfoList]);
|
||
|
- _ -> get_nodes_info(Nodes, InfoList)
|
||
|
- end;
|
||
|
-get_nodes_info([], InfoList) ->
|
||
|
- {ok, InfoList}.
|
||
|
+ Nodes = ets:select(sys_dist, [{#connection{node = '$1', _ = '_'}, [], ['$1']}]),
|
||
|
+ {ok, lists:filtermap(
|
||
|
+ fun(Node) ->
|
||
|
+ case get_node_info(Node) of
|
||
|
+ {ok, Info} -> {true, {Node, Info}};
|
||
|
+ _ -> false
|
||
|
+ end
|
||
|
+ end, Nodes)}.
|
||
|
|
||
|
%% ------------------------------------------------------------
|
||
|
%% Misc. functions
|