Skip to content

Commit

Permalink
Dev: utils: Check node is reachable by using both ping and ssh (Clust…
Browse files Browse the repository at this point in the history
…erLabs#1563)

If both ping and ssh fail, the node is considered unreachable.

To fix issue ClusterLabs#1551
  • Loading branch information
liangxin1300 authored Sep 27, 2024
2 parents 3727fc0 + 3988820 commit 113b211
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 65 deletions.
4 changes: 2 additions & 2 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def _validate_nodes_option(self):
utils.fatal(f"Overriding current user '{self.current_user}' by '{user}'. Ouch, don't do it.")
self.user_at_node_list = [value for (user, node), value in zip(li, self.user_at_node_list) if node != me]
for user, node in (utils.parse_user_at_host(x) for x in self.user_at_node_list):
utils.ping_node(node)
utils.node_reachable_check(node)

def _validate_cluster_node(self):
"""
Expand Down Expand Up @@ -2307,7 +2307,7 @@ def bootstrap_join(context):
_context.initialize_user()

remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
utils.ping_node(cluster_node)
utils.node_reachable_check(cluster_node)
join_ssh(cluster_node, remote_user)
remote_user = utils.user_of(cluster_node)

Expand Down
2 changes: 1 addition & 1 deletion crmsh/qdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def check_qnetd_addr(qnetd_addr):
except socket.error:
raise ValueError("host \"{}\" is unreachable".format(qnetd_addr))

utils.ping_node(qnetd_addr)
utils.node_reachable_check(qnetd_addr)

if utils.InterfacesInfo.ip_in_local(qnetd_ip):
raise ValueError("host for qnetd must be a remote one")
Expand Down
2 changes: 1 addition & 1 deletion crmsh/report/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ def process_node_list(context: Context) -> None:
if node == context.me:
continue
try:
crmutils.ping_node(node)
crmutils.node_reachable_check(node)
except Exception as err:
logger.error(str(err))
context.node_list.remove(node)
Expand Down
2 changes: 1 addition & 1 deletion crmsh/ui_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def parse_option_for_nodes(context, *args):
node_list = member_list if options.all else args
for node in node_list:
try:
utils.ping_node(node)
utils.node_reachable_check(node)
except ValueError as err:
logger.warning(str(err))
node_list.remove(node)
Expand Down
42 changes: 27 additions & 15 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import bz2
import lzma
import json
import socket
from pathlib import Path
from contextlib import contextmanager, closing
from stat import S_ISBLK
Expand Down Expand Up @@ -2108,15 +2109,21 @@ def check_ssh_passwd_need(local_user, remote_user, host, shell: sh.LocalShell =
return rc != 0


def check_port_open(ip, port):
import socket

family = socket.AF_INET6 if IP.is_ipv6(ip) else socket.AF_INET
with closing(socket.socket(family, socket.SOCK_STREAM)) as sock:
if sock.connect_ex((ip, port)) == 0:
return True
else:
return False
def check_port_open(host, port, timeout=3) -> bool:
"""
Check whether the port is open on the host
Use getaddrinfo to support both IPv4 and IPv6
"""
try:
addrinfo = socket.getaddrinfo(host, port, socket.AF_UNSPEC, socket.SOCK_STREAM)
af, socktype, proto, canonname, sa = addrinfo[0]
with closing(socket.socket(af, socktype, proto)) as sock:
sock.settimeout(timeout)
if sock.connect_ex(sa) == 0:
return True
return False
except socket.error:
return False


def valid_port(port):
Expand Down Expand Up @@ -2460,13 +2467,18 @@ def package_is_installed(pkg, remote_addr=None):
return rc == 0


def ping_node(node):
def node_reachable_check(node, ping_count=1, port=22, timeout=3):
"""
Check if the remote node is reachable
Check if node is reachable by using ping and socket to ssh port
"""
rc, _, err = ShellUtils().get_stdout_stderr("ping -c 1 {}".format(node))
if rc != 0:
raise ValueError("host \"{}\" is unreachable: {}".format(node, err))
rc, _, _ = ShellUtils().get_stdout_stderr(f"ping -n -c {ping_count} -W {timeout} {node}")
if rc == 0:
return True
# ping failed, try to connect to ssh port by socket
if check_port_open(node, port, timeout):
return True
# both ping and socket failed
raise ValueError(f"host \"{node}\" is unreachable")


def calculate_quorate_status(expected_votes, actual_votes):
Expand All @@ -2490,7 +2502,7 @@ def check_all_nodes_reachable():
"""
out = sh.cluster_shell().get_stdout_or_raise_error("crm_node -l")
for node in re.findall("\d+ (.*) \w+", out):
ping_node(node)
node_reachable_check(node)


def re_split_string(reg, string):
Expand Down
2 changes: 1 addition & 1 deletion test/features/crm_report_normal.feature
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Feature: crm report functional test for common cases
Then Expected "Directory /fsf does not exist" in stderr

When Try "crm report -n fs" on "hanode1"
Then Expected "host "fs" is unreachable:" in stderr
Then Expected "host "fs" is unreachable" in stderr

When Try "crm report -f xxxx" on "hanode1"
Then Expected "Invalid time string 'xxxx'" in stderr
Expand Down
17 changes: 2 additions & 15 deletions test/unittests/test_qdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,23 +202,10 @@ def test_qdevice_p12_on_cluster(self):
res = self.qdevice_with_ip_cluster_node.qdevice_p12_on_cluster
self.assertEqual(res, "/etc/corosync/qdevice/net/node1.com/qdevice-net-node.p12")

@mock.patch('crmsh.utils.check_port_open')
@mock.patch('crmsh.utils.InterfacesInfo.ip_in_local')
@mock.patch('crmsh.utils.ping_node')
@mock.patch('crmsh.utils.node_reachable_check')
@mock.patch('socket.getaddrinfo')
def test_check_qnetd_addr_port_error(self, mock_getaddrinfo, mock_ping, mock_in_local, mock_check):
mock_getaddrinfo.return_value = [(None, ("10.10.10.123",)),]
mock_in_local.return_value = False
mock_check.return_value = False
with self.assertRaises(ValueError) as err:
qdevice.QDevice.check_qnetd_addr("qnetd-node")
excepted_err_string = "ssh service on \"qnetd-node\" not available"
self.assertEqual(excepted_err_string, str(err.exception))

@mock.patch('crmsh.utils.InterfacesInfo.ip_in_local')
@mock.patch('crmsh.utils.ping_node')
@mock.patch('socket.getaddrinfo')
def test_check_qnetd_addr_local(self, mock_getaddrinfo, mock_ping, mock_in_local):
def test_check_qnetd_addr_local(self, mock_getaddrinfo, mock_reachable, mock_in_local):
mock_getaddrinfo.return_value = [(None, ("10.10.10.123",)),]
mock_in_local.return_value = True
with self.assertRaises(ValueError) as err:
Expand Down
6 changes: 3 additions & 3 deletions test/unittests/test_report_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,12 +475,12 @@ def test_process_node_list_single(self, mock_list_nodes, mock_local_mode):

@mock.patch('crmsh.report.core.local_mode')
@mock.patch('logging.Logger.error')
@mock.patch('crmsh.utils.ping_node')
@mock.patch('crmsh.utils.node_reachable_check')
@mock.patch('crmsh.utils.list_cluster_nodes')
def test_process_node_list(self, mock_list_nodes, mock_ping, mock_error, mock_local_mode):
def test_process_node_list(self, mock_list_nodes, mock_reachable, mock_error, mock_local_mode):
mock_local_mode.return_value = False
mock_ctx_inst = mock.Mock(node_list=["node1", "node2"], single=False, me="node1")
mock_ping.side_effect = ValueError("error")
mock_reachable.side_effect = ValueError("error")
core.process_node_list(mock_ctx_inst)
self.assertEqual(mock_ctx_inst.node_list, ["node1"])

Expand Down
41 changes: 15 additions & 26 deletions test/unittests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,39 +265,37 @@ def test_sysconfig_set_bsc1145823():
sc = utils.parse_sysconfig(fname)
assert (sc.get("age") == "100")

@mock.patch("crmsh.utils.IP.is_ipv6")
@mock.patch("socket.getaddrinfo")
@mock.patch("socket.socket")
@mock.patch("crmsh.utils.closing")
def test_check_port_open_false(mock_closing, mock_socket, mock_is_ipv6):
mock_is_ipv6.return_value = False
def test_check_port_open_false(mock_closing, mock_socket, mock_getaddrinfo):
sock_inst = mock.Mock()
mock_socket.return_value = sock_inst
mock_closing.return_value.__enter__.return_value = sock_inst
sock_inst.connect_ex.return_value = 1
mock_getaddrinfo.return_value = [(socket.AF_INET, socket.SOCK_STREAM, 6, "", ("127.0.0.1", 22))]

assert utils.check_port_open("10.10.10.1", 22) is False
assert utils.check_port_open("localhost", 22) is False

mock_is_ipv6.assert_called_once_with("10.10.10.1")
mock_socket.assert_called_once_with(socket.AF_INET, socket.SOCK_STREAM)
mock_socket.assert_called_once_with(socket.AF_INET, socket.SOCK_STREAM, 6)
mock_closing.assert_called_once_with(sock_inst)
sock_inst.connect_ex.assert_called_once_with(("10.10.10.1", 22))
sock_inst.connect_ex.assert_called_once_with(("127.0.0.1", 22))

@mock.patch("crmsh.utils.IP.is_ipv6")
@mock.patch("socket.getaddrinfo")
@mock.patch("socket.socket")
@mock.patch("crmsh.utils.closing")
def test_check_port_open_true(mock_closing, mock_socket, mock_is_ipv6):
mock_is_ipv6.return_value = True
def test_check_port_open_true(mock_closing, mock_socket, mock_getaddrinfo):
sock_inst = mock.Mock()
mock_socket.return_value = sock_inst
mock_closing.return_value.__enter__.return_value = sock_inst
sock_inst.connect_ex.return_value = 0
mock_getaddrinfo.return_value = [(socket.AF_INET, socket.SOCK_STREAM, 6, "", ("127.0.0.1", 22))]

assert utils.check_port_open("2001:db8:10::7", 22) is True
assert utils.check_port_open("localhost", 22) is True

mock_is_ipv6.assert_called_once_with("2001:db8:10::7")
mock_socket.assert_called_once_with(socket.AF_INET6, socket.SOCK_STREAM)
mock_socket.assert_called_once_with(socket.AF_INET, socket.SOCK_STREAM, 6)
mock_closing.assert_called_once_with(sock_inst)
sock_inst.connect_ex.assert_called_once_with(("2001:db8:10::7", 22))
sock_inst.connect_ex.assert_called_once_with(("127.0.0.1", 22))

def test_valid_port():
assert utils.valid_port(1) is False
Expand Down Expand Up @@ -741,15 +739,6 @@ def test_get_iplist_from_name(mock_get_nodeid, mock_get_nodeinfo):
mock_get_nodeinfo.assert_called_once_with()


@mock.patch("crmsh.sh.ShellUtils.get_stdout_stderr")
def test_ping_node(mock_run):
mock_run.return_value = (1, None, "error data")
with pytest.raises(ValueError) as err:
utils.ping_node("node_unreachable")
assert str(err.value) == 'host "node_unreachable" is unreachable: error data'
mock_run.assert_called_once_with("ping -c 1 node_unreachable")


def test_calculate_quorate_status():
assert utils.calculate_quorate_status(3, 2) is True
assert utils.calculate_quorate_status(3, 1) is False
Expand Down Expand Up @@ -983,13 +972,13 @@ def test_is_block_device(mock_stat, mock_isblk):
mock_isblk.assert_called_once_with(12345)


@mock.patch('crmsh.utils.ping_node')
@mock.patch('crmsh.utils.node_reachable_check')
@mock.patch('crmsh.sh.ClusterShell.get_stdout_or_raise_error')
def test_check_all_nodes_reachable(mock_run, mock_ping):
def test_check_all_nodes_reachable(mock_run, mock_reachable):
mock_run.return_value = "1084783297 15sp2-1 member"
utils.check_all_nodes_reachable()
mock_run.assert_called_once_with("crm_node -l")
mock_ping.assert_called_once_with("15sp2-1")
mock_reachable.assert_called_once_with("15sp2-1")


@mock.patch('crmsh.sh.ShellUtils.get_stdout_stderr')
Expand Down

0 comments on commit 113b211

Please sign in to comment.