From ddf0fea587cc485605d7198a39e213eb07da7ddd Mon Sep 17 00:00:00 2001 From: Tomas Jelinek Date: Tue, 5 Dec 2023 10:05:32 +0100 Subject: [PATCH] fix: set sbd.service timeout based on SBD_START_DELAY Timeout for starting the sbd.service needs to be longer than SBD_START_DELAY, otherwise the start of the sbd.service times out and prevents a cluster from starting --- .gitlab-ci.yml | 1 + README.md | 21 +++++-- tasks/shell_pcs/sbd.yml | 37 ++++++++++++ templates/override-timeout.conf | 4 ++ tests/tests_sbd_all_options.yml | 4 +- tests/tests_sbd_delay_start.yml | 99 +++++++++++++++++++++++++++++++++ 6 files changed, 159 insertions(+), 7 deletions(-) create mode 100644 templates/override-timeout.conf create mode 100644 tests/tests_sbd_delay_start.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 11493c0b..aecf6845 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -60,6 +60,7 @@ variables: - sbd_check_devices_count - sbd_defaults_disabled - sbd_defaults + - sbd_delay_start - sbd_needs_atb_while_atb_disabled - sbd_needs_atb_while_atb_enabled diff --git a/README.md b/README.md index 537d12de..badd3307 100644 --- a/README.md +++ b/README.md @@ -449,11 +449,22 @@ You may take a look at [an example](#configuring-cluster-to-use-sbd). list, default: `[]` -List of name-value dictionaries specifying SBD options. Supported options are: -`delay-start` (defaults to `false`), `startmode` (defaults to `always`), -`timeout-action` (defaults to `flush,reboot`) and `watchdog-timeout` (defaults -to `5`). See `sbd(8)` man page, section 'Configuration via environment' for -their description. +List of name-value dictionaries specifying SBD options. See `sbd(8)` man page, +section 'Configuration via environment' for their description. Supported +options are: + +* `delay-start` + * `false` or `integer`, defaults to `false` + * documented as SBD\_DELAY\_START +* `startmode` + * `string`, defaults to `always` + * documented as SBD\_STARTMODE +* `timeout-action` + * `string`, defaults to `flush,reboot` + * documented as SBD\_TIMEOUT\_ACTION +* `watchdog-timeout` + * `integer`, defaults to `5` + * documented as SBD\_WATCHDOG\_TIMEOUT You may take a look at [an example](#configuring-cluster-to-use-sbd). diff --git a/tasks/shell_pcs/sbd.yml b/tasks/shell_pcs/sbd.yml index 5ea156dc..ee7a4273 100644 --- a/tasks/shell_pcs/sbd.yml +++ b/tasks/shell_pcs/sbd.yml @@ -99,6 +99,43 @@ node_devices: "{{ ha_cluster.sbd_devices | d([]) }}" register: __ha_cluster_distribute_sbd_config + - name: Configure systemd timeout for SBD + vars: + __sbd_delay_start: "{{ ha_cluster_sbd_options + | selectattr('name', 'match', '^delay-start$') + | map(attribute='value') | list }}" + when: + - __sbd_delay_start | length > 0 + - __sbd_delay_start | first | int > 0 + block: + - name: Ensure /etc/systemd/system/sbd.service.d directory exists + file: + path: /etc/systemd/system/sbd.service.d + state: directory + owner: root + group: root + mode: 0755 + + - name: Override start timeout for SBD + template: + src: templates/override-timeout.conf + dest: /etc/systemd/system/sbd.service.d/override-timeout.conf + owner: root + group: root + mode: 0644 + vars: + # Make sure the timeout is at least the default 90 seconds. + # The intent is to make the timeout longer if needed, not shorter. + # yamllint disable rule:line-length + timeout_value: "{{ [90, + (__sbd_delay_start | first | float * 1.2) | round(0, 'ceil') | int] | + max }}" + # yamllint enable rule:line-length + + - name: Reload systemd service files + systemd: + daemon_reload: true + - name: Get services status - detect pacemaker service_facts: diff --git a/templates/override-timeout.conf b/templates/override-timeout.conf new file mode 100644 index 00000000..b050f24c --- /dev/null +++ b/templates/override-timeout.conf @@ -0,0 +1,4 @@ +{{ ansible_managed | comment }} +{{ "system_role:ha_cluster" | comment(prefix="", postfix="") }} +[Service] +TimeoutStartSec={{ timeout_value }} diff --git a/tests/tests_sbd_all_options.yml b/tests/tests_sbd_all_options.yml index 586b1e31..48f95463 100644 --- a/tests/tests_sbd_all_options.yml +++ b/tests/tests_sbd_all_options.yml @@ -8,7 +8,7 @@ ha_cluster_sbd_enabled: true ha_cluster_sbd_options: - name: delay-start - value: 2s + value: 2 - name: startmode value: clean - name: timeout-action @@ -132,7 +132,7 @@ - name: Check SBD config assert: that: - - "'SBD_DELAY_START=\"2s\"' in __test_sbd_config_lines" + - "'SBD_DELAY_START=\"2\"' in __test_sbd_config_lines" - "'SBD_DEVICE=\"{{ __test_sbd_mount.stdout }}\"' in __test_sbd_config_lines" - "'SBD_STARTMODE=\"clean\"' in __test_sbd_config_lines" diff --git a/tests/tests_sbd_delay_start.yml b/tests/tests_sbd_delay_start.yml new file mode 100644 index 00000000..fab6cad2 --- /dev/null +++ b/tests/tests_sbd_delay_start.yml @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: MIT +--- +- name: SBD with long delay-start + hosts: all + vars_files: vars/main.yml + vars: + ha_cluster_cluster_name: test-cluster + ha_cluster_sbd_enabled: true + ha_cluster_sbd_options: + - name: delay-start + value: 101 + + tasks: + - name: Run test + tags: tests::verify + block: + - name: Set up test environment + include_role: + name: linux-system-roles.ha_cluster + tasks_from: test_setup.yml + + - name: Set up test environment for SBD + include_role: + name: linux-system-roles.ha_cluster + tasks_from: test_setup_sbd.yml + + - name: Ensure SBD config file is not present + file: + path: /etc/sysconfig/sbd + state: absent + + - name: Ensure systemd overrides for SBD are not present + file: + path: /etc/systemd/system/sbd.service.d/override-timeout.conf + state: absent + + - name: Run HA Cluster role + include_role: + name: linux-system-roles.ha_cluster + public: true + + - name: Slurp SBD config file + slurp: + src: /etc/sysconfig/sbd + register: __test_sbd_config + + - name: Decode SBD config + set_fact: + __test_sbd_config_lines: "{{ + (__test_sbd_config.content | b64decode).splitlines() }}" + + - name: Print SBD config lines + debug: + var: __test_sbd_config_lines + + - name: Check SBD config + assert: + that: + - "'SBD_DELAY_START=\"101\"' in __test_sbd_config_lines" + - > + __test_sbd_config_lines[-1] + == 'SBD_OPTS="-n {{ __ha_cluster_node_name }}"' + + - name: Check header for ansible_managed, fingerprint + include_tasks: tasks/check_header.yml + vars: + __file_content: "{{ __test_sbd_config }}" + __fingerprint: "system_role:ha_cluster" + + - name: Slurp SBD overrides + slurp: + src: /etc/systemd/system/sbd.service.d/override-timeout.conf + register: __test_sbd_overrides_config + + - name: Decode SBD overrides + set_fact: + __test_sbd_overrides_config_lines: "{{ + (__test_sbd_overrides_config.content | b64decode).splitlines() + }}" + + - name: Check SBD overrides + assert: + that: + - "'TimeoutStartSec=122' in __test_sbd_overrides_config_lines" + + - name: Check header for ansible_managed, fingerprint + include_tasks: tasks/check_header.yml + vars: + __file_content: "{{ __test_sbd_overrides_config }}" + __fingerprint: "system_role:ha_cluster" + + - name: Check firewall and selinux state + include_tasks: tasks/check_firewall_selinux.yml + + always: + - name: Clean up test environment for SBD + include_role: + name: linux-system-roles.ha_cluster + tasks_from: test_cleanup_sbd.yml