Skip to content

Commit

Permalink
fix: set sbd.service timeout based on SBD_START_DELAY
Browse files Browse the repository at this point in the history
Timeout for starting the sbd.service needs to be longer than
SBD_START_DELAY, otherwise the start of the sbd.service times out and
prevents a cluster from starting
  • Loading branch information
tomjelinek authored and richm committed Dec 5, 2023
1 parent 523036d commit 99cca5c
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ variables:
- sbd_check_devices_count
- sbd_defaults_disabled
- sbd_defaults
- sbd_delay_start
- sbd_needs_atb_while_atb_disabled
- sbd_needs_atb_while_atb_enabled

Expand Down
21 changes: 16 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -449,11 +449,22 @@ You may take a look at [an example](#configuring-cluster-to-use-sbd).

list, default: `[]`

List of name-value dictionaries specifying SBD options. Supported options are:
`delay-start` (defaults to `false`), `startmode` (defaults to `always`),
`timeout-action` (defaults to `flush,reboot`) and `watchdog-timeout` (defaults
to `5`). See `sbd(8)` man page, section 'Configuration via environment' for
their description.
List of name-value dictionaries specifying SBD options. See `sbd(8)` man page,
section 'Configuration via environment' for their description. Supported
options are:

* `delay-start`
* `false` or `integer`, defaults to `false`
* documented as SBD\_DELAY\_START
* `startmode`
* `string`, defaults to `always`
* documented as SBD\_STARTMODE
* `timeout-action`
* `string`, defaults to `flush,reboot`
* documented as SBD\_TIMEOUT\_ACTION
* `watchdog-timeout`
* `integer`, defaults to `5`
* documented as SBD\_WATCHDOG\_TIMEOUT

You may take a look at [an example](#configuring-cluster-to-use-sbd).

Expand Down
37 changes: 37 additions & 0 deletions tasks/shell_pcs/sbd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,43 @@
node_devices: "{{ ha_cluster.sbd_devices | d([]) }}"
register: __ha_cluster_distribute_sbd_config

- name: Configure systemd timeout for SBD
vars:
__sbd_delay_start: "{{ ha_cluster_sbd_options
| selectattr('name', 'match', '^delay-start$')
| map(attribute='value') | list }}"
when:
- __sbd_delay_start | length > 0
- __sbd_delay_start | first | int > 0
block:
- name: Ensure /etc/systemd/system/sbd.service.d directory exists
file:
path: /etc/systemd/system/sbd.service.d
state: directory
owner: root
group: root
mode: 0755

- name: Override start timeout for SBD
template:
src: templates/override-timeout.conf
dest: /etc/systemd/system/sbd.service.d/override-timeout.conf
owner: root
group: root
mode: 0644
vars:
# Make sure the timeout is at least the default 90 seconds.
# The intent is to make the timeout longer if needed, not shorter.
# yamllint disable rule:line-length
timeout_value: "{{ [90,
(__sbd_delay_start | first | float * 1.2) | round(0, 'ceil') | int] |
max }}"
# yamllint enable rule:line-length

- name: Reload systemd service files
systemd:
daemon_reload: true

- name: Get services status - detect pacemaker
service_facts:

Expand Down
4 changes: 4 additions & 0 deletions templates/override-timeout.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{{ ansible_managed | comment }}
{{ "system_role:ha_cluster" | comment(prefix="", postfix="") }}
[Service]
TimeoutStartSec={{ timeout_value }}
4 changes: 2 additions & 2 deletions tests/tests_sbd_all_options.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
ha_cluster_sbd_enabled: true
ha_cluster_sbd_options:
- name: delay-start
value: 2s
value: 2
- name: startmode
value: clean
- name: timeout-action
Expand Down Expand Up @@ -132,7 +132,7 @@
- name: Check SBD config
assert:
that:
- "'SBD_DELAY_START=\"2s\"' in __test_sbd_config_lines"
- "'SBD_DELAY_START=\"2\"' in __test_sbd_config_lines"
- "'SBD_DEVICE=\"{{ __test_sbd_mount.stdout }}\"'
in __test_sbd_config_lines"
- "'SBD_STARTMODE=\"clean\"' in __test_sbd_config_lines"
Expand Down
99 changes: 99 additions & 0 deletions tests/tests_sbd_delay_start.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: MIT
---
- name: SBD with long delay-start
hosts: all
vars_files: vars/main.yml
vars:
ha_cluster_cluster_name: test-cluster
ha_cluster_sbd_enabled: true
ha_cluster_sbd_options:
- name: delay-start
value: 101

tasks:
- name: Run test
tags: tests::verify
block:
- name: Set up test environment
include_role:
name: linux-system-roles.ha_cluster
tasks_from: test_setup.yml

- name: Set up test environment for SBD
include_role:
name: linux-system-roles.ha_cluster
tasks_from: test_setup_sbd.yml

- name: Ensure SBD config file is not present
file:
path: /etc/sysconfig/sbd
state: absent

- name: Ensure systemd overrides for SBD are not present
file:
path: /etc/systemd/system/sbd.service.d/override-timeout.conf
state: absent

- name: Run HA Cluster role
include_role:
name: linux-system-roles.ha_cluster
public: true

- name: Slurp SBD config file
slurp:
src: /etc/sysconfig/sbd
register: __test_sbd_config

- name: Decode SBD config
set_fact:
__test_sbd_config_lines: "{{
(__test_sbd_config.content | b64decode).splitlines() }}"

- name: Print SBD config lines
debug:
var: __test_sbd_config_lines

- name: Check SBD config
assert:
that:
- "'SBD_DELAY_START=\"101\"' in __test_sbd_config_lines"
- >
__test_sbd_config_lines[-1]
== 'SBD_OPTS="-n {{ __ha_cluster_node_name }}"'
- name: Check header for ansible_managed, fingerprint
include_tasks: tasks/check_header.yml
vars:
__file_content: "{{ __test_sbd_config }}"
__fingerprint: "system_role:ha_cluster"

- name: Slurp SBD overrides
slurp:
src: /etc/systemd/system/sbd.service.d/override-timeout.conf
register: __test_sbd_overrides_config

- name: Decode SBD overrides
set_fact:
__test_sbd_overrides_config_lines: "{{
(__test_sbd_overrides_config.content | b64decode).splitlines()
}}"

- name: Check SBD overrides
assert:
that:
- "'TimeoutStartSec=122' in __test_sbd_overrides_config_lines"

- name: Check header for ansible_managed, fingerprint
include_tasks: tasks/check_header.yml
vars:
__file_content: "{{ __test_sbd_overrides_config }}"
__fingerprint: "system_role:ha_cluster"

- name: Check firewall and selinux state
include_tasks: tasks/check_firewall_selinux.yml

always:
- name: Clean up test environment for SBD
include_role:
name: linux-system-roles.ha_cluster
tasks_from: test_cleanup_sbd.yml

0 comments on commit 99cca5c

Please sign in to comment.