Skip to content

Commit

Permalink
Fix: sbd watchdog rebooting upon restart of pacemaker-remote
Browse files Browse the repository at this point in the history
  • Loading branch information
wenningerk committed Jun 21, 2023
1 parent 5ec38cf commit 5ea57f5
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 6 deletions.
24 changes: 24 additions & 0 deletions src/sbd-cluster.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,16 @@ static crm_cluster_t cluster;
static gboolean sbd_remote_check(gpointer user_data);
static long unsigned int find_pacemaker_remote(void);
static void sbd_membership_destroy(gpointer user_data);
static bool wait_for_pacemaker_remote_lost = false;

static void signal_exitreq(void)
{
union sigval signal_value;
pid_t ppid = getppid();

memset(&signal_value, 0, sizeof(signal_value));
sigqueue(ppid, SIG_EXITREQ, signal_value);
}

#if SUPPORT_PLUGIN
static void
Expand Down Expand Up @@ -675,6 +684,10 @@ sbd_remote_check(gpointer user_data)
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
} else {
if (wait_for_pacemaker_remote_lost) {
signal_exitreq();
return true;
}
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
}
Expand Down Expand Up @@ -742,6 +755,16 @@ cluster_shutdown(int nsig)
clean_up(0);
}

static void
trigger_wait_for_pacemaker_remote_lost(int nsig)
{
/* if we've never seen pacemaker_remoted request exit immeditely */
if ((remoted_pid <= 0) || !remote_node) {
signal_exitreq();
}
wait_for_pacemaker_remote_lost = true;
}

int
servant_cluster(const char *diskname, int mode, const void* argp)
{
Expand All @@ -761,6 +784,7 @@ servant_cluster(const char *diskname, int mode, const void* argp)

mainloop_add_signal(SIGTERM, cluster_shutdown);
mainloop_add_signal(SIGINT, cluster_shutdown);
mainloop_add_signal(SIGUSR2, trigger_wait_for_pacemaker_remote_lost);

g_main_loop_run(mainloop);
g_main_loop_unref(mainloop);
Expand Down
12 changes: 7 additions & 5 deletions src/sbd-inquisitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -248,14 +248,14 @@ void servants_start(void)
}
}

void servants_kill(void)
void servants_kill(int sig)
{
struct servants_list_item *s;
union sigval svalue;

for (s = servants_leader; s; s = s->next) {
if (s->pid != 0)
sigqueue(s->pid, SIGKILL, svalue);
sigqueue(s->pid, sig, svalue);
}
}

Expand Down Expand Up @@ -536,7 +536,7 @@ void inquisitor_child(void)
clock_gettime(CLOCK_MONOTONIC, &t_now);

if (sig == SIG_EXITREQ || sig == SIGTERM) {
servants_kill();
servants_kill(SIGKILL);
watchdog_close(true);
exiting = 1;
} else if (sig == SIGCHLD) {
Expand Down Expand Up @@ -610,6 +610,8 @@ void inquisitor_child(void)
if (exiting)
continue;
servants_start();
} else if (sig == SIGUSR2) {
servants_kill(SIGUSR2);
}

if (exiting) {
Expand Down Expand Up @@ -718,7 +720,7 @@ void inquisitor_child(void)
*/
cl_log(LOG_DEBUG, "Decoupling");
if (inquisitor_decouple() < 0) {
servants_kill();
servants_kill(SIGKILL);
exiting = 1;
continue;
} else {
Expand All @@ -734,7 +736,7 @@ void inquisitor_child(void)
/* We're still being watched by our
* parent. We don't fence, but exit. */
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
servants_kill();
servants_kill(SIGKILL);
exiting = 1;
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion src/sbd_remote.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Type=forking
PIDFile=@runstatedir@/sbd.pid
EnvironmentFile=-@CONFIGDIR@/sbd
ExecStart=@sbindir@/sbd $SBD_OPTS -p @runstatedir@/sbd.pid watch
ExecStop=@bindir@/kill -TERM $MAINPID
ExecStop=@bindir@/kill -USR2 $MAINPID

# Could this benefit from exit codes for restart?
# Does this need to be set to msgwait * 1.2?
Expand Down

0 comments on commit 5ea57f5

Please sign in to comment.