Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: sbd watchdog rebooting upon restart of pacemaker-remote #33

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions src/sbd-cluster.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,16 @@ static crm_cluster_t cluster;
static gboolean sbd_remote_check(gpointer user_data);
static long unsigned int find_pacemaker_remote(void);
static void sbd_membership_destroy(gpointer user_data);
static bool wait_for_pacemaker_remote_lost = false;

static void signal_exitreq(void)
{
union sigval signal_value;
pid_t ppid = getppid();

memset(&signal_value, 0, sizeof(signal_value));
sigqueue(ppid, SIG_EXITREQ, signal_value);
}

#if SUPPORT_PLUGIN
static void
Expand Down Expand Up @@ -675,6 +684,10 @@ sbd_remote_check(gpointer user_data)
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
} else {
if (wait_for_pacemaker_remote_lost) {
signal_exitreq();
return true;
}
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
}
Expand Down Expand Up @@ -742,6 +755,16 @@ cluster_shutdown(int nsig)
clean_up(0);
}

static void
trigger_wait_for_pacemaker_remote_lost(int nsig)
{
/* if we've never seen pacemaker_remoted request exit immeditely */
if ((remoted_pid <= 0) || !remote_node) {
signal_exitreq();
}
wait_for_pacemaker_remote_lost = true;
}

int
servant_cluster(const char *diskname, int mode, const void* argp)
{
Expand All @@ -761,6 +784,7 @@ servant_cluster(const char *diskname, int mode, const void* argp)

mainloop_add_signal(SIGTERM, cluster_shutdown);
mainloop_add_signal(SIGINT, cluster_shutdown);
mainloop_add_signal(SIGUSR2, trigger_wait_for_pacemaker_remote_lost);

g_main_loop_run(mainloop);
g_main_loop_unref(mainloop);
Expand Down
12 changes: 7 additions & 5 deletions src/sbd-inquisitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -248,14 +248,14 @@ void servants_start(void)
}
}

void servants_kill(void)
void servants_kill(int sig)
{
struct servants_list_item *s;
union sigval svalue;

for (s = servants_leader; s; s = s->next) {
if (s->pid != 0)
sigqueue(s->pid, SIGKILL, svalue);
sigqueue(s->pid, sig, svalue);
}
}

Expand Down Expand Up @@ -536,7 +536,7 @@ void inquisitor_child(void)
clock_gettime(CLOCK_MONOTONIC, &t_now);

if (sig == SIG_EXITREQ || sig == SIGTERM) {
servants_kill();
servants_kill(SIGKILL);
watchdog_close(true);
exiting = 1;
} else if (sig == SIGCHLD) {
Expand Down Expand Up @@ -610,6 +610,8 @@ void inquisitor_child(void)
if (exiting)
continue;
servants_start();
} else if (sig == SIGUSR2) {
servants_kill(SIGUSR2);
}

if (exiting) {
Expand Down Expand Up @@ -718,7 +720,7 @@ void inquisitor_child(void)
*/
cl_log(LOG_DEBUG, "Decoupling");
if (inquisitor_decouple() < 0) {
servants_kill();
servants_kill(SIGKILL);
exiting = 1;
continue;
} else {
Expand All @@ -734,7 +736,7 @@ void inquisitor_child(void)
/* We're still being watched by our
* parent. We don't fence, but exit. */
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
servants_kill();
servants_kill(SIGKILL);
exiting = 1;
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion src/sbd_remote.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Type=forking
PIDFile=@runstatedir@/sbd.pid
EnvironmentFile=-@CONFIGDIR@/sbd
ExecStart=@sbindir@/sbd $SBD_OPTS -p @runstatedir@/sbd.pid watch
ExecStop=@bindir@/kill -TERM $MAINPID
ExecStop=@bindir@/kill -USR2 $MAINPID

# Could this benefit from exit codes for restart?
# Does this need to be set to msgwait * 1.2?
Expand Down