Skip to content

Commit

Permalink
Don't use port open check to determine if reboot completed. Fixes #856.
Browse files Browse the repository at this point in the history
The old approach, waiting for the machine to not having an open
port, and then waiting for it to be open again, was insufficient,
because of the race condition that the machine rebooted so quickly
that the port was immediately open again without nixops noticing
that it went down. I experienced this on a Hetzner cloud server.

The new approach checks the `last reboot` on the remote side
to change, which is not racy.
  • Loading branch information
nh2 committed Jun 28, 2018
1 parent d4b2fc5 commit f83e244
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 7 deletions.
48 changes: 43 additions & 5 deletions nixops/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import subprocess
import time

import nixops.util
import nixops.resources
Expand Down Expand Up @@ -182,16 +183,53 @@ def reboot(self, hard=False):
reboot_command = "systemctl reboot"
self.run_command(reboot_command, check=False)
self.state = self.STARTING
# Note we always reset the connection to ensure the SSH control master
# connection is stopped.
# While an orderly `reboot` should make the remote kernel send RST to
# all TCP connections, we don't want to rely on that here.
self.ssh.reset()

def reboot_sync(self, hard=False):
"""Reboot this machine and wait until it's up again."""

# To check when the machine has finished rebooting in a race-free
# manner, we compare the output of `last reboot` before and after
# the reboot. Once the output has changed, the reboot is done.
def get_last_reboot_output():
# Note `last reboot` does not exist on older OSs like
# the Hetzner rescue system, but that doesn't matter much
# because all we care about is when the output of the
# command invocation changes.
# We use timeout=10 so that the user gets some sense
# of progress, as reboots can take a long time.
return self.run_command('last reboot --time-format iso | head -n 1', capture_stdout=True, timeout=10).rstrip()

# The server might be off.
# In that case, we just set `pre_reboot_last_reboot_output` to some string
# so that it changes when it comes up.
try:
pre_reboot_last_reboot_output = get_last_reboot_output()
except nixops.ssh_util.SSHConnectionFailed:
self.log("server is unreachable, so reboot completion detection may not be accurate if it's a network problem")
pre_reboot_last_reboot_output = "server is off or couldn't be contacted"

self.reboot(hard=hard)
self.log_start("waiting for the machine to finish rebooting...")
nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, open=False, callback=lambda: self.log_continue("."))
self.log_continue("[down]")
nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, callback=lambda: self.log_continue("."))
self.log_end("[up]")

self.log_start("waiting for reboot to complete...")
while True:
last_reboot_output = None
try:
last_reboot_output = get_last_reboot_output()
except (nixops.ssh_util.SSHConnectionFailed, nixops.ssh_util.SSHCommandFailed):
# We accept this because the machine might be down,
# and show an 'x' as progress indicator in that case.
self.log_continue("x")
if last_reboot_output is not None and last_reboot_output != pre_reboot_last_reboot_output:
break
self.log_continue(".")
time.sleep(1)
self.log_end("done.")

self.state = self.UP
self.ssh_pinged = True
self._ssh_pinged_this_time = True
Expand Down
3 changes: 2 additions & 1 deletion nixops/backends/azure_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,8 @@ def reboot(self, hard=False):
self.log("sending hard reset to Azure machine...")
self.cmc().virtual_machines.restart(self.resource_group, self.machine_name)
self.state = self.STARTING
self.ssh.reset()
if reset:
self.ssh.reset()
else:
MachineState.reboot(self, hard=hard)
self.ssh_pinged = False
Expand Down
6 changes: 5 additions & 1 deletion nixops/backends/hetzner.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,13 @@ def reboot(self, hard=False):
server.reboot('hard')
self.log_end("done.")
self.state = self.STARTING
# Note we always reset the connection because on a hard reboot, the
# other side may not have a chance to terminate the SSH connection,
# (send TCP RST), which could lead to the control master process
# still being alive and hanging until the TCP connection times out.
self.ssh.reset()
else:
MachineState.reboot(self, hard=hard)
MachineState.reboot(self, hard=hard, reset=reset)

def reboot_rescue(self, install=False, partitions=None, bootstrap=True,
hard=False):
Expand Down

0 comments on commit f83e244

Please sign in to comment.