Skip to content

Commit

Permalink
Don't use port open check to determine if reboot completed. Fixes #856.
Browse files Browse the repository at this point in the history
The old approach, waiting for the machine to not having an open
port, and then waiting for it to be open again, was insufficient,
because of the race condition that the machine rebooted so quickly
that the port was immediately open again without nixops noticing
that it went down. I experienced this on a Hetzner cloud server.

The new approach checks the `last reboot` on the remote side
to change, which is not racy.
  • Loading branch information
nh2 committed Feb 3, 2018
1 parent d4b2fc5 commit 37bc5af
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 17 deletions.
41 changes: 33 additions & 8 deletions nixops/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import subprocess
import time

import nixops.util
import nixops.resources
Expand Down Expand Up @@ -170,7 +171,7 @@ def backup(self, defn, backup_id):
"""Make backup of persistent disks, if possible."""
self.warn("don't know how to make backup of disks for machine ‘{0}’".format(self.name))

def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
"""Reboot this machine."""
self.log("rebooting...")
if self.state == self.RESCUE:
Expand All @@ -182,16 +183,40 @@ def reboot(self, hard=False):
reboot_command = "systemctl reboot"
self.run_command(reboot_command, check=False)
self.state = self.STARTING
self.ssh.reset()
if reset:
self.ssh.reset()

def reboot_sync(self, hard=False):
"""Reboot this machine and wait until it's up again."""
self.reboot(hard=hard)
self.log_start("waiting for the machine to finish rebooting...")
nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, open=False, callback=lambda: self.log_continue("."))
self.log_continue("[down]")
nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, callback=lambda: self.log_continue("."))
self.log_end("[up]")

# To check when the machine has finished rebooting in a race-free
# manner, we compare the output of `last reboot` before and after
# the reboot. Once the output has changed, the reboot is done.
def get_last_reboot_output():
return self.run_command('last reboot --time-format iso | head -n1', capture_stdout=True).rstrip()

pre_reboot_last_reboot_output = get_last_reboot_output()

# We set reset=False so that we can continue running `last reboot`
# remotely; when the reboot happens, our SSH connection will be reset
# by the remote side instead.
self.reboot(hard=hard, reset=False)

self.log_start("waiting for reboot to complete...")
while True:
last_reboot_output = None
try:
last_reboot_output = get_last_reboot_output()
except nixops.ssh_util.SSHCommandFailed:
# We accept this because the machine might be down,
# and show an 'x' as progress indicator in that case.
self.log_continue("x")
if last_reboot_output is not None and last_reboot_output != pre_reboot_last_reboot_output:
break
self.log_continue(".")
time.sleep(1)
self.log_end("done.")

self.state = self.UP
self.ssh_pinged = True
self._ssh_pinged_this_time = True
Expand Down
7 changes: 4 additions & 3 deletions nixops/backends/azure_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,14 +948,15 @@ def after_activation(self, defn):
self._delete_encryption_key(d_id)


def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
if hard:
self.log("sending hard reset to Azure machine...")
self.cmc().virtual_machines.restart(self.resource_group, self.machine_name)
self.state = self.STARTING
self.ssh.reset()
if reset:
self.ssh.reset()
else:
MachineState.reboot(self, hard=hard)
MachineState.reboot(self, hard=hard, reset=reset)
self.ssh_pinged = False

def start(self):
Expand Down
2 changes: 1 addition & 1 deletion nixops/backends/ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,7 +1407,7 @@ def _check(self, res):
res.messages.append(" * {0} - {1}".format(e.not_before, e.not_after))


def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
self.log("rebooting EC2 machine...")
instance = self._get_instance()
instance.reboot()
Expand Down
4 changes: 2 additions & 2 deletions nixops/backends/gce.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,13 +530,13 @@ def create_node(self, defn):
self.on_host_maintenance = defn.on_host_maintenance


def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
if hard:
self.log("sending hard reset to GCE machine...")
self.node().reboot()
self.state = self.STARTING
else:
MachineState.reboot(self, hard=hard)
MachineState.reboot(self, hard=hard, reset=reset)


def start(self):
Expand Down
7 changes: 4 additions & 3 deletions nixops/backends/hetzner.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,16 +311,17 @@ def _bootstrap_rescue(self, install, partitions):
self.run_command(cmd)
self.log_end("done.")

def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
if hard:
self.log_start("sending hard reset to robot... ")
server = self._get_server_by_ip(self.main_ipv4)
server.reboot('hard')
self.log_end("done.")
self.state = self.STARTING
self.ssh.reset()
if reset:
self.ssh.reset()
else:
MachineState.reboot(self, hard=hard)
MachineState.reboot(self, hard=hard, reset=reset)

def reboot_rescue(self, install=False, partitions=None, bootstrap=True,
hard=False):
Expand Down

0 comments on commit 37bc5af

Please sign in to comment.