From f83e2445d17f4b3a089a55183fe030e1f46458e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= <mail@nh2.me>
Date: Sat, 27 Jan 2018 19:49:06 +0100
Subject: [PATCH] Don't use port open check to determine if reboot completed.
 Fixes #856.

The old approach, waiting for the machine to not having an open
port, and then waiting for it to be open again, was insufficient,
because of the race condition that the machine rebooted so quickly
that the port was immediately open again without nixops noticing
that it went down. I experienced this on a Hetzner cloud server.

The new approach checks the `last reboot` on the remote side
to change, which is not racy.
---
 nixops/backends/__init__.py | 48 +++++++++++++++++++++++++++++++++----
 nixops/backends/azure_vm.py |  3 ++-
 nixops/backends/hetzner.py  |  6 ++++-
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/nixops/backends/__init__.py b/nixops/backends/__init__.py
index 8c30d63c4..2dd7f126b 100644
--- a/nixops/backends/__init__.py
+++ b/nixops/backends/__init__.py
@@ -3,6 +3,7 @@
 import os
 import re
 import subprocess
+import time
 
 import nixops.util
 import nixops.resources
@@ -182,16 +183,53 @@ def reboot(self, hard=False):
             reboot_command = "systemctl reboot"
         self.run_command(reboot_command, check=False)
         self.state = self.STARTING
+        # Note we always reset the connection to ensure the SSH control master
+        # connection is stopped.
+        # While an orderly `reboot` should make the remote kernel send RST to
+        # all TCP connections, we don't want to rely on that here.
         self.ssh.reset()
 
     def reboot_sync(self, hard=False):
         """Reboot this machine and wait until it's up again."""
+
+        # To check when the machine has finished rebooting in a race-free
+        # manner, we compare the output of `last reboot` before and after
+        # the reboot. Once the output has changed, the reboot is done.
+        def get_last_reboot_output():
+            # Note `last reboot` does not exist on older OSs like
+            # the Hetzner rescue system, but that doesn't matter much
+            # because all we care about is when the output of the
+            # command invocation changes.
+            # We use timeout=10 so that the user gets some sense
+            # of progress, as reboots can take a long time.
+            return self.run_command('last reboot --time-format iso | head -n 1', capture_stdout=True, timeout=10).rstrip()
+
+        # The server might be off.
+        # In that case, we just set `pre_reboot_last_reboot_output` to some string
+        # so that it changes when it comes up.
+        try:
+            pre_reboot_last_reboot_output = get_last_reboot_output()
+        except nixops.ssh_util.SSHConnectionFailed:
+            self.log("server is unreachable, so reboot completion detection may not be accurate if it's a network problem")
+            pre_reboot_last_reboot_output = "server is off or couldn't be contacted"
+
         self.reboot(hard=hard)
-        self.log_start("waiting for the machine to finish rebooting...")
-        nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, open=False, callback=lambda: self.log_continue("."))
-        self.log_continue("[down]")
-        nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, callback=lambda: self.log_continue("."))
-        self.log_end("[up]")
+
+        self.log_start("waiting for reboot to complete...")
+        while True:
+            last_reboot_output = None
+            try:
+                last_reboot_output = get_last_reboot_output()
+            except (nixops.ssh_util.SSHConnectionFailed, nixops.ssh_util.SSHCommandFailed):
+                # We accept this because the machine might be down,
+                # and show an 'x' as progress indicator in that case.
+                self.log_continue("x")
+            if last_reboot_output is not None and last_reboot_output != pre_reboot_last_reboot_output:
+                break
+            self.log_continue(".")
+            time.sleep(1)
+        self.log_end("done.")
+
         self.state = self.UP
         self.ssh_pinged = True
         self._ssh_pinged_this_time = True
diff --git a/nixops/backends/azure_vm.py b/nixops/backends/azure_vm.py
index 349c3a68b..b21176fef 100644
--- a/nixops/backends/azure_vm.py
+++ b/nixops/backends/azure_vm.py
@@ -953,7 +953,8 @@ def reboot(self, hard=False):
             self.log("sending hard reset to Azure machine...")
             self.cmc().virtual_machines.restart(self.resource_group, self.machine_name)
             self.state = self.STARTING
-            self.ssh.reset()
+            if reset:
+                self.ssh.reset()
         else:
             MachineState.reboot(self, hard=hard)
         self.ssh_pinged = False
diff --git a/nixops/backends/hetzner.py b/nixops/backends/hetzner.py
index 320c1accd..ba5ba61d1 100644
--- a/nixops/backends/hetzner.py
+++ b/nixops/backends/hetzner.py
@@ -318,9 +318,13 @@ def reboot(self, hard=False):
             server.reboot('hard')
             self.log_end("done.")
             self.state = self.STARTING
+            # Note we always reset the connection because on a hard reboot, the
+            # other side may not have a chance to terminate the SSH connection,
+            # (send TCP RST), which could lead to the control master process
+            # still being alive and hanging until the TCP connection times out.
             self.ssh.reset()
         else:
-            MachineState.reboot(self, hard=hard)
+            MachineState.reboot(self, hard=hard, reset=reset)
 
     def reboot_rescue(self, install=False, partitions=None, bootstrap=True,
                       hard=False):