Skip to content

Commit

Permalink
Add the steps to reboot the computes after update.
Browse files Browse the repository at this point in the history
This sequence implements reboot of the compute nodes after the update.

If one or more instances have been created on the hypervisor being
rebooted they will be live-migrated to others hypervisor before the
reboot and migrated back to that original hypervisor after the reboot.

Some basic sanity checks are performed after the reboot and before the
migration back to ensure that the necessary services are up and
running.

During the reboot we start two scripts. One monitors and log the
reboot of the hypervisors.  The other log where the instance is
currently running.

Closes: https://issues.redhat.com/browse/OSPRH-8937
  • Loading branch information
sathlan committed Dec 16, 2024
1 parent 7c36a5a commit a22e794
Show file tree
Hide file tree
Showing 7 changed files with 272 additions and 0 deletions.
3 changes: 3 additions & 0 deletions roles/update/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,6 @@
- not cifmw_update_run_dryrun | bool
ansible.builtin.shell: |
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh
- name: Reboot the compute nodes
ansible.builtin.include_tasks: reboot_compute.yml
48 changes: 48 additions & 0 deletions roles/update/tasks/reboot_compute.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
- name: Define command for OpenStack client interactions
ansible.builtin.set_fact:
cifmw_update_openstack_cmd: |
oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack
cifmw_update_bash_cmd: |
oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c
- name: Register storage backend type
shell: >-
{{ cifmw_update_openstack_cmd }} volume service list -f json |
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host'
register: storage_backend

- name: Get list of OpenStack hypervisors
ansible.builtin.shell: |
{{ cifmw_update_openstack_cmd }} hypervisor list -f json
register: hypervisor_list
changed_when: false

- name: Parse the hypervisor list to extract hostnames
ansible.builtin.set_fact:
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}"

- name: Create a reboot monitor script
ansible.builtin.template:
src: "monitor_servers.sh.j2"
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_server.sh"
mode: "0775"

- name: Start the reboot monitor script
ansible.builtin.shell:
cmd:: "monitor_servers.sh"

- name: Create a instance placement monitor script
ansible.builtin.template:
src: "monitor_vm_placement.sh.j2"
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh"
mode: "0775"

- name: Start the monitor placement script
ansible.builtin.shell:
cmd: "monitor_vm_placement.sh"

- name: Iterate over each hypervisor
ansible.builtin.include_tasks: reboot_hypervisor.yml
loop: "{{ hypervisor_hostnames }}"
loop_control:
loop_var: hypervisor
69 changes: 69 additions & 0 deletions roles/update/tasks/reboot_hypervisor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
---
- name: Extract short hostname from FQDN
ansible.builtin.set_fact:
cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}"

- debug:
msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}"

- name: Check active VMs on hypervisor
ansible.builtin.shell: >-
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID'
register: active_vms
changed_when: false

- name: Evacuate VMs if they are running
ansible.builtin.shell: >-
{{ cifmw_update_bash_cmd }} ". cloudrc &&
nova host-evacuate-live
{% if 'ceph' not in storage_backend.stdout %}
--block-migrate
{% endif %}
{{ hypervisor }}"
when: active_vms.stdout != ''
changed_when: true

- name: Wait for compute node to get quiesced
ansible.builtin.shell: >-
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
| jq -r -c '[.[] | select(.Status |
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))]
| length'
register: compute_node_instances
until: compute_node_instances.stdout.find("0") > -1
retries: 30
delay: 5
when:
- active_vms.stdout != ''

- name: Reboot the hypervisor
ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml

- name: Perform sanity checks post-reboot
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml
vars:
current_hypervisor: "{{ hypervisor }}"

- debug:
msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}."
with_items: "{{ active_vms.stdout_lines }}"

- name: Migrate back VMs post-reboot
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_bash_cmd }} ". cloudrc &&
nova live-migration
{% if 'ceph' not in storage_backend.stdout %}
--block-migrate
{% endif %}
{{ item }} {{ hypervisor }}";
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json |
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
register: instance_migration_result
until: instance_migration_result.stdout.find(hypervisor) > -1
retries: 30
delay: 5
with_items: "{{ active_vms.stdout_lines }}"
when:
- active_vms.stdout != ''
33 changes: 33 additions & 0 deletions roles/update/tasks/reboot_hypervisor_sanity_checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
- ansible.builtin.debug:
msg: "Here I'm testing the reboot for {{ current_hypervisor }}."

- name: Verify nova-compute service
ansible.builtin.shell: >-
{{ openstack_cmd }} compute service list --host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("nova-compute")) | .State'
register: nova_compute_status
until: nova_compute_status.stdout == 'up'
retries: 5
delay: 30

- name: Verify ovn-controller service
ansible.builtin.shell: >-
{{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("ovn-controller")) | .Alive'
register: ovn_controller_status
until: ovn_controller_status.stdout == 'true'
retries: 5
delay: 30

- name: Verify networking-ovn-metadata-agent
ansible.builtin.shell: >-
{{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive'
register: networking_ovn_metadata_status
until: networking_ovn_metadata_status.stdout == 'true'
retries: 5
delay: 30
50 changes: 50 additions & 0 deletions roles/update/tasks/reboot_hypervisor_using_cr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
- name: Define necessary command prefixes for kube operations
ansible.builtin.set_fact:
cifmw_update_oc_cmd_prefix: "oc -n {{ cifmw_update_namespace }}"

- name: Fetch NodeSets for the OpenStackDataPlaneDeployment
ansible.builtin.shell: >-
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanenodeset -o name | awk -F'/' '{print " - " $2}'
register: cifmw_update_node_sets
changed_when: false

- name: Construct date string for CR name
ansible.builtin.set_fact:
cifmw_update_cr_date: "{{ lookup('pipe', 'date +%Y%m%d') }}"

- name: Construct CR name
ansible.builtin.set_fact:
cifmw_reboot_dep_name: reboot-{{ hypervisor_short_name }}-{{ cifmw_update_cr_date }}

- name: Create OpenStackDataPlaneDeployment CR YAML file
ansible.builtin.copy:
dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml"
content: |
apiVersion: dataplane.openstack.org/v1beta1
kind: OpenStackDataPlaneDeployment
metadata:
name: {{ cifmw_reboot_dep_name }}
namespace: {{ cifmw_update_namespace }}
spec:
nodeSets:
{{ cifmw_update_node_sets.stdout }}
servicesOverride:
- reboot-os
ansibleExtraVars:
edpm_reboot_strategy: force
ansibleLimit: {{ hypervisor_short_name }}
- name: Apply the OpenStackDataPlaneDeployment CR to trigger a reboot
ansible.builtin.shell: >-
{{ cifmw_update_oc_cmd_prefix }}
create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml
- name: Check OpenStackDataPlaneDeployment status
ansible.builtin.command: >-
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanedeployment
{{ cifmw_reboot_dep_name }}
register: deployment_status
until: deployment_status.stdout.find('Setup complete') > -1
retries: 60
delay: 5
46 changes: 46 additions & 0 deletions roles/update/templates/monitor_servers.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash

# List of servers can be input as command line arguments or hardcoded here.
servers=(
{% for server in hypervisor_hostnames %}
{{ server.split('.')[0] }}
{% endfor %}
)
# or, for a hardcoded list: servers=("server1" "server2" ...)

# Log file to store the status changes
log_file="{{ cifmw_update_artifacts_basedir }}/reboot_server_status.log"

# Function to check server status via SSH
check_servers() {
for server in "${servers[@]}"; do
# Attempt to connect to the SSH port (22)
# if nc -z -w 5 "$server" 22 &> /dev/null; then
# Alternatively, using ssh (make sure you have SSH keys set up for passwordless access):
if ssh -i {{ ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then
# Server is up
if [ "${server_status[$server]}" == "down" ]; then
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file"
server_status[$server]="up"
fi
else
# Server is down
if [ "${server_status[$server]}" != "down" ]; then
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file"
server_status[$server]="down"
fi
fi
done
}

# Initialize server status array
declare -A server_status
for server in "${servers[@]}"; do
server_status[$server]="unknown"
done

# Main loop to continuously check server status
while true; do
check_servers
sleep 1 # Wait for 60 seconds before re-checking
done
23 changes: 23 additions & 0 deletions roles/update/templates/monitor_vm_placement.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# Log the instance hypervisor. Useful when tracking compute reboot.

export KUBECONFIG="{{ cifmw_openshift_kubeconfig }}"
export PATH="{{ cifmw_path }}"

log_file={{ cifmw_update_artifacts_basedir }}/instance_placement.log
source_file={{ cifmw_update_artifacts_basedir }}/workload_suffix
instance_prefix="instance_"

. "$source_file"

instance_name="${instance_prefix}${SUFFIX}"
previous_hypervisor=""

while true; do
current_hypervisor=$(oc rsh -n openstack openstackclient openstack server show "${instance_name}" -f json | jq -r -c '.["OS-EXT-SRV-ATTR:host"]')
if [[ "$current_hypervisor" != "$previous_hypervisor" ]]; then
echo "$(date) $instance_name $current_hypervisor" >> "$log_file"
previous_hypervisor="$current_hypervisor"
fi
sleep 1
done

0 comments on commit a22e794

Please sign in to comment.