Skip to content

Commit

Permalink
Introduce control plane testing.
Browse files Browse the repository at this point in the history
This continuously
1. creates a vm;
2. attach a volume (optional) and a fip to ip
3. ssh to it
4. destroy it
5. restart from 1.

This give a good level of confidence that the openstack API is still
reachable during the update.
  • Loading branch information
sathlan committed Jul 26, 2024
1 parent e16dcda commit 0229713
Show file tree
Hide file tree
Showing 8 changed files with 329 additions and 1 deletion.
2 changes: 1 addition & 1 deletion roles/update/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ Role to run update
* `cifmw_update_create_volume`: (Boolean) Attach a volume to the test OS instance when set to true. Default to `False`
* `cifmw_update_ping_loss_second` : (Integer) Number of seconds that the ping test is allowed to fail. Default to `0`. Note that 1 packet loss is always accepted to avoid false positive.
* `cifmw_update_ping_loss_percent` : (Integer) Maximum percentage of ping loss accepted. Default to `0`. Only relevant when `cifmw_update_ping_loss_second` is not 0.

* `cifmw_update_control_plane_check`: (Boolean) Activate a continuous control plane testing. Default to `False`
## Examples
2 changes: 2 additions & 0 deletions roles/update/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,5 @@ cifmw_update_ping_test: false
cifmw_update_create_volume: false
cifmw_update_ping_loss_second: 0
cifmw_update_ping_loss_percent: 0

cifmw_update_control_plane_check: false
198 changes: 198 additions & 0 deletions roles/update/files/continuous-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/bin/bash
set -eu
## ---------------------------------------------------------------------
## NAME:
## continuous-test.sh - run a script in a loop and gather the results.
##
## SYNOPSIS
## continuous-test.sh [OPTION] [SCRIPT]
##
## DESCRIPTION
## Run SCRIPT and collect date, time and exit status.
##
## The SCRIPT will be continuously run until we get a SIGUSR1
## signal. When the signal is caught, we will wait for the last
## run to end and dump to stdout the result of all commands.
##
## The output of the each command will be saved into "continuous-test-<pid>/" under
## the current directory.
##
## A /var/run/continuous-test.pid will register the pid of the
## running process.
##
## OPTIONS
## -d Enable debug mode.
## -l <PREFIX> Prefix used for:
## - Logfile: Default to ./continuous-test-<PID>.log
## - Done file: Default to ./continuous-test-<PID>.done
##
## The logfile will hold the result of each command run and the
## done file indicate that the last run is finished when we want
## to end the continuous test.
##
## Both those files will have the <PID> added to the prefix so that
## multiple command can be run in parallel if needed.
##
## The pid can be find in the PIDFILE.
##
## -p <PIDFILE> save the PID to that file.
## Default to ./continuous-test.pid
##
## -o <DIR> Directory where to save all those files. Default to
## the directory where continuous-test.sh is.
##
## FILES
##
## /var/run/continuous-test.pid will hold the pid of the process
## ./continuous-test.log have the result of the check
## ./continuous-test-<pid>/<files> will hold the output of each command.
##
## ENVIRONMENT
## CT_SCRIPT_ARGS A string holding any argument that should
## be passed to SCRIPT.
##
## AUTHOR
## Athlan-Guyot Sofer <[email protected]>
## ---------------------------------------------------------------------
FILE=$(basename $0)

CT_PARENT=${CT_PARENT:-true}
CT_CHILD=${CT_CHILD:-false}

CT_STOP=false

## ---------------------------------------------------------------------
## Function definitions.
process_sig() {
echo "$$: received term signal" >&2
CT_STOP=true
}

process_sigterm_parent() {
echo "$$: Parent received term signal" >&2
if [ -n "${CT_PID}" ]; then
echo "$$: received term signal: killing $CT_PID" >&2
kill -s USR1 $CT_PID
else
# Should not happen.
echo "$$: received term signal: killing group" >&2
kill -s USR1 0
fi
}

# Daemonize the process. This will fork a process and detach from the
# console after setting the environment from the options.
if "${CT_PARENT}"; then
export DEBUG=false
while getopts :p:l:o:d OPT; do
case $OPT in
l|+l)
CT_PREFIX="$OPTARG"
;;
p|+p)
CT_PIDFILE="$OPTARG"
;;
o|+o)
CT_DIR="$OPTARG"
;;
d|+d)
DEBUG=true
;;
*)
echo "usage: ${0##*/} [-l LOGFILE] [-p PIDFILE] [-d] SCRIPT"
exit 2
esac
done
shift $(( OPTIND - 1 ))
OPTIND=1
if [ -z "${CT_DIR}" ]; then
CT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
fi
export CT_DIR
if $DEBUG; then
export CT_TTY=$(tty)
else
export CT_TTY=/dev/null
fi
exec 2>$CT_TTY
echo "entering parent $$ $FILE" >&2
export CT_SCRIPT_ARGS=${CT_SCRIPT_ARGS:-""}
export CT_SCRIPT="${@:?'SCRIPT cannot be empty.'}"
export CT_PREFIX="${CT_PREFIX:-}"
export CT_PIDFILE="${CT_PIDFILE:-}"
export CT_CHILD=true
export CT_PARENT=false
setsid ${CT_DIR}/${FILE} "$@" </dev/null >$CT_TTY 2>$CT_TTY &
CT_PID=$!
if $DEBUG ; then
trap process_sigterm_parent SIGTERM SIGINT
wait $CT_PID
echo "leaving parent $$ after waiting for $CT_PID/$FILE" >&2
else
echo "leaving parent $$ $FILE" >&2
fi
sync
exit 0
fi

if "${CT_CHILD}"; then
if [ -n "${CT_TTY}" ]; then
exec 2> ${CT_TTY}
exec 1> ${CT_TTY}
else
CT_TTY=/dev/null
fi
echo "entering child $$ running $FILE" >&2
if [ -z "${CT_PREFIX}" ]; then
CT_LOGFILE="${CT_DIR}/continuous-test-$$.log"
else
CT_LOGFILE="${CT_DIR}/${CT_PREFIX}-$$.log"
fi
if [ -z "${CT_PIDFILE}" ]; then
CT_PIDFILE="${CT_DIR}/continuous-test.pid"
fi
export CT_LOGFILE
export CT_PIDFILE
export CT_CMD_OUT_DIR="${CT_DIR}/ct-$$"
trap process_sig SIGTERM SIGUSR1
export CT_CHILD=false
export CT_PARENT=false
echo $$ > "${CT_PIDFILE}"
# Main loop where eventually run the script.
while ! $CT_STOP; do
setsid ${CT_DIR}/$FILE "$@" </dev/null 2>$CT_TTY
done
echo "Leaving child $$ running $FILE" >&2
if [ -z "${CT_PREFIX}" ]; then
CT_ENDFILE="${CT_DIR}/continuous-test-$$.done"
else
CT_ENDFILE="${CT_DIR}/${CT_PREFIX}-$$.done"
fi
date > $CT_ENDFILE
sync
exit 0
fi

exec >>$CT_LOGFILE
mkdir -p "${CT_CMD_OUT_DIR}"
echo "entering loop $$ $CT_SCRIPT" >&2
# We cannot have to jobs in the same seconds, or else we will
# overwrite the file. sleep 1 prevents this.
sleep 1
start_time="$(date +%s)"
start_time_h="$(date -d@${start_time})"
echo -n "${start_time_h} (${start_time}) "
set +e
"${CT_SCRIPT}" ${CT_SCRIPT_ARGS} &>> "${CT_CMD_OUT_DIR}/${start_time}.log"
RC="${?}"
set -e
end_time="$(date +%s)"
duration=$((end_time - start_time))
echo -n "${duration}s "

if [ $RC -eq 0 ]; then
echo "SUCCESS (0)"
else
echo "FAILED (${RC})"
fi
echo "leaving loop $$" >&2
27 changes: 27 additions & 0 deletions roles/update/tasks/create_test_files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,30 @@
src: "l3_agent_stop_ping.sh.j2"
dest: "{{ cifmw_update_ping_stop_script }}"
mode: "0775"

- name: Control plane testing related files
when: cifmw_update_control_plane_check|bool
block:
- name: Create control plane wrapper
ansible.builtin.copy:
src: "continuous-test.sh"
dest: "{{ cifmw_update_artifacts_basedir }}/continuous-test.sh"
mode: "0775"

- name: Create control plane start script
ansible.builtin.template:
src: "control_plane_test_start.sh.j2"
dest: "{{ cifmw_update_artifacts_basedir }}/control_plane_test_start.sh"
mode: "0775"

- name: Create control plane stop script
ansible.builtin.template:
src: "control_plane_test_stop.sh.j2"
dest: "{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh"
mode: "0775"

- name: Create control plane workload launch wrapper
ansible.builtin.template:
src: "workload_launch_k8s.sh.j2"
dest: "{{ cifmw_update_artifacts_basedir }}/workload_launch_k8s.sh"
mode: "0775"
15 changes: 15 additions & 0 deletions roles/update/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
- name: Start ping test
ansible.builtin.include_tasks: l3_agent_connectivity_check_start.yml

- name: Trigger the continuous control plane test
when:
- cifmw_update_control_plane_check | bool
- not cifmw_update_run_dryrun | bool
ansible.builtin.shell: |
{{ cifmw_update_artifacts_basedir }}/control_plane_test_start.sh
- name: Set openstack_update_run Makefile environment variables
tags:
- always
Expand Down Expand Up @@ -60,3 +68,10 @@
when:
- cifmw_update_ping_test | bool
- not cifmw_update_run_dryrun | bool

- name: Stop the continuous control plane test
when:
- cifmw_update_control_plane_check | bool
- not cifmw_update_run_dryrun | bool
ansible.builtin.shell: |
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh
12 changes: 12 additions & 0 deletions roles/update/templates/control_plane_test_start.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
#
# Script to test control plane by creating a vm in a loop during the
# update. Start sequence.
set -eu
BASE_DIR="${1:-{{ cifmw_update_artifacts_basedir }}}"

continuous_test_wrapper="${BASE_DIR}/continuous-test.sh"

if [ -e "${continuous_test_wrapper}" ]; then
${continuous_test_wrapper} -o "${BASE_DIR}" -l control-plane-test -p "${BASE_DIR}/control-plane-test.pid" "${BASE_DIR}/workload_launch_k8s.sh"
fi
70 changes: 70 additions & 0 deletions roles/update/templates/control_plane_test_stop.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/bin/bash
#
# Script to stop a previously started control plane testing.
# Get the pid, kill it and wait for the end of the last run.
set -eu

MAX_CONS_FAIL=${1:-2}
MAX_FAIL=${2:-3}
BASE_DIR="${3:-{{ cifmw_update_artifacts_basedir }}}"
STOP_MAX_TRIES=${4:-60} # 5 seconds x MAX_TRIES = 5 min by default

pid_file="${BASE_DIR}/control-plane-test.pid"

if [ ! -e "${pid_file}" ]; then
echo "Not pid file: ${pid_file}"
exit 1
fi

PID=$(cat "${pid_file}")

done_file=${BASE_DIR}/control-plane-test-${PID}.done
result_file=${BASE_DIR}/control-plane-test-${PID}.log

kill "${PID}"

current_try=0
until [ -e "${done_file}" ]; do
if [ $current_try -le "${STOP_MAX_TRIES}" ]; then
sleep 5
current_try=$((current_try+1))
else
echo "Waited to long for ${PID} to finish. Aborting."
exit 1
fi
done

# Verify that we didn't get any workload issue.
FAILURE=""
# Number of successive failure
successive_failure=$(
awk 'BEGIN{fail=0; max=0}
NR>1 && NF>1 && $(NF-1)==prev{fail++; if (fail > max){max = fail}}
/FAIL/{prev=$(NF-1)}
/SUCCESS/{fail=0}
END{print max}' "${result_file}"
)
if [ "${successive_failure}" -gt "${MAX_CONS_FAIL}" ]; then
echo "Max number of consecutive control plane failure (${MAX_CONS_FAIL}) reached."
echo "Found ${successive_failure} consecutive failures during update."
grep FAILED "${result_file}"
FAILURE="true"
fi
# Total number of failure
failures=$(grep -Fc FAILED "${result_file}" ||:) # prevents exit 1 when no match
if [ "${failures}" -gt "${MAX_FAIL}" ]; then
echo "Max number of control plan failure (${MAX_FAIL}) reached."
echo "Found ${failures} failures during update."
grep FAILED "${result_file}"
FAILURE="true"
fi

if [ -n "${FAILURE}" ]; then
echo "Concaneted files in ${BASE_DIR}/control-plane-testing-detailed.log"
tail -n +1 ./ct-"${PID}"/*.log > "${BASE_DIR}/control-plane-testing-detailed.log"
exit 1
else
echo "$(date) No (or not enough) failure(s) during control plane testing"
echo "Successive failure: ${successive_failure}/${MAX_CONS_FAIL}"
echo "Total number of failures: ${failures}/${MAX_FAIL}"
fi
4 changes: 4 additions & 0 deletions roles/update/templates/workload_launch_k8s.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/bash

cat "{{ cifmw_update_artifacts_basedir }}/workload_launch.sh" | \
oc rsh -n openstack openstackclient env WKL_MODE=sanityfast bash

0 comments on commit 0229713

Please sign in to comment.