diff --git a/defaults/main.yml b/defaults/main.yml index 2caa8d6..7a355df 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -48,6 +48,8 @@ concourse_worker: no concourse_worker_name: "{{ ansible_hostname }}" concourse_worker_launcher_path: "{{ concourse_install_dir }}/concourse-worker" concourse_retire_worker_path: "{{ concourse_install_dir }}/concourse-retire-worker" +concourse_worker_watchdog_sec: 5 +concourse_worker_watchdog_path: "{{ concourse_install_dir }}/concourse-worker-watchdog" concourse_work_dir: "{{ concourse_install_dir }}/work" concourse_tsa_public_key_path: "{{ concourse_install_dir }}/host_key.pub" concourse_tsa_worker_key_path: "{{ concourse_install_dir }}/worker_key" diff --git a/tasks/install-worker.yml b/tasks/install-worker.yml index 01e31c8..10bb86d 100644 --- a/tasks/install-worker.yml +++ b/tasks/install-worker.yml @@ -24,6 +24,8 @@ dest: "{{ concourse_worker_launcher_path }}" - src: concourse-retire-worker.j2 dest: "{{ concourse_retire_worker_path }}" + - src: concourse-worker-watchdog.j2 + dest: "{{ concourse_worker_watchdog_path }}" - name: create worker service | concourse template: diff --git a/templates/concourse-worker-watchdog.j2 b/templates/concourse-worker-watchdog.j2 new file mode 100644 index 0000000..8dd2e4d --- /dev/null +++ b/templates/concourse-worker-watchdog.j2 @@ -0,0 +1,17 @@ +#!/bin/bash + +# {{ ansible_managed }} + +while(true); do + FAIL=0 + + curl --silent 127.0.0.1:8888 || FAIL=1 + + if [[ $FAIL -eq 0 ]]; then + /bin/systemd-notify "WATCHDOG=1"; + sleep $(({{ concourse_worker_watchdog_sec * 1000000 }} / 2000000)) + else + echo "> $0: FAILURE" + sleep 1 + fi +done diff --git a/templates/concourse-worker.service.j2 b/templates/concourse-worker.service.j2 index 3d049db..2cfc49c 100644 --- a/templates/concourse-worker.service.j2 +++ b/templates/concourse-worker.service.j2 @@ -17,5 +17,15 @@ TasksMax=infinity Delegate=yes KillMode=process +## Watchdog +WatchdogSec={{ concourse_worker_watchdog_sec }} +NotifyAccess=all +ExecStartPost={{ concourse_worker_watchdog_path }} +# If there is `StartLimitBurst` failed restart attempt +# within `StartLimitInterval` then force poweroff +StartLimitInterval=5min +StartLimitBurst=4 +StartLimitAction=poweroff-force + [Install] WantedBy=multi-user.target