diff --git a/defaults/main.yml b/defaults/main.yml index 2caa8d6..b03e677 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -48,6 +48,8 @@ concourse_worker: no concourse_worker_name: "{{ ansible_hostname }}" concourse_worker_launcher_path: "{{ concourse_install_dir }}/concourse-worker" concourse_retire_worker_path: "{{ concourse_install_dir }}/concourse-retire-worker" +concourse_worker_watchdog_sec: 10 +concourse_worker_watchdog_terminate_on_repeating_start_failure: no concourse_work_dir: "{{ concourse_install_dir }}/work" concourse_tsa_public_key_path: "{{ concourse_install_dir }}/host_key.pub" concourse_tsa_worker_key_path: "{{ concourse_install_dir }}/worker_key" diff --git a/tasks/install-worker.yml b/tasks/install-worker.yml index 01e31c8..e093ba8 100644 --- a/tasks/install-worker.yml +++ b/tasks/install-worker.yml @@ -27,11 +27,14 @@ - name: create worker service | concourse template: - src: concourse-worker.service.j2 - dest: /etc/systemd/system/concourse-worker.service + src: "{{ item['src'] }}" + dest: "{{ item['dest'] }}" owner: root force: yes become: yes become_user: root + with_items: + - src: concourse-worker.service.j2 + dest: /etc/systemd/system/concourse-worker.service notify: - restart concourse worker diff --git a/templates/concourse-retire-worker.j2 b/templates/concourse-retire-worker.j2 index 59509d4..e4ede41 100644 --- a/templates/concourse-retire-worker.j2 +++ b/templates/concourse-retire-worker.j2 @@ -9,7 +9,7 @@ export {{ key }}="{{ value }}" # If $1 PID of concourse worker is provided, do a kill instead of an api call # Mostly used by systemd for concourse compatiility issues https://github.com/concourse/concourse/pull/3929 -until ! curl --fail 127.0.0.1:7777/ping; do +until ! curl --silent --fail 127.0.0.1:7777/ping; do if [[ -z "$1" ]]; then {{ concourse_binary_path }} retire-worker \ diff --git a/templates/concourse-worker.j2 b/templates/concourse-worker.j2 index e72a5b3..927f697 100644 --- a/templates/concourse-worker.j2 +++ b/templates/concourse-worker.j2 @@ -2,6 +2,24 @@ # {{ ansible_managed }} +watchdog() { + WORKER_PID=$1 + + while(true); do + FAIL=0 + + curl --silent 127.0.0.1:8888 || FAIL=1 + + if [[ $FAIL -eq 0 ]]; then + /bin/systemd-notify --pid=$WORKER_PID "WATCHDOG=1"; + sleep $(({{ concourse_worker_watchdog_sec }} / 2)) + else + echo "watchdog: concourse-worker healthcheck failed" + sleep 1 + fi + done +} + {% if concourse_worker_forward_force_accept|bool %} iptables -P FORWARD ACCEPT {% endif %} @@ -10,6 +28,8 @@ iptables -P FORWARD ACCEPT export {{ key }}="{{ value }}" {% endfor %} +watchdog $$ & + exec {{ concourse_binary_path }} worker \ --tsa-host "{{ concourse_tsa_host }}:{{ concourse_tsa_port }}" \ --tsa-public-key {{ concourse_tsa_public_key_path }} \ diff --git a/templates/concourse-worker.service.j2 b/templates/concourse-worker.service.j2 index 3d049db..ed08272 100644 --- a/templates/concourse-worker.service.j2 +++ b/templates/concourse-worker.service.j2 @@ -4,6 +4,8 @@ Description=concourse-worker Requires=network-online.target After=network-online.target +Wants=concourse-worker-watchdog.service +Before=concourse-worker-watchdog.service [Service] ExecStart={{ concourse_worker_launcher_path }} @@ -17,5 +19,16 @@ TasksMax=infinity Delegate=yes KillMode=process +## Watchdog +WatchdogSec={{ concourse_worker_watchdog_sec }} +NotifyAccess=main +{% if concourse_worker_watchdog_terminate_on_repeating_start_failure|bool %} +# If there is `StartLimitBurst` failed restart attempt +# within `StartLimitInterval` then force poweroff +StartLimitInterval=5min +StartLimitBurst=4 +StartLimitAction=poweroff-force +{% endif %} + [Install] WantedBy=multi-user.target