Skip to content

Commit

Permalink
concourse-worker: add watchdog process
Browse files Browse the repository at this point in the history
  • Loading branch information
Steve Durrheimer committed Jun 10, 2020
1 parent fb0f13c commit 663e615
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 3 deletions.
2 changes: 2 additions & 0 deletions defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ concourse_worker: no
concourse_worker_name: "{{ ansible_hostname }}"
concourse_worker_launcher_path: "{{ concourse_install_dir }}/concourse-worker"
concourse_retire_worker_path: "{{ concourse_install_dir }}/concourse-retire-worker"
concourse_worker_watchdog_sec: 10
concourse_worker_watchdog_terminate_on_repeating_start_failure: no
concourse_work_dir: "{{ concourse_install_dir }}/work"
concourse_tsa_public_key_path: "{{ concourse_install_dir }}/host_key.pub"
concourse_tsa_worker_key_path: "{{ concourse_install_dir }}/worker_key"
Expand Down
7 changes: 5 additions & 2 deletions tasks/install-worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@

- name: create worker service | concourse
template:
src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
src: "{{ item['src'] }}"
dest: "{{ item['dest'] }}"
owner: root
force: yes
become: yes
become_user: root
with_items:
- src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
notify:
- restart concourse worker
2 changes: 1 addition & 1 deletion templates/concourse-retire-worker.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export {{ key }}="{{ value }}"
# If $1 PID of concourse worker is provided, do a kill instead of an api call
# Mostly used by systemd for concourse compatiility issues https://github.com/concourse/concourse/pull/3929

until ! curl --fail 127.0.0.1:7777/ping; do
until ! curl --silent --fail 127.0.0.1:7777/ping; do

if [[ -z "$1" ]]; then
{{ concourse_binary_path }} retire-worker \
Expand Down
20 changes: 20 additions & 0 deletions templates/concourse-worker.j2
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@

# {{ ansible_managed }}

watchdog() {
WORKER_PID=$1

while(true); do
FAIL=0

curl --silent 127.0.0.1:8888 || FAIL=1

if [[ $FAIL -eq 0 ]]; then
/bin/systemd-notify --pid=$WORKER_PID "WATCHDOG=1";
sleep $(({{ concourse_worker_watchdog_sec }} / 2))
else
echo "watchdog: concourse-worker healthcheck failed"
sleep 1
fi
done
}

{% if concourse_worker_forward_force_accept|bool %}
iptables -P FORWARD ACCEPT
{% endif %}
Expand All @@ -10,6 +28,8 @@ iptables -P FORWARD ACCEPT
export {{ key }}="{{ value }}"
{% endfor %}

watchdog $$ &

exec {{ concourse_binary_path }} worker \
--tsa-host "{{ concourse_tsa_host }}:{{ concourse_tsa_port }}" \
--tsa-public-key {{ concourse_tsa_public_key_path }} \
Expand Down
13 changes: 13 additions & 0 deletions templates/concourse-worker.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
Description=concourse-worker
Requires=network-online.target
After=network-online.target
Wants=concourse-worker-watchdog.service
Before=concourse-worker-watchdog.service

[Service]
ExecStart={{ concourse_worker_launcher_path }}
Expand All @@ -17,5 +19,16 @@ TasksMax=infinity
Delegate=yes
KillMode=process

## Watchdog
WatchdogSec={{ concourse_worker_watchdog_sec }}
NotifyAccess=main
{% if concourse_worker_watchdog_terminate_on_repeating_start_failure|bool %}
# If there is `StartLimitBurst` failed restart attempt
# within `StartLimitInterval` then force poweroff
StartLimitInterval=5min
StartLimitBurst=4
StartLimitAction=poweroff-force
{% endif %}

[Install]
WantedBy=multi-user.target

0 comments on commit 663e615

Please sign in to comment.