Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

watchdog: restart worker if failing #7

Merged
merged 1 commit into from
Jul 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concourse_group: "{{ concourse_user }}"
concourse_gid: "{{ concourse_uid }}"
concourse_force_restart: no
concourse_service_enabled: yes

concourse_service_watchdog_enabled: yes

# Concourse source variables

Expand Down
11 changes: 9 additions & 2 deletions tasks/install-worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,21 @@
dest: "{{ concourse_worker_launcher_path }}"
- src: concourse-retire-worker.j2
dest: "{{ concourse_retire_worker_path }}"
- src: concourse-worker-watchdog.j2
dest: "{{ concourse_install_dir }}/concourse-worker-watchdog"

- name: create worker service | concourse
template:
src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
src: "{{ item['src'] }}"
dest: "{{ item['dest'] }}"
owner: root
force: yes
become: yes
become_user: root
with_items:
- src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
- src: concourse-worker-watchdog.service.j2
dest: /etc/systemd/system/concourse-worker-watchdog.service
notify:
- restart concourse worker
14 changes: 14 additions & 0 deletions tasks/start.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
become: yes
when: concourse_worker

- name: configure worker watchdog service | concourse
service:
name: concourse-worker-watchdog
enabled: "{{ concourse_service_watchdog_enabled }}"
become: yes
when: concourse_worker

- name: start web service | concourse
service:
name: concourse-web
Expand All @@ -25,3 +32,10 @@
state: started
become: yes
when: concourse_worker and concourse_service_enabled

- name: start worker watchdog service | concourse
service:
name: concourse-worker-watchdog
state: started
become: yes
when: concourse_worker and concourse_service_enabled
2 changes: 1 addition & 1 deletion templates/concourse-retire-worker.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export {{ key }}="{{ value }}"
# If $1 PID of concourse worker is provided, do a kill instead of an api call
# Mostly used by systemd for concourse compatiility issues https://github.com/concourse/concourse/pull/3929

until ! curl --fail 127.0.0.1:7777/ping; do
until ! curl --silent --fail 127.0.0.1:7777/ping; do

if [[ -z "$1" ]]; then
{{ concourse_binary_path }} retire-worker \
Expand Down
32 changes: 32 additions & 0 deletions templates/concourse-worker-watchdog.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

watchdog() {
#WORKER_PID=$1

RETRY=3
while(true); do
FAIL=0

curl --silent 127.0.0.1:8888 || FAIL=1

#if [[ $FAIL -eq 0 ]]; then
if [[ $FAIL -eq 1 ]]; then
if [[ $RETRY -ne 0 ]]; then
echo "retry $RETRY"
((RETRY=RETRY-1))
else
echo "restart worker"
#/bin/systemd-notify --pid=$WORKER_PID "WATCHDOG=1";
/bin/systemctl restart concourse-worker
RETRY=3
fi
else
echo "watchdog: concourse-worker healthcheck ok"
#sleep 1
RETRY=3
fi
sleep 15
done
}

watchdog
15 changes: 15 additions & 0 deletions templates/concourse-worker-watchdog.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# {{ ansible_managed }}

[Unit]
Description=concourse-worker-watchdog
Requires=network-online.target
After=concourse-worker.service

[Service]
ExecStart={{ concourse_install_dir }}/concourse-worker-watchdog
ExecStop=/bin/kill $MAINPID
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure

[Install]
WantedBy=multi-user.target
1 change: 1 addition & 0 deletions templates/concourse-worker.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Description=concourse-worker
Requires=network-online.target
After=network-online.target
Before=concourse-worker-watchdog.service

[Service]
ExecStart={{ concourse_worker_launcher_path }}
Expand Down