Skip to content

Commit

Permalink
Make startup of services more-robust in the face of PostgreSQL not be…
Browse files Browse the repository at this point in the history
…ing ready. #1393
  • Loading branch information
mfeit-internet2 committed Jan 24, 2024
1 parent b28f852 commit 271f8b7
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 72 deletions.
45 changes: 19 additions & 26 deletions pscheduler-server/pscheduler-server/daemons/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ DAEMONS=\
ticker \
scheduler \

DAEMON_UTILITIES=\
wait-for-database

DAEMON_INSTALL=$(DAEMONS) $(DAEMON_UTILITIES)

COMMANDS=\
debug \
Expand Down Expand Up @@ -123,20 +127,9 @@ endif
fi
TO_CLEAN += resume



# Systemd 240 and later supports the 'exec' service type, which is
# preferred becuase it catches more failures sooner. For anything
# earlier, fall back on 'simple'.
SYSTEMD_VERSION := $(shell systemctl --version | awk '$$1 == "systemd" { print $$2 }')
ifeq ($(shell expr '$(SYSTEMD_VERSION)' '>=' 240), 1)
SERVICE_TYPE := exec
else
SERVICE_TYPE := simple
endif


$(UNITS): service-template.raw
$(UNITS): service-template.m4
ifndef CONFIGDIR
@echo No CONFIGDIR specified for build
@false
Expand Down Expand Up @@ -169,17 +162,17 @@ ifndef VAR
@echo No VAR specified for build
@false
endif
sed \
-e 's|__CONFIGDIR__|$(CONFIGDIR)|g' \
-e 's|__DAEMONDIR__|$(DAEMONDIR)|g' \
-e 's|__DSN__|$(DSNFILE)|g' \
-e 's|__PROG__|$(@:service-%=%)|g' \
-e 's|__PSUSER__|$(PSUSER)|g' \
-e 's|__RUNDIR__|$(RUNDIR)|g' \
-e 's|__VAR__|$(VAR)|g' \
-e 's|__LOGDIR__|$(LOGDIR)|g' \
-e 's|__PGSERVICE__|$(PGSERVICE)|g' \
-e 's|__SERVICE_TYPE__|$(SERVICE_TYPE)|g' \
m4 \
-D '__CONFIGDIR__=$(CONFIGDIR)' \
-D '__DAEMONDIR__=$(DAEMONDIR)' \
-D '__DSN__=$(DSNFILE)' \
-D '__PROG__=$(@:service-%=%)' \
-D '__PSUSER__=$(PSUSER)' \
-D '__RUNDIR__=$(RUNDIR)' \
-D '__VAR__=$(VAR)' \
-D '__LOGDIR__=$(LOGDIR)' \
-D '__PGSERVICE__=$(PGSERVICE)' \
-D '__SYSTEMD_VERSION__=$(SYSTEMD_VERSION)' \
< $< > $@
@if egrep -e '__[A-Z_]+__' $@ ; then \
echo "Found un-substituted values in processed file $@" ; \
Expand All @@ -199,7 +192,7 @@ $(CONFIGS): config-template.raw
TO_CLEAN += $(CONFIGS)


build: $(DAEMONS) $(UNITS) $(CONFIGS) $(COMMANDS) $(INTERNALS)
build: $(DAEMON_INSTALL) $(UNITS) $(CONFIGS) $(COMMANDS) $(INTERNALS)
@true


Expand Down Expand Up @@ -235,8 +228,8 @@ endif
cp -f $(COMMANDS) $(COMMANDDIR)
chmod 555 $(COMMANDS:%=$(COMMANDDIR)/%)
mkdir -p $(DAEMONDIR)
cp -f $(DAEMONS) $(DAEMONDIR)
chmod 555 $(DAEMONS:%=$(DAEMONDIR)/%)
cp -f $(DAEMON_INSTALL) $(DAEMONDIR)
chmod 555 $(DAEMON_INSTALL:%=$(DAEMONDIR)/%)
mkdir -p $(INTERNALSDIR)
cp -f $(INTERNALS) $(INTERNALSDIR)
chmod 555 $(INTERNALS:%=$(INTERNALSDIR)/%)
Expand Down
77 changes: 77 additions & 0 deletions pscheduler-server/pscheduler-server/daemons/service-template.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
changequote(<<<,>>>)dnl
changecom()dnl
#
# Systemd unit for __PROG__
#
# Systemd version __SYSTEMD_VERSION__ was installed at build time.
#
#
# Version of systemd installed by distros we support:
#
# EL7 229
# EL8 239
# EL9 252
# D10 241
# D11 247
# D12 252
# U20 245
# U22 249

[Unit]
Description=pScheduler Server - __PROG__
# This forces starting and stopping in concert
PartOf=__PGSERVICE__
After=__PGSERVICE__
Wants=__PGSERVICE__

[Service]
# Systemd 240 added exec, which is better.
Type=ifelse(eval(__SYSTEMD_VERSION__ < 240),1,simple,exec)

User=__PSUSER__
Group=__PSUSER__

PermissionsStartOnly=true
LimitNOFILE=32768
LimitNPROC=32768

Restart=always
RestartSec=15

# This is slightly longer than the database check below so failures
# will be more apparent than just a timeout.
TimeoutStartSec=130

# Wait for the database to become accessible. This is done because
# the PostgreSQL service can appear up when it isn't ready to take
# queries yet. That will cause this service to die.
ExecStartPre=__DAEMONDIR__/wait-for-database --dsn @__DSN__ --dwell 120 --retry 5

# Create the run directory
ExecStartPre=/bin/mkdir -p __RUNDIR__/__PROG__
ExecStartPre=/bin/chmod 755 __RUNDIR__/__PROG__

# Set up some temporary space and export its location
ExecStartPre=/bin/mkdir -p __RUNDIR__/__PROG__/tmp
ExecStartPre=/bin/chmod 700 __RUNDIR__/__PROG__/tmp
Environment=TMPDIR=__RUNDIR__/__PROG__/tmp

# Set ownership
ExecStartPre=/bin/chown -R __PSUSER__:__PSUSER__ __RUNDIR__/__PROG__

# Generate options file
ExecStartPre=/bin/sh -c "if [ -r __CONFIGDIR__/__PROG__.conf ]; then opts=$(sed -e 's/#.*$//' __CONFIGDIR__/__PROG__.conf); echo OPTIONS=$opts > __RUNDIR__/__PROG__/options; chown __PSUSER__:__PSUSER__ __RUNDIR__/__PROG__/options; fi"

# Redirections
StandardOutput=journal
StandardError=journal

# Start service
EnvironmentFile=-__RUNDIR__/__PROG__/options
ExecStart=__DAEMONDIR__/__PROG__ --dsn @__DSN__ $OPTIONS

# Stop service
ExecStopPost=/bin/rm -rf __RUNDIR__/__PROG__

[Install]
WantedBy=multi-user.target
44 changes: 0 additions & 44 deletions pscheduler-server/pscheduler-server/daemons/service-template.raw

This file was deleted.

86 changes: 86 additions & 0 deletions pscheduler-server/pscheduler-server/daemons/wait-for-database
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python3
#
# Check that the database is available for connections and working.
#

import datetime
import optparse
import pscheduler
import psycopg2
import sys
import time

pscheduler.set_graceful_exit()

#
# Gargle the arguments
#

opt_parser = optparse.OptionParser()

# Program options

opt_parser.add_option("--dsn",
help="Database connection string, prefix with @ to read from file",
action="store", type="string", dest="dsn",
default="")
opt_parser.add_option("--dwell",
help="How long to keep trying to connect (seconds)",
action="store", type="int", dest="dwell",
default=60)
opt_parser.add_option("--retry",
help="How often to try connecting (seconds)",
action="store", type="int", dest="retry",
default=2)

(options, args) = opt_parser.parse_args()


def try_database(dsn):
"""
Attempt to connect to the database and use it, returning a tuple
of True/False if successful/failed and an error message.
"""
db = None
try:
db = pscheduler.PgConnection(dsn, name="check-database")
rows = list(db.query('SELECT 12345'))
except Exception as ex:
return (False, str(ex))
finally:
if db is not None:
db.close()

if (len(rows) != 1) or (rows[0] != (12345,)):
return(False, 'Got unexpected data: %s' % (rows))

return (True, 'OK')


attempts = 0
dwell_until = pscheduler.time_now() + datetime.timedelta(seconds=options.dwell)

# This will be overwritten in the loop below.
reason = 'Never tried to connect'

while pscheduler.time_now() < dwell_until:

status, reason = try_database(options.dsn)

if status:
print('Successfully connected to the database.', file=sys.stderr)
exit(0)

attempts += 1
if attempts == 1:
print('Failed first attempt connecting to the database:', reason, file=sys.stderr)

time.sleep(options.retry)


# No dice.

print('Unable to connect within the dwell time. Last error:\n',
reason, file=sys.stderr)

exit(1)
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Architecture: all
Depends: ${misc:Depends}, python3, curl,
python3-daemon, python3-flask, python3-psutil,
python3-tz, python3-jsontemplate, python3-radix, python3-crontab,
python3-pscheduler (>= 4.4.0~), pscheduler-core, pscheduler-account,
python3-pscheduler (>= 5.0.8~), pscheduler-core, pscheduler-account,
apache2, libapache2-mod-wsgi-py3, logrotate, psmisc,
dbconfig-common, postgresql (>= 9.5),
postgresql-12 | postgresql-11 | postgresql-10 | postgresql-9.6 | postgresql-9.5,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ Requires: httpd-wsgi-socket
Requires: mod_ssl
Requires: mod_wsgi > 4.0
Requires: %{_pscheduler_python}-parse-crontab
Requires: %{_pscheduler_python}-pscheduler >= 5.0.0
Requires: %{_pscheduler_python}-pscheduler >= 5.0.8

%if 0%{?el7}
Requires: pytz
Expand Down
3 changes: 3 additions & 0 deletions python-pscheduler/pscheduler/pscheduler/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def __init__(self, dsn, autocommit=True, name=None):
self.pending_notifications = {}
self.lock = threading.Lock()

def close(self):
self.pg.close()

#
# Notifications
#
Expand Down

0 comments on commit 271f8b7

Please sign in to comment.