From 271f8b7ef9cde0c1142619249c322740af8779cd Mon Sep 17 00:00:00 2001 From: Mark Feit Date: Wed, 24 Jan 2024 22:34:27 +0000 Subject: [PATCH] Make startup of services more-robust in the face of PostgreSQL not being ready. #1393 --- .../pscheduler-server/daemons/Makefile | 45 ++++------ .../daemons/service-template.m4 | 77 +++++++++++++++++ .../daemons/service-template.raw | 44 ---------- .../daemons/wait-for-database | 86 +++++++++++++++++++ .../unibuild-packaging/deb/control | 2 +- .../rpm/pscheduler-server.spec | 2 +- python-pscheduler/pscheduler/pscheduler/db.py | 3 + 7 files changed, 187 insertions(+), 72 deletions(-) create mode 100644 pscheduler-server/pscheduler-server/daemons/service-template.m4 delete mode 100644 pscheduler-server/pscheduler-server/daemons/service-template.raw create mode 100755 pscheduler-server/pscheduler-server/daemons/wait-for-database diff --git a/pscheduler-server/pscheduler-server/daemons/Makefile b/pscheduler-server/pscheduler-server/daemons/Makefile index 17a2793c16..e2a8f57abf 100644 --- a/pscheduler-server/pscheduler-server/daemons/Makefile +++ b/pscheduler-server/pscheduler-server/daemons/Makefile @@ -21,6 +21,10 @@ DAEMONS=\ ticker \ scheduler \ +DAEMON_UTILITIES=\ + wait-for-database + +DAEMON_INSTALL=$(DAEMONS) $(DAEMON_UTILITIES) COMMANDS=\ debug \ @@ -123,20 +127,9 @@ endif fi TO_CLEAN += resume - - -# Systemd 240 and later supports the 'exec' service type, which is -# preferred becuase it catches more failures sooner. For anything -# earlier, fall back on 'simple'. SYSTEMD_VERSION := $(shell systemctl --version | awk '$$1 == "systemd" { print $$2 }') -ifeq ($(shell expr '$(SYSTEMD_VERSION)' '>=' 240), 1) - SERVICE_TYPE := exec -else - SERVICE_TYPE := simple -endif - -$(UNITS): service-template.raw +$(UNITS): service-template.m4 ifndef CONFIGDIR @echo No CONFIGDIR specified for build @false @@ -169,17 +162,17 @@ ifndef VAR @echo No VAR specified for build @false endif - sed \ - -e 's|__CONFIGDIR__|$(CONFIGDIR)|g' \ - -e 's|__DAEMONDIR__|$(DAEMONDIR)|g' \ - -e 's|__DSN__|$(DSNFILE)|g' \ - -e 's|__PROG__|$(@:service-%=%)|g' \ - -e 's|__PSUSER__|$(PSUSER)|g' \ - -e 's|__RUNDIR__|$(RUNDIR)|g' \ - -e 's|__VAR__|$(VAR)|g' \ - -e 's|__LOGDIR__|$(LOGDIR)|g' \ - -e 's|__PGSERVICE__|$(PGSERVICE)|g' \ - -e 's|__SERVICE_TYPE__|$(SERVICE_TYPE)|g' \ + m4 \ + -D '__CONFIGDIR__=$(CONFIGDIR)' \ + -D '__DAEMONDIR__=$(DAEMONDIR)' \ + -D '__DSN__=$(DSNFILE)' \ + -D '__PROG__=$(@:service-%=%)' \ + -D '__PSUSER__=$(PSUSER)' \ + -D '__RUNDIR__=$(RUNDIR)' \ + -D '__VAR__=$(VAR)' \ + -D '__LOGDIR__=$(LOGDIR)' \ + -D '__PGSERVICE__=$(PGSERVICE)' \ + -D '__SYSTEMD_VERSION__=$(SYSTEMD_VERSION)' \ < $< > $@ @if egrep -e '__[A-Z_]+__' $@ ; then \ echo "Found un-substituted values in processed file $@" ; \ @@ -199,7 +192,7 @@ $(CONFIGS): config-template.raw TO_CLEAN += $(CONFIGS) -build: $(DAEMONS) $(UNITS) $(CONFIGS) $(COMMANDS) $(INTERNALS) +build: $(DAEMON_INSTALL) $(UNITS) $(CONFIGS) $(COMMANDS) $(INTERNALS) @true @@ -235,8 +228,8 @@ endif cp -f $(COMMANDS) $(COMMANDDIR) chmod 555 $(COMMANDS:%=$(COMMANDDIR)/%) mkdir -p $(DAEMONDIR) - cp -f $(DAEMONS) $(DAEMONDIR) - chmod 555 $(DAEMONS:%=$(DAEMONDIR)/%) + cp -f $(DAEMON_INSTALL) $(DAEMONDIR) + chmod 555 $(DAEMON_INSTALL:%=$(DAEMONDIR)/%) mkdir -p $(INTERNALSDIR) cp -f $(INTERNALS) $(INTERNALSDIR) chmod 555 $(INTERNALS:%=$(INTERNALSDIR)/%) diff --git a/pscheduler-server/pscheduler-server/daemons/service-template.m4 b/pscheduler-server/pscheduler-server/daemons/service-template.m4 new file mode 100644 index 0000000000..d230d39a88 --- /dev/null +++ b/pscheduler-server/pscheduler-server/daemons/service-template.m4 @@ -0,0 +1,77 @@ +changequote(<<<,>>>)dnl +changecom()dnl +# +# Systemd unit for __PROG__ +# +# Systemd version __SYSTEMD_VERSION__ was installed at build time. +# +# +# Version of systemd installed by distros we support: +# +# EL7 229 +# EL8 239 +# EL9 252 +# D10 241 +# D11 247 +# D12 252 +# U20 245 +# U22 249 + +[Unit] +Description=pScheduler Server - __PROG__ +# This forces starting and stopping in concert +PartOf=__PGSERVICE__ +After=__PGSERVICE__ +Wants=__PGSERVICE__ + +[Service] +# Systemd 240 added exec, which is better. +Type=ifelse(eval(__SYSTEMD_VERSION__ < 240),1,simple,exec) + +User=__PSUSER__ +Group=__PSUSER__ + +PermissionsStartOnly=true +LimitNOFILE=32768 +LimitNPROC=32768 + +Restart=always +RestartSec=15 + +# This is slightly longer than the database check below so failures +# will be more apparent than just a timeout. +TimeoutStartSec=130 + +# Wait for the database to become accessible. This is done because +# the PostgreSQL service can appear up when it isn't ready to take +# queries yet. That will cause this service to die. +ExecStartPre=__DAEMONDIR__/wait-for-database --dsn @__DSN__ --dwell 120 --retry 5 + +# Create the run directory +ExecStartPre=/bin/mkdir -p __RUNDIR__/__PROG__ +ExecStartPre=/bin/chmod 755 __RUNDIR__/__PROG__ + +# Set up some temporary space and export its location +ExecStartPre=/bin/mkdir -p __RUNDIR__/__PROG__/tmp +ExecStartPre=/bin/chmod 700 __RUNDIR__/__PROG__/tmp +Environment=TMPDIR=__RUNDIR__/__PROG__/tmp + +# Set ownership +ExecStartPre=/bin/chown -R __PSUSER__:__PSUSER__ __RUNDIR__/__PROG__ + +# Generate options file +ExecStartPre=/bin/sh -c "if [ -r __CONFIGDIR__/__PROG__.conf ]; then opts=$(sed -e 's/#.*$//' __CONFIGDIR__/__PROG__.conf); echo OPTIONS=$opts > __RUNDIR__/__PROG__/options; chown __PSUSER__:__PSUSER__ __RUNDIR__/__PROG__/options; fi" + +# Redirections +StandardOutput=journal +StandardError=journal + +# Start service +EnvironmentFile=-__RUNDIR__/__PROG__/options +ExecStart=__DAEMONDIR__/__PROG__ --dsn @__DSN__ $OPTIONS + +# Stop service +ExecStopPost=/bin/rm -rf __RUNDIR__/__PROG__ + +[Install] +WantedBy=multi-user.target diff --git a/pscheduler-server/pscheduler-server/daemons/service-template.raw b/pscheduler-server/pscheduler-server/daemons/service-template.raw deleted file mode 100644 index 7e82f9943f..0000000000 --- a/pscheduler-server/pscheduler-server/daemons/service-template.raw +++ /dev/null @@ -1,44 +0,0 @@ -[Unit] -Description=pScheduler server - __PROG__ -# This forces starting and stopping in concert -PartOf=__PGSERVICE__ - -[Service] -Type=__SERVICE_TYPE__ -User=__PSUSER__ -Group=__PSUSER__ -PermissionsStartOnly=true -LimitNOFILE=32768 -LimitNPROC=32768 - -Restart=always - -# Create the run directory -ExecStartPre=-/bin/mkdir -p __RUNDIR__/__PROG__ -ExecStartPre=-/bin/chmod 755 __RUNDIR__/__PROG__ - -# Set up some temporary space and export its location -ExecStartPre=-/bin/mkdir -p __RUNDIR__/__PROG__/tmp -ExecStartPre=-/bin/chmod 700 __RUNDIR__/__PROG__/tmp -Environment=TMPDIR=__RUNDIR__/__PROG__/tmp - -# Set ownership -ExecStartPre=-/bin/chown -R __PSUSER__:__PSUSER__ __RUNDIR__/__PROG__ - - -# Generate options file -ExecStartPre=-/bin/sh -c "if [ -r __CONFIGDIR__/__PROG__.conf ]; then opts=$(sed -e 's/#.*$//' __CONFIGDIR__/__PROG__.conf); echo OPTIONS=$opts > __RUNDIR__/__PROG__/options; chown __PSUSER__:__PSUSER__ __RUNDIR__/__PROG__/options; fi" - -# Redirections -StandardOutput=journal -StandardError=journal - -# Start service -EnvironmentFile=-__RUNDIR__/__PROG__/options -ExecStart=__DAEMONDIR__/__PROG__ --dsn @__DSN__ $OPTIONS - -# Stop service -ExecStopPost=/bin/rm -rf __RUNDIR__/__PROG__ - -[Install] -WantedBy=multi-user.target diff --git a/pscheduler-server/pscheduler-server/daemons/wait-for-database b/pscheduler-server/pscheduler-server/daemons/wait-for-database new file mode 100755 index 0000000000..c4944e29ba --- /dev/null +++ b/pscheduler-server/pscheduler-server/daemons/wait-for-database @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# +# Check that the database is available for connections and working. +# + +import datetime +import optparse +import pscheduler +import psycopg2 +import sys +import time + +pscheduler.set_graceful_exit() + +# +# Gargle the arguments +# + +opt_parser = optparse.OptionParser() + +# Program options + +opt_parser.add_option("--dsn", + help="Database connection string, prefix with @ to read from file", + action="store", type="string", dest="dsn", + default="") +opt_parser.add_option("--dwell", + help="How long to keep trying to connect (seconds)", + action="store", type="int", dest="dwell", + default=60) +opt_parser.add_option("--retry", + help="How often to try connecting (seconds)", + action="store", type="int", dest="retry", + default=2) + +(options, args) = opt_parser.parse_args() + + +def try_database(dsn): + """ + Attempt to connect to the database and use it, returning a tuple + of True/False if successful/failed and an error message. + """ + db = None + try: + db = pscheduler.PgConnection(dsn, name="check-database") + rows = list(db.query('SELECT 12345')) + except Exception as ex: + return (False, str(ex)) + finally: + if db is not None: + db.close() + + if (len(rows) != 1) or (rows[0] != (12345,)): + return(False, 'Got unexpected data: %s' % (rows)) + + return (True, 'OK') + + +attempts = 0 +dwell_until = pscheduler.time_now() + datetime.timedelta(seconds=options.dwell) + +# This will be overwritten in the loop below. +reason = 'Never tried to connect' + +while pscheduler.time_now() < dwell_until: + + status, reason = try_database(options.dsn) + + if status: + print('Successfully connected to the database.', file=sys.stderr) + exit(0) + + attempts += 1 + if attempts == 1: + print('Failed first attempt connecting to the database:', reason, file=sys.stderr) + + time.sleep(options.retry) + + +# No dice. + +print('Unable to connect within the dwell time. Last error:\n', + reason, file=sys.stderr) + +exit(1) diff --git a/pscheduler-server/pscheduler-server/unibuild-packaging/deb/control b/pscheduler-server/pscheduler-server/unibuild-packaging/deb/control index 8af228bbf7..5dc54e61df 100644 --- a/pscheduler-server/pscheduler-server/unibuild-packaging/deb/control +++ b/pscheduler-server/pscheduler-server/unibuild-packaging/deb/control @@ -13,7 +13,7 @@ Architecture: all Depends: ${misc:Depends}, python3, curl, python3-daemon, python3-flask, python3-psutil, python3-tz, python3-jsontemplate, python3-radix, python3-crontab, - python3-pscheduler (>= 4.4.0~), pscheduler-core, pscheduler-account, + python3-pscheduler (>= 5.0.8~), pscheduler-core, pscheduler-account, apache2, libapache2-mod-wsgi-py3, logrotate, psmisc, dbconfig-common, postgresql (>= 9.5), postgresql-12 | postgresql-11 | postgresql-10 | postgresql-9.6 | postgresql-9.5, diff --git a/pscheduler-server/pscheduler-server/unibuild-packaging/rpm/pscheduler-server.spec b/pscheduler-server/pscheduler-server/unibuild-packaging/rpm/pscheduler-server.spec index 8ea8d3f240..ea45bcf3d7 100644 --- a/pscheduler-server/pscheduler-server/unibuild-packaging/rpm/pscheduler-server.spec +++ b/pscheduler-server/pscheduler-server/unibuild-packaging/rpm/pscheduler-server.spec @@ -72,7 +72,7 @@ Requires: httpd-wsgi-socket Requires: mod_ssl Requires: mod_wsgi > 4.0 Requires: %{_pscheduler_python}-parse-crontab -Requires: %{_pscheduler_python}-pscheduler >= 5.0.0 +Requires: %{_pscheduler_python}-pscheduler >= 5.0.8 %if 0%{?el7} Requires: pytz diff --git a/python-pscheduler/pscheduler/pscheduler/db.py b/python-pscheduler/pscheduler/pscheduler/db.py index 4ff1ac2a0e..bd0f65f3df 100644 --- a/python-pscheduler/pscheduler/pscheduler/db.py +++ b/python-pscheduler/pscheduler/pscheduler/db.py @@ -96,6 +96,9 @@ def __init__(self, dsn, autocommit=True, name=None): self.pending_notifications = {} self.lock = threading.Lock() + def close(self): + self.pg.close() + # # Notifications #