diff --git a/CHANGELOG.md b/CHANGELOG.md index 76a09d1..2224389 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## Version 0.10.0 / 2020-02-23 +## Version 0.10.0 / 2022-02-23 [full changelog](https://github.com/rnaveiras/postgres_exporter/compare/v0.9.0...v0.10.0) diff --git a/README.md b/README.md index 2c46780..a307aaa 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,9 @@ Prometheus exporter for PostgreSQL server metrics. | postgres_stat_database_xact_commit_total | Number of transactions in this database that have been committed | datname | | postgres_stat_database_xact_rollback_total | Number of transactions in this database that have been rolled back | datname | | postgres_stat_replication_lag_bytes | Replication Lag in bytes | application_name, client_addr, state, sync_state | +| postgres_stat_replication_flush_lag_seconds | Elapsed time during committed WALs from primary to the standby (WAL's has already been flushed but not yet applied). Reported from the primary node. *Only available on Posgres versions > 9x*. | application_name, client_addr, state, sync_state | +| postgres_stat_replication_replay_lag_seconds | Elapsed time during committed WALs from primary to the standby (fully committed in standby node). Reported from the primary node. *Only available on Posgres versions > 9x*. | application_name, client_addr, state, sync_state | +| postgres_stat_replication_write_lag_seconds | Elapsed time during committed WALs from primary to the standby (but not yet committed in the standby). Reported from the primary node. *Only available on Posgres versions > 9x*. | application_name, client_addr, state, sync_state | | postgres_stat_vacuum_progress_heap_blks_scanned | Number of heap blocks scanned | pid, query_start, schemaname, datname, relname | | postgres_stat_vacuum_progress_heap_blks_total | Total number of heap blocks in the table | pid, query_start, schemaname, datname, relname | | postgres_stat_vacuum_progress_heap_blks_vacuumed | Number of heap blocks vacuumed | pid, query_start, schemaname, datname, relname | @@ -75,8 +78,11 @@ Prometheus exporter for PostgreSQL server metrics. | postgres_stat_user_indexes_scan_total | Number of times this index has been scanned | datname, schemaname, tablename, indexname | | postgres_stat_user_indexes_tuple_read_total | Number of times tuples have been returned from scanning this index | datname, schemaname, tablename, indexname | | postgres_stat_user_indexes_tuple_fetch_total | Number of live tuples fetched by scans on this index | datname, schemaname, tablename, indexname | +| postgres_wal_receiver_replay_lag_seconds | Replication lag measured in seconds on the standby. Measured as `EXTRACT (EPOCH FROM now()) - pg_last_xact_replay_timestamp()` | status | +| postgres_wal_receiver_replay_lag_bytes | Replication lag measured in bytes on the standby. Measured as `pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())::float` | status | | postgres_up | Whether the Postgres server is up | | + ### Run #### Passing in a libpq connection string diff --git a/collector/exporter.go b/collector/exporter.go index f30c271..4db1b87 100644 --- a/collector/exporter.go +++ b/collector/exporter.go @@ -101,6 +101,7 @@ func NewExporter(ctx context.Context, logger kitlog.Logger, connConfig *pgx.Conn NewStatBgwriterScraper(), NewStatDatabaseScraper(), NewStatReplicationScraper(), + NewWalReceiverScraper(), }, datnameScrapers: []Scraper{ NewStatVacuumProgressScraper(), diff --git a/collector/stat_replication.go b/collector/stat_replication.go index a2f8c9b..56940b5 100644 --- a/collector/stat_replication.go +++ b/collector/stat_replication.go @@ -14,22 +14,7 @@ running the backup. In both connections (state=backup and state=streaming) the pg_log_location_diff is null and it requires to be excluded */ const ( // Scrape query - statReplicationLagBytes9x = ` -WITH pg_replication AS ( - SELECT application_name - , client_addr - , state - , sync_state - , ( CASE when pg_is_in_recovery() - THEN pg_xlog_location_diff(pg_last_xlog_receive_location(), replay_location)::float - ELSE pg_xlog_location_diff(pg_current_xlog_location(), replay_location)::float - END - ) AS pg_xlog_location_diff - FROM pg_stat_replication -) -SELECT * FROM pg_replication WHERE pg_xlog_location_diff IS NOT NULL /*postgres_exporter*/` - - statReplicationLagBytes = ` + statReplicationLag = ` WITH pg_replication AS ( SELECT application_name , client_addr @@ -40,13 +25,19 @@ WITH pg_replication AS ( ELSE pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)::float END ) AS pg_xlog_location_diff + , COALESCE(EXTRACT (EPOCH FROM write_lag), 0) as write_lag_seconds + , COALESCE(EXTRACT (EPOCH FROM flush_lag), 0) as flush_lag_seconds + , COALESCE(EXTRACT (EPOCH FROM replay_lag), 0) as replay_lag_seconds FROM pg_stat_replication ) SELECT * FROM pg_replication WHERE pg_xlog_location_diff IS NOT NULL /*postgres_exporter*/` ) type statReplicationScraper struct { - lagBytes *prometheus.Desc + lagBytes *prometheus.Desc + writeLag *prometheus.Desc + flushLag *prometheus.Desc + replayLag *prometheus.Desc } // NewStatReplicationScraper returns a new Scraper exposing postgres pg_stat_replication @@ -58,6 +49,24 @@ func NewStatReplicationScraper() Scraper { []string{"application_name", "client_addr", "state", "sync_state"}, nil, ), + writeLag: prometheus.NewDesc( + "postgres_stat_replication_write_lag_seconds", + "write_lag as reported by the pg_stat_replication view converted to seconds", + []string{"application_name", "client_addr", "state", "sync_state"}, + nil, + ), + flushLag: prometheus.NewDesc( + "postgres_stat_replication_flush_lag_seconds", + "flush_lag as reported by the pg_stat_replication view converted to seconds", + []string{"application_name", "client_addr", "state", "sync_state"}, + nil, + ), + replayLag: prometheus.NewDesc( + "postgres_stat_replication_replay_lag_seconds", + "replay_lag as reported by the pg_stat_replication view converted to seconds", + []string{"application_name", "client_addr", "state", "sync_state"}, + nil, + ), } } @@ -69,11 +78,7 @@ func (c *statReplicationScraper) Scrape(ctx context.Context, conn *pgx.Conn, ver var rows pgx.Rows var err error - if version.Gte(10) { - rows, err = conn.Query(ctx, statReplicationLagBytes) - } else { - rows, err = conn.Query(ctx, statReplicationLagBytes9x) - } + rows, err = conn.Query(ctx, statReplicationLag) if err != nil { return err @@ -82,18 +87,21 @@ func (c *statReplicationScraper) Scrape(ctx context.Context, conn *pgx.Conn, ver var applicationName, state, syncState string var clientAddr net.IP - var pgXlogLocationDiff float64 + var pgXlogLocationDiff, writeLagSeconds, flushLagSeconds, replayLagSeconds float64 for rows.Next() { + if err := rows.Scan(&applicationName, &clientAddr, &state, &syncState, - &pgXlogLocationDiff); err != nil { + &pgXlogLocationDiff, + &writeLagSeconds, + &flushLagSeconds, + &replayLagSeconds); err != nil { return err } - // postgres_stat_replication_lag_bytes ch <- prometheus.MustNewConstMetric(c.lagBytes, prometheus.GaugeValue, @@ -102,8 +110,34 @@ func (c *statReplicationScraper) Scrape(ctx context.Context, conn *pgx.Conn, ver clientAddr.String(), state, syncState) - } + // postgres_stat_replication_write_lag_seconds + ch <- prometheus.MustNewConstMetric(c.writeLag, + prometheus.GaugeValue, + writeLagSeconds, + applicationName, + clientAddr.String(), + state, + syncState) + + // postgres_stat_replication_flush_lag_seconds + ch <- prometheus.MustNewConstMetric(c.flushLag, + prometheus.GaugeValue, + flushLagSeconds, + applicationName, + clientAddr.String(), + state, + syncState) + + // postgres_stat_replication_replay_lag_seconds + ch <- prometheus.MustNewConstMetric(c.replayLag, + prometheus.GaugeValue, + replayLagSeconds, + applicationName, + clientAddr.String(), + state, + syncState) + } err = rows.Err() if err != nil { return err diff --git a/collector/stat_wal_receiver.go b/collector/stat_wal_receiver.go new file mode 100644 index 0000000..a8ebd44 --- /dev/null +++ b/collector/stat_wal_receiver.go @@ -0,0 +1,99 @@ +package collector + +import ( + "context" + + pgx "github.com/jackc/pgx/v4" + "github.com/prometheus/client_golang/prometheus" +) + +/* When pg_basebackup is running in stream mode, it opens a second connection +to the server and starts streaming the transaction log in parallel while +running the backup. In both connections (state=backup and state=streaming) the +pg_log_location_diff is null and it requires to be excluded */ +const ( + // Scrape query + statWalReceiver = ` +WITH pg_wal_receiver AS ( + SELECT status + , pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())::float + as postgres_wal_receiver_replay_bytes + , ( + CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() + THEN 0 + ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) + END + ) as postgres_wal_receiver_replay_lag + FROM pg_stat_wal_receiver +) +SELECT * FROM pg_wal_receiver WHERE postgres_wal_receiver_replay_lag IS NOT NULL /*postgres_exporter*/` +) + +type statWalReceiverScraper struct { + walReceiverReplayBytes *prometheus.Desc + walReceiverReplayLag *prometheus.Desc +} + +// NewStatWalReceiverScraper returns a new Scraper exposing postgres pg_stat_replication +func NewWalReceiverScraper() Scraper { + return &statWalReceiverScraper{ + walReceiverReplayBytes: prometheus.NewDesc( + "postgres_wal_receiver_replay_lag_bytes", + "delay in standby wal replay bytes pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())::float", + []string{"status"}, + nil, + ), + walReceiverReplayLag: prometheus.NewDesc( + "postgres_wal_receiver_replay_lag_seconds", + "delay in standby wal replay seconds EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()", + []string{"status"}, + nil, + ), + } +} + +func (c *statWalReceiverScraper) Name() string { + return "StatWalReceiverScraperr" +} + +func (c *statWalReceiverScraper) Scrape(ctx context.Context, conn *pgx.Conn, version Version, ch chan<- prometheus.Metric) error { + var rows pgx.Rows + var err error + + rows, err = conn.Query(ctx, statWalReceiver) + + if err != nil { + return err + } + defer rows.Close() + + var status string + var pgWalReceiverReplayBytes, pgWalReceiverReplayLag float64 + + for rows.Next() { + + if err := rows.Scan(&status, + &pgWalReceiverReplayBytes, + &pgWalReceiverReplayLag); err != nil { + + return err + } + // postgres_wal_receiver_replay_lag_bytes + ch <- prometheus.MustNewConstMetric(c.walReceiverReplayBytes, + prometheus.GaugeValue, + pgWalReceiverReplayBytes, + status) + // postgres_wal_receiver_replay_lag_seconds + ch <- prometheus.MustNewConstMetric(c.walReceiverReplayLag, + prometheus.GaugeValue, + pgWalReceiverReplayLag, + status) + } + + err = rows.Err() + if err != nil { + return err + } + + return nil +}