-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Expand metrics on pg_stat_replication to include lag expressed as time. #115
base: main
Are you sure you want to change the base?
Changes from 6 commits
f70335a
50baa5f
103ac73
458791c
4aaf535
bac46c7
f5c3d70
242f796
cbcc86c
9cc3974
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
package collector | ||
|
||
import ( | ||
"context" | ||
|
||
pgx "github.com/jackc/pgx/v4" | ||
"github.com/prometheus/client_golang/prometheus" | ||
) | ||
|
||
/* When pg_basebackup is running in stream mode, it opens a second connection | ||
to the server and starts streaming the transaction log in parallel while | ||
running the backup. In both connections (state=backup and state=streaming) the | ||
pg_log_location_diff is null and it requires to be excluded */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is
Even Google has no clue what |
||
const ( | ||
// Scrape query | ||
statWalReceiver = ` | ||
WITH pg_wal_receiver AS ( | ||
SELECT status | ||
, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())::float | ||
as postgres_wal_receiver_replay_bytes | ||
, ( | ||
CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() | ||
THEN 0 | ||
ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) | ||
END | ||
) as postgres_wal_receiver_replay_lag | ||
FROM pg_stat_wal_receiver | ||
) | ||
SELECT * FROM pg_wal_receiver WHERE postgres_wal_receiver_replay_lag IS NOT NULL /*postgres_exporter*/` | ||
) | ||
|
||
type statWalReceiverScraper struct { | ||
walReceiverReplayBytes *prometheus.Desc | ||
walReceiverReplayLag *prometheus.Desc | ||
} | ||
|
||
// NewStatWalReceiverScraper returns a new Scraper exposing postgres pg_stat_replication | ||
func NewWalReceiverScraper() Scraper { | ||
return &statWalReceiverScraper{ | ||
walReceiverReplayBytes: prometheus.NewDesc( | ||
"postgres_wal_receiver_replay_lag_bytes", | ||
"delay in standby wal replay bytes pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())::float", | ||
[]string{"status"}, | ||
nil, | ||
), | ||
walReceiverReplayLag: prometheus.NewDesc( | ||
"postgres_wal_receiver_replay_lag_seconds", | ||
"delay in standby wal replay seconds EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()", | ||
[]string{"status"}, | ||
nil, | ||
), | ||
} | ||
} | ||
|
||
func (c *statWalReceiverScraper) Name() string { | ||
return "StatWalReceiverScraperr" | ||
} | ||
|
||
func (c *statWalReceiverScraper) Scrape(ctx context.Context, conn *pgx.Conn, version Version, ch chan<- prometheus.Metric) error { | ||
var rows pgx.Rows | ||
var err error | ||
|
||
rows, err = conn.Query(ctx, statWalReceiver) | ||
|
||
if err != nil { | ||
return err | ||
} | ||
defer rows.Close() | ||
|
||
var status string | ||
var pgWalReceiverReplayBytes, pgWalReceiverReplayLag float64 | ||
|
||
for rows.Next() { | ||
|
||
if err := rows.Scan(&status, | ||
&pgWalReceiverReplayBytes, | ||
&pgWalReceiverReplayLag); err != nil { | ||
|
||
return err | ||
} | ||
// postgres_wal_receiver_replay_lag_bytes | ||
ch <- prometheus.MustNewConstMetric(c.walReceiverReplayBytes, | ||
prometheus.GaugeValue, | ||
pgWalReceiverReplayBytes, | ||
status) | ||
// postgres_wal_receiver_replay_lag_seconds | ||
ch <- prometheus.MustNewConstMetric(c.walReceiverReplayLag, | ||
prometheus.GaugeValue, | ||
pgWalReceiverReplayLag, | ||
status) | ||
} | ||
|
||
err = rows.Err() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ahjmorton I didn't have time to go into details here, but could you share more details about these metrics disappearing?
Later you make a conditional have a value to report these metrics. We should avoid that, as metrics that disappear from Prometheus are complex to deal with
https://prometheus.io/docs/practices/instrumentation/#avoid-missing-metrics
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the WAL sender process didn't have an activity for some period of time (not sure how long, might need to look at source) then Postgres would null out those "*_lag" fields.
I was trying to avoid reporting metrics for nodes that didn't have a WAL sender process however those would be eliminated by the query as they wouldn't have a row in
pg_stat_replication
.Going to change this to default to zero in case of null.