Export last replay age in replication collector (#1085)

The exported replication lag does not handle all failure modes, and can
report 0 for replicas that are out of sync and incapable of recovery.

A proper replacement for that metric would require a different approach
(see e.g. #1007), but for a lot of folks, simply exporting the age of
the last replay can provide a pretty strong signal for something being
amiss.

I think this solution might be preferable to #977, though the lag
metric needs to be fixed or abandoned eventually.

Signed-off-by: Conrad Hoffmann <ch@bitfehler.net>
This commit is contained in:
Conrad Hoffmann 2025-02-15 15:15:44 +01:00 committed by GitHub
parent 2ee2a8fa7c
commit c3885e840a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 4 deletions

View File

@ -51,6 +51,15 @@ var (
"Indicates if the server is a replica",
[]string{}, nil,
)
pgReplicationLastReplay = prometheus.NewDesc(
prometheus.BuildFQName(
namespace,
replicationSubsystem,
"last_replay_seconds",
),
"Age of last replay in seconds",
[]string{}, nil,
)
pgReplicationQuery = `SELECT
CASE
@ -61,7 +70,8 @@ var (
CASE
WHEN pg_is_in_recovery() THEN 1
ELSE 0
END as is_replica`
END as is_replica,
GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay`
)
func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
var lag float64
var isReplica int64
err := row.Scan(&lag, &isReplica)
var replayAge float64
err := row.Scan(&lag, &isReplica, &replayAge)
if err != nil {
return err
}
@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
pgReplicationIsReplica,
prometheus.GaugeValue, float64(isReplica),
)
ch <- prometheus.MustNewConstMetric(
pgReplicationLastReplay,
prometheus.GaugeValue, replayAge,
)
return nil
}

View File

@ -31,9 +31,9 @@ func TestPgReplicationCollector(t *testing.T) {
inst := &instance{db: db}
columns := []string{"lag", "is_replica"}
columns := []string{"lag", "is_replica", "last_replay"}
rows := sqlmock.NewRows(columns).
AddRow(1000, 1)
AddRow(1000, 1, 3)
mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnRows(rows)
ch := make(chan prometheus.Metric)
@ -49,6 +49,7 @@ func TestPgReplicationCollector(t *testing.T) {
expected := []MetricResult{
{labels: labelMap{}, value: 1000, metricType: dto.MetricType_GAUGE},
{labels: labelMap{}, value: 1, metricType: dto.MetricType_GAUGE},
{labels: labelMap{}, value: 3, metricType: dto.MetricType_GAUGE},
}
convey.Convey("Metrics comparison", t, func() {