mirror of
https://github.com/prometheus-community/postgres_exporter
synced 2025-04-11 03:31:26 +00:00
Export last replay age in replication collector (#1085)
The exported replication lag does not handle all failure modes, and can report 0 for replicas that are out of sync and incapable of recovery. A proper replacement for that metric would require a different approach (see e.g. #1007), but for a lot of folks, simply exporting the age of the last replay can provide a pretty strong signal for something being amiss. I think this solution might be preferable to #977, though the lag metric needs to be fixed or abandoned eventually. Signed-off-by: Conrad Hoffmann <ch@bitfehler.net>
This commit is contained in:
parent
2ee2a8fa7c
commit
c3885e840a
@ -51,6 +51,15 @@ var (
|
||||
"Indicates if the server is a replica",
|
||||
[]string{}, nil,
|
||||
)
|
||||
pgReplicationLastReplay = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(
|
||||
namespace,
|
||||
replicationSubsystem,
|
||||
"last_replay_seconds",
|
||||
),
|
||||
"Age of last replay in seconds",
|
||||
[]string{}, nil,
|
||||
)
|
||||
|
||||
pgReplicationQuery = `SELECT
|
||||
CASE
|
||||
@ -61,7 +70,8 @@ var (
|
||||
CASE
|
||||
WHEN pg_is_in_recovery() THEN 1
|
||||
ELSE 0
|
||||
END as is_replica`
|
||||
END as is_replica,
|
||||
GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay`
|
||||
)
|
||||
|
||||
func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
|
||||
@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
|
||||
|
||||
var lag float64
|
||||
var isReplica int64
|
||||
err := row.Scan(&lag, &isReplica)
|
||||
var replayAge float64
|
||||
err := row.Scan(&lag, &isReplica, &replayAge)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
|
||||
pgReplicationIsReplica,
|
||||
prometheus.GaugeValue, float64(isReplica),
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
pgReplicationLastReplay,
|
||||
prometheus.GaugeValue, replayAge,
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
@ -31,9 +31,9 @@ func TestPgReplicationCollector(t *testing.T) {
|
||||
|
||||
inst := &instance{db: db}
|
||||
|
||||
columns := []string{"lag", "is_replica"}
|
||||
columns := []string{"lag", "is_replica", "last_replay"}
|
||||
rows := sqlmock.NewRows(columns).
|
||||
AddRow(1000, 1)
|
||||
AddRow(1000, 1, 3)
|
||||
mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnRows(rows)
|
||||
|
||||
ch := make(chan prometheus.Metric)
|
||||
@ -49,6 +49,7 @@ func TestPgReplicationCollector(t *testing.T) {
|
||||
expected := []MetricResult{
|
||||
{labels: labelMap{}, value: 1000, metricType: dto.MetricType_GAUGE},
|
||||
{labels: labelMap{}, value: 1, metricType: dto.MetricType_GAUGE},
|
||||
{labels: labelMap{}, value: 3, metricType: dto.MetricType_GAUGE},
|
||||
}
|
||||
|
||||
convey.Convey("Metrics comparison", t, func() {
|
||||
|
Loading…
Reference in New Issue
Block a user