Gitlab Collector: Wal Receiver Collector and Test (#844)

* Wal Receiver Collector and Test

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Add more escapes

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Corrections to wal_receiver

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Continue on null labels

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Skip nulls and log a message

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Redundant breaks

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Fix up walreceiver

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Remove extra label

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Update collector/pg_stat_walreceiver.go

Co-authored-by: Ben Kochie <superq@gmail.com>
Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Clean up the extra assignments

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

* Update collector/pg_stat_walreceiver.go

Co-authored-by: Joe Adams <github@joeadams.io>
Signed-off-by: Felix Yuan <felix.yuan@reddit.com>

---------

Signed-off-by: Felix Yuan <felix.yuan@reddit.com>
Co-authored-by: Ben Kochie <superq@gmail.com>
Co-authored-by: Joe Adams <github@joeadams.io>
This commit is contained in:
Felix Yuan 2023-07-21 11:42:08 -07:00 committed by GitHub
parent dc3e813f43
commit 2d7e152751
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 456 additions and 0 deletions

View File

@ -49,6 +49,7 @@ func readMetric(m prometheus.Metric) MetricResult {
func sanitizeQuery(q string) string {
q = strings.Join(strings.Fields(q), " ")
q = strings.Replace(q, "(", "\\(", -1)
q = strings.Replace(q, "?", "\\?", -1)
q = strings.Replace(q, ")", "\\)", -1)
q = strings.Replace(q, "[", "\\[", -1)
q = strings.Replace(q, "]", "\\]", -1)

View File

@ -0,0 +1,269 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collector
import (
"context"
"database/sql"
"fmt"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
)
func init() {
registerCollector(statWalReceiverSubsystem, defaultDisabled, NewPGStatWalReceiverCollector)
}
type PGStatWalReceiverCollector struct {
log log.Logger
}
const statWalReceiverSubsystem = "stat_wal_receiver"
func NewPGStatWalReceiverCollector(config collectorConfig) (Collector, error) {
return &PGStatWalReceiverCollector{log: config.logger}, nil
}
var (
labelCats = []string{"upstream_host", "slot_name", "status"}
statWalReceiverReceiveStartLsn = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "receive_start_lsn"),
"First write-ahead log location used when WAL receiver is started represented as a decimal",
labelCats,
prometheus.Labels{},
)
statWalReceiverReceiveStartTli = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "receive_start_tli"),
"First timeline number used when WAL receiver is started",
labelCats,
prometheus.Labels{},
)
statWalReceiverFlushedLSN = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "flushed_lsn"),
"Last write-ahead log location already received and flushed to disk, the initial value of this field being the first log location used when WAL receiver is started represented as a decimal",
labelCats,
prometheus.Labels{},
)
statWalReceiverReceivedTli = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "received_tli"),
"Timeline number of last write-ahead log location received and flushed to disk",
labelCats,
prometheus.Labels{},
)
statWalReceiverLastMsgSendTime = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "last_msg_send_time"),
"Send time of last message received from origin WAL sender",
labelCats,
prometheus.Labels{},
)
statWalReceiverLastMsgReceiptTime = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "last_msg_receipt_time"),
"Send time of last message received from origin WAL sender",
labelCats,
prometheus.Labels{},
)
statWalReceiverLatestEndLsn = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "latest_end_lsn"),
"Last write-ahead log location reported to origin WAL sender as integer",
labelCats,
prometheus.Labels{},
)
statWalReceiverLatestEndTime = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "latest_end_time"),
"Time of last write-ahead log location reported to origin WAL sender",
labelCats,
prometheus.Labels{},
)
statWalReceiverUpstreamNode = prometheus.NewDesc(
prometheus.BuildFQName(namespace, statWalReceiverSubsystem, "upstream_node"),
"Node ID of the upstream node",
labelCats,
prometheus.Labels{},
)
pgStatWalColumnQuery = `
SELECT
column_name
FROM information_schema.columns
WHERE
table_name = 'pg_stat_wal_receiver' and
column_name = 'flushed_lsn'
`
pgStatWalReceiverQueryTemplate = `
SELECT
trim(both '''' from substring(conninfo from 'host=([^ ]*)')) as upstream_host,
slot_name,
status,
(receive_start_lsn- '0/0') % (2^52)::bigint as receive_start_lsn,
%s
receive_start_tli,
received_tli,
extract(epoch from last_msg_send_time) as last_msg_send_time,
extract(epoch from last_msg_receipt_time) as last_msg_receipt_time,
(latest_end_lsn - '0/0') % (2^52)::bigint as latest_end_lsn,
extract(epoch from latest_end_time) as latest_end_time,
substring(slot_name from 'repmgr_slot_([0-9]*)') as upstream_node
FROM pg_catalog.pg_stat_wal_receiver
`
)
func (c *PGStatWalReceiverCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
db := instance.getDB()
hasFlushedLSNRows, err := db.QueryContext(ctx, pgStatWalColumnQuery)
if err != nil {
return err
}
defer hasFlushedLSNRows.Close()
hasFlushedLSN := hasFlushedLSNRows.Next()
var query string
if hasFlushedLSN {
query = fmt.Sprintf(pgStatWalReceiverQueryTemplate, "(flushed_lsn - '0/0') % (2^52)::bigint as flushed_lsn,\n")
} else {
query = fmt.Sprintf(pgStatWalReceiverQueryTemplate, "")
}
rows, err := db.QueryContext(ctx, query)
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var upstreamHost, slotName, status sql.NullString
var receiveStartLsn, receiveStartTli, flushedLsn, receivedTli, latestEndLsn, upstreamNode sql.NullInt64
var lastMsgSendTime, lastMsgReceiptTime, latestEndTime sql.NullFloat64
if hasFlushedLSN {
if err := rows.Scan(&upstreamHost, &slotName, &status, &receiveStartLsn, &receiveStartTli, &flushedLsn, &receivedTli, &lastMsgSendTime, &lastMsgReceiptTime, &latestEndLsn, &latestEndTime, &upstreamNode); err != nil {
return err
}
} else {
if err := rows.Scan(&upstreamHost, &slotName, &status, &receiveStartLsn, &receiveStartTli, &receivedTli, &lastMsgSendTime, &lastMsgReceiptTime, &latestEndLsn, &latestEndTime, &upstreamNode); err != nil {
return err
}
}
if !upstreamHost.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because upstream host is null")
continue
}
if !slotName.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because slotname host is null")
continue
}
if !status.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because status is null")
continue
}
labels := []string{upstreamHost.String, slotName.String, status.String}
if !receiveStartLsn.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because receive_start_lsn is null")
continue
}
if !receiveStartTli.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because receive_start_tli is null")
continue
}
if hasFlushedLSN && !flushedLsn.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because flushed_lsn is null")
continue
}
if !receivedTli.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because received_tli is null")
continue
}
if !lastMsgSendTime.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because last_msg_send_time is null")
continue
}
if !lastMsgReceiptTime.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because last_msg_receipt_time is null")
continue
}
if !latestEndLsn.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because latest_end_lsn is null")
continue
}
if !latestEndTime.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because latest_end_time is null")
continue
}
if !upstreamNode.Valid {
level.Debug(c.log).Log("msg", "Skipping wal receiver stats because upstream_node is null")
continue
}
ch <- prometheus.MustNewConstMetric(
statWalReceiverReceiveStartLsn,
prometheus.CounterValue,
float64(receiveStartLsn.Int64),
labels...)
ch <- prometheus.MustNewConstMetric(
statWalReceiverReceiveStartTli,
prometheus.GaugeValue,
float64(receiveStartTli.Int64),
labels...)
if hasFlushedLSN {
ch <- prometheus.MustNewConstMetric(
statWalReceiverFlushedLSN,
prometheus.CounterValue,
float64(flushedLsn.Int64),
labels...)
}
ch <- prometheus.MustNewConstMetric(
statWalReceiverReceivedTli,
prometheus.GaugeValue,
float64(receivedTli.Int64),
labels...)
ch <- prometheus.MustNewConstMetric(
statWalReceiverLastMsgSendTime,
prometheus.CounterValue,
float64(lastMsgSendTime.Float64),
labels...)
ch <- prometheus.MustNewConstMetric(
statWalReceiverLastMsgReceiptTime,
prometheus.CounterValue,
float64(lastMsgReceiptTime.Float64),
labels...)
ch <- prometheus.MustNewConstMetric(
statWalReceiverLatestEndLsn,
prometheus.CounterValue,
float64(latestEndLsn.Int64),
labels...)
ch <- prometheus.MustNewConstMetric(
statWalReceiverLatestEndTime,
prometheus.CounterValue,
latestEndTime.Float64,
labels...)
ch <- prometheus.MustNewConstMetric(
statWalReceiverUpstreamNode,
prometheus.GaugeValue,
float64(upstreamNode.Int64),
labels...)
}
if err := rows.Err(); err != nil {
return err
}
return nil
}

View File

@ -0,0 +1,186 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collector
import (
"context"
"fmt"
"testing"
"github.com/DATA-DOG/go-sqlmock"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/smartystreets/goconvey/convey"
)
var queryWithFlushedLSN = fmt.Sprintf(pgStatWalReceiverQueryTemplate, "(flushed_lsn - '0/0') % (2^52)::bigint as flushed_lsn,\n")
var queryWithNoFlushedLSN = fmt.Sprintf(pgStatWalReceiverQueryTemplate, "")
func TestPGStatWalReceiverCollectorWithFlushedLSN(t *testing.T) {
db, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("Error opening a stub db connection: %s", err)
}
defer db.Close()
inst := &instance{db: db}
infoSchemaColumns := []string{
"column_name",
}
infoSchemaRows := sqlmock.NewRows(infoSchemaColumns).
AddRow(
"flushed_lsn",
)
mock.ExpectQuery(sanitizeQuery(pgStatWalColumnQuery)).WillReturnRows(infoSchemaRows)
columns := []string{
"upstream_host",
"slot_name",
"status",
"receive_start_lsn",
"receive_start_tli",
"flushed_lsn",
"received_tli",
"last_msg_send_time",
"last_msg_receipt_time",
"latest_end_lsn",
"latest_end_time",
"upstream_node",
}
rows := sqlmock.NewRows(columns).
AddRow(
"foo",
"bar",
"stopping",
1200668684563608,
1687321285,
1200668684563609,
1687321280,
1687321275,
1687321276,
1200668684563610,
1687321277,
5,
)
mock.ExpectQuery(sanitizeQuery(queryWithFlushedLSN)).WillReturnRows(rows)
ch := make(chan prometheus.Metric)
go func() {
defer close(ch)
c := PGStatWalReceiverCollector{}
if err := c.Update(context.Background(), inst, ch); err != nil {
t.Errorf("Error calling PgStatWalReceiverCollector.Update: %s", err)
}
}()
expected := []MetricResult{
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1200668684563608, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1687321285, metricType: dto.MetricType_GAUGE},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1200668684563609, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1687321280, metricType: dto.MetricType_GAUGE},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1687321275, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1687321276, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1200668684563610, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 1687321277, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "stopping"}, value: 5, metricType: dto.MetricType_GAUGE},
}
convey.Convey("Metrics comparison", t, func() {
for _, expect := range expected {
m := readMetric(<-ch)
convey.So(expect, convey.ShouldResemble, m)
}
})
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("there were unfulfilled exceptions: %s", err)
}
}
func TestPGStatWalReceiverCollectorWithNoFlushedLSN(t *testing.T) {
db, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("Error opening a stub db connection: %s", err)
}
defer db.Close()
inst := &instance{db: db}
infoSchemaColumns := []string{
"column_name",
}
infoSchemaRows := sqlmock.NewRows(infoSchemaColumns)
mock.ExpectQuery(sanitizeQuery(pgStatWalColumnQuery)).WillReturnRows(infoSchemaRows)
columns := []string{
"upstream_host",
"slot_name",
"status",
"receive_start_lsn",
"receive_start_tli",
"received_tli",
"last_msg_send_time",
"last_msg_receipt_time",
"latest_end_lsn",
"latest_end_time",
"upstream_node",
}
rows := sqlmock.NewRows(columns).
AddRow(
"foo",
"bar",
"starting",
1200668684563608,
1687321285,
1687321280,
1687321275,
1687321276,
1200668684563610,
1687321277,
5,
)
mock.ExpectQuery(sanitizeQuery(queryWithNoFlushedLSN)).WillReturnRows(rows)
ch := make(chan prometheus.Metric)
go func() {
defer close(ch)
c := PGStatWalReceiverCollector{}
if err := c.Update(context.Background(), inst, ch); err != nil {
t.Errorf("Error calling PgStatWalReceiverCollector.Update: %s", err)
}
}()
expected := []MetricResult{
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 1200668684563608, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 1687321285, metricType: dto.MetricType_GAUGE},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 1687321280, metricType: dto.MetricType_GAUGE},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 1687321275, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 1687321276, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 1200668684563610, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 1687321277, metricType: dto.MetricType_COUNTER},
{labels: labelMap{"upstream_host": "foo", "slot_name": "bar", "status": "starting"}, value: 5, metricType: dto.MetricType_GAUGE},
}
convey.Convey("Metrics comparison", t, func() {
for _, expect := range expected {
m := readMetric(<-ch)
convey.So(expect, convey.ShouldResemble, m)
}
})
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("there were unfulfilled exceptions: %s", err)
}
}