diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ac268c8..0572bf5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## master / unreleased +* [CHANGE] pg_stat_bgwriter counter metrics had the `_total` suffix added #556 * [ENHANCEMENT] Add pg_database_size_bytes metric #613 ## 0.10.1 / 2022-01-14 diff --git a/README.md b/README.md index 19afadbd..bb7c90d4 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,12 @@ This will build the docker image as `prometheuscommunity/postgres_exporter:${bra * `help` Show context-sensitive help (also try --help-long and --help-man). +* `collector.database` + Enable the pg_database collector. Default is `enabled` + +* `collector.bgwriter` + Enable the pg_stat_bgwriter collector. Default is `enabled` + * `web.listen-address` Address to listen on for web interface and telemetry. Default is `:9187`. diff --git a/cmd/postgres_exporter/main.go b/cmd/postgres_exporter/main.go index 987ea917..39e401e5 100644 --- a/cmd/postgres_exporter/main.go +++ b/cmd/postgres_exporter/main.go @@ -115,7 +115,12 @@ func main() { prometheus.MustRegister(exporter) - pe, err := collector.NewPostgresCollector(logger, dsn) + pe, err := collector.NewPostgresCollector( + logger, + dsn, + []string{}, + collector.WithAutoDiscoverDatabases(*autoDiscoverDatabases), + ) if err != nil { level.Error(logger).Log("msg", "Failed to create PostgresCollector", "err", err.Error()) os.Exit(1) diff --git a/cmd/postgres_exporter/postgres_exporter.go b/cmd/postgres_exporter/postgres_exporter.go index 2594a964..2574e025 100644 --- a/cmd/postgres_exporter/postgres_exporter.go +++ b/cmd/postgres_exporter/postgres_exporter.go @@ -163,23 +163,6 @@ func dumpMaps() { } var builtinMetricMaps = map[string]intermediateMetricMap{ - "pg_stat_bgwriter": { - map[string]ColumnMapping{ - "checkpoints_timed": {COUNTER, "Number of scheduled checkpoints that have been performed", nil, nil}, - "checkpoints_req": {COUNTER, "Number of requested checkpoints that have been performed", nil, nil}, - "checkpoint_write_time": {COUNTER, "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds", nil, nil}, - "checkpoint_sync_time": {COUNTER, "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds", nil, nil}, - "buffers_checkpoint": {COUNTER, "Number of buffers written during checkpoints", nil, nil}, - "buffers_clean": {COUNTER, "Number of buffers written by the background writer", nil, nil}, - "maxwritten_clean": {COUNTER, "Number of times the background writer stopped a cleaning scan because it had written too many buffers", nil, nil}, - "buffers_backend": {COUNTER, "Number of buffers written directly by a backend", nil, nil}, - "buffers_backend_fsync": {COUNTER, "Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)", nil, nil}, - "buffers_alloc": {COUNTER, "Number of buffers allocated", nil, nil}, - "stats_reset": {COUNTER, "Time at which these statistics were last reset", nil, nil}, - }, - true, - 0, - }, "pg_stat_database": { map[string]ColumnMapping{ "datid": {LABEL, "OID of a database", nil, nil}, diff --git a/collector/collector.go b/collector/collector.go index f6a80b60..e75f67dd 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -87,10 +87,27 @@ type PostgresCollector struct { logger log.Logger servers map[string]*server + + // autoDiscoverDatabases will cause the collector to query the database + // to find other servers and also scrape them. + autoDiscoverDatabases bool } +type Option func(*PostgresCollector) error + // NewPostgresCollector creates a new PostgresCollector. -func NewPostgresCollector(logger log.Logger, dsns []string, filters ...string) (*PostgresCollector, error) { +func NewPostgresCollector(logger log.Logger, dsns []string, filters []string, options ...Option) (*PostgresCollector, error) { + p := &PostgresCollector{ + logger: logger, + } + // Apply options to customize the collector + for _, o := range options { + err := o(p) + if err != nil { + return nil, err + } + } + f := make(map[string]bool) for _, filter := range filters { enabled, exist := collectorState[filter] @@ -121,48 +138,62 @@ func NewPostgresCollector(logger log.Logger, dsns []string, filters ...string) ( } } + p.Collectors = collectors + servers := make(map[string]*server) for _, dsn := range dsns { s, err := makeServer(dsn) if err != nil { return nil, err } + // Manually provided servers are always classified as "primary" + s.isPrimary = true + + // TODO(@sysadmind): We need to discover the downstream servers and add them here. + // if p.autoDiscoverDatabases { + // } + servers[dsn] = s } - return &PostgresCollector{ - Collectors: collectors, - logger: logger, - servers: servers, - }, nil + p.servers = servers + + return p, nil +} + +func WithAutoDiscoverDatabases(discover bool) Option { + return func(p *PostgresCollector) error { + p.autoDiscoverDatabases = discover + return nil + } } // Describe implements the prometheus.Collector interface. -func (n PostgresCollector) Describe(ch chan<- *prometheus.Desc) { +func (p PostgresCollector) Describe(ch chan<- *prometheus.Desc) { ch <- scrapeDurationDesc ch <- scrapeSuccessDesc } // Collect implements the prometheus.Collector interface. -func (n PostgresCollector) Collect(ch chan<- prometheus.Metric) { +func (p PostgresCollector) Collect(ch chan<- prometheus.Metric) { ctx := context.TODO() wg := sync.WaitGroup{} - wg.Add(len(n.servers)) - for _, s := range n.servers { + wg.Add(len(p.servers)) + for _, s := range p.servers { go func(s *server) { - n.subCollect(ctx, s, ch) + p.subCollect(ctx, s, ch) wg.Done() }(s) } wg.Wait() } -func (n PostgresCollector) subCollect(ctx context.Context, server *server, ch chan<- prometheus.Metric) { +func (p PostgresCollector) subCollect(ctx context.Context, server *server, ch chan<- prometheus.Metric) { wg := sync.WaitGroup{} - wg.Add(len(n.Collectors)) - for name, c := range n.Collectors { + wg.Add(len(p.Collectors)) + for name, c := range p.Collectors { go func(name string, c Collector) { - execute(ctx, name, c, server, ch, n.logger) + execute(ctx, name, c, server, ch, p.logger) wg.Done() }(name, c) } diff --git a/collector/pg_stat_bgwriter.go b/collector/pg_stat_bgwriter.go new file mode 100644 index 00000000..7e7d09c7 --- /dev/null +++ b/collector/pg_stat_bgwriter.go @@ -0,0 +1,212 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "context" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" +) + +func init() { + registerCollector("bgwriter", defaultEnabled, NewPGStatBGWriterCollector) +} + +type PGStatBGWriterCollector struct { +} + +func NewPGStatBGWriterCollector(logger log.Logger) (Collector, error) { + return &PGStatBGWriterCollector{}, nil +} + +const bgWriterSubsystem = "stat_bgwriter" + +var statBGWriter = map[string]*prometheus.Desc{ + "checkpoints_timed": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "checkpoints_timed_total"), + "Number of scheduled checkpoints that have been performed", + []string{"server"}, + prometheus.Labels{}, + ), + "checkpoints_req": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "checkpoints_req_total"), + "Number of requested checkpoints that have been performed", + []string{"server"}, + prometheus.Labels{}, + ), + "checkpoint_write_time": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "checkpoint_write_time_total"), + "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds", + []string{"server"}, + prometheus.Labels{}, + ), + "checkpoint_sync_time": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "checkpoint_sync_time_total"), + "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds", + []string{"server"}, + prometheus.Labels{}, + ), + "buffers_checkpoint": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "buffers_checkpoint_total"), + "Number of buffers written during checkpoints", + []string{"server"}, + prometheus.Labels{}, + ), + "buffers_clean": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "buffers_clean_total"), + "Number of buffers written by the background writer", + []string{"server"}, + prometheus.Labels{}, + ), + "maxwritten_clean": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "maxwritten_clean_total"), + "Number of times the background writer stopped a cleaning scan because it had written too many buffers", + []string{"server"}, + prometheus.Labels{}, + ), + "buffers_backend": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "buffers_backend_total"), + "Number of buffers written directly by a backend", + []string{"server"}, + prometheus.Labels{}, + ), + "buffers_backend_fsync": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "buffers_backend_fsync_total"), + "Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)", + []string{"server"}, + prometheus.Labels{}, + ), + "buffers_alloc": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "buffers_alloc_total"), + "Number of buffers allocated", + []string{"server"}, + prometheus.Labels{}, + ), + "stats_reset": prometheus.NewDesc( + prometheus.BuildFQName(namespace, bgWriterSubsystem, "stats_reset_total"), + "Time at which these statistics were last reset", + []string{"server"}, + prometheus.Labels{}, + ), +} + +func (PGStatBGWriterCollector) Update(ctx context.Context, server *server, ch chan<- prometheus.Metric) error { + db, err := server.GetDB() + if err != nil { + return err + } + + row := db.QueryRowContext(ctx, + `SELECT + checkpoints_timed + ,checkpoints_req + ,checkpoint_write_time + ,checkpoint_sync_time + ,buffers_checkpoint + ,buffers_clean + ,maxwritten_clean + ,buffers_backend + ,buffers_backend_fsync + ,buffers_alloc + ,stats_reset + FROM pg_stat_bgwriter;`) + + var cpt int + var cpr int + var cpwt int + var cpst int + var bcp int + var bc int + var mwc int + var bb int + var bbf int + var ba int + var sr time.Time + + err = row.Scan(&cpt, &cpr, &cpwt, &cpst, &bcp, &bc, &mwc, &bb, &bbf, &ba, &sr) + if err != nil { + return err + } + + ch <- prometheus.MustNewConstMetric( + statBGWriter["checkpoints_timed"], + prometheus.CounterValue, + float64(cpt), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["checkpoints_req"], + prometheus.CounterValue, + float64(cpr), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["checkpoint_write_time"], + prometheus.CounterValue, + float64(cpwt), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["checkpoint_sync_time"], + prometheus.CounterValue, + float64(cpst), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["buffers_checkpoint"], + prometheus.CounterValue, + float64(bcp), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["buffers_clean"], + prometheus.CounterValue, + float64(bc), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["maxwritten_clean"], + prometheus.CounterValue, + float64(mwc), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["buffers_backend"], + prometheus.CounterValue, + float64(bb), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["buffers_backend_fsync"], + prometheus.CounterValue, + float64(bbf), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["buffers_alloc"], + prometheus.CounterValue, + float64(ba), + server.GetName(), + ) + ch <- prometheus.MustNewConstMetric( + statBGWriter["stats_reset"], + prometheus.CounterValue, + float64(sr.Unix()), + server.GetName(), + ) + + return nil +} diff --git a/collector/server.go b/collector/server.go index fa490a2c..3c6d1c85 100644 --- a/collector/server.go +++ b/collector/server.go @@ -22,9 +22,10 @@ import ( ) type server struct { - dsn string - name string - db *sql.DB + dsn string + name string + db *sql.DB + isPrimary bool // Certain queries are only run on the primary server } func makeServer(dsn string) (*server, error) { diff --git a/postgres_mixin/dashboards/postgres-overview.json b/postgres_mixin/dashboards/postgres-overview.json index 24b7bc92..5bea4926 100644 --- a/postgres_mixin/dashboards/postgres-overview.json +++ b/postgres_mixin/dashboards/postgres-overview.json @@ -584,7 +584,7 @@ { "alias": "Buffers Allocated", "dsType": "prometheus", - "expr": "irate(pg_stat_bgwriter_buffers_alloc{instance='$instance'}[5m])", + "expr": "irate(pg_stat_bgwriter_buffers_alloc_total{instance='$instance'}[5m])", "format": "time_series", "groupBy": [ { @@ -636,7 +636,7 @@ { "alias": "Buffers Allocated", "dsType": "prometheus", - "expr": "irate(pg_stat_bgwriter_buffers_backend_fsync{instance='$instance'}[5m])", + "expr": "irate(pg_stat_bgwriter_buffers_backend_fsync_total{instance='$instance'}[5m])", "format": "time_series", "groupBy": [ { @@ -688,7 +688,7 @@ { "alias": "Buffers Allocated", "dsType": "prometheus", - "expr": "irate(pg_stat_bgwriter_buffers_backend{instance='$instance'}[5m])", + "expr": "irate(pg_stat_bgwriter_buffers_backend_total{instance='$instance'}[5m])", "format": "time_series", "groupBy": [ { @@ -740,7 +740,7 @@ { "alias": "Buffers Allocated", "dsType": "prometheus", - "expr": "irate(pg_stat_bgwriter_buffers_clean{instance='$instance'}[5m])", + "expr": "irate(pg_stat_bgwriter_buffers_clean_total{instance='$instance'}[5m])", "format": "time_series", "groupBy": [ { @@ -792,7 +792,7 @@ { "alias": "Buffers Allocated", "dsType": "prometheus", - "expr": "irate(pg_stat_bgwriter_buffers_checkpoint{instance='$instance'}[5m])", + "expr": "irate(pg_stat_bgwriter_buffers_checkpoint_total{instance='$instance'}[5m])", "format": "time_series", "groupBy": [ {