Merge branch 'master' into feature/test_postgres_12

2025-02-25 15:00:41 +00:00 · 2021-01-31 22:23:07 +11:00 · 2021-01-31 22:23:07 +11:00 · 178426c095
commit 178426c095
parent 69e1ec9829 301976c218
6 changed files with 383 additions and 40 deletions
--- a/README.md
+++ b/README.md
@ -140,6 +140,9 @@ The following environment variables configure the exporter:
 * `PG_EXPORTER_EXCLUDE_DATABASES`
  A comma-separated list of databases to remove when autoDiscoverDatabases is enabled. Default is empty string.

+* `PG_EXPORTER_METRIC_PREFIX`
+  A prefix to use for each of the default metrics exported by postgres-exporter. Default is `pg`
+
 Settings set by environment variables starting with `PG_` will be overwritten by the corresponding CLI flag if given.

 ### Setting the Postgres server's data source name
--- a/cmd/postgres_exporter/postgres_exporter.go
+++ b/cmd/postgres_exporter/postgres_exporter.go
@ -53,6 +53,7 @@ var (
 	onlyDumpMaps           = kingpin.Flag("dumpmaps", "Do not run, simply dump the maps.").Bool()
 	constantLabelsList     = kingpin.Flag("constantLabels", "A list of label=value separated by comma(,).").Default("").Envar("PG_EXPORTER_CONSTANT_LABELS").String()
 	excludeDatabases       = kingpin.Flag("exclude-databases", "A list of databases to remove when autoDiscoverDatabases is enabled").Default("").Envar("PG_EXPORTER_EXCLUDE_DATABASES").String()
+	metricPrefix           = kingpin.Flag("metric-prefix", "A metric prefix can be used to have non-default (not \"pg\") prefixes for each of the metrics").Default("pg").Envar("PG_EXPORTER_METRIC_PREFIX").String()
 )

 // Metric name parts.
@ -80,6 +81,7 @@ const (
 	GAUGE        ColumnUsage = iota // Use this column as a gauge
 	MAPPEDMETRIC ColumnUsage = iota // Use this column with the supplied mapping of text values
 	DURATION     ColumnUsage = iota // This column should be interpreted as a text duration (and converted to milliseconds)
+	HISTOGRAM    ColumnUsage = iota // Use this column as a histogram
 )

 // UnmarshalYAML implements the yaml.Unmarshaller interface.
@ -169,6 +171,7 @@ type MetricMapNamespace struct {
 // be mapped to by the collector
 type MetricMap struct {
 	discard    bool                              // Should metric be discarded during mapping?
+	histogram  bool                              // Should metric be treated as a histogram?
 	vtype      prometheus.ValueType              // Prometheus valuetype
 	desc       *prometheus.Desc                  // Prometheus descriptor
 	conversion func(interface{}) (float64, bool) // Conversion function to turn PG result into float64
@ -376,7 +379,8 @@ var queryOverrides = map[string][]OverrideQuery{
 				         ('sharelock'),
 				         ('sharerowexclusivelock'),
 				         ('exclusivelock'),
-				         ('accessexclusivelock')
+				         ('accessexclusivelock'),
+					 ('sireadlock')
 				) AS tmp(mode) CROSS JOIN pg_database
 			LEFT JOIN
 			  (SELECT database, lower(mode) AS mode,count(*) AS count
@ -598,6 +602,8 @@ func makeDescMap(pgVersion semver.Version, serverLabels prometheus.Labels, metri
 	for namespace, intermediateMappings := range metricMaps {
 		thisMap := make(map[string]MetricMap)

+		namespace = strings.Replace(namespace, "pg", *metricPrefix, 1)
+
 		// Get the constant labels
 		var variableLabels []string
 		for columnName, columnMapping := range intermediateMappings.columnMappings {
@ -650,6 +656,27 @@ func makeDescMap(pgVersion semver.Version, serverLabels prometheus.Labels, metri
 						return dbToFloat64(in)
 					},
 				}
+			case HISTOGRAM:
+				thisMap[columnName] = MetricMap{
+					histogram: true,
+					vtype:     prometheus.UntypedValue,
+					desc:      prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, variableLabels, serverLabels),
+					conversion: func(in interface{}) (float64, bool) {
+						return dbToFloat64(in)
+					},
+				}
+				thisMap[columnName+"_bucket"] = MetricMap{
+					histogram: true,
+					discard:   true,
+				}
+				thisMap[columnName+"_sum"] = MetricMap{
+					histogram: true,
+					discard:   true,
+				}
+				thisMap[columnName+"_count"] = MetricMap{
+					histogram: true,
+					discard:   true,
+				}
 			case MAPPEDMETRIC:
 				thisMap[columnName] = MetricMap{
 					vtype: prometheus.GaugeValue,
@ -721,6 +748,9 @@ func stringToColumnUsage(s string) (ColumnUsage, error) {
 	case "GAUGE":
 		u = GAUGE

+	case "HISTOGRAM":
+		u = HISTOGRAM
+
 	case "MAPPEDMETRIC":
 		u = MAPPEDMETRIC

@ -772,6 +802,46 @@ func dbToFloat64(t interface{}) (float64, bool) {
 	}
 }

+// Convert database.sql types to uint64 for Prometheus consumption. Null types are mapped to 0. string and []byte
+// types are mapped as 0 and !ok
+func dbToUint64(t interface{}) (uint64, bool) {
+	switch v := t.(type) {
+	case uint64:
+		return v, true
+	case int64:
+		return uint64(v), true
+	case float64:
+		return uint64(v), true
+	case time.Time:
+		return uint64(v.Unix()), true
+	case []byte:
+		// Try and convert to string and then parse to a uint64
+		strV := string(v)
+		result, err := strconv.ParseUint(strV, 10, 64)
+		if err != nil {
+			log.Infoln("Could not parse []byte:", err)
+			return 0, false
+		}
+		return result, true
+	case string:
+		result, err := strconv.ParseUint(v, 10, 64)
+		if err != nil {
+			log.Infoln("Could not parse string:", err)
+			return 0, false
+		}
+		return result, true
+	case bool:
+		if v {
+			return 1, true
+		}
+		return 0, true
+	case nil:
+		return 0, true
+	default:
+		return 0, false
+	}
+}
+
 // Convert database.sql to string for Prometheus labels. Null types are mapped to empty strings.
 func dbToString(t interface{}) (string, bool) {
 	switch v := t.(type) {
@ -977,7 +1047,7 @@ func (s *Servers) GetServer(dsn string) (*Server, error) {
 	var err error
 	var ok bool
 	errCount := 0 // start at zero because we increment before doing work
-	retries := 3
+	retries := 1
 	var server *Server
 	for {
 		if errCount++; errCount > retries {
@ -1167,29 +1237,6 @@ func (e *Exporter) setupInternalMetrics() {

 // Describe implements prometheus.Collector.
 func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
-	// We cannot know in advance what metrics the exporter will generate
-	// from Postgres. So we use the poor man's describe method: Run a collect
-	// and send the descriptors of all the collected metrics. The problem
-	// here is that we need to connect to the Postgres DB. If it is currently
-	// unavailable, the descriptors will be incomplete. Since this is a
-	// stand-alone exporter and not used as a library within other code
-	// implementing additional metrics, the worst that can happen is that we
-	// don't detect inconsistent metrics created by this exporter
-	// itself. Also, a change in the monitored Postgres instance may change the
-	// exported metrics during the runtime of the exporter.
-	metricCh := make(chan prometheus.Metric)
-	doneCh := make(chan struct{})
-
-	go func() {
-		for m := range metricCh {
-			ch <- m.Desc()
-		}
-		close(doneCh)
-	}()
-
-	e.Collect(metricCh)
-	close(metricCh)
-	<-doneCh
 }

 // Collect implements prometheus.Collector.
@ -1304,13 +1351,68 @@ func queryNamespaceMapping(server *Server, namespace string, mapping MetricMapNa
 					continue
 				}

-				value, ok := dbToFloat64(columnData[idx])
-				if !ok {
-					nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName, columnData[idx])))
-					continue
+				if metricMapping.histogram {
+					var keys []float64
+					err = pq.Array(&keys).Scan(columnData[idx])
+					if err != nil {
+						return []prometheus.Metric{}, []error{}, errors.New(fmt.Sprintln("Error retrieving", columnName, "buckets:", namespace, err))
+					}
+
+					var values []int64
+					valuesIdx, ok := columnIdx[columnName+"_bucket"]
+					if !ok {
+						nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Missing column: ", namespace, columnName+"_bucket")))
+						continue
+					}
+					err = pq.Array(&values).Scan(columnData[valuesIdx])
+					if err != nil {
+						return []prometheus.Metric{}, []error{}, errors.New(fmt.Sprintln("Error retrieving", columnName, "bucket values:", namespace, err))
+					}
+
+					buckets := make(map[float64]uint64, len(keys))
+					for i, key := range keys {
+						if i >= len(values) {
+							break
+						}
+						buckets[key] = uint64(values[i])
+					}
+
+					idx, ok = columnIdx[columnName+"_sum"]
+					if !ok {
+						nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Missing column: ", namespace, columnName+"_sum")))
+						continue
+					}
+					sum, ok := dbToFloat64(columnData[idx])
+					if !ok {
+						nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName+"_sum", columnData[idx])))
+						continue
+					}
+
+					idx, ok = columnIdx[columnName+"_count"]
+					if !ok {
+						nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Missing column: ", namespace, columnName+"_count")))
+						continue
+					}
+					count, ok := dbToUint64(columnData[idx])
+					if !ok {
+						nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName+"_count", columnData[idx])))
+						continue
+					}
+
+					metric = prometheus.MustNewConstHistogram(
+						metricMapping.desc,
+						count, sum, buckets,
+						labels...,
+					)
+				} else {
+					value, ok := dbToFloat64(columnData[idx])
+					if !ok {
+						nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName, columnData[idx])))
+						continue
+					}
+					// Generate the metric
+					metric = prometheus.MustNewConstMetric(metricMapping.desc, metricMapping.vtype, value, labels...)
 				}
-				// Generate the metric
-				metric = prometheus.MustNewConstMetric(metricMapping.desc, metricMapping.vtype, value, labels...)
 			} else {
 				// Unknown metric. Report as untyped if scan to float64 works, else note an error too.
 				metricLabel := fmt.Sprintf("%s_%s", namespace, columnName)
@ -1515,20 +1617,36 @@ func (e *Exporter) scrape(ch chan<- prometheus.Metric) {
 }

 func (e *Exporter) discoverDatabaseDSNs() []string {
+	// connstring syntax is complex (and not sure if even regular).
+	// we don't need to parse it, so just superficially validate that it starts
+	// with a valid-ish keyword pair
+	connstringRe := regexp.MustCompile(`^ *[a-zA-Z0-9]+ *= *[^= ]+`)
+
 	dsns := make(map[string]struct{})
 	for _, dsn := range e.dsn {
-		parsedDSN, err := url.Parse(dsn)
-		if err != nil {
-			log.Errorf("Unable to parse DSN (%s): %v", loggableDSN(dsn), err)
+		var dsnURI *url.URL
+		var dsnConnstring string
+
+		if strings.HasPrefix(dsn, "postgresql://") {
+			var err error
+			dsnURI, err = url.Parse(dsn)
+			if err != nil {
+				log.Errorf("Unable to parse DSN as URI (%s): %v", loggableDSN(dsn), err)
+				continue
+			}
+		} else if connstringRe.MatchString(dsn) {
+			dsnConnstring = dsn
+		} else {
+			log.Errorf("Unable to parse DSN as either URI or connstring (%s)", loggableDSN(dsn))
 			continue
 		}

-		dsns[dsn] = struct{}{}
 		server, err := e.servers.GetServer(dsn)
 		if err != nil {
 			log.Errorf("Error opening connection to database (%s): %v", loggableDSN(dsn), err)
 			continue
 		}
+		dsns[dsn] = struct{}{}

 		// If autoDiscoverDatabases is true, set first dsn as master database (Default: false)
 		server.master = true
@ -1542,8 +1660,16 @@ func (e *Exporter) discoverDatabaseDSNs() []string {
 			if contains(e.excludeDatabases, databaseName) {
 				continue
 			}
-			parsedDSN.Path = databaseName
-			dsns[parsedDSN.String()] = struct{}{}
+
+			if dsnURI != nil {
+				dsnURI.Path = databaseName
+				dsn = dsnURI.String()
+			} else {
+				// replacing one dbname with another is complicated.
+				// just append new dbname to override.
+				dsn = fmt.Sprintf("%s dbname=%s", dsnConnstring, databaseName)
+			}
+			dsns[dsn] = struct{}{}
 		}
 	}

--- a/cmd/postgres_exporter/postgres_exporter_integration_test.go
+++ b/cmd/postgres_exporter/postgres_exporter_integration_test.go
@ -126,3 +126,26 @@ func (s *IntegrationSuite) TestUnknownMetricParsingDoesntCrash(c *C) {
 	// scrape the exporter and make sure it works
 	exporter.scrape(ch)
 }
+
+// TestExtendQueriesDoesntCrash tests that specifying extend.query-path doesn't
+// crash.
+func (s *IntegrationSuite) TestExtendQueriesDoesntCrash(c *C) {
+	// Setup a dummy channel to consume metrics
+	ch := make(chan prometheus.Metric, 100)
+	go func() {
+		for range ch {
+		}
+	}()
+
+	dsn := os.Getenv("DATA_SOURCE_NAME")
+	c.Assert(dsn, Not(Equals), "")
+
+	exporter := NewExporter(
+		strings.Split(dsn, ","),
+		WithUserQueriesPath("../user_queries_test.yaml"),
+	)
+	c.Assert(exporter, NotNil)
+
+	// scrape the exporter and make sure it works
+	exporter.scrape(ch)
+}
--- a/cmd/postgres_exporter/postgres_exporter_test.go
+++ b/cmd/postgres_exporter/postgres_exporter_test.go
@ -4,9 +4,11 @@ package main

 import (
 	"io/ioutil"
+	"math"
 	"os"
 	"reflect"
 	"testing"
+	"time"

 	"github.com/blang/semver"
 	"github.com/prometheus/client_golang/prometheus"
@ -287,6 +289,22 @@ func UnsetEnvironment(c *C, d string) {
 	c.Assert(err, IsNil)
 }

+type isNaNChecker struct {
+	*CheckerInfo
+}
+
+var IsNaN Checker = &isNaNChecker{
+	&CheckerInfo{Name: "IsNaN", Params: []string{"value"}},
+}
+
+func (checker *isNaNChecker) Check(params []interface{}, names []string) (result bool, error string) {
+	param, ok := (params[0]).(float64)
+	if !ok {
+		return false, "obtained value type is not a float"
+	}
+	return math.IsNaN(param), ""
+}
+
 // test boolean metric type gets converted to float
 func (s *FunctionalSuite) TestBooleanConversionToValueAndString(c *C) {

@ -294,6 +312,7 @@ func (s *FunctionalSuite) TestBooleanConversionToValueAndString(c *C) {
 		input          interface{}
 		expectedString string
 		expectedValue  float64
+		expectedCount  uint64
 		expectedOK     bool
 	}

@ -302,19 +321,71 @@ func (s *FunctionalSuite) TestBooleanConversionToValueAndString(c *C) {
 			input:          true,
 			expectedString: "true",
 			expectedValue:  1.0,
+			expectedCount:  1,
 			expectedOK:     true,
 		},
 		{
 			input:          false,
 			expectedString: "false",
 			expectedValue:  0.0,
+			expectedCount:  0,
+			expectedOK:     true,
+		},
+		{
+			input:          nil,
+			expectedString: "",
+			expectedValue:  math.NaN(),
+			expectedCount:  0,
+			expectedOK:     true,
+		},
+		{
+			input:          TestCase{},
+			expectedString: "",
+			expectedValue:  math.NaN(),
+			expectedCount:  0,
+			expectedOK:     false,
+		},
+		{
+			input:          123.0,
+			expectedString: "123",
+			expectedValue:  123.0,
+			expectedCount:  123,
+			expectedOK:     true,
+		},
+		{
+			input:          "123",
+			expectedString: "123",
+			expectedValue:  123.0,
+			expectedCount:  123,
+			expectedOK:     true,
+		},
+		{
+			input:          []byte("123"),
+			expectedString: "123",
+			expectedValue:  123.0,
+			expectedCount:  123,
+			expectedOK:     true,
+		},
+		{
+			input:          time.Unix(1600000000, 0),
+			expectedString: "1600000000",
+			expectedValue:  1600000000.0,
+			expectedCount:  1600000000,
 			expectedOK:     true,
 		},
 	}

 	for _, cs := range cases {
 		value, ok := dbToFloat64(cs.input)
-		c.Assert(value, Equals, cs.expectedValue)
+		if math.IsNaN(cs.expectedValue) {
+			c.Assert(value, IsNaN)
+		} else {
+			c.Assert(value, Equals, cs.expectedValue)
+		}
+		c.Assert(ok, Equals, cs.expectedOK)
+
+		count, ok := dbToUint64(cs.input)
+		c.Assert(count, Equals, cs.expectedCount)
 		c.Assert(ok, Equals, cs.expectedOK)

 		str, ok := dbToString(cs.input)
--- a/cmd/postgres_exporter/tests/user_queries_test.yaml
+++ b/cmd/postgres_exporter/tests/user_queries_test.yaml
@ -0,0 +1,51 @@
+random:
+  query: |
+    WITH data AS (SELECT floor(random()*10) AS d FROM generate_series(1,100)),
+         metrics AS (SELECT SUM(d) AS sum, COUNT(*) AS count FROM data),
+         buckets AS (SELECT le, SUM(CASE WHEN d <= le THEN 1 ELSE 0 END) AS d
+                     FROM data, UNNEST(ARRAY[1, 2, 4, 8]) AS le GROUP BY le)
+    SELECT
+      sum AS histogram_sum,
+      count AS histogram_count,
+      ARRAY_AGG(le) AS histogram,
+      ARRAY_AGG(d) AS histogram_bucket,
+      ARRAY_AGG(le) AS missing,
+      ARRAY_AGG(le) AS missing_sum,
+      ARRAY_AGG(d) AS missing_sum_bucket,
+      ARRAY_AGG(le) AS missing_count,
+      ARRAY_AGG(d) AS missing_count_bucket,
+      sum AS missing_count_sum,
+      ARRAY_AGG(le) AS unexpected_sum,
+      ARRAY_AGG(d) AS unexpected_sum_bucket,
+      'data' AS unexpected_sum_sum,
+      ARRAY_AGG(le) AS unexpected_count,
+      ARRAY_AGG(d) AS unexpected_count_bucket,
+      sum AS unexpected_count_sum,
+      'nan'::varchar AS unexpected_count_count,
+      ARRAY_AGG(le) AS unexpected_bytes,
+      ARRAY_AGG(d) AS unexpected_bytes_bucket,
+      sum AS unexpected_bytes_sum,
+      'nan'::bytea AS unexpected_bytes_count
+    FROM metrics, buckets GROUP BY 1,2
+  metrics:
+    - histogram:
+        usage: "HISTOGRAM"
+        description: "Random data"
+    - missing:
+        usage: "HISTOGRAM"
+        description: "nonfatal error"
+    - missing_sum:
+        usage: "HISTOGRAM"
+        description: "nonfatal error"
+    - missing_count:
+        usage: "HISTOGRAM"
+        description: "nonfatal error"
+    - unexpected_sum:
+        usage: "HISTOGRAM"
+        description: "nonfatal error"
+    - unexpected_count:
+        usage: "HISTOGRAM"
+        description: "nonfatal error"
+    - unexpected_bytes:
+        usage: "HISTOGRAM"
+        description: "nonfatal error"
--- a/queries.yaml
+++ b/queries.yaml
@ -1,5 +1,5 @@
 pg_replication:
-  query: "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag"
+  query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag"
  master: true
  metrics:
    - lag:
@ -15,7 +15,32 @@ pg_postmaster:
        description: "Time at which postmaster started"

 pg_stat_user_tables:
-  query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z'), COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables"
+  query: |
+   SELECT
+     current_database() datname,
+     schemaname,
+     relname,
+     seq_scan,
+     seq_tup_read,
+     idx_scan,
+     idx_tup_fetch,
+     n_tup_ins,
+     n_tup_upd,
+     n_tup_del,
+     n_tup_hot_upd,
+     n_live_tup,
+     n_dead_tup,
+     n_mod_since_analyze,
+     COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, 
+     COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, 
+     COALESCE(last_analyze, '1970-01-01Z') as last_analyze, 
+     COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, 
+     vacuum_count, 
+     autovacuum_count, 
+     analyze_count, 
+     autoanalyze_count 
+   FROM 
+     pg_stat_user_tables
  metrics:
    - datname:
        usage: "LABEL"
@ -203,3 +228,47 @@ pg_stat_statements:
    - blk_write_time_seconds:
        usage: "COUNTER"
        description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
+
+pg_stat_activity:
+  query: |
+    WITH
+      metrics AS (
+        SELECT
+          application_name,
+          SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum,
+          COUNT(*) AS process_idle_seconds_count
+        FROM pg_stat_activity
+        WHERE state = 'idle'
+        GROUP BY application_name
+      ),
+      buckets AS (
+        SELECT
+          application_name,
+          le,
+          SUM(
+            CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le
+              THEN 1
+              ELSE 0
+            END
+          )::bigint AS bucket
+        FROM
+          pg_stat_activity,
+          UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le
+        GROUP BY application_name, le
+        ORDER BY application_name, le
+      )
+    SELECT
+      application_name,
+      process_idle_seconds_sum,
+      process_idle_seconds_count,
+      ARRAY_AGG(le) AS process_idle_seconds,
+      ARRAY_AGG(bucket) AS process_idle_seconds_bucket
+    FROM metrics JOIN buckets USING (application_name)
+    GROUP BY 1, 2, 3
+  metrics:
+    - application_name:
+        usage: "LABEL"
+        description: "Application Name"
+    - process_idle_seconds:
+        usage: "HISTOGRAM"
+        description: "Idle time of server processes"