From 238f5c099af62ec32fb7c511361fd616e3998f2f Mon Sep 17 00:00:00 2001
From: Dmitry Ulyanov <swhile@ya.ru>
Date: Mon, 12 Aug 2019 06:35:58 +0300
Subject: [PATCH] Fix pg_up metric returns last calculated value without
 explicit resetting (#291)

If exporter is scraped by multiple Prometheuses (as we do) - Collect() could be called concurrently. In result in some cases one of Prometheuses could get pg_up = 0, because it was explicitly set to zero on first Collect call.
---
 cmd/postgres_exporter/postgres_exporter.go | 65 ++++++++++++++++------
 1 file changed, 48 insertions(+), 17 deletions(-)

diff --git a/cmd/postgres_exporter/postgres_exporter.go b/cmd/postgres_exporter/postgres_exporter.go
index 920dfc3d..3522f7ab 100644
--- a/cmd/postgres_exporter/postgres_exporter.go
+++ b/cmd/postgres_exporter/postgres_exporter.go
@@ -129,6 +129,16 @@ type MetricMap struct {
 	conversion func(interface{}) (float64, bool) // Conversion function to turn PG result into float64
 }
 
+// ErrorConnectToServer is a connection to PgSQL server error
+type ErrorConnectToServer struct {
+	Msg string
+}
+
+// Error returns error
+func (e *ErrorConnectToServer) Error() string {
+	return e.Msg
+}
+
 // TODO: revisit this with the semver system
 func dumpMaps() {
 	// TODO: make this function part of the exporter
@@ -813,21 +823,24 @@ func (s *Server) String() string {
 }
 
 // Scrape loads metrics.
-func (s *Server) Scrape(ch chan<- prometheus.Metric, errGauge prometheus.Gauge, disableSettingsMetrics bool) {
+func (s *Server) Scrape(ch chan<- prometheus.Metric, disableSettingsMetrics bool) error {
 	s.mappingMtx.RLock()
 	defer s.mappingMtx.RUnlock()
 
+	var err error
+
 	if !disableSettingsMetrics {
-		if err := querySettings(ch, s); err != nil {
-			log.Errorf("Error retrieving settings: %s", err)
-			errGauge.Inc()
+		if err = querySettings(ch, s); err != nil {
+			err = fmt.Errorf("error retrieving settings: %s", err)
 		}
 	}
 
 	errMap := queryNamespaceMappings(ch, s)
 	if len(errMap) > 0 {
-		errGauge.Inc()
+		err = fmt.Errorf("queryNamespaceMappings returned %d errors", len(errMap))
 	}
+
+	return err
 }
 
 // Servers contains a collection of servers to Postgres.
@@ -1289,16 +1302,40 @@ func (e *Exporter) scrape(ch chan<- prometheus.Metric) {
 		e.duration.Set(time.Since(begun).Seconds())
 	}(time.Now())
 
-	e.error.Set(0)
-	e.psqlUp.Set(0)
 	e.totalScrapes.Inc()
 
 	dsns := e.dsn
 	if e.autoDiscoverDatabases {
 		dsns = e.discoverDatabaseDSNs()
 	}
+
+	var errorsCount int
+	var connectionErrorsCount int
+
 	for _, dsn := range dsns {
-		e.scrapeDSN(ch, dsn)
+		if err := e.scrapeDSN(ch, dsn); err != nil {
+			errorsCount++
+
+			log.Errorf(err.Error())
+
+			if _, ok := err.(*ErrorConnectToServer); ok {
+				connectionErrorsCount++
+			}
+		}
+	}
+
+	switch {
+	case connectionErrorsCount >= len(dsns):
+		e.psqlUp.Set(0)
+	default:
+		e.psqlUp.Set(1) // Didn't fail, can mark connection as up for this scrape.
+	}
+
+	switch errorsCount {
+	case 0:
+		e.error.Set(0)
+	default:
+		e.error.Set(1)
 	}
 }
 
@@ -1342,24 +1379,18 @@ func (e *Exporter) discoverDatabaseDSNs() []string {
 	return result
 }
 
-func (e *Exporter) scrapeDSN(ch chan<- prometheus.Metric, dsn string) {
+func (e *Exporter) scrapeDSN(ch chan<- prometheus.Metric, dsn string) error {
 	server, err := e.servers.GetServer(dsn)
 	if err != nil {
-		log.Errorf("Error opening connection to database (%s): %v", loggableDSN(dsn), err)
-		e.error.Inc()
-		return
+		return &ErrorConnectToServer{fmt.Sprintf("Error opening connection to database (%s): %s", loggableDSN(dsn), err)}
 	}
 
-	// Didn't fail, can mark connection as up for this scrape.
-	e.psqlUp.Inc()
-
 	// Check if map versions need to be updated
 	if err := e.checkMapVersions(ch, server); err != nil {
 		log.Warnln("Proceeding with outdated query maps, as the Postgres version could not be determined:", err)
-		e.error.Inc()
 	}
 
-	server.Scrape(ch, e.error, e.disableSettingsMetrics)
+	return server.Scrape(ch, e.disableSettingsMetrics)
 }
 
 // try to get the DataSource