From 043e68e067a3327fee9198f2888296995ae68ca8 Mon Sep 17 00:00:00 2001 From: Dan Molik Date: Tue, 29 Oct 2019 23:54:18 -0400 Subject: [PATCH] Add retries to getServer() (#316) Some backstory ============== I was attempting to use postgres_exporter with the official Docker container (https://hub.docker.com/_/postgres) In a Kubernetes StatefulSet, with a side-car configuration, but found that I wasn't able to connect even with sharing the Postgres Unix listening socket, between both containers. After copying the container over to an Alpine base I quickly found out that the postgres_exporter was actually starting before the main Postres container had dropped the unix socket onto the file system, a quick work around is to write a bash for loop checking for the existence of a unix socket, however this would require maintaining a container, besides other users may find retries useful on startup. Implementation ============== All changes are made to the getServer function and variables are local, I was unsure if it was worth adding command line switches but this would allow for a more sophisticated backOff loop in the future. Hope this help, and let me know if you would like me to changes anything. --- cmd/postgres_exporter/postgres_exporter.go | 30 +++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/cmd/postgres_exporter/postgres_exporter.go b/cmd/postgres_exporter/postgres_exporter.go index 3522f7ab..8176d057 100644 --- a/cmd/postgres_exporter/postgres_exporter.go +++ b/cmd/postgres_exporter/postgres_exporter.go @@ -863,17 +863,29 @@ func (s *Servers) GetServer(dsn string) (*Server, error) { s.m.Lock() defer s.m.Unlock() var err error - server, ok := s.servers[dsn] - if !ok { - server, err = NewServer(dsn, s.opts...) - if err != nil { + var ok bool + errCount := 0 // start at zero because we increment before doing work + retries := 3 + var server *Server + for { + if errCount++; errCount > retries { return nil, err } - s.servers[dsn] = server - } - if err = server.Ping(); err != nil { - delete(s.servers, dsn) - return nil, err + server, ok = s.servers[dsn] + if !ok { + server, err = NewServer(dsn, s.opts...) + if err != nil { + time.Sleep(time.Duration(errCount) * time.Second) + continue + } + s.servers[dsn] = server + } + if err = server.Ping(); err != nil { + delete(s.servers, dsn) + time.Sleep(time.Duration(errCount) * time.Second) + continue + } + break } return server, nil }