From cd9aa031a89136c20e82120aa026d2036e9f7ede Mon Sep 17 00:00:00 2001 From: Tim Serong Date: Thu, 23 Nov 2017 20:24:31 +1100 Subject: [PATCH] Terminate exporter process if maximum open files exceeded This is somewhat of a workaround for the exporter becoming perpetually blocked when it runs out of file descriptors if the cluster is down for too long, as mentioned in: https://github.com/digitalocean/ceph_exporter/issues/60#issuecomment-319396108 The problem is that if the MONs are down for long enough, each time prometheus scrapes the metrics, another socket is opened, but these block forever. If the cluster comes back up before we run out of FDs, the blocked requests recover. If the clusetr *doesn't* come back up before we run out of FDs, the blocked requests never recover. This commit causes ceph exporter to terminate if it runs out of file descriptors, which IMO is better than blocking forever -- it'll be a noisier failure, and also if you're running ceph_exporter via systemd, systemd will then automatically trigger a service restart. Signed-off-by: Tim Serong (cherry picked from commit bb1ad364b52611e54c139cae67da9a1116aacfbf) --- exporter.go | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/exporter.go b/exporter.go index 06efe2f..c9edee3 100644 --- a/exporter.go +++ b/exporter.go @@ -18,8 +18,12 @@ package main import ( "flag" "log" + "net" "net/http" + "os" "sync" + "syscall" + "time" "github.com/ceph/go-ceph/rados" "github.com/digitalocean/ceph_exporter/collectors" @@ -27,6 +31,31 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" ) +// This horrible thing is a copy of tcpKeepAliveListener, tweaked to +// specifically check if it hits EMFILE when doing an accept, and if so, +// terminate the process. + +type emfileAwareTcpListener struct { + *net.TCPListener +} + +func (ln emfileAwareTcpListener) Accept() (c net.Conn, err error) { + tc, err := ln.AcceptTCP() + if err != nil { + if oerr, ok := err.(*net.OpError); ok { + if serr, ok := oerr.Err.(*os.SyscallError); ok && serr.Err == syscall.EMFILE { + // This calls os.Exit(1) and terminates the process + log.Fatalf("%v", err) + } + } + // Default return + return + } + tc.SetKeepAlive(true) + tc.SetKeepAlivePeriod(3 * time.Minute) + return tc, nil +} + // CephExporter wraps all the ceph collectors and provides a single global // exporter to extracts metrics out of. It also ensures that the collection // is done in a thread-safe manner, the necessary requirement stated by @@ -152,7 +181,16 @@ func main() { }) log.Printf("Starting ceph exporter on %q", *addr) - if err := http.ListenAndServe(*addr, nil); err != nil { - log.Fatalf("cannot start ceph exporter: %s", err) + // Below is essentially http.ListenAndServe(), but using our custom + // emfileAwareTcpListener that will die if we run out of file descriptors + ln, err := net.Listen("tcp", *addr) + if err == nil { + err := http.Serve(emfileAwareTcpListener{ln.(*net.TCPListener)}, nil) + if err != nil { + log.Fatalf("unable to serve requests: %s", err) + } + } + if err != nil { + log.Fatalf("unable to create listener: %s", err) } }