Terminate exporter process if maximum open files exceeded

This is somewhat of a workaround for the exporter becoming
perpetually blocked when it runs out of file descriptors if
the cluster is down for too long, as mentioned in:

  https://github.com/digitalocean/ceph_exporter/issues/60#issuecomment-319396108

The problem is that if the MONs are down for long enough,
each time prometheus scrapes the metrics, another socket is
opened, but these block forever.  If the cluster comes back
up before we run out of FDs, the blocked requests recover.
If the clusetr *doesn't* come back up before we run out of
FDs, the blocked requests never recover.

This commit causes ceph exporter to terminate if it runs
out of file descriptors, which IMO is better than blocking
forever -- it'll be a noisier failure, and also if you're
running ceph_exporter via systemd, systemd will then
automatically trigger a service restart.

Signed-off-by: Tim Serong <tserong@suse.com>
(cherry picked from commit bb1ad364b5)
This commit is contained in:
Tim Serong 2017-11-23 20:24:31 +11:00 committed by Jan Fajerski
parent ccd6b7135b
commit cd9aa031a8

View File

@ -18,8 +18,12 @@ package main
import ( import (
"flag" "flag"
"log" "log"
"net"
"net/http" "net/http"
"os"
"sync" "sync"
"syscall"
"time"
"github.com/ceph/go-ceph/rados" "github.com/ceph/go-ceph/rados"
"github.com/digitalocean/ceph_exporter/collectors" "github.com/digitalocean/ceph_exporter/collectors"
@ -27,6 +31,31 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/promhttp"
) )
// This horrible thing is a copy of tcpKeepAliveListener, tweaked to
// specifically check if it hits EMFILE when doing an accept, and if so,
// terminate the process.
type emfileAwareTcpListener struct {
*net.TCPListener
}
func (ln emfileAwareTcpListener) Accept() (c net.Conn, err error) {
tc, err := ln.AcceptTCP()
if err != nil {
if oerr, ok := err.(*net.OpError); ok {
if serr, ok := oerr.Err.(*os.SyscallError); ok && serr.Err == syscall.EMFILE {
// This calls os.Exit(1) and terminates the process
log.Fatalf("%v", err)
}
}
// Default return
return
}
tc.SetKeepAlive(true)
tc.SetKeepAlivePeriod(3 * time.Minute)
return tc, nil
}
// CephExporter wraps all the ceph collectors and provides a single global // CephExporter wraps all the ceph collectors and provides a single global
// exporter to extracts metrics out of. It also ensures that the collection // exporter to extracts metrics out of. It also ensures that the collection
// is done in a thread-safe manner, the necessary requirement stated by // is done in a thread-safe manner, the necessary requirement stated by
@ -152,7 +181,16 @@ func main() {
}) })
log.Printf("Starting ceph exporter on %q", *addr) log.Printf("Starting ceph exporter on %q", *addr)
if err := http.ListenAndServe(*addr, nil); err != nil { // Below is essentially http.ListenAndServe(), but using our custom
log.Fatalf("cannot start ceph exporter: %s", err) // emfileAwareTcpListener that will die if we run out of file descriptors
ln, err := net.Listen("tcp", *addr)
if err == nil {
err := http.Serve(emfileAwareTcpListener{ln.(*net.TCPListener)}, nil)
if err != nil {
log.Fatalf("unable to serve requests: %s", err)
}
}
if err != nil {
log.Fatalf("unable to create listener: %s", err)
} }
} }