mirror of
https://github.com/digitalocean/ceph_exporter
synced 2024-12-11 17:56:41 +00:00
Terminate exporter process if maximum open files exceeded
This is somewhat of a workaround for the exporter becoming
perpetually blocked when it runs out of file descriptors if
the cluster is down for too long, as mentioned in:
https://github.com/digitalocean/ceph_exporter/issues/60#issuecomment-319396108
The problem is that if the MONs are down for long enough,
each time prometheus scrapes the metrics, another socket is
opened, but these block forever. If the cluster comes back
up before we run out of FDs, the blocked requests recover.
If the clusetr *doesn't* come back up before we run out of
FDs, the blocked requests never recover.
This commit causes ceph exporter to terminate if it runs
out of file descriptors, which IMO is better than blocking
forever -- it'll be a noisier failure, and also if you're
running ceph_exporter via systemd, systemd will then
automatically trigger a service restart.
Signed-off-by: Tim Serong <tserong@suse.com>
(cherry picked from commit bb1ad364b5
)
This commit is contained in:
parent
ccd6b7135b
commit
cd9aa031a8
42
exporter.go
42
exporter.go
@ -18,8 +18,12 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/ceph/go-ceph/rados"
|
||||
"github.com/digitalocean/ceph_exporter/collectors"
|
||||
@ -27,6 +31,31 @@ import (
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
)
|
||||
|
||||
// This horrible thing is a copy of tcpKeepAliveListener, tweaked to
|
||||
// specifically check if it hits EMFILE when doing an accept, and if so,
|
||||
// terminate the process.
|
||||
|
||||
type emfileAwareTcpListener struct {
|
||||
*net.TCPListener
|
||||
}
|
||||
|
||||
func (ln emfileAwareTcpListener) Accept() (c net.Conn, err error) {
|
||||
tc, err := ln.AcceptTCP()
|
||||
if err != nil {
|
||||
if oerr, ok := err.(*net.OpError); ok {
|
||||
if serr, ok := oerr.Err.(*os.SyscallError); ok && serr.Err == syscall.EMFILE {
|
||||
// This calls os.Exit(1) and terminates the process
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
}
|
||||
// Default return
|
||||
return
|
||||
}
|
||||
tc.SetKeepAlive(true)
|
||||
tc.SetKeepAlivePeriod(3 * time.Minute)
|
||||
return tc, nil
|
||||
}
|
||||
|
||||
// CephExporter wraps all the ceph collectors and provides a single global
|
||||
// exporter to extracts metrics out of. It also ensures that the collection
|
||||
// is done in a thread-safe manner, the necessary requirement stated by
|
||||
@ -152,7 +181,16 @@ func main() {
|
||||
})
|
||||
|
||||
log.Printf("Starting ceph exporter on %q", *addr)
|
||||
if err := http.ListenAndServe(*addr, nil); err != nil {
|
||||
log.Fatalf("cannot start ceph exporter: %s", err)
|
||||
// Below is essentially http.ListenAndServe(), but using our custom
|
||||
// emfileAwareTcpListener that will die if we run out of file descriptors
|
||||
ln, err := net.Listen("tcp", *addr)
|
||||
if err == nil {
|
||||
err := http.Serve(emfileAwareTcpListener{ln.(*net.TCPListener)}, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("unable to serve requests: %s", err)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatalf("unable to create listener: %s", err)
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user