2016-01-06 18:24:20 +00:00
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2016-01-10 20:35:53 +00:00
// Command ceph_exporter provides a Prometheus exporter for a Ceph cluster.
2016-01-06 18:24:20 +00:00
package main
import (
2017-11-23 09:24:31 +00:00
"net"
2016-01-06 18:24:20 +00:00
"net/http"
2017-11-23 09:24:31 +00:00
"os"
2016-01-06 18:24:20 +00:00
"sync"
2017-11-23 09:24:31 +00:00
"syscall"
"time"
2016-01-06 18:24:20 +00:00
2017-09-14 11:39:52 +00:00
"github.com/digitalocean/ceph_exporter/collectors"
2020-10-28 18:42:52 +00:00
"github.com/ianschenck/envflag"
2016-01-06 18:24:20 +00:00
"github.com/prometheus/client_golang/prometheus"
2017-09-14 11:39:52 +00:00
"github.com/prometheus/client_golang/prometheus/promhttp"
2020-10-28 18:42:52 +00:00
"github.com/sirupsen/logrus"
2016-01-06 18:24:20 +00:00
)
2018-08-01 13:37:07 +00:00
const (
defaultCephClusterLabel = "ceph"
defaultCephConfigPath = "/etc/ceph/ceph.conf"
2020-10-28 18:42:52 +00:00
defaultCephUser = "admin"
defaultRadosOpTimeout = 30 * time . Second
2018-08-01 13:37:07 +00:00
)
2017-11-23 09:24:31 +00:00
// This horrible thing is a copy of tcpKeepAliveListener, tweaked to
// specifically check if it hits EMFILE when doing an accept, and if so,
// terminate the process.
2018-07-09 09:08:14 +00:00
const keepAlive time . Duration = 3 * time . Minute
2017-11-23 09:24:31 +00:00
type emfileAwareTcpListener struct {
* net . TCPListener
2020-10-28 18:42:52 +00:00
logger * logrus . Logger
2017-11-23 09:24:31 +00:00
}
func ( ln emfileAwareTcpListener ) Accept ( ) ( c net . Conn , err error ) {
tc , err := ln . AcceptTCP ( )
if err != nil {
if oerr , ok := err . ( * net . OpError ) ; ok {
if serr , ok := oerr . Err . ( * os . SyscallError ) ; ok && serr . Err == syscall . EMFILE {
2020-10-28 18:42:52 +00:00
ln . logger . WithError ( err ) . Fatal ( "running out of file descriptors" )
2017-11-23 09:24:31 +00:00
}
}
// Default return
return
}
tc . SetKeepAlive ( true )
2018-07-09 09:08:14 +00:00
tc . SetKeepAlivePeriod ( keepAlive )
2017-11-23 09:24:31 +00:00
return tc , nil
}
2016-01-06 18:24:20 +00:00
// CephExporter wraps all the ceph collectors and provides a single global
// exporter to extracts metrics out of. It also ensures that the collection
// is done in a thread-safe manner, the necessary requirement stated by
// prometheus. It also implements a prometheus.Collector interface in order
// to register it correctly.
type CephExporter struct {
mu sync . Mutex
collectors [ ] prometheus . Collector
2020-10-28 18:42:52 +00:00
logger * logrus . Logger
2016-01-06 18:24:20 +00:00
}
// Verify that the exporter implements the interface correctly.
var _ prometheus . Collector = & CephExporter { }
// NewCephExporter creates an instance to CephExporter and returns a reference
// to it. We can choose to enable a collector to extract stats out of by adding
// it to the list of collectors.
2020-10-28 18:42:52 +00:00
func NewCephExporter ( conn collectors . Conn , cluster string , config string , rgwMode int , logger * logrus . Logger ) * CephExporter {
2018-08-01 13:37:07 +00:00
c := & CephExporter {
2016-01-06 18:24:20 +00:00
collectors : [ ] prometheus . Collector {
2020-10-28 18:42:52 +00:00
collectors . NewClusterUsageCollector ( conn , cluster , logger ) ,
collectors . NewPoolUsageCollector ( conn , cluster , logger ) ,
collectors . NewPoolInfoCollector ( conn , cluster , logger ) ,
collectors . NewClusterHealthCollector ( conn , cluster , logger ) ,
collectors . NewMonitorCollector ( conn , cluster , logger ) ,
collectors . NewOSDCollector ( conn , cluster , logger ) ,
2016-01-06 18:24:20 +00:00
} ,
2020-10-28 18:42:52 +00:00
logger : logger ,
2016-01-06 18:24:20 +00:00
}
2018-08-01 13:37:07 +00:00
2018-08-10 19:43:02 +00:00
switch rgwMode {
case collectors . RGWModeForeground :
2018-08-01 13:37:07 +00:00
c . collectors = append ( c . collectors ,
2020-10-28 18:42:52 +00:00
collectors . NewRGWCollector ( cluster , config , false , logger ) ,
2018-08-01 13:37:07 +00:00
)
2018-08-10 19:43:02 +00:00
case collectors . RGWModeBackground :
c . collectors = append ( c . collectors ,
2020-10-28 18:42:52 +00:00
collectors . NewRGWCollector ( cluster , config , true , logger ) ,
2018-08-10 19:43:02 +00:00
)
case collectors . RGWModeDisabled :
// nothing to do
default :
2020-10-28 18:42:52 +00:00
logger . WithField ( "rgwMode" , rgwMode ) . Warn ( "RGW Collector Disabled do to invalid mode" )
2018-08-01 13:37:07 +00:00
}
return c
2016-01-06 18:24:20 +00:00
}
// Describe sends all the descriptors of the collectors included to
// the provided channel.
func ( c * CephExporter ) Describe ( ch chan <- * prometheus . Desc ) {
for _ , cc := range c . collectors {
cc . Describe ( ch )
}
}
// Collect sends the collected metrics from each of the collectors to
// prometheus. Collect could be called several times concurrently
// and thus its run is protected by a single mutex.
func ( c * CephExporter ) Collect ( ch chan <- prometheus . Metric ) {
c . mu . Lock ( )
defer c . mu . Unlock ( )
for _ , cc := range c . collectors {
cc . Collect ( ch )
}
}
func main ( ) {
var (
2020-10-28 18:42:52 +00:00
metricsAddr = envflag . String ( "TELEMETRY_ADDR" , ":9128" , "Host:Port for ceph_exporter's metrics endpoint" )
metricsPath = envflag . String ( "TELEMETRY_PATH" , "/metrics" , "URL path for surfacing metrics to Prometheus" )
exporterConfig = envflag . String ( "EXPORTER_CONFIG" , "/etc/ceph/exporter.yml" , "Path to ceph_exporter config" )
rgwMode = envflag . Int ( "RGW_MODE" , 0 , "Enable collection of stats from RGW (0:disabled 1:enabled 2:background)" )
2016-01-06 18:24:20 +00:00
2020-10-28 18:42:52 +00:00
logLevel = envflag . String ( "LOG_LEVEL" , "info" , "Logging level. One of: [trace, debug, info, warn, error, fatal, panic]" )
2018-08-01 13:37:07 +00:00
2020-10-28 18:42:52 +00:00
cephCluster = envflag . String ( "CEPH_CLUSTER" , defaultCephClusterLabel , "Ceph cluster name" )
cephConfig = envflag . String ( "CEPH_CONFIG" , defaultCephConfigPath , "Path to Ceph config file" )
cephUser = envflag . String ( "CEPH_USER" , defaultCephUser , "Ceph user to connect to cluster" )
cephRadosOpTimeout = envflag . Duration ( "CEPH_RADOS_OP_TIMEOUT" , defaultRadosOpTimeout , "Ceph rados_osd_op_timeout and rados_mon_op_timeout used to contact cluster (0s means no limit)" )
2016-01-06 18:24:20 +00:00
)
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
envflag . Parse ( )
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
logger := logrus . New ( )
logger . SetFormatter ( & logrus . TextFormatter {
FullTimestamp : true ,
} )
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
if v , err := logrus . ParseLevel ( * logLevel ) ; err != nil {
logger . WithError ( err ) . Warn ( "error setting log level" )
2016-01-06 18:24:20 +00:00
} else {
2020-10-28 18:42:52 +00:00
logger . SetLevel ( v )
}
2019-10-09 17:19:32 +00:00
2020-10-28 18:42:52 +00:00
clusterConfigs := ( [ ] * ClusterConfig ) ( nil )
2019-10-09 17:19:32 +00:00
2020-10-28 18:42:52 +00:00
if fileExists ( * exporterConfig ) {
cfg , err := ParseConfig ( * exporterConfig )
if err != nil {
logger . WithError ( err ) . WithField (
"file" , * exporterConfig ,
) . Fatal ( "error parsing ceph_exporter config file" )
2017-03-23 20:20:25 +00:00
}
2020-10-28 18:42:52 +00:00
clusterConfigs = cfg . Cluster
} else {
clusterConfigs = [ ] * ClusterConfig {
{
ClusterLabel : * cephCluster ,
User : * cephUser ,
ConfigFile : * cephConfig ,
} ,
2017-03-23 20:20:25 +00:00
}
2020-10-28 18:42:52 +00:00
}
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
for _ , cluster := range clusterConfigs {
conn := collectors . NewRadosConn (
cluster . User ,
cluster . ConfigFile ,
* cephRadosOpTimeout ,
logger )
prometheus . MustRegister ( NewCephExporter (
conn ,
cluster . ClusterLabel ,
cluster . ConfigFile ,
* rgwMode ,
logger ) )
logger . WithField ( "cluster" , cluster . ClusterLabel ) . Info ( "exporting cluster" )
2016-01-06 18:24:20 +00:00
}
2017-09-14 11:39:52 +00:00
http . Handle ( * metricsPath , promhttp . Handler ( ) )
2016-01-06 18:24:20 +00:00
http . HandleFunc ( "/" , func ( w http . ResponseWriter , r * http . Request ) {
2016-12-29 22:08:20 +00:00
w . Write ( [ ] byte ( ` < html >
< head > < title > Ceph Exporter < / title > < / head >
< body >
< h1 > Ceph Exporter < / h1 >
< p > < a href = ' ` + *metricsPath + ` ' > Metrics < / a > < / p >
< / body >
< / html > ` ) )
2016-01-06 18:24:20 +00:00
} )
2020-10-28 18:42:52 +00:00
logger . WithField ( "endpoint" , * metricsAddr ) . Info ( "starting ceph_exporter listener" )
2017-11-23 09:24:31 +00:00
// Below is essentially http.ListenAndServe(), but using our custom
// emfileAwareTcpListener that will die if we run out of file descriptors
2020-10-28 18:42:52 +00:00
ln , err := net . Listen ( "tcp" , * metricsAddr )
if err != nil {
logrus . WithError ( err ) . Fatal ( "error creating listener" )
2017-11-23 09:24:31 +00:00
}
2020-10-28 18:42:52 +00:00
err = http . Serve ( emfileAwareTcpListener { ln . ( * net . TCPListener ) , logger } , nil )
2017-11-23 09:24:31 +00:00
if err != nil {
2020-10-28 18:42:52 +00:00
logrus . WithError ( err ) . Fatal ( "error serving requests" )
2016-01-06 18:24:20 +00:00
}
}