2022-03-23 21:02:21 +00:00
// Copyright 2022 DigitalOcean
2016-01-06 18:24:20 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2016-01-10 20:35:53 +00:00
// Command ceph_exporter provides a Prometheus exporter for a Ceph cluster.
2016-01-06 18:24:20 +00:00
package main
import (
2022-03-23 18:43:13 +00:00
"crypto/tls"
2017-11-23 09:24:31 +00:00
"net"
2016-01-06 18:24:20 +00:00
"net/http"
2017-11-23 09:24:31 +00:00
"os"
"syscall"
"time"
2016-01-06 18:24:20 +00:00
2020-10-28 18:42:52 +00:00
"github.com/ianschenck/envflag"
2016-01-06 18:24:20 +00:00
"github.com/prometheus/client_golang/prometheus"
2017-09-14 11:39:52 +00:00
"github.com/prometheus/client_golang/prometheus/promhttp"
2020-10-28 18:42:52 +00:00
"github.com/sirupsen/logrus"
2022-02-16 18:00:05 +00:00
2022-02-23 23:43:46 +00:00
"github.com/digitalocean/ceph_exporter/ceph"
2022-02-24 20:12:52 +00:00
"github.com/digitalocean/ceph_exporter/rados"
2016-01-06 18:24:20 +00:00
)
2018-08-01 13:37:07 +00:00
const (
defaultCephClusterLabel = "ceph"
defaultCephConfigPath = "/etc/ceph/ceph.conf"
2020-10-28 18:42:52 +00:00
defaultCephUser = "admin"
defaultRadosOpTimeout = 30 * time . Second
2018-08-01 13:37:07 +00:00
)
2017-11-23 09:24:31 +00:00
// This horrible thing is a copy of tcpKeepAliveListener, tweaked to
// specifically check if it hits EMFILE when doing an accept, and if so,
// terminate the process.
2018-07-09 09:08:14 +00:00
const keepAlive time . Duration = 3 * time . Minute
2017-11-23 09:24:31 +00:00
type emfileAwareTcpListener struct {
* net . TCPListener
2020-10-28 18:42:52 +00:00
logger * logrus . Logger
2017-11-23 09:24:31 +00:00
}
func ( ln emfileAwareTcpListener ) Accept ( ) ( c net . Conn , err error ) {
tc , err := ln . AcceptTCP ( )
if err != nil {
if oerr , ok := err . ( * net . OpError ) ; ok {
if serr , ok := oerr . Err . ( * os . SyscallError ) ; ok && serr . Err == syscall . EMFILE {
2020-10-28 18:42:52 +00:00
ln . logger . WithError ( err ) . Fatal ( "running out of file descriptors" )
2017-11-23 09:24:31 +00:00
}
}
// Default return
return
}
tc . SetKeepAlive ( true )
2018-07-09 09:08:14 +00:00
tc . SetKeepAlivePeriod ( keepAlive )
2017-11-23 09:24:31 +00:00
return tc , nil
}
2016-01-06 18:24:20 +00:00
// Verify that the exporter implements the interface correctly.
2022-02-23 23:43:46 +00:00
var _ prometheus . Collector = & ceph . Exporter { }
2016-01-06 18:24:20 +00:00
func main ( ) {
var (
2020-10-28 18:42:52 +00:00
metricsAddr = envflag . String ( "TELEMETRY_ADDR" , ":9128" , "Host:Port for ceph_exporter's metrics endpoint" )
metricsPath = envflag . String ( "TELEMETRY_PATH" , "/metrics" , "URL path for surfacing metrics to Prometheus" )
exporterConfig = envflag . String ( "EXPORTER_CONFIG" , "/etc/ceph/exporter.yml" , "Path to ceph_exporter config" )
rgwMode = envflag . Int ( "RGW_MODE" , 0 , "Enable collection of stats from RGW (0:disabled 1:enabled 2:background)" )
2016-01-06 18:24:20 +00:00
2020-10-28 18:42:52 +00:00
logLevel = envflag . String ( "LOG_LEVEL" , "info" , "Logging level. One of: [trace, debug, info, warn, error, fatal, panic]" )
2018-08-01 13:37:07 +00:00
2020-10-28 18:42:52 +00:00
cephCluster = envflag . String ( "CEPH_CLUSTER" , defaultCephClusterLabel , "Ceph cluster name" )
cephConfig = envflag . String ( "CEPH_CONFIG" , defaultCephConfigPath , "Path to Ceph config file" )
cephUser = envflag . String ( "CEPH_USER" , defaultCephUser , "Ceph user to connect to cluster" )
cephRadosOpTimeout = envflag . Duration ( "CEPH_RADOS_OP_TIMEOUT" , defaultRadosOpTimeout , "Ceph rados_osd_op_timeout and rados_mon_op_timeout used to contact cluster (0s means no limit)" )
2022-03-22 17:40:40 +00:00
tlsCertPath = envflag . String ( "TLS_CERT_FILE_PATH" , "" , "Path to certificate file for TLS" )
tlsKeyPath = envflag . String ( "TLS_KEY_FILE_PATH" , "" , "Path to key file for TLS" )
2016-01-06 18:24:20 +00:00
)
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
envflag . Parse ( )
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
logger := logrus . New ( )
logger . SetFormatter ( & logrus . TextFormatter {
FullTimestamp : true ,
} )
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
if v , err := logrus . ParseLevel ( * logLevel ) ; err != nil {
logger . WithError ( err ) . Warn ( "error setting log level" )
2016-01-06 18:24:20 +00:00
} else {
2020-10-28 18:42:52 +00:00
logger . SetLevel ( v )
}
2019-10-09 17:19:32 +00:00
2020-10-28 18:42:52 +00:00
clusterConfigs := ( [ ] * ClusterConfig ) ( nil )
2019-10-09 17:19:32 +00:00
2020-10-28 18:42:52 +00:00
if fileExists ( * exporterConfig ) {
cfg , err := ParseConfig ( * exporterConfig )
if err != nil {
logger . WithError ( err ) . WithField (
"file" , * exporterConfig ,
) . Fatal ( "error parsing ceph_exporter config file" )
2017-03-23 20:20:25 +00:00
}
2020-10-28 18:42:52 +00:00
clusterConfigs = cfg . Cluster
} else {
clusterConfigs = [ ] * ClusterConfig {
{
ClusterLabel : * cephCluster ,
User : * cephUser ,
ConfigFile : * cephConfig ,
} ,
2017-03-23 20:20:25 +00:00
}
2020-10-28 18:42:52 +00:00
}
2017-03-23 20:20:25 +00:00
2020-10-28 18:42:52 +00:00
for _ , cluster := range clusterConfigs {
2023-03-23 20:35:12 +00:00
conn , err := rados . NewRadosConn (
2020-10-28 18:42:52 +00:00
cluster . User ,
cluster . ConfigFile ,
* cephRadosOpTimeout ,
logger )
2023-03-23 20:35:12 +00:00
if err != nil {
2023-11-23 16:46:50 +00:00
logger . WithError ( err ) . WithField ( "cluster" , cluster . ClusterLabel ) . Fatal ( "unable to create rados connection for cluster" )
2023-03-23 20:35:12 +00:00
}
2022-02-23 23:43:46 +00:00
prometheus . MustRegister ( ceph . NewExporter (
2020-10-28 18:42:52 +00:00
conn ,
cluster . ClusterLabel ,
cluster . ConfigFile ,
2022-08-25 22:20:57 +00:00
cluster . User ,
2020-10-28 18:42:52 +00:00
* rgwMode ,
logger ) )
logger . WithField ( "cluster" , cluster . ClusterLabel ) . Info ( "exporting cluster" )
2016-01-06 18:24:20 +00:00
}
2017-09-14 11:39:52 +00:00
http . Handle ( * metricsPath , promhttp . Handler ( ) )
2016-01-06 18:24:20 +00:00
http . HandleFunc ( "/" , func ( w http . ResponseWriter , r * http . Request ) {
2016-12-29 22:08:20 +00:00
w . Write ( [ ] byte ( ` < html >
< head > < title > Ceph Exporter < / title > < / head >
< body >
< h1 > Ceph Exporter < / h1 >
< p > < a href = ' ` + *metricsPath + ` ' > Metrics < / a > < / p >
< / body >
< / html > ` ) )
2016-01-06 18:24:20 +00:00
} )
2020-10-28 18:42:52 +00:00
logger . WithField ( "endpoint" , * metricsAddr ) . Info ( "starting ceph_exporter listener" )
2017-11-23 09:24:31 +00:00
// Below is essentially http.ListenAndServe(), but using our custom
// emfileAwareTcpListener that will die if we run out of file descriptors
2020-10-28 18:42:52 +00:00
ln , err := net . Listen ( "tcp" , * metricsAddr )
if err != nil {
logrus . WithError ( err ) . Fatal ( "error creating listener" )
2017-11-23 09:24:31 +00:00
}
2020-10-28 18:42:52 +00:00
2022-03-22 17:40:40 +00:00
if len ( * tlsCertPath ) != 0 && len ( * tlsKeyPath ) != 0 {
2022-03-23 18:43:13 +00:00
server := & http . Server {
TLSConfig : & tls . Config {
GetCertificate : func ( info * tls . ClientHelloInfo ) ( * tls . Certificate , error ) {
caFiles , err := tls . LoadX509KeyPair ( * tlsCertPath , * tlsKeyPath )
if err != nil {
return nil , err
}
return & caFiles , nil
} ,
} ,
}
err = server . ServeTLS ( emfileAwareTcpListener { ln . ( * net . TCPListener ) , logger } , "" , "" )
2022-03-22 17:40:40 +00:00
if err != nil {
logrus . WithError ( err ) . Fatal ( "error serving TLS requests" )
}
} else {
err = http . Serve ( emfileAwareTcpListener { ln . ( * net . TCPListener ) , logger } , nil )
if err != nil {
logrus . WithError ( err ) . Fatal ( "error serving requests" )
}
2016-01-06 18:24:20 +00:00
}
}