2022-03-23 21:02:21 +00:00
// Copyright 2022 DigitalOcean
2016-01-06 18:24:20 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2022-02-23 23:43:46 +00:00
package ceph
2016-01-06 18:24:20 +00:00
import (
"encoding/json"
2022-10-06 15:52:45 +00:00
"fmt"
2020-07-10 18:49:42 +00:00
"math"
2016-01-06 18:24:20 +00:00
"github.com/prometheus/client_golang/prometheus"
2020-10-28 18:42:52 +00:00
"github.com/sirupsen/logrus"
2016-01-06 18:24:20 +00:00
)
2020-06-26 19:19:59 +00:00
// PoolUsageCollector displays statistics about each pool in the Ceph cluster.
2016-01-06 18:24:20 +00:00
type PoolUsageCollector struct {
2022-02-23 23:43:46 +00:00
conn Conn
logger * logrus . Logger
version * Version
2016-01-06 18:24:20 +00:00
// UsedBytes tracks the amount of bytes currently allocated for the pool. This
// does not factor in the overcommitment made for individual images.
2022-10-06 15:52:45 +00:00
UsedBytes * prometheus . Desc
2016-01-06 18:24:20 +00:00
2016-10-19 09:45:05 +00:00
// RawUsedBytes tracks the amount of raw bytes currently used for the pool. This
// factors in the replication factor (size) of the pool.
2022-10-06 15:52:45 +00:00
RawUsedBytes * prometheus . Desc
2016-10-19 09:45:05 +00:00
2016-05-31 19:21:25 +00:00
// MaxAvail tracks the amount of bytes currently free for the pool,
// which depends on the replication settings for the pool in question.
2022-10-06 15:52:45 +00:00
MaxAvail * prometheus . Desc
2016-05-31 19:21:25 +00:00
2020-06-26 19:19:59 +00:00
// PercentUsed is the percentage of raw space available to the pool currently in use
2022-10-06 15:52:45 +00:00
PercentUsed * prometheus . Desc
2020-06-26 19:19:59 +00:00
2016-01-06 18:24:20 +00:00
// Objects shows the no. of RADOS objects created within the pool.
2022-10-06 15:52:45 +00:00
Objects * prometheus . Desc
2016-01-06 18:24:20 +00:00
2016-11-01 00:46:43 +00:00
// DirtyObjects shows the no. of RADOS dirty objects in a cache-tier pool,
2016-10-19 09:45:05 +00:00
// this doesn't make sense in a regular pool, see:
// http://lists.ceph.com/pipermail/ceph-users-ceph.com/2015-April/000557.html
2022-10-06 15:52:45 +00:00
DirtyObjects * prometheus . Desc
2016-10-19 09:45:05 +00:00
2020-09-10 18:32:03 +00:00
// UnfoundObjects shows the no. of RADOS unfound object within each pool.
2022-10-06 15:52:45 +00:00
UnfoundObjects * prometheus . Desc
2020-09-10 18:32:03 +00:00
2016-01-06 18:24:20 +00:00
// ReadIO tracks the read IO calls made for the images within each pool.
2022-10-06 15:52:45 +00:00
ReadIO * prometheus . Desc
2016-01-06 18:24:20 +00:00
2016-10-19 09:45:05 +00:00
// Readbytes tracks the read throughput made for the images within each pool.
2022-10-06 15:52:45 +00:00
ReadBytes * prometheus . Desc
2016-10-19 09:45:05 +00:00
2016-01-06 18:24:20 +00:00
// WriteIO tracks the write IO calls made for the images within each pool.
2022-10-06 15:52:45 +00:00
WriteIO * prometheus . Desc
2016-10-19 09:45:05 +00:00
// WriteBytes tracks the write throughput made for the images within each pool.
2022-10-06 15:52:45 +00:00
WriteBytes * prometheus . Desc
2016-01-06 18:24:20 +00:00
}
// NewPoolUsageCollector creates a new instance of PoolUsageCollector and returns
// its reference.
2022-02-23 23:43:46 +00:00
func NewPoolUsageCollector ( exporter * Exporter ) * PoolUsageCollector {
2016-06-06 14:40:50 +00:00
var (
subSystem = "pool"
poolLabel = [ ] string { "pool" }
)
2017-03-23 20:20:25 +00:00
labels := make ( prometheus . Labels )
2022-02-23 23:43:46 +00:00
labels [ "cluster" ] = exporter . Cluster
2017-03-23 20:20:25 +00:00
2016-01-06 18:24:20 +00:00
return & PoolUsageCollector {
2022-02-23 23:43:46 +00:00
conn : exporter . Conn ,
logger : exporter . Logger ,
version : exporter . Version ,
2016-01-06 18:24:20 +00:00
2022-10-06 15:52:45 +00:00
UsedBytes : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_used_bytes" , cephNamespace , subSystem ) , "Capacity of the pool that is currently under use" ,
poolLabel , labels ,
2016-01-06 18:24:20 +00:00
) ,
2022-10-06 15:52:45 +00:00
RawUsedBytes : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_raw_used_bytes" , cephNamespace , subSystem ) , "Raw capacity of the pool that is currently under use, this factors in the size" ,
poolLabel , labels ,
2016-10-19 09:45:05 +00:00
) ,
2022-10-06 15:52:45 +00:00
MaxAvail : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_available_bytes" , cephNamespace , subSystem ) , "Free space for the pool" ,
poolLabel , labels ,
2020-06-26 19:19:59 +00:00
) ,
2022-10-06 15:52:45 +00:00
PercentUsed : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_percent_used" , cephNamespace , subSystem ) , "Percentage of the capacity available to this pool that is used by this pool" ,
poolLabel , labels ,
2016-05-31 19:21:25 +00:00
) ,
2022-10-06 15:52:45 +00:00
Objects : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_objects_total" , cephNamespace , subSystem ) , "Total no. of objects allocated within the pool" ,
poolLabel , labels ,
2016-01-06 18:24:20 +00:00
) ,
2022-10-06 15:52:45 +00:00
DirtyObjects : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_dirty_objects_total" , cephNamespace , subSystem ) , "Total no. of dirty objects in a cache-tier pool" ,
poolLabel , labels ,
2016-10-19 09:45:05 +00:00
) ,
2022-10-06 15:52:45 +00:00
UnfoundObjects : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_unfound_objects_total" , cephNamespace , subSystem ) , "Total no. of unfound objects for the pool" ,
poolLabel , labels ,
2020-09-10 18:32:03 +00:00
) ,
2022-10-06 15:52:45 +00:00
ReadIO : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_read_total" , cephNamespace , subSystem ) , "Total read I/O calls for the pool" ,
poolLabel , labels ,
2016-01-06 18:24:20 +00:00
) ,
2022-10-06 15:52:45 +00:00
ReadBytes : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_read_bytes_total" , cephNamespace , subSystem ) , "Total read throughput for the pool" ,
poolLabel , labels ,
2016-10-19 09:45:05 +00:00
) ,
2022-10-06 15:52:45 +00:00
WriteIO : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_write_total" , cephNamespace , subSystem ) , "Total write I/O calls for the pool" ,
poolLabel , labels ,
2016-01-06 18:24:20 +00:00
) ,
2022-10-06 15:52:45 +00:00
WriteBytes : prometheus . NewDesc ( fmt . Sprintf ( "%s_%s_write_bytes_total" , cephNamespace , subSystem ) , "Total write throughput for the pool" ,
poolLabel , labels ,
2016-10-19 09:45:05 +00:00
) ,
2016-01-06 18:24:20 +00:00
}
}
type cephPoolStats struct {
Pools [ ] struct {
Name string ` json:"name" `
ID int ` json:"id" `
Stats struct {
2016-10-19 09:45:05 +00:00
BytesUsed float64 ` json:"bytes_used" `
2020-06-05 19:37:45 +00:00
StoredRaw float64 ` json:"stored_raw" `
2020-07-10 22:43:48 +00:00
Stored float64 ` json:"stored" `
2016-10-19 09:45:05 +00:00
MaxAvail float64 ` json:"max_avail" `
2020-06-26 19:19:59 +00:00
PercentUsed float64 ` json:"percent_used" `
2016-10-19 09:45:05 +00:00
Objects float64 ` json:"objects" `
DirtyObjects float64 ` json:"dirty" `
ReadIO float64 ` json:"rd" `
ReadBytes float64 ` json:"rd_bytes" `
WriteIO float64 ` json:"wr" `
WriteBytes float64 ` json:"wr_bytes" `
2016-01-06 18:24:20 +00:00
} ` json:"stats" `
} ` json:"pools" `
}
2022-10-06 15:52:45 +00:00
func ( p * PoolUsageCollector ) collect ( ch chan <- prometheus . Metric ) error {
2016-01-06 18:24:20 +00:00
cmd := p . cephUsageCommand ( )
buf , _ , err := p . conn . MonCommand ( cmd )
if err != nil {
2020-10-28 18:42:52 +00:00
p . logger . WithError ( err ) . WithField (
"args" , string ( cmd ) ,
) . Error ( "error executing mon command" )
2016-01-06 18:24:20 +00:00
return err
}
stats := & cephPoolStats { }
if err := json . Unmarshal ( buf , stats ) ; err != nil {
return err
}
for _ , pool := range stats . Pools {
2022-10-06 15:52:45 +00:00
ch <- prometheus . MustNewConstMetric ( p . UsedBytes , prometheus . GaugeValue , pool . Stats . Stored , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . RawUsedBytes , prometheus . GaugeValue , math . Max ( pool . Stats . StoredRaw , pool . Stats . BytesUsed ) , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . MaxAvail , prometheus . GaugeValue , pool . Stats . MaxAvail , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . PercentUsed , prometheus . GaugeValue , pool . Stats . PercentUsed , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . Objects , prometheus . GaugeValue , pool . Stats . Objects , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . DirtyObjects , prometheus . GaugeValue , pool . Stats . DirtyObjects , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . ReadIO , prometheus . GaugeValue , pool . Stats . ReadIO , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . ReadBytes , prometheus . GaugeValue , pool . Stats . ReadBytes , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . WriteIO , prometheus . GaugeValue , pool . Stats . WriteIO , pool . Name )
ch <- prometheus . MustNewConstMetric ( p . WriteBytes , prometheus . GaugeValue , pool . Stats . WriteBytes , pool . Name )
2020-09-10 18:32:03 +00:00
2020-10-28 18:42:52 +00:00
st , err := p . conn . GetPoolStats ( pool . Name )
2020-09-10 18:32:03 +00:00
if err != nil {
2020-10-28 18:42:52 +00:00
p . logger . WithError ( err ) . WithField (
"pool" , pool . Name ,
) . Error ( "error getting pool stats" )
2020-09-10 18:32:03 +00:00
continue
}
2022-10-06 15:52:45 +00:00
ch <- prometheus . MustNewConstMetric ( p . UnfoundObjects , prometheus . GaugeValue , float64 ( st . ObjectsUnfound ) , pool . Name )
2016-01-06 18:24:20 +00:00
}
return nil
}
func ( p * PoolUsageCollector ) cephUsageCommand ( ) [ ] byte {
cmd , err := json . Marshal ( map [ string ] interface { } {
"prefix" : "df" ,
"detail" : "detail" ,
"format" : "json" ,
} )
if err != nil {
2020-10-28 18:42:52 +00:00
p . logger . WithError ( err ) . Panic ( "error marshalling ceph df detail" )
2016-01-06 18:24:20 +00:00
}
return cmd
}
// Describe fulfills the prometheus.Collector's interface and sends the descriptors
// of pool's metrics to the given channel.
func ( p * PoolUsageCollector ) Describe ( ch chan <- * prometheus . Desc ) {
2022-10-06 15:52:45 +00:00
ch <- p . UsedBytes
ch <- p . RawUsedBytes
ch <- p . MaxAvail
ch <- p . PercentUsed
ch <- p . Objects
ch <- p . DirtyObjects
ch <- p . UnfoundObjects
ch <- p . ReadIO
ch <- p . ReadBytes
ch <- p . WriteIO
ch <- p . WriteBytes
2016-01-06 18:24:20 +00:00
}
// Collect extracts the current values of all the metrics and sends them to the
// prometheus channel.
func ( p * PoolUsageCollector ) Collect ( ch chan <- prometheus . Metric ) {
2020-10-28 18:42:52 +00:00
p . logger . Debug ( "collecting pool usage metrics" )
2022-10-06 15:52:45 +00:00
if err := p . collect ( ch ) ; err != nil {
2020-10-28 18:42:52 +00:00
p . logger . WithError ( err ) . Error ( "error collecting pool usage metrics" )
2016-01-06 18:24:20 +00:00
return
}
}