Merge pull request #210 from digitalocean/more-new-stuff

4.0-rc1
This commit is contained in:
Kyle 2022-03-23 13:31:49 -07:00 committed by GitHub
commit 00c0dacc02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 95 additions and 312 deletions

View File

@ -1,17 +0,0 @@
# Copyright 2015 The Prometheus Authors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
VERSION := 0.1.0
TARGET := ceph_exporter
include Makefile.COMMON

View File

@ -1,132 +0,0 @@
# Copyright 2015 The Prometheus Authors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# THE AUTHORITATIVE VERSION OF THIS MAKEFILE LIVES IN:
#
# https://github.com/prometheus/utils
#
# PLEASE MAKE ANY CHANGES THERE AND PROPAGATE THEM TO ALL PROMETHEUS
# REPOSITORIES THAT ARE USING THIS MAKEFILE.
#
# This file provides common Makefile infrastructure for several Prometheus
# components. This includes make tasks for downloading Go, setting up a
# self-contained build environment, fetching Go dependencies, building
# binaries, running tests, and doing release management. This file is intended
# to be included from a project's Makefile, which needs to define the following
# variables, at a minimum:
#
# * VERSION - The current version of the project in question.
# * TARGET - The desired name of the built binary.
#
# Many of the variables defined below are defined conditionally (using '?'),
# which allows the project's main Makefile to override any of these settings, if
# needed. See also:
#
# https://www.gnu.org/software/make/manual/html_node/Flavors.html#Flavors.
#
# The including Makefile may define any number of extra targets that are
# specific to that project.
VERSION ?= $(error VERSION not set in including Makefile)
TARGET ?= $(error TARGET not set in including Makefile)
SRC ?= $(shell find . -type f -name "*.go" ! -path "./.build/*")
GOOS ?= $(shell uname | tr A-Z a-z)
GOARCH ?= $(subst x86_64,amd64,$(patsubst i%86,386,$(shell uname -m)))
GO_VERSION ?= 1.13
# Check for the correct version of go in the path. If we find it, use it.
# Otherwise, prepare to build go locally.
ifeq ($(shell command -v "go" >/dev/null && go version | sed -e 's/^[^0-9.]*\([0-9.]*\).*/\1/'), $(GO_VERSION))
GOCC ?= $(shell command -v "go")
GOFMT ?= $(shell command -v "gofmt")
GO ?= $(GOCC)
else
GOURL ?= https://golang.org/dl
GOPKG ?= go$(GO_VERSION).$(GOOS)-$(GOARCH).tar.gz
GOROOT ?= $(CURDIR)/.build/go$(GO_VERSION)
GOCC ?= $(GOROOT)/bin/go
GOFMT ?= $(GOROOT)/bin/gofmt
GO ?= GOROOT=$(GOROOT) $(GOCC)
endif
# Use vendored dependencies if available. Otherwise try to download them.
ifneq (,$(wildcard vendor))
DEPENDENCIES := $(shell find vendor/ -type f -iname '*.go')
GO := GO15VENDOREXPERIMENT=1 $(GO)
else
GOPATH := $(CURDIR)/.build/gopath
ROOTPKG ?= github.com/prometheus/$(TARGET)
SELFLINK ?= $(GOPATH)/src/$(ROOTPKG)
DEPENDENCIES := dependencies-stamp
GO := GOPATH=$(GOPATH) $(GO)
endif
# Never honor GOBIN, should it be set at all.
unexport GOBIN
SUFFIX ?= $(GOOS)-$(GOARCH)
BINARY ?= $(TARGET)
ARCHIVE ?= $(TARGET)-$(VERSION).$(SUFFIX).tar.gz
default: $(BINARY)
$(BINARY): $(GOCC) $(SRC) $(DEPENDENCIES) Makefile Makefile.COMMON
$(GO) build $(GOFLAGS) -o $@
.PHONY: archive
archive: $(ARCHIVE)
$(ARCHIVE): $(BINARY)
tar -czf $@ $<
.PHONY: tag
tag:
git tag $(VERSION)
git push --tags
.PHONY: test
test: $(GOCC) $(DEPENDENCIES)
$(GO) test $$($(GO) list ./... | grep -v /vendor/)
.PHONY: format
format: $(GOCC)
find . -iname '*.go' | egrep -v "^\./\.build|./generated|\./vendor|\.(l|y)\.go" | xargs -n1 $(GOFMT) -w -s=true
.PHONY: clean
clean:
rm -rf $(BINARY) $(ARCHIVE) .build *-stamp
$(GOCC):
@echo Go version $(GO_VERSION) required but not found in PATH.
@echo About to download and install go$(GO_VERSION) to $(GOROOT)
@echo Abort now if you want to manually install it system-wide instead.
@echo
@sleep 5
mkdir -p .build
# The archive contains a single directory called 'go/'.
curl -L $(GOURL)/$(GOPKG) | tar -C .build -xzf -
rm -rf $(GOROOT)
mv .build/go $(GOROOT)
$(SELFLINK):
mkdir -p $(dir $@)
ln -s $(CURDIR) $@
# Download dependencies if project doesn't vendor them.
dependencies-stamp: $(GOCC) $(SRC) | $(SELFLINK)
$(GO) get -d
touch $@

View File

@ -1,4 +1,4 @@
# Ceph Exporter [![GoDoc](https://godoc.org/github.com/digitalocean/ceph_exporter?status.svg)](https://godoc.org/github.com/digitalocean/ceph_exporter) [![Build Status](https://travis-ci.org/digitalocean/ceph_exporter.svg)](https://travis-ci.org/digitalocean/ceph_exporter) [![Coverage Status](https://coveralls.io/repos/github/digitalocean/ceph_exporter/badge.svg?branch=master&service=github)](https://coveralls.io/github/digitalocean/ceph_exporter?branch=master) [![Go Report Card](https://goreportcard.com/badge/digitalocean/ceph_exporter)](https://goreportcard.com/report/digitalocean/ceph_exporter)
# Ceph Exporter [![GoDoc](https://godoc.org/github.com/digitalocean/ceph_exporter?status.svg)](https://godoc.org/github.com/digitalocean/ceph_exporter) ![build](https://github.com/digitalocean/ceph_exporter/actions/workflows/run_build.yml/badge.svg) ![tests](https://github.com/digitalocean/ceph_exporter/actions/workflows/run_tests.yml/badge.svg) [![Go Report Card](https://goreportcard.com/badge/digitalocean/ceph_exporter)](https://goreportcard.com/report/digitalocean/ceph_exporter)
A Prometheus exporter that scrapes meta information about a running Ceph
cluster. All the information gathered from the cluster is done by interacting
@ -30,32 +30,38 @@ variables:
We use Ceph's [official Golang client](https://github.com/ceph/go-ceph) to run
commands on the cluster.
This `ceph_exporter` branch is tested only on Ceph Nautilus releases. It might
This `ceph_exporter` branch currently supports the Nautilus, Octopus (untested), and Pacific releases. It might
not work as expected with older or non-LTS versions of Ceph.
## Environment Variables
Name | Description | Default
---- | ---- | ----
`TELEMETRY_ADDR` | Host:Port for ceph_exporter's metrics endpoint | `*:9128`
`TELEMETRY_PATH` | URL Path for surfacing metrics to Prometheus | `/metrics`
`EXPORTER_CONFIG` | Path to ceph_exporter configuration file | `/etc/ceph/exporter.yml`
`RGW_MODE` | Enable collection of stats from RGW (0:disabled 1:enabled 2:background) | `0`
`CEPH_CLUSTER` | Ceph cluster name | `ceph`
`CEPH_CONFIG` | Path to Ceph configuration file | `/etc/ceph/ceph.conf`
`CEPH_USER` | Ceph user to connect to cluster | `admin`
`CEPH_RADOS_OP_TIMEOUT` | Ceph rados_osd_op_timeout and rados_mon_op_timeout used to contact cluster (0s means no limit) | `30s`
`LOG_LEVEL` | logging level. One of: [trace, debug, info, warn, error, fatal, panic] | `info`
| Name | Description | Default |
|-------------------------|------------------------------------------------------------------------------------------------|--------------------------|
| `TELEMETRY_ADDR` | Host:Port for ceph_exporter's metrics endpoint | `*:9128` |
| `TELEMETRY_PATH` | URL Path for surfacing metrics to Prometheus | `/metrics` |
| `EXPORTER_CONFIG` | Path to ceph_exporter configuration file | `/etc/ceph/exporter.yml` |
| `RGW_MODE` | Enable collection of stats from RGW (0:disabled 1:enabled 2:background) | `0` |
| `CEPH_CLUSTER` | Ceph cluster name | `ceph` |
| `CEPH_CONFIG` | Path to Ceph configuration file | `/etc/ceph/ceph.conf` |
| `CEPH_USER` | Ceph user to connect to cluster | `admin` |
| `CEPH_RADOS_OP_TIMEOUT` | Ceph rados_osd_op_timeout and rados_mon_op_timeout used to contact cluster (0s means no limit) | `30s` |
| `LOG_LEVEL` | Logging level. One of: [trace, debug, info, warn, error, fatal, panic] | `info` |
| `TLS_CERT_FILE_PATH` | Path to the x509 certificate file for enabling TLS (the key file path must also be specified) | |
| `TLS_KEY_FILE_PATH` | Path to the x509 key file for enabling TLS (the cert file path must also be specified) | |
## Installation
Typical way of installing in Go should work.
The typical Go way of installing or building should work provided you have the [cgo dependencies](https://github.com/ceph/go-ceph#installation).
```
$ go install
$ go install -tags nautilus
```
A Makefile is provided in case you find a need for it.
```
$ go build -o ceph_exporter -tags nautilus
```
We build the client with support for nautilus specifically but the binary will work for Octopus and Pacific as well.
## Docker Image

View File

@ -278,7 +278,7 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector {
labels := make(prometheus.Labels)
labels["cluster"] = exporter.Cluster
return &ClusterHealthCollector{
collector := &ClusterHealthCollector{
conn: exporter.Conn,
logger: exporter.Logger,
version: exporter.Version,
@ -898,6 +898,15 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector {
labels,
),
}
if exporter.Version.IsAtLeast(Pacific) {
// pacific adds the DAEMON_OLD_VERSION health check
// that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay
// we'll interpret this is a critical warning (2)
collector.healthChecksMap["DAEMON_OLD_VERSION"] = 2
}
return collector
}
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
@ -989,9 +998,8 @@ type cephHealthStats struct {
Severity string `json:"severity"`
Summary string `json:"summary"`
} `json:"summary"`
OverallStatus string `json:"overall_status"`
Status string `json:"status"`
Checks map[string]struct {
Status string `json:"status"`
Checks map[string]struct {
Severity string `json:"severity"`
Summary struct {
Message string `json:"message"`
@ -1034,18 +1042,6 @@ type cephHealthStats struct {
} `json:"servicemap"`
}
type cephHealthDetailStats struct {
Checks map[string]struct {
Details []struct {
Message string `json:"message"`
} `json:"detail"`
Summary struct {
Message string `json:"message"`
} `json:"summary"`
Severity string `json:"severity"`
} `json:"checks"`
}
func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
cmd := c.cephUsageCommand(jsonFormat)
buf, _, err := c.conn.MonCommand(cmd)
@ -1068,23 +1064,6 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
}
}
switch stats.Health.OverallStatus {
case CephHealthOK:
c.HealthStatus.Set(0)
c.HealthStatusInterpreter.Set(0)
case CephHealthWarn:
c.HealthStatus.Set(1)
c.HealthStatusInterpreter.Set(2)
case CephHealthErr:
c.HealthStatus.Set(2)
c.HealthStatusInterpreter.Set(3)
default:
c.HealthStatus.Set(2)
c.HealthStatusInterpreter.Set(3)
}
// This will be set only if Luminous is running. Will be
// ignored otherwise.
switch stats.Health.Status {
case CephHealthOK:
c.HealthStatus.Set(0)
@ -1435,18 +1414,6 @@ func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte {
return cmd
}
func (c *ClusterHealthCollector) cephHealthDetailCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "health",
"detail": "detail",
"format": jsonFormat,
})
if err != nil {
c.logger.WithError(err).Panic("error marshalling ceph health detail")
}
return cmd
}
func (c *ClusterHealthCollector) collectRecoveryClientIO() error {
cmd := c.cephUsageCommand(plainFormat)
buf, _, err := c.conn.MonCommand(cmd)

View File

@ -192,7 +192,7 @@ func TestClusterHealthCollector(t *testing.T) {
"num_remapped_pgs": 10
}
},
"health": { "overall_status": "HEALTH_OK" } }`,
"health": { "status": "HEALTH_OK" } }`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 0`),
},
@ -209,7 +209,7 @@ func TestClusterHealthCollector(t *testing.T) {
"num_remapped_pgs": 10
}
},
"health": { "overall_status": "HEALTH_WARN", "status": "HEALTH_OK } }`,
"health": { "status": "HEALTH_OK } }`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 0`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 0`),
@ -245,7 +245,7 @@ func TestClusterHealthCollector(t *testing.T) {
"num_remapped_pgs": 10
}
},
"health": { "overall_status": "HEALTH_WARN" } }`,
"health": { "status": "HEALTH_WARN" } }`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 1`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 2`),
@ -263,7 +263,7 @@ func TestClusterHealthCollector(t *testing.T) {
"num_remapped_pgs": 10
}
},
"health": { "overall_status": "HEALTH_ERR" } }`,
"health": { "status": "HEALTH_ERR" } }`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 2`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 3`),
@ -815,8 +815,7 @@ $ sudo ceph -s
[]byte(tt.input), "", nil,
)
collector := NewClusterHealthCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New()})
collector.version = &Version{Major: 14, Minor: 2, Patch: 0}
collector := NewClusterHealthCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: &Version{Major: 14, Minor: 2, Patch: 0}})
err := prometheus.Register(collector)
require.NoError(t, err)
defer prometheus.Unregister(collector)

View File

@ -5,8 +5,6 @@ import (
"encoding/json"
"fmt"
"math"
"regexp"
"strconv"
"strings"
"time"
@ -540,19 +538,16 @@ type cephOSDTree struct {
} `json:"stray"`
}
type osdNode struct {
ID int64 `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
Status string `json:"status"`
}
type cephOSDTreeDown struct {
Nodes []struct {
ID int64 `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
Status string `json:"status"`
} `json:"nodes"`
Stray []struct {
ID int64 `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
Status string `json:"status"`
} `json:"stray"`
Nodes []osdNode `json:"nodes"`
Stray []osdNode `json:"stray"`
}
type cephPGDumpBrief struct {
@ -564,24 +559,6 @@ type cephPGDumpBrief struct {
} `json:"pg_stats"`
}
type cephPGQuery struct {
State string `json:"state"`
Info struct {
Stats struct {
StatSum struct {
NumObjectsRecovered int64 `json:"num_objects_recovered"`
} `json:"stat_sum"`
} `json:"stats"`
} `json:"info"`
RecoveryState []struct {
Name string `json:"name"`
EnterTime string `json:"enter_time"`
RecoverProgress *struct {
BackfillTargets []string `json:"backfill_targets"`
} `json:"recovery_progress"`
} `json:"recovery_state"`
}
type cephOSDLabel struct {
ID int64 `json:"id"`
Name string `json:"name"`
@ -595,40 +572,6 @@ type cephOSDLabel struct {
parent int64 // parent id when building tables
}
// backfillTargets returns a map from PG query result containing OSDs and
// corresponding shards that are being backfilled.
func (c cephPGQuery) backfillTargets() map[int64]int64 {
osdRegExp := regexp.MustCompile(`^(\d+)\((\d+)\)$`)
targets := make(map[int64]int64)
for _, state := range c.RecoveryState {
if state.RecoverProgress == nil {
continue
}
for _, osd := range state.RecoverProgress.BackfillTargets {
m := osdRegExp.FindStringSubmatch(osd)
if m == nil {
continue
}
osdID, err := strconv.ParseInt(m[1], 10, 64)
if err != nil {
continue
}
shard, err := strconv.ParseInt(m[2], 10, 64)
if err != nil {
continue
}
targets[osdID] = shard
}
}
return targets
}
func (o *OSDCollector) collectOSDDF() error {
args := o.cephOSDDFCommand()
buf, _, err := o.conn.MgrCommand(args)
@ -904,7 +847,6 @@ func (o *OSDCollector) collectOSDTreeDown(ch chan<- prometheus.Metric) error {
}
downItems := append(osdDown.Nodes, osdDown.Stray...)
for _, downItem := range downItems {
if downItem.Type != "osd" {
continue
@ -1118,18 +1060,6 @@ func (o *OSDCollector) cephPGDumpCommand() [][]byte {
return [][]byte{cmd}
}
func (o *OSDCollector) cephPGQueryCommand(pgid string) []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "query",
"pgid": pgid,
"format": jsonFormat,
})
if err != nil {
o.logger.WithError(err).Panic("error marshalling ceph pg query")
}
return cmd
}
func (o *OSDCollector) collectPGStates(ch chan<- prometheus.Metric) error {
// - See if there are PGs that we're tracking that are now active
// - See if there are new ones to add

View File

@ -16,6 +16,7 @@ package ceph
import (
"encoding/json"
"errors"
"math"
"strconv"
@ -273,23 +274,29 @@ func (p *PoolInfoCollector) Collect(ch chan<- prometheus.Metric) {
}
func (p *PoolInfoCollector) getExpansionFactor(pool poolInfo) float64 {
if ef, ok := p.getECExpansionFactor(pool); ok {
ef, err := p.getECExpansionFactor(pool)
if err == nil {
return ef
} else {
// Non-EC pool (or unable to get profile info); assume that it's replicated.
logrus.WithError(err).Warn("failed to get ec expansion factor")
return pool.ActualSize
}
// Non-EC pool (or unable to get profile info); assume that it's replicated.
return pool.ActualSize
}
func (p *PoolInfoCollector) getECExpansionFactor(pool poolInfo) (float64, bool) {
func (p *PoolInfoCollector) getECExpansionFactor(pool poolInfo) (float64, error) {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd erasure-code-profile get",
"name": pool.Profile,
"format": "json",
})
if err != nil {
return -1, err
}
buf, _, err := p.conn.MonCommand(cmd)
if err != nil {
return -1, false
return -1, err
}
type ecInfo struct {
@ -299,8 +306,12 @@ func (p *PoolInfoCollector) getECExpansionFactor(pool poolInfo) (float64, bool)
ecStats := ecInfo{}
err = json.Unmarshal(buf, &ecStats)
if err != nil || ecStats.K == "" || ecStats.M == "" {
return -1, false
if err != nil {
return -1, err
}
if ecStats.K == "" || ecStats.M == "" {
return -1, errors.New("missing stats")
}
k, _ := strconv.ParseFloat(ecStats.K, 64)
@ -308,7 +319,7 @@ func (p *PoolInfoCollector) getECExpansionFactor(pool poolInfo) (float64, bool)
expansionFactor := (k + m) / k
roundedExpansion := math.Round(expansionFactor*100) / 100
return roundedExpansion, true
return roundedExpansion, nil
}
func (p *PoolInfoCollector) getCrushRuleToRootMappings() map[int64]string {

View File

@ -162,7 +162,7 @@ func (r *RGWCollector) collect() error {
return err
}
tasks := make([]rgwTaskGC, 0, 0)
tasks := make([]rgwTaskGC, 0)
err = json.Unmarshal(data, &tasks)
if err != nil {
return err

35
main.go
View File

@ -16,7 +16,7 @@
package main
import (
"errors"
"crypto/tls"
"net"
"net/http"
"os"
@ -39,10 +39,6 @@ const (
defaultRadosOpTimeout = 30 * time.Second
)
var (
errCephVersionUnsupported = errors.New("ceph version unsupported")
)
// This horrible thing is a copy of tcpKeepAliveListener, tweaked to
// specifically check if it hits EMFILE when doing an accept, and if so,
// terminate the process.
@ -85,6 +81,9 @@ func main() {
cephConfig = envflag.String("CEPH_CONFIG", defaultCephConfigPath, "Path to Ceph config file")
cephUser = envflag.String("CEPH_USER", defaultCephUser, "Ceph user to connect to cluster")
cephRadosOpTimeout = envflag.Duration("CEPH_RADOS_OP_TIMEOUT", defaultRadosOpTimeout, "Ceph rados_osd_op_timeout and rados_mon_op_timeout used to contact cluster (0s means no limit)")
tlsCertPath = envflag.String("TLS_CERT_FILE_PATH", "", "Path to certificate file for TLS")
tlsKeyPath = envflag.String("TLS_KEY_FILE_PATH", "", "Path to key file for TLS")
)
envflag.Parse()
@ -157,8 +156,28 @@ func main() {
logrus.WithError(err).Fatal("error creating listener")
}
err = http.Serve(emfileAwareTcpListener{ln.(*net.TCPListener), logger}, nil)
if err != nil {
logrus.WithError(err).Fatal("error serving requests")
if len(*tlsCertPath) != 0 && len(*tlsKeyPath) != 0 {
server := &http.Server{
TLSConfig: &tls.Config{
GetCertificate: func(info *tls.ClientHelloInfo) (*tls.Certificate, error) {
caFiles, err := tls.LoadX509KeyPair(*tlsCertPath, *tlsKeyPath)
if err != nil {
return nil, err
}
return &caFiles, nil
},
},
}
err = server.ServeTLS(emfileAwareTcpListener{ln.(*net.TCPListener), logger}, "", "")
if err != nil {
logrus.WithError(err).Fatal("error serving TLS requests")
}
} else {
err = http.Serve(emfileAwareTcpListener{ln.(*net.TCPListener), logger}, nil)
if err != nil {
logrus.WithError(err).Fatal("error serving requests")
}
}
}