Add Ceph Exporter

This commit is contained in:
Vaibhav Bhembre 2016-01-06 13:24:20 -05:00
commit 9cda67d44a
17 changed files with 2113 additions and 0 deletions

24
.gitignore vendored Normal file
View File

@ -0,0 +1,24 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof

45
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,45 @@
Contributing
============
Please use this guide before making any contributions to this repository.
Preliminary
-----------
* All code **must** be [`gofmt`](https://golang.org/cmd/gofmt/)'d, [`golint`](https://github.com/golang/lint)'d and [`go vet`](https://golang.org/cmd/vet/)'d before being committed.
* Code **should** have test coverage to ensure its correctness.
PRs
---
**Commits**
Keep individual commits descriptive. Prefix them with the collector name and a
colon. Anyone viewing the git history should be able to determine from those
first 80 characters, the body of the commit. Feel free to expand further on
the commit but keep the first 80 characters on point.
Good Commit:
```
monitor: expose metrics for clock skew
- scrape monitor's skew value from ceph's status
```
Bad Commit:
```
new monitor metrics
```
Use your own discretion when deciding whether or not to squash multiple commits
in a PR to a single commit. However, each commit should contain a single,
logical unit of change, and a descriptive message.
Resources
---------
* [Effective Go](https://golang.org/doc/effective_go.html)
* [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
* [How to Write Go Code](https://golang.org/doc/code.html)
* [Twelve Go Best Practices](https://talks.golang.org/2013/bestpractices.slide)

201
LICENSE Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

17
Makefile Normal file
View File

@ -0,0 +1,17 @@
# Copyright 2015 The Prometheus Authors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
VERSION := 0.1.0
TARGET := ceph_exporter
include Makefile.COMMON

119
Makefile.COMMON Normal file
View File

@ -0,0 +1,119 @@
# Copyright 2015 The Prometheus Authors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# THE AUTHORITATIVE VERSION OF THIS MAKEFILE LIVES IN:
#
# https://github.com/prometheus/utils
#
# PLEASE MAKE ANY CHANGES THERE AND PROPAGATE THEM TO ALL PROMETHEUS
# REPOSITORIES THAT ARE USING THIS MAKEFILE.
#
# This file provides common Makefile infrastructure for several Prometheus
# components. This includes make tasks for downloading Go, setting up a
# self-contained build environment, fetching Go dependencies, building
# binaries, running tests, and doing release management. This file is intended
# to be included from a project's Makefile, which needs to define the following
# variables, at a minimum:
#
# * VERSION - The current version of the project in question.
# * TARGET - The desired name of the built binary.
#
# Many of the variables defined below are defined conditionally (using '?'),
# which allows the project's main Makefile to override any of these settings, if
# needed. See also:
#
# https://www.gnu.org/software/make/manual/html_node/Flavors.html#Flavors.
#
# The including Makefile may define any number of extra targets that are
# specific to that project.
VERSION ?= $(error VERSION not set in including Makefile)
TARGET ?= $(error TARGET not set in including Makefile)
SRC ?= $(shell find . -type f -name "*.go" ! -path "./.build/*")
GOOS := $(shell uname | tr A-Z a-z)
GOARCH := $(subst x86_64,amd64,$(patsubst i%86,386,$(shell uname -m)))
ifeq ($(GOOS),darwin)
RELEASE_SUFFIX ?= -osx$(shell sw_vers -productVersion)
endif
GO_VERSION ?= 1.4.2
ifeq ($(shell type go >/dev/null && go version | sed 's/.*go\([0-9.]*\).*/\1/'), $(GO_VERSION))
GOROOT := $(shell go env GOROOT)
else
GOROOT := $(CURDIR)/.build/go$(GO_VERSION)
endif
GOURL ?= https://golang.org/dl
GOPKG ?= go$(GO_VERSION).$(GOOS)-$(GOARCH)$(RELEASE_SUFFIX).tar.gz
GOPATH := $(CURDIR)/.build/gopath
GOCC ?= $(GOROOT)/bin/go
GO ?= GOROOT=$(GOROOT) GOPATH=$(GOPATH) $(GOCC)
GOFMT ?= $(GOROOT)/bin/gofmt
# Never honor GOBIN, should it be set at all.
unexport GOBIN
SUFFIX ?= $(GOOS)-$(GOARCH)
BINARY ?= $(TARGET)
ARCHIVE ?= $(TARGET)-$(VERSION).$(SUFFIX).tar.gz
ROOTPKG ?= github.com/prometheus/$(TARGET)
SELFLINK ?= $(GOPATH)/src/$(ROOTPKG)
default: $(BINARY)
$(GOCC):
@echo Go version $(GO_VERSION) required but not found in PATH.
@echo About to download and install go$(GO_VERSION) to $(GOROOT)
@echo Abort now if you want to manually install it system-wide instead.
@echo
@sleep 5
mkdir -p $(GOROOT)
curl -L $(GOURL)/$(GOPKG) | tar -C $(GOROOT) --strip 1 -xz
$(SELFLINK):
mkdir -p $(dir $@)
ln -s $(CURDIR) $@
dependencies-stamp: $(GOCC) $(SRC) | $(SELFLINK)
$(GO) get -d
touch $@
$(BINARY): $(GOCC) $(SRC) dependencies-stamp Makefile Makefile.COMMON
$(GO) build $(GOFLAGS) -o $@
.PHONY: archive
archive: $(ARCHIVE)
$(ARCHIVE): $(BINARY)
tar -czf $@ $<
.PHONY: tag
tag:
git tag $(VERSION)
git push --tags
.PHONY: test
test: $(GOCC) dependencies-stamp
$(GO) test ./...
.PHONY: format
format: $(GOCC)
find . -iname '*.go' | egrep -v "^\./\.build|./generated|\./Godeps|\.(l|y)\.go" | xargs -n1 $(GOFMT) -w -s=true
.PHONY: clean
clean:
rm -rf $(BINARY) $(ARCHIVE) .build *-stamp

51
README.md Normal file
View File

@ -0,0 +1,51 @@
# Ceph Exporter
Prometheus exporter that scrapes meta information about a running ceph cluster. All the information gathered from the cluster is done by interacting with the monitors using an appropriate wrapper over `rados_mon_command()`. Hence, no additional setup is necessary other than having a working ceph cluster.
## Dependencies
You should ideally run this exporter from the client that can talk to
Ceph. Like any other ceph client it needs the following files to run
correctly.
* `ceph.conf` containing your ceph configuration.
* `ceph.<user>.keyring` in order to authenticate to your cluster.
Ceph exporter will automatically pick those up if they are present in
any of the [default
locations](http://docs.ceph.com/docs/master/rados/configuration/ceph-conf/#the-configuration-file). Otherwise you will need to provide the configuration manually using `--ceph.config` flag.
We use Ceph's [official Golang client](https://github.com/ceph/go-ceph) to run commands on the cluster.
## Flags
Name | Description | Default
---- | ---- | ----
telemetry.addr | Host:Port pair to run exporter on | `*:9190`
telemetry.path | URL Path for surfacing metrics to prometheus | `/metrics`
ceph.config | Path to ceph configuration file | ""
## Installation
Typical way of installing in Go should work.
```
go install
```
A Makefile is provided in case you find a need for it.
## Contributing
Please refer to the [CONTRIBUTING](CONTRIBUTING.md) guide for more
information on how to submit your changes to this repository.
## Sample view
If you have [promdash](https://github.com/prometheus/promdash) set up you
can generate views like:
![](sample.png)
---
Copyright @ 2016 DigitalOcean™ Inc.

169
collectors/cluster_usage.go Normal file
View File

@ -0,0 +1,169 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"encoding/json"
"log"
"github.com/prometheus/client_golang/prometheus"
)
const (
cephNamespace = "ceph"
)
// A ClusterUsageCollector is used to gather all the global stats about a given
// ceph cluster. It is sometimes essential to know how fast the cluster is growing
// or shrinking as a whole in order to zero in on the cause. The pool specific
// stats are provided separately.
type ClusterUsageCollector struct {
conn Conn
// GlobalCapacity displays the total storage capacity of the cluster. This
// information is based on the actual no. of objects that are allocated. It
// does not take overcommittment into consideration.
GlobalCapacity prometheus.Gauge
// UsedCapacity shows the storage under use.
UsedCapacity prometheus.Gauge
// AvailableCapacity shows the remaining capacity of the cluster that is left unallocated.
AvailableCapacity prometheus.Gauge
// Objects show the total no. of RADOS objects that are currently allocated.
Objects prometheus.Gauge
}
// NewClusterUsageCollector creates and returns the reference to ClusterUsageCollector
// and internally defines each metric that display cluster stats.
func NewClusterUsageCollector(conn Conn) *ClusterUsageCollector {
return &ClusterUsageCollector{
conn: conn,
GlobalCapacity: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cluster_capacity_bytes",
Help: "Total capacity of the cluster",
}),
UsedCapacity: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cluster_used_bytes",
Help: "Capacity of the cluster currently in use",
}),
AvailableCapacity: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cluster_available_bytes",
Help: "Available space within the cluster",
}),
Objects: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cluster_objects",
Help: "No. of rados objects within the cluster",
}),
}
}
func (c *ClusterUsageCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
c.GlobalCapacity,
c.UsedCapacity,
c.AvailableCapacity,
c.Objects,
}
}
type cephClusterStats struct {
Stats struct {
TotalBytes json.Number `json:"total_bytes"`
TotalUsedBytes json.Number `json:"total_used_bytes"`
TotalAvailBytes json.Number `json:"total_avail_bytes"`
TotalObjects json.Number `json:"total_objects"`
} `json:"stats"`
}
func (c *ClusterUsageCollector) collect() error {
cmd := c.cephUsageCommand()
buf, _, err := c.conn.MonCommand(cmd)
if err != nil {
return err
}
stats := &cephClusterStats{}
if err := json.Unmarshal(buf, stats); err != nil {
return err
}
tot, err := stats.Stats.TotalBytes.Float64()
if err != nil {
return err
}
c.GlobalCapacity.Set(tot)
used, err := stats.Stats.TotalUsedBytes.Float64()
if err != nil {
return err
}
c.UsedCapacity.Set(used)
avail, err := stats.Stats.TotalAvailBytes.Float64()
if err != nil {
return err
}
c.AvailableCapacity.Set(avail)
objects, err := stats.Stats.TotalObjects.Float64()
if err != nil {
return err
}
c.Objects.Set(objects)
return nil
}
func (c *ClusterUsageCollector) cephUsageCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "df",
"detail": "detail",
"format": "json",
})
if err != nil {
// panic! because ideally in no world this hard-coded input
// should fail.
panic(err)
}
return cmd
}
// Describe sends the descriptors of each metric over to the provided channel.
// The corresponding metric values are sent separately.
func (c *ClusterUsageCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range c.metricsList() {
ch <- metric.Desc()
}
}
// Collect sends the metric values for each metric pertaining to the global
// cluster usage over to the provided prometheus Metric channel.
func (c *ClusterUsageCollector) Collect(ch chan<- prometheus.Metric) {
if err := c.collect(); err != nil {
log.Println("failed collecting metrics:", err)
return
}
for _, metric := range c.metricsList() {
ch <- metric
}
}

View File

@ -0,0 +1,69 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"io/ioutil"
"net/http"
"net/http/httptest"
"regexp"
"testing"
"github.com/prometheus/client_golang/prometheus"
)
func TestClusterUsage(t *testing.T) {
var (
expected = `
{
"stats": {
"total_bytes": 10,
"total_used_bytes": 6,
"total_avail_bytes": 4,
"total_objects": 1
}
}`
)
collector := NewClusterUsageCollector(NewNoopConn(expected))
if err := prometheus.Register(collector); err != nil {
t.Fatalf("collector failed to register: %s", err)
}
server := httptest.NewServer(prometheus.Handler())
defer server.Close()
resp, err := http.Get(server.URL)
if err != nil {
t.Fatalf("unexpected failed response from prometheus: %s", err)
}
defer resp.Body.Close()
buf, err := ioutil.ReadAll(resp.Body)
if err != nil {
t.Fatalf("failed reading server response: %s", err)
}
for _, re := range []*regexp.Regexp{
regexp.MustCompile(`ceph_cluster_capacity_bytes 10`),
regexp.MustCompile(`ceph_cluster_used_bytes 6`),
regexp.MustCompile(`ceph_cluster_available_bytes 4`),
regexp.MustCompile(`ceph_cluster_objects 1`),
} {
if !re.Match(buf) {
t.Errorf("failed matching: %q", re)
}
}
}

69
collectors/conn.go Normal file
View File

@ -0,0 +1,69 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import "github.com/ceph/go-ceph/rados"
// Conn interface implements only necessary methods that are used
// in this repository of *rados.Conn. This keeps rest of the implementation
// clean and *rados.Conn doesn't need to show up everywhere (it being
// more of an implementation detail in reality). Also it makes mocking
// easier for unit-testing the collectors.
type Conn interface {
ReadDefaultConfigFile() error
Connect() error
Shutdown()
MonCommand([]byte) ([]byte, string, error)
}
// Verify that *rados.Conn implements Conn correctly.
var _ Conn = &rados.Conn{}
// NoopConn is the stub we use for mocking rados Conn. Unit testing
// each individual collectors becomes a lot easier after that.
type NoopConn struct {
output string
}
// The stub we use for testing should also satisfy the interface properties.
var _ Conn = &NoopConn{}
// NewNoopConn returns an instance of *NoopConn. The string that we want
// outputted at the end of the command we issue to ceph, should be
// specified in the only input parameter.
func NewNoopConn(output string) *NoopConn {
return &NoopConn{output}
}
// ReadDefaultConfigFile does not need to return an error. It satisfies
// rados.Conn's function with the same prototype.
func (n *NoopConn) ReadDefaultConfigFile() error {
return nil
}
// Connect does not need to return an error. It satisfies
// rados.Conn's function with the same prototype.
func (n *NoopConn) Connect() error {
return nil
}
// Shutdown satisfies rados.Conn's function prototype.
func (n *NoopConn) Shutdown() {}
// MonCommand returns the provided output string to NoopConn as is, making
// it seem like it actually ran something and produced that string as a result.
func (n *NoopConn) MonCommand(_ []byte) ([]byte, string, error) {
return []byte(n.output), "", nil
}

235
collectors/health.go Normal file
View File

@ -0,0 +1,235 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"encoding/json"
"log"
"regexp"
"strconv"
"github.com/prometheus/client_golang/prometheus"
)
// ClusterHealthCollector collects information about the health of an overall cluster.
// It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector
// does.
type ClusterHealthCollector struct {
conn Conn
DegradedPGs prometheus.Gauge
UncleanPGs prometheus.Gauge
UndersizedPGs prometheus.Gauge
StalePGs prometheus.Gauge
DegradedObjectsCount prometheus.Gauge
OSDsDown prometheus.Gauge
}
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
// metrics on.
func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
return &ClusterHealthCollector{
conn: conn,
DegradedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "degraded_pgs",
Help: "No. of PGs in a degraded state",
},
),
UncleanPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "unclean_pgs",
Help: "No. of PGs in an unclean state",
},
),
UndersizedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "undersized_pgs",
Help: "No. of undersized PGs in the cluster",
},
),
StalePGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stale_pgs",
Help: "No. of stale PGs in the cluster",
},
),
DegradedObjectsCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "degraded_objects",
Help: "No. of degraded objects across all PGs",
},
),
OSDsDown: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_down",
Help: "Count of OSDs that are in DOWN state",
},
),
}
}
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
c.DegradedPGs,
c.UncleanPGs,
c.UndersizedPGs,
c.StalePGs,
c.DegradedObjectsCount,
c.OSDsDown,
}
}
type cephHealthStats struct {
Summary []struct {
Severity string `json:"severity"`
Summary string `json:"summary"`
} `json:"summary"`
}
func (c *ClusterHealthCollector) collect() error {
cmd := c.cephUsageCommand()
buf, _, err := c.conn.MonCommand(cmd)
if err != nil {
return err
}
stats := &cephHealthStats{}
if err := json.Unmarshal(buf, stats); err != nil {
return err
}
for _, metric := range c.metricsList() {
if gauge, ok := metric.(prometheus.Gauge); ok {
gauge.Set(0)
}
}
if len(stats.Summary) < 1 {
return nil
}
var (
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`)
staleRegex = regexp.MustCompile(`([\d]+) pgs stale`)
degradedObjectsRegex = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects degraded`)
osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`)
)
for _, s := range stats.Summary {
matched := degradedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.DegradedPGs.Set(float64(v))
}
matched = uncleanRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.UncleanPGs.Set(float64(v))
}
matched = undersizedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.UndersizedPGs.Set(float64(v))
}
matched = staleRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StalePGs.Set(float64(v))
}
matched = degradedObjectsRegex.FindStringSubmatch(s.Summary)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.DegradedObjectsCount.Set(float64(v))
}
matched = osdsDownRegex.FindStringSubmatch(s.Summary)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.OSDsDown.Set(float64(v))
}
}
return nil
}
func (c *ClusterHealthCollector) cephUsageCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "health",
"detail": "detail",
"format": "json",
})
if err != nil {
// panic! because ideally in no world this hard-coded input
// should fail.
panic(err)
}
return cmd
}
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector
// to the provided prometheus channel.
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range c.metricsList() {
ch <- metric.Desc()
}
}
// Collect sends all the collected metrics to the provided prometheus channel.
// It requires the caller to handle synchronization.
func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) {
if err := c.collect(); err != nil {
log.Println("failed collecting metrics:", err)
return
}
for _, metric := range c.metricsList() {
ch <- metric
}
}

97
collectors/health_test.go Normal file
View File

@ -0,0 +1,97 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"io/ioutil"
"net/http"
"net/http/httptest"
"regexp"
"testing"
"github.com/prometheus/client_golang/prometheus"
)
func TestClusterHealthCollector(t *testing.T) {
for _, tt := range []struct {
input string
regexes []*regexp.Regexp
}{
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]}`,
[]*regexp.Regexp{
regexp.MustCompile(`degraded_pgs 5`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}`,
[]*regexp.Regexp{
regexp.MustCompile(`unclean_pgs 6`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]}`,
[]*regexp.Regexp{
regexp.MustCompile(`undersized_pgs 7`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]}`,
[]*regexp.Regexp{
regexp.MustCompile(`stale_pgs 8`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]}`,
[]*regexp.Regexp{
regexp.MustCompile(`degraded_objects 10`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]}`,
[]*regexp.Regexp{
regexp.MustCompile(`osds_down 3`),
},
},
} {
func() {
collector := NewClusterHealthCollector(NewNoopConn(tt.input))
if err := prometheus.Register(collector); err != nil {
t.Fatalf("collector failed to register: %s", err)
}
defer prometheus.Unregister(collector)
server := httptest.NewServer(prometheus.Handler())
defer server.Close()
resp, err := http.Get(server.URL)
if err != nil {
t.Fatalf("unexpected failed response from prometheus: %s", err)
}
defer resp.Body.Close()
buf, err := ioutil.ReadAll(resp.Body)
if err != nil {
t.Fatalf("failed reading server response: %s", err)
}
for _, re := range tt.regexes {
if !re.Match(buf) {
t.Errorf("failed matching: %q", re)
}
}
}()
}
}

352
collectors/monitors.go Normal file
View File

@ -0,0 +1,352 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"encoding/json"
"log"
"github.com/prometheus/client_golang/prometheus"
)
// MonitorCollector is used to extract stats related to monitors
// running within Ceph cluster. As we extract information pertaining
// to each monitor instance, there are various vector metrics we
// need to use.
type MonitorCollector struct {
conn Conn
// TotalKBs display the total storage a given monitor node has.
TotalKBs *prometheus.GaugeVec
// UsedKBs depict how much of the total storage our monitor process
// has utilized.
UsedKBs *prometheus.GaugeVec
// AvailKBs shows the space left unused.
AvailKBs *prometheus.GaugeVec
// PercentAvail shows the amount of unused space as a percentage of total
// space.
PercentAvail *prometheus.GaugeVec
// Store exposes information about internal backing store.
Store Store
// ClockSkew shows how far the monitor clocks have skewed from each other. This
// is an important metric because the functioning of Ceph's paxos depends on
// the clocks being aligned as close to each other as possible.
ClockSkew *prometheus.GaugeVec
// Latency displays the time the monitors take to communicate between themselves.
Latency *prometheus.GaugeVec
// NodesinQuorum show the size of the working monitor quorum. Any change in this
// metric can imply a significant issue in the cluster if it is not manually changed.
NodesinQuorum prometheus.Gauge
}
// Store displays information about Monitor's FileStore. It is responsible for
// storing all the meta information about the cluster, including monmaps, osdmaps,
// pgmaps, etc. along with logs and other data.
type Store struct {
// TotalBytes displays the current size of the FileStore.
TotalBytes *prometheus.GaugeVec
// SSTBytes shows the amount used by LevelDB's sorted-string tables.
SSTBytes *prometheus.GaugeVec
// LogBytes shows the amount used by logs.
LogBytes *prometheus.GaugeVec
// MiscBytes shows the amount used by miscellaneous information.
MiscBytes *prometheus.GaugeVec
}
// NewMonitorCollector creates an instance of the MonitorCollector and instantiates
// the individual metrics that show information about the monitor processes.
func NewMonitorCollector(conn Conn) *MonitorCollector {
return &MonitorCollector{
conn: conn,
TotalKBs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_capacity_bytes",
Help: "Total storage capacity of the monitor node",
},
[]string{"monitor"},
),
UsedKBs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_used_bytes",
Help: "Storage of the monitor node that is currently allocated for use",
},
[]string{"monitor"},
),
AvailKBs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_avail_bytes",
Help: "Total unused storage capacity that the monitor node has left",
},
[]string{"monitor"},
),
PercentAvail: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_avail_percent",
Help: "Percentage of total unused storage capacity that the monitor node has left",
},
[]string{"monitor"},
),
Store: Store{
TotalBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_capacity_bytes",
Help: "Total capacity of the FileStore backing the monitor daemon",
},
[]string{"monitor"},
),
SSTBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_sst_bytes",
Help: "Capacity of the FileStore used only for raw SSTs",
},
[]string{"monitor"},
),
LogBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_log_bytes",
Help: "Capacity of the FileStore used only for logging",
},
[]string{"monitor"},
),
MiscBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_misc_bytes",
Help: "Capacity of the FileStore used only for storing miscellaneous information",
},
[]string{"monitor"},
),
},
ClockSkew: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_clock_skew_seconds",
Help: "Clock skew the monitor node is incurring",
},
[]string{"monitor"},
),
Latency: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_latency_seconds",
Help: "Latency the monitor node is incurring",
},
[]string{"monitor"},
),
NodesinQuorum: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_quorum_count",
Help: "The total size of the monitor quorum",
},
),
}
}
func (m *MonitorCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
m.TotalKBs,
m.UsedKBs,
m.AvailKBs,
m.PercentAvail,
m.Store.TotalBytes,
m.Store.SSTBytes,
m.Store.LogBytes,
m.Store.MiscBytes,
m.ClockSkew,
m.Latency,
}
}
func (m *MonitorCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
m.NodesinQuorum,
}
}
type cephMonitorStats struct {
Health struct {
Health struct {
HealthServices []struct {
Mons []struct {
Name string `json:"name"`
KBTotal json.Number `json:"kb_total"`
KBUsed json.Number `json:"kb_used"`
KBAvail json.Number `json:"kb_avail"`
AvailPercent json.Number `json:"avail_percent"`
StoreStats struct {
BytesTotal json.Number `json:"bytes_total"`
BytesSST json.Number `json:"bytes_sst"`
BytesLog json.Number `json:"bytes_log"`
BytesMisc json.Number `json:"bytes_misc"`
} `json:"store_stats"`
} `json:"mons"`
} `json:"health_services"`
} `json:"health"`
TimeChecks struct {
Mons []struct {
Name string `json:"name"`
Skew json.Number `json:"skew"`
Latency json.Number `json:"latency"`
} `json:"mons"`
} `json:"timechecks"`
} `json:"health"`
Quorum []int `json:"quorum"`
}
func (m *MonitorCollector) collect() error {
cmd := m.cephUsageCommand()
buf, _, err := m.conn.MonCommand(cmd)
if err != nil {
return err
}
stats := &cephMonitorStats{}
if err := json.Unmarshal(buf, stats); err != nil {
return err
}
for _, healthService := range stats.Health.Health.HealthServices {
for _, monstat := range healthService.Mons {
kbTotal, err := monstat.KBTotal.Float64()
if err != nil {
return err
}
m.TotalKBs.WithLabelValues(monstat.Name).Set(kbTotal * 1e3)
kbUsed, err := monstat.KBUsed.Float64()
if err != nil {
return err
}
m.UsedKBs.WithLabelValues(monstat.Name).Set(kbUsed * 1e3)
kbAvail, err := monstat.KBAvail.Float64()
if err != nil {
return err
}
m.AvailKBs.WithLabelValues(monstat.Name).Set(kbAvail * 1e3)
percentAvail, err := monstat.AvailPercent.Float64()
if err != nil {
return err
}
m.PercentAvail.WithLabelValues(monstat.Name).Set(percentAvail)
storeBytes, err := monstat.StoreStats.BytesTotal.Float64()
if err != nil {
return err
}
m.Store.TotalBytes.WithLabelValues(monstat.Name).Set(storeBytes)
sstBytes, err := monstat.StoreStats.BytesSST.Float64()
if err != nil {
return err
}
m.Store.SSTBytes.WithLabelValues(monstat.Name).Set(sstBytes)
logBytes, err := monstat.StoreStats.BytesLog.Float64()
if err != nil {
return err
}
m.Store.LogBytes.WithLabelValues(monstat.Name).Set(logBytes)
miscBytes, err := monstat.StoreStats.BytesMisc.Float64()
if err != nil {
return err
}
m.Store.MiscBytes.WithLabelValues(monstat.Name).Set(miscBytes)
}
}
for _, monstat := range stats.Health.TimeChecks.Mons {
skew, err := monstat.Skew.Float64()
if err != nil {
return err
}
m.ClockSkew.WithLabelValues(monstat.Name).Set(skew)
latency, err := monstat.Latency.Float64()
if err != nil {
return err
}
m.Latency.WithLabelValues(monstat.Name).Set(latency)
}
m.NodesinQuorum.Set(float64(len(stats.Quorum)))
return nil
}
func (m *MonitorCollector) cephUsageCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "status",
"format": "json",
})
if err != nil {
// panic! because ideally in no world this hard-coded input
// should fail.
panic(err)
}
return cmd
}
// Describe sends the descriptors of each Monitor related metric we have defined
// to the channel provided.
func (m *MonitorCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range m.collectorList() {
metric.Describe(ch)
}
for _, metric := range m.metricsList() {
ch <- metric.Desc()
}
}
// Collect extracts the given metrics from the Monitors and sends it to the prometheus
// channel.
func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric) {
if err := m.collect(); err != nil {
log.Println("failed collecting metrics:", err)
return
}
for _, metric := range m.collectorList() {
metric.Collect(ch)
}
for _, metric := range m.metricsList() {
ch <- metric
}
}

292
collectors/monitors_test.go Normal file
View File

@ -0,0 +1,292 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"io/ioutil"
"net/http"
"net/http/httptest"
"regexp"
"testing"
"github.com/prometheus/client_golang/prometheus"
)
func TestMonitorCollector(t *testing.T) {
for _, tt := range []struct {
input string
regexes []*regexp.Regexp
}{
{
`
{
"health": {
"health": {
"health_services": [
{
"mons": [
{
"name": "test-mon01",
"kb_total": 412718256,
"kb_used": 1812852,
"kb_avail": 389917500,
"avail_percent": 94,
"last_updated": "2015-12-28 15:54:03.763348",
"store_stats": {
"bytes_total": 1781282079,
"bytes_sst": 1,
"bytes_log": 609694,
"bytes_misc": 1780672385,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "test-mon02",
"kb_total": 412718256,
"kb_used": 1875304,
"kb_avail": 389855048,
"avail_percent": 94,
"last_updated": "2015-12-28 15:53:53.808657",
"store_stats": {
"bytes_total": 1844348214,
"bytes_sst": 2,
"bytes_log": 871605,
"bytes_misc": 1843476609,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "test-mon03",
"kb_total": 412718256,
"kb_used": 2095356,
"kb_avail": 389634996,
"avail_percent": 94,
"last_updated": "2015-12-28 15:53:06.292749",
"store_stats": {
"bytes_total": 2069468587,
"bytes_sst": 3,
"bytes_log": 871605,
"bytes_misc": 2068596982,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "test-mon04",
"kb_total": 412718256,
"kb_used": 1726276,
"kb_avail": 390004076,
"avail_percent": 94,
"last_updated": "2015-12-28 15:53:10.770775",
"store_stats": {
"bytes_total": 1691972147,
"bytes_sst": 4,
"bytes_log": 871605,
"bytes_misc": 1691100542,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "test-mon05",
"kb_total": 412718256,
"kb_used": 1883228,
"kb_avail": 389847124,
"avail_percent": 94,
"last_updated": "2015-12-28 15:53:11.407033",
"store_stats": {
"bytes_total": 1852485942,
"bytes_sst": 5,
"bytes_log": 871605,
"bytes_misc": 1851614337,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
}
]
}
]
},
"timechecks": {
"epoch": 70,
"round": 3362,
"round_status": "finished",
"mons": [
{
"name": "test-mon01",
"skew": 0.000000,
"latency": 0.000000,
"health": "HEALTH_OK"
},
{
"name": "test-mon02",
"skew": -0.000002,
"latency": 0.000815,
"health": "HEALTH_OK"
},
{
"name": "test-mon03",
"skew": -0.000002,
"latency": 0.000829,
"health": "HEALTH_OK"
},
{
"name": "test-mon04",
"skew": -0.000019,
"latency": 0.000609,
"health": "HEALTH_OK"
},
{
"name": "test-mon05",
"skew": -0.000628,
"latency": 0.000659,
"health": "HEALTH_OK"
}
]
},
"summary": [],
"overall_status": "HEALTH_OK",
"detail": []
},
"fsid": "6C9BF03E-044E-4EEB-9C5F-145A54ECF7DB",
"election_epoch": 70,
"quorum": [
0,
1,
2,
3,
4
],
"monmap": {
"epoch": 12,
"fsid": "6C9BF03E-044E-4EEB-9C5F-145A54ECF7DB",
"modified": "2015-11-25 07:58:56.388352",
"created": "0.000000",
"mons": [
{
"rank": 0,
"name": "test-mon01",
"addr": "10.123.1.25:6789\/0"
},
{
"rank": 1,
"name": "test-mon02",
"addr": "10.123.1.26:6789\/0"
},
{
"rank": 2,
"name": "test-mon03",
"addr": "10.123.2.25:6789\/0"
},
{
"rank": 3,
"name": "test-mon04",
"addr": "10.123.2.26:6789\/0"
},
{
"rank": 4,
"name": "test-mon05",
"addr": "10.123.2.27:6789\/0"
}
]
}
}
`,
[]*regexp.Regexp{
regexp.MustCompile(`ceph_monitor_avail_bytes{monitor="test-mon01"} 3.899175e`),
regexp.MustCompile(`ceph_monitor_avail_bytes{monitor="test-mon02"} 3.89855048e`),
regexp.MustCompile(`ceph_monitor_avail_bytes{monitor="test-mon03"} 3.89634996e`),
regexp.MustCompile(`ceph_monitor_avail_bytes{monitor="test-mon04"} 3.90004076e`),
regexp.MustCompile(`ceph_monitor_avail_bytes{monitor="test-mon05"} 3.89847124e`),
regexp.MustCompile(`ceph_monitor_avail_percent{monitor="test-mon01"} 94`),
regexp.MustCompile(`ceph_monitor_avail_percent{monitor="test-mon02"} 94`),
regexp.MustCompile(`ceph_monitor_avail_percent{monitor="test-mon03"} 94`),
regexp.MustCompile(`ceph_monitor_avail_percent{monitor="test-mon04"} 94`),
regexp.MustCompile(`ceph_monitor_avail_percent{monitor="test-mon05"} 94`),
regexp.MustCompile(`ceph_monitor_clock_skew_seconds{monitor="test-mon01"} 0`),
regexp.MustCompile(`ceph_monitor_clock_skew_seconds{monitor="test-mon02"} -2e-06`),
regexp.MustCompile(`ceph_monitor_clock_skew_seconds{monitor="test-mon03"} -2e-06`),
regexp.MustCompile(`ceph_monitor_clock_skew_seconds{monitor="test-mon04"} -1.9e-05`),
regexp.MustCompile(`ceph_monitor_clock_skew_seconds{monitor="test-mon05"} -0.000628`),
regexp.MustCompile(`ceph_monitor_latency_seconds{monitor="test-mon01"} 0`),
regexp.MustCompile(`ceph_monitor_latency_seconds{monitor="test-mon02"} 0.000815`),
regexp.MustCompile(`ceph_monitor_latency_seconds{monitor="test-mon03"} 0.000829`),
regexp.MustCompile(`ceph_monitor_latency_seconds{monitor="test-mon04"} 0.000609`),
regexp.MustCompile(`ceph_monitor_latency_seconds{monitor="test-mon05"} 0.000659`),
regexp.MustCompile(`ceph_monitor_quorum_count 5`),
regexp.MustCompile(`ceph_monitor_store_log_bytes{monitor="test-mon01"} 609694`),
regexp.MustCompile(`ceph_monitor_store_log_bytes{monitor="test-mon02"} 871605`),
regexp.MustCompile(`ceph_monitor_store_log_bytes{monitor="test-mon03"} 871605`),
regexp.MustCompile(`ceph_monitor_store_log_bytes{monitor="test-mon04"} 871605`),
regexp.MustCompile(`ceph_monitor_store_log_bytes{monitor="test-mon05"} 871605`),
regexp.MustCompile(`ceph_monitor_store_misc_bytes{monitor="test-mon01"} 1.780672385e`),
regexp.MustCompile(`ceph_monitor_store_misc_bytes{monitor="test-mon02"} 1.843476609e`),
regexp.MustCompile(`ceph_monitor_store_misc_bytes{monitor="test-mon03"} 2.068596982e`),
regexp.MustCompile(`ceph_monitor_store_misc_bytes{monitor="test-mon04"} 1.691100542e`),
regexp.MustCompile(`ceph_monitor_store_misc_bytes{monitor="test-mon05"} 1.851614337e`),
regexp.MustCompile(`ceph_monitor_store_sst_bytes{monitor="test-mon01"} 1`),
regexp.MustCompile(`ceph_monitor_store_sst_bytes{monitor="test-mon02"} 2`),
regexp.MustCompile(`ceph_monitor_store_sst_bytes{monitor="test-mon03"} 3`),
regexp.MustCompile(`ceph_monitor_store_sst_bytes{monitor="test-mon04"} 4`),
regexp.MustCompile(`ceph_monitor_store_sst_bytes{monitor="test-mon05"} 5`),
regexp.MustCompile(`ceph_monitor_store_capacity_bytes{monitor="test-mon01"} 1.781282079e`),
regexp.MustCompile(`ceph_monitor_store_capacity_bytes{monitor="test-mon02"} 1.844348214e`),
regexp.MustCompile(`ceph_monitor_store_capacity_bytes{monitor="test-mon03"} 2.069468587e`),
regexp.MustCompile(`ceph_monitor_store_capacity_bytes{monitor="test-mon04"} 1.691972147e`),
regexp.MustCompile(`ceph_monitor_store_capacity_bytes{monitor="test-mon05"} 1.852485942e`),
regexp.MustCompile(`ceph_monitor_capacity_bytes{monitor="test-mon01"} 4.12718256e`),
regexp.MustCompile(`ceph_monitor_capacity_bytes{monitor="test-mon02"} 4.12718256e`),
regexp.MustCompile(`ceph_monitor_capacity_bytes{monitor="test-mon03"} 4.12718256e`),
regexp.MustCompile(`ceph_monitor_capacity_bytes{monitor="test-mon04"} 4.12718256e`),
regexp.MustCompile(`ceph_monitor_capacity_bytes{monitor="test-mon05"} 4.12718256e`),
regexp.MustCompile(`ceph_monitor_used_bytes{monitor="test-mon01"} 1.812852e`),
regexp.MustCompile(`ceph_monitor_used_bytes{monitor="test-mon02"} 1.875304e`),
regexp.MustCompile(`ceph_monitor_used_bytes{monitor="test-mon03"} 2.095356e`),
regexp.MustCompile(`ceph_monitor_used_bytes{monitor="test-mon04"} 1.726276e`),
regexp.MustCompile(`ceph_monitor_used_bytes{monitor="test-mon05"} 1.883228e`),
},
},
} {
func() {
collector := NewMonitorCollector(NewNoopConn(tt.input))
if err := prometheus.Register(collector); err != nil {
t.Fatalf("collector failed to register: %s", err)
}
defer prometheus.Unregister(collector)
server := httptest.NewServer(prometheus.Handler())
defer server.Close()
resp, err := http.Get(server.URL)
if err != nil {
t.Fatalf("unexpected failed response from prometheus: %s", err)
}
defer resp.Body.Close()
buf, err := ioutil.ReadAll(resp.Body)
if err != nil {
t.Fatalf("failed reading server response: %s", err)
}
for _, re := range tt.regexes {
if !re.Match(buf) {
t.Errorf("failed matching: %q", re)
}
}
}()
}
}

185
collectors/pool_usage.go Normal file
View File

@ -0,0 +1,185 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"encoding/json"
"errors"
"log"
"github.com/prometheus/client_golang/prometheus"
)
// PoolUsageCollector displays statistics about each pool we have created
// in the ceph cluster.
type PoolUsageCollector struct {
conn Conn
// UsedBytes tracks the amount of bytes currently allocated for the pool. This
// does not factor in the overcommitment made for individual images.
UsedBytes *prometheus.GaugeVec
// Objects shows the no. of RADOS objects created within the pool.
Objects *prometheus.GaugeVec
// ReadIO tracks the read IO calls made for the images within each pool.
ReadIO *prometheus.CounterVec
// WriteIO tracks the write IO calls made for the images within each pool.
WriteIO *prometheus.CounterVec
}
// NewPoolUsageCollector creates a new instance of PoolUsageCollector and returns
// its reference.
func NewPoolUsageCollector(conn Conn) *PoolUsageCollector {
return &PoolUsageCollector{
conn: conn,
UsedBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "pool_used_bytes",
Help: "Capacity of the pool that is currently under use",
},
[]string{"pool"},
),
Objects: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "pool_objects_total",
Help: "Total no. of objects allocated within the pool",
},
[]string{"pool"},
),
ReadIO: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: cephNamespace,
Name: "pool_read_total",
Help: "Total read i/o calls the pool has been subject to",
},
[]string{"pool"},
),
WriteIO: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: cephNamespace,
Name: "pool_write_total",
Help: "Total write i/o calls the pool has been subject to",
},
[]string{"pool"},
),
}
}
func (p *PoolUsageCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
p.UsedBytes,
p.Objects,
p.ReadIO,
p.WriteIO,
}
}
type cephPoolStats struct {
Pools []struct {
Name string `json:"name"`
ID int `json:"id"`
Stats struct {
BytesUsed json.Number `json:"bytes_used"`
Objects json.Number `json:"objects"`
Read json.Number `json:"rd"`
Write json.Number `json:"wr"`
} `json:"stats"`
} `json:"pools"`
}
func (p *PoolUsageCollector) collect() error {
cmd := p.cephUsageCommand()
buf, _, err := p.conn.MonCommand(cmd)
if err != nil {
return err
}
stats := &cephPoolStats{}
if err := json.Unmarshal(buf, stats); err != nil {
return err
}
if len(stats.Pools) < 1 {
return errors.New("no pools found in the cluster to report stats on")
}
for _, pool := range stats.Pools {
bytesUsed, err := pool.Stats.BytesUsed.Float64()
if err != nil {
return err
}
p.UsedBytes.WithLabelValues(pool.Name).Set(bytesUsed)
objects, err := pool.Stats.Objects.Float64()
if err != nil {
return err
}
p.Objects.WithLabelValues(pool.Name).Set(objects)
read, err := pool.Stats.Read.Float64()
if err != nil {
return err
}
p.ReadIO.WithLabelValues(pool.Name).Set(read)
write, err := pool.Stats.Write.Float64()
if err != nil {
return err
}
p.WriteIO.WithLabelValues(pool.Name).Set(write)
}
return nil
}
func (p *PoolUsageCollector) cephUsageCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "df",
"detail": "detail",
"format": "json",
})
if err != nil {
// panic! because ideally in no world this hard-coded input
// should fail.
panic(err)
}
return cmd
}
// Describe fulfills the prometheus.Collector's interface and sends the descriptors
// of pool's metrics to the given channel.
func (p *PoolUsageCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range p.collectorList() {
metric.Describe(ch)
}
}
// Collect extracts the current values of all the metrics and sends them to the
// prometheus channel.
func (p *PoolUsageCollector) Collect(ch chan<- prometheus.Metric) {
if err := p.collect(); err != nil {
log.Println("failed collecting metrics:", err)
return
}
for _, metric := range p.collectorList() {
metric.Collect(ch)
}
}

View File

@ -0,0 +1,73 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"io/ioutil"
"net/http"
"net/http/httptest"
"regexp"
"testing"
"github.com/prometheus/client_golang/prometheus"
)
func TestPoolUsageCollector(t *testing.T) {
for _, tt := range []struct {
input string
regexes []*regexp.Regexp
}{
{
`
{"pools": [
{"name": "rbd", "id": 11, "stats": {"bytes_used": 20, "objects": 5, "rd": 4, "wr": 6}}
]}`,
[]*regexp.Regexp{
regexp.MustCompile(`pool_used_bytes{pool="rbd"} 20`),
regexp.MustCompile(`pool_objects_total{pool="rbd"} 5`),
regexp.MustCompile(`pool_read_total{pool="rbd"} 4`),
regexp.MustCompile(`pool_write_total{pool="rbd"} 6`),
},
},
} {
func() {
collector := NewPoolUsageCollector(NewNoopConn(tt.input))
if err := prometheus.Register(collector); err != nil {
t.Fatalf("collector failed to register: %s", err)
}
defer prometheus.Unregister(collector)
server := httptest.NewServer(prometheus.Handler())
defer server.Close()
resp, err := http.Get(server.URL)
if err != nil {
t.Fatalf("unexpected failed response from prometheus: %s", err)
}
defer resp.Body.Close()
buf, err := ioutil.ReadAll(resp.Body)
if err != nil {
t.Fatalf("failed reading server response: %s", err)
}
for _, re := range tt.regexes {
if !re.Match(buf) {
t.Errorf("failed matching: %q", re)
}
}
}()
}
}

115
exporter.go Normal file
View File

@ -0,0 +1,115 @@
// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"flag"
"log"
"net/http"
"sync"
ceph_collectors "github.com/digitalocean/ceph_exporter/collectors"
"github.com/ceph/go-ceph/rados"
"github.com/prometheus/client_golang/prometheus"
)
// CephExporter wraps all the ceph collectors and provides a single global
// exporter to extracts metrics out of. It also ensures that the collection
// is done in a thread-safe manner, the necessary requirement stated by
// prometheus. It also implements a prometheus.Collector interface in order
// to register it correctly.
type CephExporter struct {
mu sync.Mutex
collectors []prometheus.Collector
}
// Verify that the exporter implements the interface correctly.
var _ prometheus.Collector = &CephExporter{}
// NewCephExporter creates an instance to CephExporter and returns a reference
// to it. We can choose to enable a collector to extract stats out of by adding
// it to the list of collectors.
func NewCephExporter(conn *rados.Conn) *CephExporter {
return &CephExporter{
collectors: []prometheus.Collector{
ceph_collectors.NewClusterUsageCollector(conn),
ceph_collectors.NewPoolUsageCollector(conn),
ceph_collectors.NewClusterHealthCollector(conn),
ceph_collectors.NewMonitorCollector(conn),
},
}
}
// Describe sends all the descriptors of the collectors included to
// the provided channel.
func (c *CephExporter) Describe(ch chan<- *prometheus.Desc) {
for _, cc := range c.collectors {
cc.Describe(ch)
}
}
// Collect sends the collected metrics from each of the collectors to
// prometheus. Collect could be called several times concurrently
// and thus its run is protected by a single mutex.
func (c *CephExporter) Collect(ch chan<- prometheus.Metric) {
c.mu.Lock()
defer c.mu.Unlock()
for _, cc := range c.collectors {
cc.Collect(ch)
}
}
func main() {
var (
addr = flag.String("telemetry.addr", ":9190", "host:port for ceph exporter")
metricsPath = flag.String("telemetry.path", "/metrics", "URL path for surfacing collected metrics")
cephConfig = flag.String("ceph.config", "", "path to ceph config file")
)
flag.Parse()
conn, err := rados.NewConn()
if err != nil {
log.Fatalf("cannot create new ceph connection: %s", err)
}
if *cephConfig != "" {
err = conn.ReadConfigFile(*cephConfig)
} else {
err = conn.ReadDefaultConfigFile()
}
if err != nil {
log.Fatalf("cannot read ceph config file: %s", err)
}
if err := conn.Connect(); err != nil {
log.Fatalf("cannot connect to ceph cluster: %s", err)
}
defer conn.Shutdown()
prometheus.MustRegister(NewCephExporter(conn))
http.Handle(*metricsPath, prometheus.Handler())
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, *metricsPath, http.StatusMovedPermanently)
})
log.Printf("Starting ceph exporter on %q", *addr)
if err := http.ListenAndServe(*addr, nil); err != nil {
log.Fatalf("cannot start ceph exporter: %s", err)
}
}

BIN
sample.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB