mirror of
https://github.com/prometheus/node_exporter
synced 2024-12-18 04:15:06 +00:00
381f32b1c5
Despite being quite hard to provoke (< 10% in my testing), the btrfs collector would occasionally leave stale FDs relating to btrfs mountpoints, making the filesystems unable to be unmounted. Fixes: #2772. Signed-off-by: Daniel Swarbrick <daniel.swarbrick@gmail.com>
413 lines
12 KiB
Go
413 lines
12 KiB
Go
// Copyright 2019 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//go:build !nobtrfs
|
|
// +build !nobtrfs
|
|
|
|
package collector
|
|
|
|
import (
|
|
"fmt"
|
|
"path"
|
|
"strings"
|
|
"syscall"
|
|
|
|
dennwc "github.com/dennwc/btrfs"
|
|
"github.com/go-kit/log"
|
|
"github.com/go-kit/log/level"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/procfs/btrfs"
|
|
)
|
|
|
|
// A btrfsCollector is a Collector which gathers metrics from Btrfs filesystems.
|
|
type btrfsCollector struct {
|
|
fs btrfs.FS
|
|
logger log.Logger
|
|
}
|
|
|
|
func init() {
|
|
registerCollector("btrfs", defaultEnabled, NewBtrfsCollector)
|
|
}
|
|
|
|
// NewBtrfsCollector returns a new Collector exposing Btrfs statistics.
|
|
func NewBtrfsCollector(logger log.Logger) (Collector, error) {
|
|
fs, err := btrfs.NewFS(*sysPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to open sysfs: %w", err)
|
|
}
|
|
|
|
return &btrfsCollector{
|
|
fs: fs,
|
|
logger: logger,
|
|
}, nil
|
|
}
|
|
|
|
// Update retrieves and exports Btrfs statistics.
|
|
// It implements Collector.
|
|
func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error {
|
|
stats, err := c.fs.Stats()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err)
|
|
}
|
|
|
|
ioctlStatsMap, err := c.getIoctlStats()
|
|
if err != nil {
|
|
level.Debug(c.logger).Log(
|
|
"msg", "Error querying btrfs device stats with ioctl",
|
|
"err", err)
|
|
ioctlStatsMap = make(map[string]*btrfsIoctlFsStats)
|
|
}
|
|
|
|
for _, s := range stats {
|
|
// match up procfs and ioctl info by filesystem UUID (without dashes)
|
|
var fsUUID = strings.Replace(s.UUID, "-", "", -1)
|
|
ioctlStats := ioctlStatsMap[fsUUID]
|
|
c.updateBtrfsStats(ch, s, ioctlStats)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type btrfsIoctlFsDevStats struct {
|
|
path string
|
|
uuid string
|
|
|
|
bytesUsed uint64
|
|
totalBytes uint64
|
|
|
|
// The error stats below match the following upstream lists:
|
|
// https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140
|
|
// https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692
|
|
writeErrs uint64
|
|
readErrs uint64
|
|
flushErrs uint64
|
|
corruptionErrs uint64
|
|
generationErrs uint64
|
|
}
|
|
|
|
type btrfsIoctlFsStats struct {
|
|
uuid string
|
|
devices []btrfsIoctlFsDevStats
|
|
}
|
|
|
|
func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) {
|
|
// Instead of introducing more ioctl calls to scan for all btrfs
|
|
// filesystems re-use our mount point utils to find known mounts
|
|
mountsList, err := mountPointDetails(c.logger)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Track devices we have successfully scanned, by device path.
|
|
devicesDone := make(map[string]struct{})
|
|
// Filesystems scann results by UUID.
|
|
fsStats := make(map[string]*btrfsIoctlFsStats)
|
|
|
|
for _, mount := range mountsList {
|
|
if mount.fsType != "btrfs" {
|
|
continue
|
|
}
|
|
|
|
if _, found := devicesDone[mount.device]; found {
|
|
// We already found this filesystem by another mount point.
|
|
continue
|
|
}
|
|
|
|
mountPath := rootfsFilePath(mount.mountPoint)
|
|
|
|
fs, err := dennwc.Open(mountPath, true)
|
|
if err != nil {
|
|
// Failed to open this mount point, maybe we didn't have permission
|
|
// maybe we'll find another mount point for this FS later.
|
|
level.Debug(c.logger).Log(
|
|
"msg", "Error inspecting btrfs mountpoint",
|
|
"mountPoint", mountPath,
|
|
"err", err)
|
|
continue
|
|
}
|
|
defer fs.Close()
|
|
|
|
fsInfo, err := fs.Info()
|
|
if err != nil {
|
|
// Failed to get the FS info for some reason,
|
|
// perhaps it'll work with a different mount point
|
|
level.Debug(c.logger).Log(
|
|
"msg", "Error querying btrfs filesystem",
|
|
"mountPoint", mountPath,
|
|
"err", err)
|
|
continue
|
|
}
|
|
|
|
fsID := fsInfo.FSID.String()
|
|
if _, found := fsStats[fsID]; found {
|
|
// We already found this filesystem by another mount point
|
|
continue
|
|
}
|
|
|
|
deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo)
|
|
if err != nil {
|
|
level.Debug(c.logger).Log(
|
|
"msg", "Error querying btrfs device stats",
|
|
"mountPoint", mountPath,
|
|
"err", err)
|
|
continue
|
|
}
|
|
|
|
devicesDone[mount.device] = struct{}{}
|
|
fsStats[fsID] = &btrfsIoctlFsStats{
|
|
uuid: fsID,
|
|
devices: deviceStats,
|
|
}
|
|
}
|
|
|
|
return fsStats, nil
|
|
}
|
|
|
|
func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) {
|
|
devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices)
|
|
|
|
for i := uint64(0); i <= fsInfo.MaxID; i++ {
|
|
deviceInfo, err := fs.GetDevInfo(i)
|
|
|
|
if err != nil {
|
|
if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV {
|
|
// Device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this.
|
|
continue
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
deviceStats, err := fs.GetDevStats(i)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
devices = append(devices, btrfsIoctlFsDevStats{
|
|
path: deviceInfo.Path,
|
|
uuid: deviceInfo.UUID.String(),
|
|
bytesUsed: deviceInfo.BytesUsed,
|
|
totalBytes: deviceInfo.TotalBytes,
|
|
|
|
writeErrs: deviceStats.WriteErrs,
|
|
readErrs: deviceStats.ReadErrs,
|
|
flushErrs: deviceStats.FlushErrs,
|
|
corruptionErrs: deviceStats.CorruptionErrs,
|
|
generationErrs: deviceStats.GenerationErrs,
|
|
})
|
|
|
|
if uint64(len(devices)) == fsInfo.NumDevices {
|
|
break
|
|
}
|
|
}
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
// btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric.
|
|
type btrfsMetric struct {
|
|
name string
|
|
metricType prometheus.ValueType
|
|
desc string
|
|
value float64
|
|
extraLabel []string
|
|
extraLabelValue []string
|
|
}
|
|
|
|
// updateBtrfsStats collects statistics for one bcache ID.
|
|
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) {
|
|
const subsystem = "btrfs"
|
|
|
|
// Basic information about the filesystem.
|
|
devLabels := []string{"uuid"}
|
|
|
|
// Retrieve the metrics.
|
|
metrics := c.getMetrics(s, ioctlStats)
|
|
|
|
// Convert all gathered metrics to Prometheus Metrics and add to channel.
|
|
for _, m := range metrics {
|
|
labels := append(devLabels, m.extraLabel...)
|
|
|
|
desc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, m.name),
|
|
m.desc,
|
|
labels,
|
|
nil,
|
|
)
|
|
|
|
labelValues := []string{s.UUID}
|
|
if len(m.extraLabelValue) > 0 {
|
|
labelValues = append(labelValues, m.extraLabelValue...)
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
desc,
|
|
m.metricType,
|
|
m.value,
|
|
labelValues...,
|
|
)
|
|
}
|
|
}
|
|
|
|
// getMetrics returns metrics for the given Btrfs statistics.
|
|
func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric {
|
|
metrics := []btrfsMetric{
|
|
{
|
|
name: "info",
|
|
desc: "Filesystem information",
|
|
value: 1,
|
|
metricType: prometheus.GaugeValue,
|
|
extraLabel: []string{"label"},
|
|
extraLabelValue: []string{s.Label},
|
|
},
|
|
{
|
|
name: "global_rsv_size_bytes",
|
|
desc: "Size of global reserve.",
|
|
metricType: prometheus.GaugeValue,
|
|
value: float64(s.Allocation.GlobalRsvSize),
|
|
},
|
|
}
|
|
|
|
// Information about data, metadata and system data.
|
|
metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...)
|
|
metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...)
|
|
metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...)
|
|
|
|
// Information about devices.
|
|
if ioctlStats == nil {
|
|
for n, dev := range s.Devices {
|
|
metrics = append(metrics, btrfsMetric{
|
|
name: "device_size_bytes",
|
|
desc: "Size of a device that is part of the filesystem.",
|
|
metricType: prometheus.GaugeValue,
|
|
value: float64(dev.Size),
|
|
extraLabel: []string{"device"},
|
|
extraLabelValue: []string{n},
|
|
})
|
|
}
|
|
return metrics
|
|
}
|
|
|
|
for _, dev := range ioctlStats.devices {
|
|
// trim the path prefix from the device name so the value should match
|
|
// the value used in the fallback branch above.
|
|
// e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1
|
|
_, device := path.Split(dev.path)
|
|
|
|
extraLabels := []string{"device", "btrfs_dev_uuid"}
|
|
extraLabelValues := []string{device, dev.uuid}
|
|
|
|
metrics = append(metrics,
|
|
btrfsMetric{
|
|
name: "device_size_bytes",
|
|
desc: "Size of a device that is part of the filesystem.",
|
|
metricType: prometheus.GaugeValue,
|
|
value: float64(dev.totalBytes),
|
|
extraLabel: extraLabels,
|
|
extraLabelValue: extraLabelValues,
|
|
},
|
|
// A bytes available metric is probably more useful than a
|
|
// bytes used metric, because large numbers of bytes will
|
|
// suffer from floating point representation issues
|
|
// and we probably care more about the number when it's low anyway
|
|
btrfsMetric{
|
|
name: "device_unused_bytes",
|
|
desc: "Unused bytes unused on a device that is part of the filesystem.",
|
|
metricType: prometheus.GaugeValue,
|
|
value: float64(dev.totalBytes - dev.bytesUsed),
|
|
extraLabel: extraLabels,
|
|
extraLabelValue: extraLabelValues,
|
|
})
|
|
|
|
errorLabels := append([]string{"type"}, extraLabels...)
|
|
values := []uint64{
|
|
dev.writeErrs,
|
|
dev.readErrs,
|
|
dev.flushErrs,
|
|
dev.corruptionErrs,
|
|
dev.generationErrs,
|
|
}
|
|
btrfsErrorTypeNames := []string{
|
|
"write",
|
|
"read",
|
|
"flush",
|
|
"corruption",
|
|
"generation",
|
|
}
|
|
|
|
for i, errorType := range btrfsErrorTypeNames {
|
|
metrics = append(metrics,
|
|
btrfsMetric{
|
|
name: "device_errors_total",
|
|
desc: "Errors reported for the device",
|
|
metricType: prometheus.CounterValue,
|
|
value: float64(values[i]),
|
|
extraLabel: errorLabels,
|
|
extraLabelValue: append([]string{errorType}, extraLabelValues...),
|
|
})
|
|
}
|
|
}
|
|
|
|
return metrics
|
|
}
|
|
|
|
// getAllocationStats returns allocation metrics for the given Btrfs Allocation statistics.
|
|
func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats) []btrfsMetric {
|
|
metrics := []btrfsMetric{
|
|
{
|
|
name: "reserved_bytes",
|
|
desc: "Amount of space reserved for a data type",
|
|
metricType: prometheus.GaugeValue,
|
|
value: float64(s.ReservedBytes),
|
|
extraLabel: []string{"block_group_type"},
|
|
extraLabelValue: []string{a},
|
|
},
|
|
}
|
|
|
|
// Add all layout statistics.
|
|
for layout, stats := range s.Layouts {
|
|
metrics = append(metrics, c.getLayoutStats(a, layout, stats)...)
|
|
}
|
|
|
|
return metrics
|
|
}
|
|
|
|
// getLayoutStats returns metrics for a data layout.
|
|
func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btrfsMetric {
|
|
return []btrfsMetric{
|
|
{
|
|
name: "used_bytes",
|
|
desc: "Amount of used space by a layout/data type",
|
|
metricType: prometheus.GaugeValue,
|
|
value: float64(s.UsedBytes),
|
|
extraLabel: []string{"block_group_type", "mode"},
|
|
extraLabelValue: []string{a, l},
|
|
},
|
|
{
|
|
name: "size_bytes",
|
|
desc: "Amount of space allocated for a layout/data type",
|
|
metricType: prometheus.GaugeValue,
|
|
value: float64(s.TotalBytes),
|
|
extraLabel: []string{"block_group_type", "mode"},
|
|
extraLabelValue: []string{a, l},
|
|
},
|
|
{
|
|
name: "allocation_ratio",
|
|
desc: "Data allocation ratio for a layout/data type",
|
|
metricType: prometheus.GaugeValue,
|
|
value: s.Ratio,
|
|
extraLabel: []string{"block_group_type", "mode"},
|
|
extraLabelValue: []string{a, l},
|
|
},
|
|
}
|
|
}
|