From 9f2aa24e12bb95cba49c903d82740d6299b5ce35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Gro=C3=9Fe=20Sundrup?= Date: Mon, 7 Sep 2015 15:49:30 +0200 Subject: [PATCH] Add collector for metrics of linux software raids --- README.md | 1 + collector/fixtures/mdstat | 26 ++++ collector/mdadm.go | 279 ++++++++++++++++++++++++++++++++++++++ collector/mdadm_test.go | 33 +++++ node_exporter.go | 2 +- 5 files changed, 340 insertions(+), 1 deletion(-) create mode 100644 collector/fixtures/mdstat create mode 100644 collector/mdadm.go create mode 100644 collector/mdadm_test.go diff --git a/README.md b/README.md index f2dd5fe6..378d00ee 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ netstat | Exposes network statistics from `/proc/net/netstat`. This is the same stat | Exposes various statistics from `/proc/stat`. This includes CPU usage, boot time, forks and interrupts. textfile | Exposes statistics read from local disk. The `--collector.textfile.directory` flag must be set. time | Exposes the current system time. +mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no /proc/mdstat present) ### Disabled by default diff --git a/collector/fixtures/mdstat b/collector/fixtures/mdstat new file mode 100644 index 00000000..da5c691c --- /dev/null +++ b/collector/fixtures/mdstat @@ -0,0 +1,26 @@ +Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10] +md3 : active raid6 sda1[8] sdh1[7] sdg1[6] sdf1[5] sde1[11] sdd1[3] sdc1[10] sdb1[9] + 5853468288 blocks super 1.2 level 6, 64k chunk, algorithm 2 [8/8] [UUUUUUUU] + +md127 : active raid1 sdi2[0] sdj2[1] + 312319552 blocks [2/2] [UU] + +md0 : active raid1 sdi1[0] sdj1[1] + 248896 blocks [2/2] [UU] + +md4 : inactive raid1 sda3[0] sdb3[1] + 4883648 blocks [2/2] [UU] + +md6 : active raid1 sdb2[2] sda2[0] + 195310144 blocks [2/1] [U_] + [=>...................] recovery = 8.5% (16775552/195310144) finish=17.0min speed=259783K/sec + +md8 : active raid1 sdb1[1] sda1[0] + 195310144 blocks [2/2] [UU] + [=>...................] resync = 8.5% (16775552/195310144) finish=17.0min speed=259783K/sec + +md7 : active raid6 sdb1[0] sde1[3] sdd1[2] sdc1[1] + 7813735424 blocks super 1.2 level 6, 512k chunk, algorithm 2 [4/3] [U_UU] + bitmap: 0/30 pages [0KB], 65536KB chunk + +unused devices: diff --git a/collector/mdadm.go b/collector/mdadm.go new file mode 100644 index 00000000..78f33d56 --- /dev/null +++ b/collector/mdadm.go @@ -0,0 +1,279 @@ +// +build !nomdadm + +package collector + +import ( + "fmt" + "io/ioutil" + "os" + "regexp" + "strconv" + "strings" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/log" +) + +var ( + statusfile = "/proc/mdstat" + statuslineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[[U_]+\]`) + buildlineRE = regexp.MustCompile(`\((\d+)/\d+\)`) +) + +type mdStatus struct { + mdName string + isActive bool + disksActive int64 + disksTotal int64 + blocksTotal int64 + blocksSynced int64 +} + +type mdadmCollector struct{} + +func init() { + Factories["mdadm"] = NewMdadmCollector +} + +func evalStatusline(statusline string) (active, total, size int64, err error) { + matches := statuslineRE.FindStringSubmatch(statusline) + + // +1 to make it more obvious that the whole string containing the info is also returned as matches[0]. + if len(matches) < 3+1 { + return 0, 0, 0, fmt.Errorf("too few matches found in statusline: %s", statusline) + } else { + if len(matches) > 3+1 { + return 0, 0, 0, fmt.Errorf("too many matches found in statusline: %s", statusline) + } + } + + size, err = strconv.ParseInt(matches[1], 10, 64) + if err != nil { + return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline) + } + + total, err = strconv.ParseInt(matches[2], 10, 64) + if err != nil { + return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline) + } + active, err = strconv.ParseInt(matches[3], 10, 64) + if err != nil { + return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline) + } + + return active, total, size, nil +} + +// Gets the size that has already been synced out of the sync-line. +func evalBuildline(buildline string) (int64, error) { + matches := buildlineRE.FindStringSubmatch(buildline) + + // +1 to make it more obvious that the whole string containing the info is also returned as matches[0]. + if len(matches) < 1+1 { + return 0, fmt.Errorf("too few matches found in buildline: %s", buildline) + } + + if len(matches) > 1+1 { + return 0, fmt.Errorf("too many matches found in buildline: %s", buildline) + } + + syncedSize, err := strconv.ParseInt(matches[1], 10, 64) + + if err != nil { + return 0, fmt.Errorf("%s in buildline: %s", err, buildline) + } + + return syncedSize, nil +} + +// Parses an mdstat-file and returns a struct with the relevant infos. +func parseMdstat(mdStatusFilePath string) ([]mdStatus, error) { + content, err := ioutil.ReadFile(mdStatusFilePath) + if err != nil { + return []mdStatus{}, fmt.Errorf("error parsing %s: %s", statusfile, err) + } + + mdStatusFile := string(content) + + lines := strings.Split(mdStatusFile, "\n") + var currentMD string + + // Each md has at least the deviceline, statusline and one empty line afterwards + // so we will have probably something of the order len(lines)/3 devices + // so we use that for preallocation. + estimateMDs := len(lines) / 3 + mdStates := make([]mdStatus, 0, estimateMDs) + + for i, l := range lines { + if l == "" { + // Skip entirely empty lines. + continue + } + + if l[0] == ' ' { + // Those lines are not the beginning of a md-section. + continue + } + + if strings.HasPrefix(l, "Personalities") || strings.HasPrefix(l, "unused") { + // We aren't interested in lines with general info. + continue + } + + mainLine := strings.Split(l, " ") + if len(mainLine) < 3 { + return mdStates, fmt.Errorf("error parsing mdline: %s", l) + } + currentMD = mainLine[0] // name of md-device + isActive := (mainLine[2] == "active") // activity status of said md-device + + if len(lines) <= i+3 { + return mdStates, fmt.Errorf("error parsing %s: entry for %s has fewer lines than expected", statusfile, currentMD) + } + + active, total, size, err := evalStatusline(lines[i+1]) // parse statusline, always present + + if err != nil { + return mdStates, fmt.Errorf("error parsing %s: %s", statusfile, err) + } + + // Now get the number of synced blocks. + var syncedBlocks int64 + + // Get the line number of the syncing-line. + var j int + if strings.Contains(lines[i+2], "bitmap") { // then skip the bitmap line + j = i + 3 + } else { + j = i + 2 + } + + // If device is syncing at the moment, get the number of currently synced bytes, + // otherwise that number equals the size of the device. + if strings.Contains(lines[j], "recovery") || strings.Contains(lines[j], "resync") { + syncedBlocks, err = evalBuildline(lines[j]) + if err != nil { + return mdStates, fmt.Errorf("error parsing %s: %s", statusfile, err) + } + } else { + syncedBlocks = size + } + + mdStates = append(mdStates, mdStatus{currentMD, isActive, active, total, size, syncedBlocks}) + + } + + return mdStates, nil +} + +// Just returns the pointer to an empty struct as we only use throwaway-metrics. +func NewMdadmCollector() (Collector, error) { + return &mdadmCollector{}, nil +} + +var ( + isActiveDesc = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, "md", "is_active"), + "Indicator whether the md-device is active or not.", + []string{"device"}, + nil, + ) + + disksActiveDesc = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, "md", "disks_active"), + "Number of active disks of device.", + []string{"device"}, + nil, + ) + + disksTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, "md", "disks"), + "Total number of disks of device.", + []string{"device"}, + nil, + ) + + blocksTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, "md", "blocks"), + "Total number of blocks on device.", + []string{"device"}, + nil, + ) + + blocksSyncedDesc = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, "md", "blocks_synced"), + "Number of blocks synced on device.", + []string{"device"}, + nil, + ) +) + +func (c *mdadmCollector) Update(ch chan<- prometheus.Metric) (err error) { + // take care we don't crash on non-existent statusfiles + _, err = os.Stat(statusfile) + if os.IsNotExist(err) { + // no such file or directory, nothing to do, just return + return nil + } + + if err != nil { // now things get weird, better to return + return err + } + + // First parse mdstat-file... + mdstate, err := parseMdstat(statusfile) + if err != nil { + return fmt.Errorf("error parsing %s: %s", statusfile, err) + } + + // ... and then plug the result into the metrics to be exported. + var isActiveFloat float64 + for _, mds := range mdstate { + + log.Debugf("collecting metrics for device %s", mds.mdName) + + if mds.isActive { + isActiveFloat = 1 + } else { + isActiveFloat = 0 + } + + ch <- prometheus.MustNewConstMetric( + isActiveDesc, + prometheus.GaugeValue, + isActiveFloat, + mds.mdName, + ) + + ch <- prometheus.MustNewConstMetric( + disksActiveDesc, + prometheus.GaugeValue, + float64(mds.disksActive), + mds.mdName, + ) + + ch <- prometheus.MustNewConstMetric( + disksTotalDesc, + prometheus.GaugeValue, + float64(mds.disksTotal), + mds.mdName, + ) + + ch <- prometheus.MustNewConstMetric( + blocksTotalDesc, + prometheus.GaugeValue, + float64(mds.blocksTotal), + mds.mdName, + ) + + ch <- prometheus.MustNewConstMetric( + blocksSyncedDesc, + prometheus.GaugeValue, + float64(mds.blocksSynced), + mds.mdName, + ) + + } + + return nil +} diff --git a/collector/mdadm_test.go b/collector/mdadm_test.go new file mode 100644 index 00000000..1c83733e --- /dev/null +++ b/collector/mdadm_test.go @@ -0,0 +1,33 @@ +package collector + +import ( + "testing" +) + +func TestMdadm(t *testing.T) { + mdStates, err := parseMdstat("fixtures/mdstat") + + if err != nil { + t.Fatalf("parsing of reference-file failed entirely: %s", err) + } + + refs := map[string]mdStatus{ + "md3": mdStatus{"md3", true, 8, 8, 5853468288, 5853468288}, + "md127": mdStatus{"md127", true, 2, 2, 312319552, 312319552}, + "md0": mdStatus{"md0", true, 2, 2, 248896, 248896}, + "md4": mdStatus{"md4", false, 2, 2, 4883648, 4883648}, + "md6": mdStatus{"md6", true, 1, 2, 195310144, 16775552}, + "md8": mdStatus{"md8", true, 2, 2, 195310144, 16775552}, + "md7": mdStatus{"md7", true, 3, 4, 7813735424, 7813735424}, + } + + for _, md := range mdStates { + if md != refs[md.mdName] { + t.Errorf("failed parsing md-device %s correctly: want %v, got %v", md.mdName, refs[md.mdName], md) + } + } + + if len(mdStates) != len(refs) { + t.Errorf("expected number of parsed md-device to be %s, but was %s", len(refs), len(mdStates)) + } +} diff --git a/node_exporter.go b/node_exporter.go index 872e5021..5e3d834b 100644 --- a/node_exporter.go +++ b/node_exporter.go @@ -28,7 +28,7 @@ var ( memProfile = flag.String("debug.memprofile-file", "", "Write memory profile to this file upon receipt of SIGUSR1.") listenAddress = flag.String("web.listen-address", ":9100", "Address on which to expose metrics and web interface.") metricsPath = flag.String("web.telemetry-path", "/metrics", "Path under which to expose metrics.") - enabledCollectors = flag.String("collectors.enabled", "diskstats,filefd,filesystem,loadavg,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname", "Comma-separated list of collectors to use.") + enabledCollectors = flag.String("collectors.enabled", "diskstats,filefd,filesystem,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname", "Comma-separated list of collectors to use.") printCollectors = flag.Bool("collectors.print", false, "If true, print available collectors and exit.") authUser = flag.String("auth.user", "", "Username for basic auth.") authPass = flag.String("auth.pass", "", "Password for basic auth.")