From f47abc5d063b177206f3b1b3c565956c4da733d8 Mon Sep 17 00:00:00 2001 From: Johannes 'fish' Ziemke Date: Tue, 8 Jul 2014 16:24:29 +0200 Subject: [PATCH] Add MegaCLI collector This collector exports the following metrics: - raid_drive_temperature: drive temperature - raid_drive_count: drive error and event counters - raid_adapter_disk_presence: disk presence per adapter --- collector/collector.go | 1 + collector/fixtures/megacli_adapter.txt | 280 +++++++++++++++++++++++++ collector/fixtures/megacli_disks.txt | 197 +++++++++++++++++ collector/megacli.go | 233 ++++++++++++++++++++ collector/megacli_test.go | 54 +++++ node_exporter.conf | 7 +- 6 files changed, 770 insertions(+), 2 deletions(-) create mode 100644 collector/fixtures/megacli_adapter.txt create mode 100644 collector/fixtures/megacli_disks.txt create mode 100644 collector/megacli.go create mode 100644 collector/megacli_test.go diff --git a/collector/collector.go b/collector/collector.go index 47deb2e9..2e781987 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -18,5 +18,6 @@ type Collector interface { // time.) type Config struct { + Config map[string]string `json:"config"` Attributes map[string]string `json:"attributes"` } diff --git a/collector/fixtures/megacli_adapter.txt b/collector/fixtures/megacli_adapter.txt new file mode 100644 index 00000000..97c55766 --- /dev/null +++ b/collector/fixtures/megacli_adapter.txt @@ -0,0 +1,280 @@ +Adapter #0 + +============================================================================== + Versions + ================ +Product Name : PERC 6/i Integrated +Serial No : 1234567890123456 +FW Package Build: 6.3.3.0002 + + Mfg. Data + ================ +Mfg. Date : 06/24/08 +Rework Date : 06/24/08 +Revision No : +Battery FRU : N/A + + Image Versions in Flash: + ================ +FW Version : 1.22.52-1909 +BIOS Version : 2.04.00 +WebBIOS Version : 1.1-46-e_15-Rel +Ctrl-R Version : 1.02-015B +Preboot CLI Version: 01.00-022:#%00005 +Boot Block Version : 1.00.00.01-0011 + + Pending Images in Flash + ================ +None + + PCI Info + ================ +Controller Id : 0000 +Vendor Id : 1000 +Device Id : 0060 +SubVendorId : 1028 +SubDeviceId : 1f0c + +Host Interface : PCIE + +Link Speed : 0 +Number of Frontend Port: 0 +Device Interface : PCIE + +Number of Backend Port: 8 +Port : Address +0 5000c50028f2083d +1 5000c50023cb3f39 +2 5000c50023cea805 +3 5000c50029124491 +4 0000000000000000 +5 0000000000000000 +6 0000000000000000 +7 0000000000000000 + + HW Configuration + ================ +SAS Address : 5a4badb01e219100 +BBU : Present +Alarm : Absent +NVRAM : Present +Serial Debugger : Present +Memory : Present +Flash : Present +Memory Size : 256MB +TPM : Absent +On board Expander: Absent +Upgrade Key : Absent +Temperature sensor for ROC : Absent +Temperature sensor for controller : Absent + + + Settings + ================ +Current Time : 14:55:23 7/4, 2014 +Predictive Fail Poll Interval : 300sec +Interrupt Throttle Active Count : 16 +Interrupt Throttle Completion : 50us +Rebuild Rate : 30% +PR Rate : 30% +BGI Rate : 30% +Check Consistency Rate : 30% +Reconstruction Rate : 30% +Cache Flush Interval : 4s +Max Drives to Spinup at One Time : 2 +Delay Among Spinup Groups : 12s +Physical Drive Coercion Mode : 128MB +Cluster Mode : Disabled +Alarm : Disabled +Auto Rebuild : Enabled +Battery Warning : Enabled +Ecc Bucket Size : 15 +Ecc Bucket Leak Rate : 1440 Minutes +Restore HotSpare on Insertion : Disabled +Expose Enclosure Devices : Disabled +Maintain PD Fail History : Disabled +Host Request Reordering : Enabled +Auto Detect BackPlane Enabled : SGPIO/i2c SEP +Load Balance Mode : Auto +Use FDE Only : No +Security Key Assigned : No +Security Key Failed : No +Security Key Not Backedup : No +Default LD PowerSave Policy : Controller Defined +Maximum number of direct attached drives to spin up in 1 min : 0 +Auto Enhanced Import : No +Any Offline VD Cache Preserved : No +Allow Boot with Preserved Cache : No +Disable Online Controller Reset : No +PFK in NVRAM : No +Use disk activity for locate : No +POST delay : 90 seconds +BIOS Error Handling : Stop On Errors +Current Boot Mode :Normal + Capabilities + ================ +RAID Level Supported : RAID0, RAID1, RAID5, RAID6, RAID00, RAID10, RAID50, RAID60, PRL 11, PRL 11 with spanning, SRL 3 supported, PRL11-RLQ0 DDF layout with no span, PRL11-RLQ0 DDF layout with span +Supported Drives : SAS, SATA + +Allowed Mixing: + +Mix in Enclosure Allowed + + Status + ================ +ECC Bucket Count : 0 + + Limitations + ================ +Max Arms Per VD : 32 +Max Spans Per VD : 8 +Max Arrays : 128 +Max Number of VDs : 64 +Max Parallel Commands : 1008 +Max SGE Count : 80 +Max Data Transfer Size : 8192 sectors +Max Strips PerIO : 42 +Max LD per array : 16 +Min Strip Size : 8 KB +Max Strip Size : 1.0 MB +Max Configurable CacheCade Size: 0 GB +Current Size of CacheCade : 0 GB +Current Size of FW Cache : 0 MB + + Device Present + ================ +Virtual Drives : 1 + Degraded : 0 + Offline : 0 +Physical Devices : 5 + Disks : 4 + Critical Disks : 0 + Failed Disks : 0 + + Supported Adapter Operations + ================ +Rebuild Rate : Yes +CC Rate : Yes +BGI Rate : Yes +Reconstruct Rate : Yes +Patrol Read Rate : Yes +Alarm Control : Yes +Cluster Support : No +BBU : Yes +Spanning : Yes +Dedicated Hot Spare : Yes +Revertible Hot Spares : Yes +Foreign Config Import : Yes +Self Diagnostic : Yes +Allow Mixed Redundancy on Array : No +Global Hot Spares : Yes +Deny SCSI Passthrough : No +Deny SMP Passthrough : No +Deny STP Passthrough : No +Support Security : No +Snapshot Enabled : No +Support the OCE without adding drives : No +Support PFK : No +Support PI : No +Support Boot Time PFK Change : No +Disable Online PFK Change : No +Support Shield State : No +Block SSD Write Disk Cache Change: No + + Supported VD Operations + ================ +Read Policy : Yes +Write Policy : Yes +IO Policy : Yes +Access Policy : Yes +Disk Cache Policy : Yes +Reconstruction : Yes +Deny Locate : No +Deny CC : No +Allow Ctrl Encryption: No +Enable LDBBM : No +Support Breakmirror : No +Power Savings : No + + Supported PD Operations + ================ +Force Online : Yes +Force Offline : Yes +Force Rebuild : Yes +Deny Force Failed : No +Deny Force Good/Bad : No +Deny Missing Replace : No +Deny Clear : No +Deny Locate : No +Support Temperature : No +NCQ : No +Disable Copyback : No +Enable JBOD : No +Enable Copyback on SMART : No +Enable Copyback to SSD on SMART Error : No +Enable SSD Patrol Read : No +PR Correct Unconfigured Areas : Yes + Error Counters + ================ +Memory Correctable Errors : 0 +Memory Uncorrectable Errors : 0 + + Cluster Information + ================ +Cluster Permitted : No +Cluster Active : No + + Default Settings + ================ +Phy Polarity : 0 +Phy PolaritySplit : 0 +Background Rate : 30 +Strip Size : 64kB +Flush Time : 4 seconds +Write Policy : WB +Read Policy : None +Cache When BBU Bad : Disabled +Cached IO : No +SMART Mode : Mode 6 +Alarm Disable : No +Coercion Mode : 128MB +ZCR Config : Unknown +Dirty LED Shows Drive Activity : No +BIOS Continue on Error : 0 +Spin Down Mode : None +Allowed Device Type : SAS/SATA Mix +Allow Mix in Enclosure : Yes +Allow HDD SAS/SATA Mix in VD : No +Allow SSD SAS/SATA Mix in VD : No +Allow HDD/SSD Mix in VD : No +Allow SATA in Cluster : No +Max Chained Enclosures : 1 +Disable Ctrl-R : No +Enable Web BIOS : No +Direct PD Mapping : Yes +BIOS Enumerate VDs : Yes +Restore Hot Spare on Insertion : No +Expose Enclosure Devices : No +Maintain PD Fail History : No +Disable Puncturing : No +Zero Based Enclosure Enumeration : Yes +PreBoot CLI Enabled : No +LED Show Drive Activity : No +Cluster Disable : Yes +SAS Disable : No +Auto Detect BackPlane Enable : SGPIO/i2c SEP +Use FDE Only : No +Enable Led Header : No +Delay during POST : 0 +EnableCrashDump : No +Disable Online Controller Reset : No +EnableLDBBM : No +Un-Certified Hard Disk Drives : Block +Treat Single span R1E as R10 : No +Max LD per array : 16 +Power Saving option : All power saving options are enabled +Default spin down time in minutes: 0 +Enable JBOD : No +Time taken to detect CME : 60s + +Exit Code: 0x00 diff --git a/collector/fixtures/megacli_disks.txt b/collector/fixtures/megacli_disks.txt new file mode 100644 index 00000000..a3374250 --- /dev/null +++ b/collector/fixtures/megacli_disks.txt @@ -0,0 +1,197 @@ + +Adapter #0 + +Enclosure Device ID: 32 +Slot Number: 0 +Drive's position: DiskGroup: 0, Span: 0, Arm: 0 +Enclosure position: N/A +Device Id: 0 +WWN: +Sequence Number: 2 +Media Error Count: 0 +Other Error Count: 0 +Predictive Failure Count: 0 +Last Predictive Failure Event Seq Number: 0 +PD Type: SAS + +Raw Size: 419.186 GB [0x3465f870 Sectors] +Non Coerced Size: 418.686 GB [0x3455f870 Sectors] +Coerced Size: 418.625 GB [0x34540000 Sectors] +Sector Size: 0 +Firmware state: Online, Spun Up +Device Firmware Level: ES64 +Shield Counter: 0 +Successful diagnostics completion on : N/A +SAS Address(0): 0x5000c50028f2083d +SAS Address(1): 0x0 +Connected Port Number: 0(path0) +Inquiry Data: SEAGATE ST3450857SS ES643SK26856 +FDE Capable: Not Capable +FDE Enable: Disable +Secured: Unsecured +Locked: Unlocked +Needs EKM Attention: No +Foreign State: None +Device Speed: Unknown +Link Speed: Unknown +Media Type: Hard Disk Device +Drive Temperature :37C (98.60 F) +PI Eligibility: No +Drive is formatted for PI information: No +PI: No PI +Port-0 : +Port status: Active +Port's Linkspeed: Unknown +Port-1 : +Port status: Active +Port's Linkspeed: Unknown +Drive has flagged a S.M.A.R.T alert : No + + + +Enclosure Device ID: 32 +Slot Number: 1 +Drive's position: DiskGroup: 0, Span: 0, Arm: 1 +Enclosure position: N/A +Device Id: 1 +WWN: +Sequence Number: 2 +Media Error Count: 0 +Other Error Count: 0 +Predictive Failure Count: 0 +Last Predictive Failure Event Seq Number: 0 +PD Type: SAS + +Raw Size: 419.186 GB [0x3465f870 Sectors] +Non Coerced Size: 418.686 GB [0x3455f870 Sectors] +Coerced Size: 418.625 GB [0x34540000 Sectors] +Sector Size: 0 +Firmware state: Online, Spun Up +Device Firmware Level: ES62 +Shield Counter: 0 +Successful diagnostics completion on : N/A +SAS Address(0): 0x5000c50023cb3f39 +SAS Address(1): 0x0 +Connected Port Number: 1(path0) +Inquiry Data: SEAGATE ST3450857SS ES623SK16HLC +FDE Capable: Not Capable +FDE Enable: Disable +Secured: Unsecured +Locked: Unlocked +Needs EKM Attention: No +Foreign State: None +Device Speed: Unknown +Link Speed: Unknown +Media Type: Hard Disk Device +Drive Temperature :37C (98.60 F) +PI Eligibility: No +Drive is formatted for PI information: No +PI: No PI +Port-0 : +Port status: Active +Port's Linkspeed: Unknown +Port-1 : +Port status: Active +Port's Linkspeed: Unknown +Drive has flagged a S.M.A.R.T alert : No + + + +Enclosure Device ID: 32 +Slot Number: 2 +Drive's position: DiskGroup: 0, Span: 1, Arm: 0 +Enclosure position: N/A +Device Id: 2 +WWN: +Sequence Number: 2 +Media Error Count: 0 +Other Error Count: 0 +Predictive Failure Count: 0 +Last Predictive Failure Event Seq Number: 0 +PD Type: SAS + +Raw Size: 419.186 GB [0x3465f870 Sectors] +Non Coerced Size: 418.686 GB [0x3455f870 Sectors] +Coerced Size: 418.625 GB [0x34540000 Sectors] +Sector Size: 0 +Firmware state: Online, Spun Up +Device Firmware Level: ES62 +Shield Counter: 0 +Successful diagnostics completion on : N/A +SAS Address(0): 0x5000c50023cea805 +SAS Address(1): 0x0 +Connected Port Number: 2(path0) +Inquiry Data: SEAGATE ST3450857SS ES623SK189BR +FDE Capable: Not Capable +FDE Enable: Disable +Secured: Unsecured +Locked: Unlocked +Needs EKM Attention: No +Foreign State: None +Device Speed: Unknown +Link Speed: Unknown +Media Type: Hard Disk Device +Drive Temperature :39C (102.20 F) +PI Eligibility: No +Drive is formatted for PI information: No +PI: No PI +Port-0 : +Port status: Active +Port's Linkspeed: Unknown +Port-1 : +Port status: Active +Port's Linkspeed: Unknown +Drive has flagged a S.M.A.R.T alert : No + + + +Enclosure Device ID: 32 +Slot Number: 3 +Drive's position: DiskGroup: 0, Span: 1, Arm: 1 +Enclosure position: N/A +Device Id: 3 +WWN: +Sequence Number: 2 +Media Error Count: 0 +Other Error Count: 0 +Predictive Failure Count: 23 +Last Predictive Failure Event Seq Number: 0 +PD Type: SAS + +Raw Size: 419.186 GB [0x3465f870 Sectors] +Non Coerced Size: 418.686 GB [0x3455f870 Sectors] +Coerced Size: 418.625 GB [0x34540000 Sectors] +Sector Size: 0 +Firmware state: Online, Spun Up +Device Firmware Level: ES64 +Shield Counter: 0 +Successful diagnostics completion on : N/A +SAS Address(0): 0x5000c50029124491 +SAS Address(1): 0x0 +Connected Port Number: 3(path0) +Inquiry Data: SEAGATE ST3450857SS ES643SK27GQ9 +FDE Capable: Not Capable +FDE Enable: Disable +Secured: Unsecured +Locked: Unlocked +Needs EKM Attention: No +Foreign State: None +Device Speed: Unknown +Link Speed: Unknown +Media Type: Hard Disk Device +Drive Temperature :38C (100.40 F) +PI Eligibility: No +Drive is formatted for PI information: No +PI: No PI +Port-0 : +Port status: Active +Port's Linkspeed: Unknown +Port-1 : +Port status: Active +Port's Linkspeed: Unknown +Drive has flagged a S.M.A.R.T alert : No + + + + +Exit Code: 0x00 diff --git a/collector/megacli.go b/collector/megacli.go new file mode 100644 index 00000000..e80c129f --- /dev/null +++ b/collector/megacli.go @@ -0,0 +1,233 @@ +// +build megacli + +package collector + +import ( + "bufio" + "io" + "os/exec" + "strconv" + "strings" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + defaultMegaCli = "megacli" + adapterHeaderSep = "================" +) + +var ( + driveTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Name: "raid_drive_temperature_celsius", + Help: "megacli: drive temperature", + }, []string{"enclosure", "slot"}) + + driveCounters = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Name: "raid_drive_count", + Help: "megacli: drive error and event counters", + }, []string{"enclosure", "slot", "type"}) + + drivePresence = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Name: "raid_adapter_disk_presence", + Help: "megacli: disk presence per adapter", + }, []string{"type"}) + + counters = []string{"Media Error Count", "Other Error Count", "Predictive Failure Count"} +) + +func init() { + Factories["megacli"] = NewMegaCliCollector +} + +func parseMegaCliDisks(r io.ReadCloser) (map[int]map[int]map[string]string, error) { + defer r.Close() + stats := map[int]map[int]map[string]string{} + scanner := bufio.NewScanner(r) + + curEnc := -1 + curSlot := -1 + for scanner.Scan() { + var err error + text := strings.TrimSpace(scanner.Text()) + parts := strings.SplitN(text, ":", 2) + if len(parts) != 2 { // Adapter #X + continue + } + key := strings.TrimSpace(parts[0]) + value := strings.TrimSpace(parts[1]) + switch { + case key == "Enclosure Device ID": + curEnc, err = strconv.Atoi(value) + if err != nil { + return nil, err + } + case key == "Slot Number": + curSlot, err = strconv.Atoi(value) + if err != nil { + return nil, err + } + case curSlot != -1 && curEnc != -1: + if _, ok := stats[curEnc]; !ok { + stats[curEnc] = map[int]map[string]string{} + } + if _, ok := stats[curEnc][curSlot]; !ok { + stats[curEnc][curSlot] = map[string]string{} + } + stats[curEnc][curSlot][key] = value + } + } + return stats, nil +} + +func parseMegaCliAdapter(r io.ReadCloser) (map[string]map[string]string, error) { + defer r.Close() + raidStats := map[string]map[string]string{} + scanner := bufio.NewScanner(r) + header := "" + last := "" + for scanner.Scan() { + text := strings.TrimSpace(scanner.Text()) + if text == adapterHeaderSep { + header = last + raidStats[header] = map[string]string{} + continue + } + last = text + if header == "" { // skip Adapter #X and separator + continue + } + parts := strings.SplitN(text, ":", 2) + if len(parts) != 2 { // these section never include anything we are interested in + continue + } + key := strings.TrimSpace(parts[0]) + value := strings.TrimSpace(parts[1]) + + raidStats[header][key] = value + + } + return raidStats, nil +} + +type megaCliCollector struct { + config Config + cli string +} + +// Takes a config struct and prometheus registry and returns a new Collector exposing +// RAID status through megacli. +func NewMegaCliCollector(config Config) (Collector, error) { + cli := defaultMegaCli + if config.Config["megacli_command"] != "" { + cli = config.Config["megacli_command"] + } + + c := megaCliCollector{ + config: config, + cli: cli, + } + + if _, err := prometheus.RegisterOrGet(driveTemperature); err != nil { + return nil, err + } + if _, err := prometheus.RegisterOrGet(driveCounters); err != nil { + return nil, err + } + if _, err := prometheus.RegisterOrGet(drivePresence); err != nil { + return nil, err + } + return &c, nil +} + +func (c *megaCliCollector) Update() (updates int, err error) { + au, err := c.updateAdapter() + if err != nil { + return au, err + } + du, err := c.updateDisks() + return au + du, err +} + +func (c *megaCliCollector) updateAdapter() (int, error) { + cmd := exec.Command(c.cli, "-AdpAllInfo", "-aALL") + pipe, err := cmd.StdoutPipe() + if err != nil { + return 0, err + } + + if err := cmd.Start(); err != nil { + return 0, err + } + + stats, err := parseMegaCliAdapter(pipe) + if err != nil { + return 0, err + } + if err := cmd.Wait(); err != nil { + return 0, err + } + + updates := 0 + for k, v := range stats["Device Present"] { + value, err := strconv.ParseFloat(v, 64) + if err != nil { + return updates, err + } + drivePresence.WithLabelValues(k).Set(value) + updates++ + } + return updates, nil +} + +func (c *megaCliCollector) updateDisks() (int, error) { + cmd := exec.Command(c.cli, "-PDList", "-aALL") + pipe, err := cmd.StdoutPipe() + if err != nil { + return 0, err + } + + if err := cmd.Start(); err != nil { + return 0, err + } + + stats, err := parseMegaCliDisks(pipe) + if err != nil { + return 0, err + } + if err := cmd.Wait(); err != nil { + return 0, err + } + + updates := 0 + for enc, encStats := range stats { + for slot, slotStats := range encStats { + tStr := slotStats["Drive Temperature"] + tStr = tStr[:strings.Index(tStr, "C")] + t, err := strconv.ParseFloat(tStr, 64) + if err != nil { + return updates, err + } + + encStr := strconv.Itoa(enc) + slotStr := strconv.Itoa(slot) + + driveTemperature.WithLabelValues(encStr, slotStr).Set(t) + updates++ + + for _, c := range counters { + counter, err := strconv.ParseFloat(slotStats[c], 64) + if err != nil { + return updates, err + } + + driveCounters.WithLabelValues(encStr, slotStr, c).Set(counter) + updates++ + } + } + } + return updates, nil +} diff --git a/collector/megacli_test.go b/collector/megacli_test.go new file mode 100644 index 00000000..74f56998 --- /dev/null +++ b/collector/megacli_test.go @@ -0,0 +1,54 @@ +// +build megacli + +package collector + +import ( + "os" + "testing" +) + +const ( + testMegaCliAdapter = "fixtures/megacli_adapter.txt" + testMegaCliDisks = "fixtures/megacli_disks.txt" + + physicalDevicesExpected = "5" + virtualDevicesDegraded = "0" +) + +func TestMegaCliAdapter(t *testing.T) { + data, err := os.Open(testMegaCliAdapter) + if err != nil { + t.Fatal(err) + } + stats, err := parseMegaCliAdapter(data) + if err != nil { + t.Fatal(err) + } + + if stats["Device Present"]["Physical Devices"] != physicalDevicesExpected { + t.Fatalf("Unexpected device count: %d != %d", stats["Device Present"]["Physical Devices"], physicalDevicesExpected) + } + + if stats["Device Present"]["Degraded"] != virtualDevicesDegraded { + t.Fatal() + } +} + +func TestMegaCliDisks(t *testing.T) { + data, err := os.Open(testMegaCliDisks) + if err != nil { + t.Fatal(err) + } + stats, err := parseMegaCliDisks(data) + if err != nil { + t.Fatal(err) + } + + if stats[32][0]["Drive Temperature"] != "37C (98.60 F)" { + t.Fatalf("Unexpected drive temperature: %s", stats[32][0]["Drive Temperature"]) + } + + if stats[32][3]["Predictive Failure Count"] != "23" { + t.Fatal() + } +} diff --git a/node_exporter.conf b/node_exporter.conf index 8ce3c679..c3567d39 100644 --- a/node_exporter.conf +++ b/node_exporter.conf @@ -1,7 +1,10 @@ { "attributes" : { + "default" : "1", "web_server" : "1", - "zone" : "a", - "default" : "1" + "zone" : "a" + }, + "config" : { + "megacli_command" : "megacli.sh" } }