diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d6abb2e..9bd7ee41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * The cpufreq metrics now separate the `cpufreq` and `scaling` data based on what the driver provides. #1248 * The labels for the network_up metric have changed, see issue #1236 * Bonding collector now uses `mii_status` instead of `operstatus` #1124 +* Several systemd metrics have been turned off by default to improve performance #1254 + These include unit_tasks_current, unit_tasks_max, service_restart_total, and unit_start_time_seconds ### Changes @@ -16,6 +18,7 @@ * [CHANGE] Add TCPSynRetrans to netstat default filter #1143 * [CHANGE] Add a limit to the number of in-flight requests #1166 * [CHANGE] Add separate cpufreq and scaling metrics #1248 +* [CHANGE] Several systemd metrics have been turned off by default to improve performance #1254 * [ENHANCEMENT] Add Infiniband counters #1120 * [ENHANCEMENT] Move network_up labels into new metric network_info #1236 * [FEATURE] Add a flag to disable exporter metrics #1148 diff --git a/collector/systemd_linux.go b/collector/systemd_linux.go index b34c938c..faccdede 100644 --- a/collector/systemd_linux.go +++ b/collector/systemd_linux.go @@ -20,6 +20,8 @@ import ( "math" "regexp" "strings" + "sync" + "time" "github.com/coreos/go-systemd/dbus" "github.com/prometheus/client_golang/prometheus" @@ -28,9 +30,12 @@ import ( ) var ( - unitWhitelist = kingpin.Flag("collector.systemd.unit-whitelist", "Regexp of systemd units to whitelist. Units must both match whitelist and not match blacklist to be included.").Default(".+").String() - unitBlacklist = kingpin.Flag("collector.systemd.unit-blacklist", "Regexp of systemd units to blacklist. Units must both match whitelist and not match blacklist to be included.").Default(".+\\.scope").String() - systemdPrivate = kingpin.Flag("collector.systemd.private", "Establish a private, direct connection to systemd without dbus.").Bool() + unitWhitelist = kingpin.Flag("collector.systemd.unit-whitelist", "Regexp of systemd units to whitelist. Units must both match whitelist and not match blacklist to be included.").Default(".+").String() + unitBlacklist = kingpin.Flag("collector.systemd.unit-blacklist", "Regexp of systemd units to blacklist. Units must both match whitelist and not match blacklist to be included.").Default(".+\\.scope").String() + systemdPrivate = kingpin.Flag("collector.systemd.private", "Establish a private, direct connection to systemd without dbus.").Bool() + enableTaskMetrics = kingpin.Flag("collector.systemd.enable-task-metrics", "Enables service unit tasks metrics unit_tasks_current and unit_tasks_max").Bool() + enableRestartsMetrics = kingpin.Flag("collector.systemd.enable-restarts-metrics", "Enables service unit metric service_restart_total").Bool() + enableStartTimeMetrics = kingpin.Flag("collector.systemd.enable-start-time-metrics", "Enables service unit metric unit_start_time_seconds").Bool() ) type systemdCollector struct { @@ -118,34 +123,102 @@ func NewSystemdCollector() (Collector, error) { }, nil } +// Update gathers metrics from systemd. Dbus collection is done in parallel +// to reduce wait time for responses. func (c *systemdCollector) Update(ch chan<- prometheus.Metric) error { - allUnits, err := c.getAllUnits() + begin := time.Now() + conn, err := c.newDbus() + if err != nil { + return fmt.Errorf("couldn't get dbus connection: %s", err) + } + defer conn.Close() + + allUnits, err := c.getAllUnits(conn) if err != nil { return fmt.Errorf("couldn't get units: %s", err) } + log.Debugf("systemd getAllUnits took %f", time.Since(begin).Seconds()) + begin = time.Now() summary := summarizeUnits(allUnits) c.collectSummaryMetrics(ch, summary) + log.Debugf("systemd collectSummaryMetrics took %f", time.Since(begin).Seconds()) + begin = time.Now() units := filterUnits(allUnits, c.unitWhitelistPattern, c.unitBlacklistPattern) - c.collectUnitStatusMetrics(ch, units) - c.collectUnitStartTimeMetrics(ch, units) - c.collectUnitTasksCurrentMetrics(ch, units) - c.collectUnitTasksMaxMetrics(ch, units) - c.collectTimers(ch, units) - c.collectSockets(ch, units) + log.Debugf("systemd filterUnits took %f", time.Since(begin).Seconds()) - systemState, err := c.getSystemState() - if err != nil { - return fmt.Errorf("couldn't get system state: %s", err) + var wg sync.WaitGroup + defer wg.Wait() + + wg.Add(1) + go func() { + defer wg.Done() + begin = time.Now() + c.collectUnitStatusMetrics(conn, ch, units) + log.Debugf("systemd collectUnitStatusMetrics took %f", time.Since(begin).Seconds()) + }() + + if *enableStartTimeMetrics { + wg.Add(1) + go func() { + defer wg.Done() + begin = time.Now() + c.collectUnitStartTimeMetrics(conn, ch, units) + log.Debugf("systemd collectUnitStartTimeMetrics took %f", time.Since(begin).Seconds()) + }() } - c.collectSystemState(ch, systemState) - return nil + if *enableTaskMetrics { + wg.Add(1) + go func() { + defer wg.Done() + begin = time.Now() + c.collectUnitTasksMetrics(conn, ch, units) + log.Debugf("systemd collectUnitTasksMetrics took %f", time.Since(begin).Seconds()) + }() + } + + wg.Add(1) + go func() { + defer wg.Done() + begin = time.Now() + c.collectTimers(conn, ch, units) + log.Debugf("systemd collectTimers took %f", time.Since(begin).Seconds()) + }() + + wg.Add(1) + go func() { + defer wg.Done() + begin = time.Now() + c.collectSockets(conn, ch, units) + log.Debugf("systemd collectSockets took %f", time.Since(begin).Seconds()) + }() + + begin = time.Now() + err = c.collectSystemState(conn, ch) + log.Debugf("systemd collectSystemState took %f", time.Since(begin).Seconds()) + return err } -func (c *systemdCollector) collectUnitStatusMetrics(ch chan<- prometheus.Metric, units []unit) { +func (c *systemdCollector) collectUnitStatusMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { + serviceType := "" + if strings.HasSuffix(unit.Name, ".service") { + serviceTypeProperty, err := conn.GetUnitTypeProperty(unit.Name, "Service", "Type") + if err != nil { + log.Debugf("couldn't get unit '%s' Type: %s", unit.Name, err) + } else { + serviceType = serviceTypeProperty.Value.Value().(string) + } + } else if strings.HasSuffix(unit.Name, ".mount") { + serviceTypeProperty, err := conn.GetUnitTypeProperty(unit.Name, "Mount", "Type") + if err != nil { + log.Debugf("couldn't get unit '%s' Type: %s", unit.Name, err) + } else { + serviceType = serviceTypeProperty.Value.Value().(string) + } + } for _, stateName := range unitStatesName { isActive := 0.0 if stateName == unit.ActiveState { @@ -153,73 +226,126 @@ func (c *systemdCollector) collectUnitStatusMetrics(ch chan<- prometheus.Metric, } ch <- prometheus.MustNewConstMetric( c.unitDesc, prometheus.GaugeValue, isActive, - unit.Name, stateName, unit.serviceType) + unit.Name, stateName, serviceType) } - if strings.HasSuffix(unit.Name, ".service") && unit.nRestarts != nil { - ch <- prometheus.MustNewConstMetric( - c.nRestartsDesc, prometheus.CounterValue, - float64(*unit.nRestarts), unit.Name) + if *enableRestartsMetrics && strings.HasSuffix(unit.Name, ".service") { + // NRestarts wasn't added until systemd 235. + restartsCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "NRestarts") + if err != nil { + log.Debugf("couldn't get unit '%s' NRestarts: %s", unit.Name, err) + } else { + ch <- prometheus.MustNewConstMetric( + c.nRestartsDesc, prometheus.CounterValue, + float64(restartsCount.Value.Value().(uint32)), unit.Name) + } } } } -func (c *systemdCollector) collectSockets(ch chan<- prometheus.Metric, units []unit) { +func (c *systemdCollector) collectSockets(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { if !strings.HasSuffix(unit.Name, ".socket") { continue } + acceptedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NAccepted") + if err != nil { + log.Debugf("couldn't get unit '%s' NAccepted: %s", unit.Name, err) + continue + } ch <- prometheus.MustNewConstMetric( c.socketAcceptedConnectionsDesc, prometheus.CounterValue, - float64(unit.acceptedConnections), unit.Name) + float64(acceptedConnectionCount.Value.Value().(uint32)), unit.Name) + + currentConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NConnections") + if err != nil { + log.Debugf("couldn't get unit '%s' NConnections: %s", unit.Name, err) + continue + } ch <- prometheus.MustNewConstMetric( c.socketCurrentConnectionsDesc, prometheus.GaugeValue, - float64(unit.currentConnections), unit.Name) - if unit.refusedConnections != nil { + float64(currentConnectionCount.Value.Value().(uint32)), unit.Name) + + // NRefused wasn't added until systemd 239. + refusedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NRefused") + if err != nil { + //log.Debugf("couldn't get unit '%s' NRefused: %s", unit.Name, err) + } else { ch <- prometheus.MustNewConstMetric( c.socketRefusedConnectionsDesc, prometheus.GaugeValue, - float64(*unit.refusedConnections), unit.Name) + float64(refusedConnectionCount.Value.Value().(uint32)), unit.Name) } } } -func (c *systemdCollector) collectUnitStartTimeMetrics(ch chan<- prometheus.Metric, units []unit) { +func (c *systemdCollector) collectUnitStartTimeMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) { + var startTimeUsec uint64 + for _, unit := range units { + if unit.ActiveState != "active" { + startTimeUsec = 0 + } else { + timestampValue, err := conn.GetUnitProperty(unit.Name, "ActiveEnterTimestamp") + if err != nil { + log.Debugf("couldn't get unit '%s' StartTimeUsec: %s", unit.Name, err) + continue + } + startTimeUsec = timestampValue.Value.Value().(uint64) + } + ch <- prometheus.MustNewConstMetric( c.unitStartTimeDesc, prometheus.GaugeValue, - float64(unit.startTimeUsec)/1e6, unit.Name) + float64(startTimeUsec)/1e6, unit.Name) } } -func (c *systemdCollector) collectUnitTasksCurrentMetrics(ch chan<- prometheus.Metric, units []unit) { +func (c *systemdCollector) collectUnitTasksMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) { + var val uint64 for _, unit := range units { - if unit.tasksCurrent != nil { - ch <- prometheus.MustNewConstMetric( - c.unitTasksCurrentDesc, prometheus.GaugeValue, - float64(*unit.tasksCurrent), unit.Name) + if strings.HasSuffix(unit.Name, ".service") { + tasksCurrentCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksCurrent") + if err != nil { + log.Debugf("couldn't get unit '%s' TasksCurrent: %s", unit.Name, err) + } else { + val = tasksCurrentCount.Value.Value().(uint64) + // Don't set if tasksCurrent if dbus reports MaxUint64. + if val != math.MaxUint64 { + ch <- prometheus.MustNewConstMetric( + c.unitTasksCurrentDesc, prometheus.GaugeValue, + float64(val), unit.Name) + } + } + tasksMaxCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksMax") + if err != nil { + log.Debugf("couldn't get unit '%s' TasksMax: %s", unit.Name, err) + } else { + val = tasksMaxCount.Value.Value().(uint64) + // Don't set if tasksMax if dbus reports MaxUint64. + if val != math.MaxUint64 { + ch <- prometheus.MustNewConstMetric( + c.unitTasksMaxDesc, prometheus.GaugeValue, + float64(val), unit.Name) + } + } } } } -func (c *systemdCollector) collectUnitTasksMaxMetrics(ch chan<- prometheus.Metric, units []unit) { - for _, unit := range units { - if unit.tasksMax != nil { - ch <- prometheus.MustNewConstMetric( - c.unitTasksMaxDesc, prometheus.GaugeValue, - float64(*unit.tasksMax), unit.Name) - } - } -} - -func (c *systemdCollector) collectTimers(ch chan<- prometheus.Metric, units []unit) { +func (c *systemdCollector) collectTimers(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { if !strings.HasSuffix(unit.Name, ".timer") { continue } + lastTriggerValue, err := conn.GetUnitTypeProperty(unit.Name, "Timer", "LastTriggerUSec") + if err != nil { + log.Debugf("couldn't get unit '%s' LastTriggerUSec: %s", unit.Name, err) + continue + } + ch <- prometheus.MustNewConstMetric( c.timerLastTriggerDesc, prometheus.GaugeValue, - float64(unit.lastTriggerUsec)/1e6, unit.Name) + float64(lastTriggerValue.Value.Value().(uint64))/1e6, unit.Name) } } @@ -230,12 +356,17 @@ func (c *systemdCollector) collectSummaryMetrics(ch chan<- prometheus.Metric, su } } -func (c *systemdCollector) collectSystemState(ch chan<- prometheus.Metric, systemState string) { +func (c *systemdCollector) collectSystemState(conn *dbus.Conn, ch chan<- prometheus.Metric) error { + systemState, err := conn.GetManagerProperty("SystemState") + if err != nil { + return fmt.Errorf("couldn't get system state: %s", err) + } isSystemRunning := 0.0 if systemState == `"running"` { isSystemRunning = 1.0 } ch <- prometheus.MustNewConstMetric(c.systemRunningDesc, prometheus.GaugeValue, isSystemRunning) + return nil } func (c *systemdCollector) newDbus() (*dbus.Conn, error) { @@ -247,37 +378,10 @@ func (c *systemdCollector) newDbus() (*dbus.Conn, error) { type unit struct { dbus.UnitStatus - lastTriggerUsec uint64 - startTimeUsec uint64 - tasksCurrent *uint64 - tasksMax *uint64 - nRestarts *uint32 - serviceType string - acceptedConnections uint32 - currentConnections uint32 - refusedConnections *uint32 } -// unitType gets the suffix after the last "." in the -// unit name and capitalizes the first letter -func (u *unit) unitType() string { - suffixIndex := strings.LastIndex(u.Name, ".") + 1 - if suffixIndex < 1 || suffixIndex > len(u.Name) { - return "" - } - return strings.Title(u.Name[suffixIndex:]) -} - -func (c *systemdCollector) getAllUnits() ([]unit, error) { - conn, err := c.newDbus() - if err != nil { - return nil, fmt.Errorf("couldn't get dbus connection: %s", err) - } - defer conn.Close() - - // Filter out any units that are not installed and are pulled in only as dependencies. +func (c *systemdCollector) getAllUnits(conn *dbus.Conn) ([]unit, error) { allUnits, err := conn.ListUnits() - if err != nil { return nil, err } @@ -287,96 +391,6 @@ func (c *systemdCollector) getAllUnits() ([]unit, error) { unit := unit{ UnitStatus: status, } - unitType := unit.unitType() - if unitType == "Service" || unitType == "Mount" { - serviceType, err := conn.GetUnitTypeProperty(unit.Name, unitType, "Type") - if err != nil { - log.Debugf("couldn't get type for unit '%s': %s", unit.Name, err) - } else { - unit.serviceType = serviceType.Value.Value().(string) - } - } - if strings.HasSuffix(unit.Name, ".timer") { - lastTriggerValue, err := conn.GetUnitTypeProperty(unit.Name, "Timer", "LastTriggerUSec") - if err != nil { - log.Debugf("couldn't get unit '%s' LastTriggerUSec: %s", unit.Name, err) - continue - } - - unit.lastTriggerUsec = lastTriggerValue.Value.Value().(uint64) - } - if strings.HasSuffix(unit.Name, ".service") { - // NRestarts wasn't added until systemd 235. - restartsCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "NRestarts") - if err != nil { - log.Debugf("couldn't get unit '%s' NRestarts: %s", unit.Name, err) - } else { - nRestarts := restartsCount.Value.Value().(uint32) - unit.nRestarts = &nRestarts - } - - tasksCurrentCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksCurrent") - if err != nil { - log.Debugf("couldn't get unit '%s' TasksCurrent: %s", unit.Name, err) - } else { - val := tasksCurrentCount.Value.Value().(uint64) - // Don't set if tasksCurrent if dbus reports MaxUint64. - if val != math.MaxUint64 { - unit.tasksCurrent = &val - } - } - - tasksMaxCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksMax") - if err != nil { - log.Debugf("couldn't get unit '%s' TasksMax: %s", unit.Name, err) - } else { - val := tasksMaxCount.Value.Value().(uint64) - // Don't set if tasksMax if dbus reports MaxUint64. - if val != math.MaxUint64 { - unit.tasksMax = &val - } - } - - } - - if strings.HasSuffix(unit.Name, ".socket") { - acceptedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NAccepted") - if err != nil { - log.Debugf("couldn't get unit '%s' NAccepted: %s", unit.Name, err) - continue - } - - unit.acceptedConnections = acceptedConnectionCount.Value.Value().(uint32) - - currentConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NConnections") - if err != nil { - log.Debugf("couldn't get unit '%s' NConnections: %s", unit.Name, err) - continue - } - unit.currentConnections = currentConnectionCount.Value.Value().(uint32) - - // NRefused wasn't added until systemd 239. - refusedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NRefused") - if err != nil { - log.Debugf("couldn't get unit '%s' NRefused: %s", unit.Name, err) - } else { - nRefused := refusedConnectionCount.Value.Value().(uint32) - unit.refusedConnections = &nRefused - } - } - - if unit.ActiveState != "active" { - unit.startTimeUsec = 0 - } else { - timestampValue, err := conn.GetUnitProperty(unit.Name, "ActiveEnterTimestamp") - if err != nil { - log.Debugf("couldn't get unit '%s' StartTimeUsec: %s", unit.Name, err) - continue - } - - unit.startTimeUsec = timestampValue.Value.Value().(uint64) - } - result = append(result, unit) } @@ -410,13 +424,3 @@ func filterUnits(units []unit, whitelistPattern, blacklistPattern *regexp.Regexp return filtered } - -func (c *systemdCollector) getSystemState() (state string, err error) { - conn, err := c.newDbus() - if err != nil { - return "", fmt.Errorf("couldn't get dbus connection: %s", err) - } - state, err = conn.GetManagerProperty("SystemState") - conn.Close() - return state, err -} diff --git a/collector/systemd_linux_test.go b/collector/systemd_linux_test.go index 26257aa5..b2298d64 100644 --- a/collector/systemd_linux_test.go +++ b/collector/systemd_linux_test.go @@ -18,7 +18,6 @@ import ( "testing" "github.com/coreos/go-systemd/dbus" - "github.com/prometheus/client_golang/prometheus" ) // Creates mock UnitLists @@ -87,26 +86,6 @@ func getUnitListFixtures() [][]unit { return [][]unit{fixture1, fixture2} } -func TestSystemdCollectorDoesntCrash(t *testing.T) { - c, err := NewSystemdCollector() - if err != nil { - t.Fatal(err) - } - sink := make(chan prometheus.Metric) - go func() { - for { - <-sink - } - }() - - fixtures := getUnitListFixtures() - collector := (c).(*systemdCollector) - for _, units := range fixtures { - collector.collectUnitStatusMetrics(sink, units) - collector.collectSockets(sink, units) - } -} - func TestSystemdIgnoreFilter(t *testing.T) { fixtures := getUnitListFixtures() whitelistPattern := regexp.MustCompile("^foo$")