Create dedicated metric for owner_node

Signed-off-by: Alexandre JARDON <28548335+webalexeu@users.noreply.github.com>
This commit is contained in:
Alexandre JARDON 2024-02-16 08:42:25 +01:00
parent eab87292c1
commit 9b5568354c
5 changed files with 115 additions and 62 deletions

View File

@ -36,7 +36,7 @@ Name | Description | Type | Labels
### Example metric
Query the state of all cluster resource owned by node1
```
windows_mscluster_resource_state{owner_node="node1"}
windows_mscluster_resource_owner_node{node_name="node1"}
```
## Useful queries

View File

@ -34,7 +34,7 @@ Name | Description | Type | Labels
### Example metric
Query the state of all cluster group owned by node1
```
windows_mscluster_resourcegroup_state{owner_node="node1"}
windows_mscluster_resourcegroup_owner_node{node_name="node1"}
```
## Useful queries

View File

@ -15,6 +15,9 @@ type Config struct{}
var ConfigDefaults = Config{}
// Variable used by mscluster_resource and mscluster_resourcegroup
var NodeName []string
// A collector is a Prometheus collector for WMI MSCluster_Node metrics
type collector struct {
logger log.Logger
@ -175,6 +178,8 @@ func (c *collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric)
return err
}
NodeName = []string{}
for _, v := range dst {
ch <- prometheus.MustNewConstMetric(
@ -274,6 +279,8 @@ func (c *collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric)
float64(v.StatusInformation),
v.Name,
)
NodeName = append(NodeName, v.Name)
}
return nil

View File

@ -1,6 +1,7 @@
package mscluster_resource
import (
"github.com/prometheus-community/windows_exporter/pkg/collector/mscluster_node"
"github.com/prometheus-community/windows_exporter/pkg/types"
"github.com/prometheus-community/windows_exporter/pkg/wmi"
@ -26,6 +27,7 @@ type collector struct {
IsAlivePollInterval *prometheus.Desc
LooksAlivePollInterval *prometheus.Desc
MonitorProcessId *prometheus.Desc
OwnerNode *prometheus.Desc
PendingTimeout *prometheus.Desc
ResourceClass *prometheus.Desc
RestartAction *prometheus.Desc
@ -63,97 +65,103 @@ func (c *collector) Build() error {
c.Characteristics = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "characteristics"),
"Provides the characteristics of the object.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.DeadlockTimeout = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "deadlock_timeout"),
"Indicates the length of time to wait, in milliseconds, before declaring a deadlock in any call into a resource.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.EmbeddedFailureAction = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "embedded_failure_action"),
"The time, in milliseconds, that a resource should remain in a failed state before the Cluster service attempts to restart it.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.Flags = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "flags"),
"Provides access to the flags set for the object.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.IsAlivePollInterval = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "is_alive_poll_interval"),
"Provides access to the resource's IsAlivePollInterval property, which is the recommended interval in milliseconds at which the Cluster Service should poll the resource to determine whether it is operational. If the property is set to 0xFFFFFFFF, the Cluster Service uses the IsAlivePollInterval property for the resource type associated with the resource.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.LooksAlivePollInterval = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "looks_alive_poll_interval"),
"Provides access to the resource's LooksAlivePollInterval property, which is the recommended interval in milliseconds at which the Cluster Service should poll the resource to determine whether it appears operational. If the property is set to 0xFFFFFFFF, the Cluster Service uses the LooksAlivePollInterval property for the resource type associated with the resource.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.MonitorProcessId = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "monitor_process_id"),
"Provides the process ID of the resource host service that is currently hosting the resource.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.OwnerNode = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "owner_node"),
"The node hosting the resource. 0: Not hosted; 1: Hosted",
[]string{"type", "owner_group", "node_name", "name"},
nil,
)
c.PendingTimeout = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "pending_timeout"),
"Provides access to the resource's PendingTimeout property. If a resource cannot be brought online or taken offline in the number of milliseconds specified by the PendingTimeout property, the resource is forcibly terminated.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.ResourceClass = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "resource_class"),
"Gets or sets the resource class of a resource. 0: Unknown; 1: Storage; 2: Network; 32768: Unknown ",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.RestartAction = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "restart_action"),
"Provides access to the resource's RestartAction property, which is the action to be taken by the Cluster Service if the resource fails.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.RestartDelay = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "restart_delay"),
"Indicates the time delay before a failed resource is restarted.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.RestartPeriod = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "restart_period"),
"Provides access to the resource's RestartPeriod property, which is interval of time, in milliseconds, during which a specified number of restart attempts can be made on a nonresponsive resource.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.RestartThreshold = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "restart_threshold"),
"Provides access to the resource's RestartThreshold property which is the maximum number of restart attempts that can be made on a resource within an interval defined by the RestartPeriod property before the Cluster Service initiates the action specified by the RestartAction property.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.RetryPeriodOnFailure = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "retry_period_on_failure"),
"Provides access to the resource's RetryPeriodOnFailure property, which is the interval of time (in milliseconds) that a resource should remain in a failed state before the Cluster service attempts to restart it.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.State = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "state"),
"The current state of the resource. -1: Unknown; 0: Inherited; 1: Initializing; 2: Online; 3: Offline; 4: Failed; 128: Pending; 129: Online Pending; 130: Offline Pending ",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
c.Subclass = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "subclass"),
"Provides the list of references to nodes that can be the owner of this resource.",
[]string{"type", "owner_group", "owner_node", "name"},
[]string{"type", "owner_group", "name"},
nil,
)
return nil
@ -200,112 +208,127 @@ func (c *collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric)
c.Characteristics,
prometheus.GaugeValue,
float64(v.Characteristics),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.DeadlockTimeout,
prometheus.GaugeValue,
float64(v.DeadlockTimeout),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.EmbeddedFailureAction,
prometheus.GaugeValue,
float64(v.EmbeddedFailureAction),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.Flags,
prometheus.GaugeValue,
float64(v.Flags),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.IsAlivePollInterval,
prometheus.GaugeValue,
float64(v.IsAlivePollInterval),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.LooksAlivePollInterval,
prometheus.GaugeValue,
float64(v.LooksAlivePollInterval),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.MonitorProcessId,
prometheus.GaugeValue,
float64(v.MonitorProcessId),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
if mscluster_node.NodeName != nil {
for _, node_name := range mscluster_node.NodeName {
isCurrentState := 0.0
if v.OwnerNode == node_name {
isCurrentState = 1.0
}
ch <- prometheus.MustNewConstMetric(
c.OwnerNode,
prometheus.GaugeValue,
isCurrentState,
v.Type, v.OwnerGroup, node_name, v.Name,
)
}
}
ch <- prometheus.MustNewConstMetric(
c.PendingTimeout,
prometheus.GaugeValue,
float64(v.PendingTimeout),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.ResourceClass,
prometheus.GaugeValue,
float64(v.ResourceClass),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.RestartAction,
prometheus.GaugeValue,
float64(v.RestartAction),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.RestartDelay,
prometheus.GaugeValue,
float64(v.RestartDelay),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.RestartPeriod,
prometheus.GaugeValue,
float64(v.RestartPeriod),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.RestartThreshold,
prometheus.GaugeValue,
float64(v.RestartThreshold),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.RetryPeriodOnFailure,
prometheus.GaugeValue,
float64(v.RetryPeriodOnFailure),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.State,
prometheus.GaugeValue,
float64(v.State),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.Subclass,
prometheus.GaugeValue,
float64(v.Subclass),
v.Type, v.OwnerGroup, v.OwnerNode, v.Name,
v.Type, v.OwnerGroup, v.Name,
)
}

View File

@ -1,6 +1,7 @@
package mscluster_resourcegroup
import (
"github.com/prometheus-community/windows_exporter/pkg/collector/mscluster_node"
"github.com/prometheus-community/windows_exporter/pkg/types"
"github.com/prometheus-community/windows_exporter/pkg/wmi"
@ -31,6 +32,7 @@ type collector struct {
Flags *prometheus.Desc
GroupType *prometheus.Desc
PlacementOptions *prometheus.Desc
OwnerNode *prometheus.Desc
Priority *prometheus.Desc
ResiliencyPeriod *prometheus.Desc
State *prometheus.Desc
@ -62,79 +64,85 @@ func (c *collector) Build() error {
c.AutoFailbackType = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "auto_failback_type"),
"Provides access to the group's AutoFailbackType property.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.Characteristics = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "characteristics"),
"Provides the characteristics of the group.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.ColdStartSetting = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "cold_start_setting"),
"Indicates whether a group can start after a cluster cold start.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.DefaultOwner = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "default_owner"),
"Number of the last node the resource group was activated on or explicitly moved to.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.FailbackWindowEnd = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "failback_window_end"),
"The FailbackWindowEnd property provides the latest time that the group can be moved back to the node identified as its preferred node.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.FailbackWindowStart = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "failback_window_start"),
"The FailbackWindowStart property provides the earliest time (that is, local time as kept by the cluster) that the group can be moved back to the node identified as its preferred node.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.FailoverPeriod = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "failover_period"),
"The FailoverPeriod property specifies a number of hours during which a maximum number of failover attempts, specified by the FailoverThreshold property, can occur.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.FailoverThreshold = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "failover_threshold"),
"The FailoverThreshold property specifies the maximum number of failover attempts.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.Flags = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "flags"),
"Provides access to the flags set for the group. ",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.GroupType = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "group_type"),
"The Type of the resource group.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.OwnerNode = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "owner_node"),
"The node hosting the resource group. 0: Not hosted; 1: Hosted",
[]string{"node_name", "name"},
nil,
)
c.Priority = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "priority"),
"Priority value of the resource group",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.ResiliencyPeriod = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "resiliency_period"),
"The resiliency period for this group, in seconds.",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
c.State = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "state"),
"The current state of the resource group. -1: Unknown; 0: Online; 1: Offline; 2: Failed; 3: Partial Online; 4: Pending",
[]string{"owner_node", "name"},
[]string{"name"},
nil,
)
return nil
@ -143,8 +151,7 @@ func (c *collector) Build() error {
// MSCluster_ResourceGroup docs:
// - https://docs.microsoft.com/en-us/previous-versions/windows/desktop/cluswmi/mscluster-resourcegroup
type MSCluster_ResourceGroup struct {
Name string
OwnerNode string
Name string
AutoFailbackType uint
Characteristics uint
@ -156,6 +163,7 @@ type MSCluster_ResourceGroup struct {
FailoverThreshold uint
Flags uint
GroupType uint
OwnerNode string
Priority uint
ResiliencyPeriod uint
State uint
@ -176,91 +184,106 @@ func (c *collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric)
c.AutoFailbackType,
prometheus.GaugeValue,
float64(v.AutoFailbackType),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.Characteristics,
prometheus.GaugeValue,
float64(v.Characteristics),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.ColdStartSetting,
prometheus.GaugeValue,
float64(v.ColdStartSetting),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.DefaultOwner,
prometheus.GaugeValue,
float64(v.DefaultOwner),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.FailbackWindowEnd,
prometheus.GaugeValue,
float64(v.FailbackWindowEnd),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.FailbackWindowStart,
prometheus.GaugeValue,
float64(v.FailbackWindowStart),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.FailoverPeriod,
prometheus.GaugeValue,
float64(v.FailoverPeriod),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.FailoverThreshold,
prometheus.GaugeValue,
float64(v.FailoverThreshold),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.Flags,
prometheus.GaugeValue,
float64(v.Flags),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.GroupType,
prometheus.GaugeValue,
float64(v.GroupType),
v.OwnerNode, v.Name,
v.Name,
)
if mscluster_node.NodeName != nil {
for _, node_name := range mscluster_node.NodeName {
isCurrentState := 0.0
if v.OwnerNode == node_name {
isCurrentState = 1.0
}
ch <- prometheus.MustNewConstMetric(
c.OwnerNode,
prometheus.GaugeValue,
isCurrentState,
node_name, v.Name,
)
}
}
ch <- prometheus.MustNewConstMetric(
c.Priority,
prometheus.GaugeValue,
float64(v.Priority),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.ResiliencyPeriod,
prometheus.GaugeValue,
float64(v.ResiliencyPeriod),
v.OwnerNode, v.Name,
v.Name,
)
ch <- prometheus.MustNewConstMetric(
c.State,
prometheus.GaugeValue,
float64(v.State),
v.OwnerNode, v.Name,
v.Name,
)
}