154 lines
5.0 KiB
Go
154 lines
5.0 KiB
Go
// Copyright 2018 Prometheus Team
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package cluster
|
|
|
|
import (
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/gogo/protobuf/proto"
|
|
"github.com/hashicorp/memberlist"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
"github.com/prometheus/alertmanager/cluster/clusterpb"
|
|
)
|
|
|
|
// Channel allows clients to send messages for a specific state type that will be
|
|
// broadcasted in a best-effort manner.
|
|
type Channel struct {
|
|
key string
|
|
send func([]byte)
|
|
peers func() []*memberlist.Node
|
|
sendOversize func(*memberlist.Node, []byte) error
|
|
|
|
msgc chan []byte
|
|
logger *slog.Logger
|
|
|
|
oversizeGossipMessageFailureTotal prometheus.Counter
|
|
oversizeGossipMessageDroppedTotal prometheus.Counter
|
|
oversizeGossipMessageSentTotal prometheus.Counter
|
|
oversizeGossipDuration prometheus.Histogram
|
|
}
|
|
|
|
// NewChannel creates a new Channel struct, which handles sending normal and
|
|
// oversize messages to peers.
|
|
func NewChannel(
|
|
key string,
|
|
send func([]byte),
|
|
peers func() []*memberlist.Node,
|
|
sendOversize func(*memberlist.Node, []byte) error,
|
|
logger *slog.Logger,
|
|
stopc chan struct{},
|
|
reg prometheus.Registerer,
|
|
) *Channel {
|
|
oversizeGossipMessageFailureTotal := prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "alertmanager_oversized_gossip_message_failure_total",
|
|
Help: "Number of oversized gossip message sends that failed.",
|
|
ConstLabels: prometheus.Labels{"key": key},
|
|
})
|
|
oversizeGossipMessageSentTotal := prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "alertmanager_oversized_gossip_message_sent_total",
|
|
Help: "Number of oversized gossip message sent.",
|
|
ConstLabels: prometheus.Labels{"key": key},
|
|
})
|
|
oversizeGossipMessageDroppedTotal := prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "alertmanager_oversized_gossip_message_dropped_total",
|
|
Help: "Number of oversized gossip messages that were dropped due to a full message queue.",
|
|
ConstLabels: prometheus.Labels{"key": key},
|
|
})
|
|
oversizeGossipDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: "alertmanager_oversize_gossip_message_duration_seconds",
|
|
Help: "Duration of oversized gossip message requests.",
|
|
ConstLabels: prometheus.Labels{"key": key},
|
|
Buckets: prometheus.DefBuckets,
|
|
NativeHistogramBucketFactor: 1.1,
|
|
NativeHistogramMaxBucketNumber: 100,
|
|
NativeHistogramMinResetDuration: 1 * time.Hour,
|
|
})
|
|
|
|
reg.MustRegister(oversizeGossipDuration, oversizeGossipMessageFailureTotal, oversizeGossipMessageDroppedTotal, oversizeGossipMessageSentTotal)
|
|
|
|
c := &Channel{
|
|
key: key,
|
|
send: send,
|
|
peers: peers,
|
|
logger: logger,
|
|
msgc: make(chan []byte, 200),
|
|
sendOversize: sendOversize,
|
|
oversizeGossipMessageFailureTotal: oversizeGossipMessageFailureTotal,
|
|
oversizeGossipMessageDroppedTotal: oversizeGossipMessageDroppedTotal,
|
|
oversizeGossipMessageSentTotal: oversizeGossipMessageSentTotal,
|
|
oversizeGossipDuration: oversizeGossipDuration,
|
|
}
|
|
|
|
go c.handleOverSizedMessages(stopc)
|
|
|
|
return c
|
|
}
|
|
|
|
// handleOverSizedMessages prevents memberlist from opening too many parallel
|
|
// TCP connections to its peers.
|
|
func (c *Channel) handleOverSizedMessages(stopc chan struct{}) {
|
|
var wg sync.WaitGroup
|
|
for {
|
|
select {
|
|
case b := <-c.msgc:
|
|
for _, n := range c.peers() {
|
|
wg.Add(1)
|
|
go func(n *memberlist.Node) {
|
|
defer wg.Done()
|
|
c.oversizeGossipMessageSentTotal.Inc()
|
|
start := time.Now()
|
|
if err := c.sendOversize(n, b); err != nil {
|
|
c.logger.Debug("failed to send reliable", "key", c.key, "node", n, "err", err)
|
|
c.oversizeGossipMessageFailureTotal.Inc()
|
|
return
|
|
}
|
|
c.oversizeGossipDuration.Observe(time.Since(start).Seconds())
|
|
}(n)
|
|
}
|
|
|
|
wg.Wait()
|
|
case <-stopc:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Broadcast enqueues a message for broadcasting.
|
|
func (c *Channel) Broadcast(b []byte) {
|
|
b, err := proto.Marshal(&clusterpb.Part{Key: c.key, Data: b})
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
if OversizedMessage(b) {
|
|
select {
|
|
case c.msgc <- b:
|
|
default:
|
|
c.logger.Debug("oversized gossip channel full")
|
|
c.oversizeGossipMessageDroppedTotal.Inc()
|
|
}
|
|
} else {
|
|
c.send(b)
|
|
}
|
|
}
|
|
|
|
// OversizedMessage indicates whether or not the byte payload should be sent
|
|
// via TCP.
|
|
func OversizedMessage(b []byte) bool {
|
|
return len(b) > MaxGossipPacketSize/2
|
|
}
|