2013-02-07 10:38:01 +00:00
|
|
|
// Copyright 2013 Prometheus Team
|
2012-11-26 19:11:34 +00:00
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2012-11-24 11:33:34 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
2013-01-22 18:32:56 +00:00
|
|
|
"flag"
|
2013-06-25 12:02:27 +00:00
|
|
|
"os"
|
|
|
|
"os/signal"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2013-08-12 15:18:02 +00:00
|
|
|
"github.com/golang/glog"
|
2013-06-25 12:02:27 +00:00
|
|
|
"github.com/prometheus/client_golang/extraction"
|
|
|
|
|
2013-01-27 17:49:45 +00:00
|
|
|
"github.com/prometheus/prometheus/config"
|
2013-07-30 15:18:07 +00:00
|
|
|
"github.com/prometheus/prometheus/notification"
|
2013-01-27 17:49:45 +00:00
|
|
|
"github.com/prometheus/prometheus/retrieval"
|
|
|
|
"github.com/prometheus/prometheus/rules"
|
2013-02-08 17:03:26 +00:00
|
|
|
"github.com/prometheus/prometheus/storage/metric"
|
2013-02-08 13:49:55 +00:00
|
|
|
"github.com/prometheus/prometheus/web"
|
2013-05-05 17:32:04 +00:00
|
|
|
"github.com/prometheus/prometheus/web/api"
|
2012-11-24 11:33:34 +00:00
|
|
|
)
|
|
|
|
|
2013-06-25 12:02:27 +00:00
|
|
|
const deletionBatchSize = 100
|
2013-05-13 08:53:24 +00:00
|
|
|
|
2013-01-22 18:32:56 +00:00
|
|
|
// Commandline flags.
|
|
|
|
var (
|
2013-04-25 09:47:48 +00:00
|
|
|
printVersion = flag.Bool("version", false, "print version information")
|
2013-01-22 17:37:01 +00:00
|
|
|
configFile = flag.String("configFile", "prometheus.conf", "Prometheus configuration file name.")
|
|
|
|
metricsStoragePath = flag.String("metricsStoragePath", "/tmp/metrics", "Base path for metrics storage.")
|
2013-06-25 12:02:27 +00:00
|
|
|
samplesQueueCapacity = flag.Int("samplesQueueCapacity", 4096, "The size of the unwritten samples queue.")
|
2013-01-22 17:37:01 +00:00
|
|
|
concurrentRetrievalAllowance = flag.Int("concurrentRetrievalAllowance", 15, "The number of concurrent metrics retrieval requests allowed.")
|
2013-04-16 15:13:29 +00:00
|
|
|
diskAppendQueueCapacity = flag.Int("queue.diskAppendCapacity", 1000000, "The size of the queue for items that are pending writing to disk.")
|
2013-04-30 11:22:33 +00:00
|
|
|
memoryAppendQueueCapacity = flag.Int("queue.memoryAppendCapacity", 10000, "The size of the queue for items that are pending writing to memory.")
|
2013-05-07 15:14:04 +00:00
|
|
|
|
|
|
|
headCompactInterval = flag.Duration("compact.headInterval", 10*3*time.Minute, "The amount of time between head compactions.")
|
|
|
|
bodyCompactInterval = flag.Duration("compact.bodyInterval", 10*5*time.Minute, "The amount of time between body compactions.")
|
|
|
|
tailCompactInterval = flag.Duration("compact.tailInterval", 10*7*time.Minute, "The amount of time between tail compactions.")
|
|
|
|
|
|
|
|
headGroupSize = flag.Int("compact.headGroupSize", 50, "The minimum group size for head samples.")
|
|
|
|
bodyGroupSize = flag.Int("compact.bodyGroupSize", 250, "The minimum group size for body samples.")
|
|
|
|
tailGroupSize = flag.Int("compact.tailGroupSize", 5000, "The minimum group size for tail samples.")
|
|
|
|
|
|
|
|
headAge = flag.Duration("compact.headAgeInclusiveness", 5*time.Minute, "The relative inclusiveness of head samples.")
|
|
|
|
bodyAge = flag.Duration("compact.bodyAgeInclusiveness", time.Hour, "The relative inclusiveness of body samples.")
|
|
|
|
tailAge = flag.Duration("compact.tailAgeInclusiveness", 24*time.Hour, "The relative inclusiveness of tail samples.")
|
2013-05-13 08:53:24 +00:00
|
|
|
|
|
|
|
deleteInterval = flag.Duration("delete.interval", 10*11*time.Minute, "The amount of time between deletion of old values.")
|
|
|
|
|
|
|
|
deleteAge = flag.Duration("delete.ageMaximum", 10*24*time.Hour, "The relative maximum age for values before they are deleted.")
|
2013-05-14 15:50:52 +00:00
|
|
|
|
|
|
|
arenaFlushInterval = flag.Duration("arena.flushInterval", 15*time.Minute, "The period at which the in-memory arena is flushed to disk.")
|
|
|
|
arenaTTL = flag.Duration("arena.ttl", 10*time.Minute, "The relative age of values to purge to disk from memory.")
|
2013-07-30 15:18:07 +00:00
|
|
|
|
|
|
|
alertmanagerUrl = flag.String("alertmanager.url", "", "The URL of the alert manager to send notifications to.")
|
|
|
|
notificationQueueCapacity = flag.Int("alertmanager.notificationQueueCapacity", 100, "The size of the queue for pending alert manager notifications.")
|
2013-01-22 18:32:56 +00:00
|
|
|
)
|
|
|
|
|
2013-04-29 09:17:56 +00:00
|
|
|
type prometheus struct {
|
2013-08-05 15:31:49 +00:00
|
|
|
headCompactionTimer *time.Ticker
|
|
|
|
bodyCompactionTimer *time.Ticker
|
|
|
|
tailCompactionTimer *time.Ticker
|
|
|
|
deletionTimer *time.Ticker
|
|
|
|
|
2013-05-13 08:53:24 +00:00
|
|
|
curationMutex sync.Mutex
|
2013-05-07 15:14:04 +00:00
|
|
|
curationState chan metric.CurationState
|
|
|
|
stopBackgroundOperations chan bool
|
|
|
|
|
2013-06-25 12:02:27 +00:00
|
|
|
unwrittenSamples chan *extraction.Result
|
2013-05-07 15:14:04 +00:00
|
|
|
|
2013-07-30 15:18:07 +00:00
|
|
|
ruleManager rules.RuleManager
|
2013-08-09 17:32:55 +00:00
|
|
|
notifications chan notification.NotificationReqs
|
2013-07-30 15:18:07 +00:00
|
|
|
storage *metric.TieredStorage
|
2013-04-29 09:17:56 +00:00
|
|
|
}
|
|
|
|
|
2013-05-07 15:14:04 +00:00
|
|
|
func (p *prometheus) interruptHandler() {
|
2013-04-29 09:17:56 +00:00
|
|
|
notifier := make(chan os.Signal)
|
|
|
|
signal.Notify(notifier, os.Interrupt)
|
|
|
|
|
|
|
|
<-notifier
|
|
|
|
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Warning("Received SIGINT; Exiting gracefully...")
|
2013-04-29 09:17:56 +00:00
|
|
|
p.close()
|
|
|
|
os.Exit(0)
|
|
|
|
}
|
|
|
|
|
2013-05-07 15:14:04 +00:00
|
|
|
func (p *prometheus) compact(olderThan time.Duration, groupSize int) error {
|
2013-05-13 08:53:24 +00:00
|
|
|
p.curationMutex.Lock()
|
|
|
|
defer p.curationMutex.Unlock()
|
2013-05-07 15:14:04 +00:00
|
|
|
|
|
|
|
processor := &metric.CompactionProcessor{
|
|
|
|
MaximumMutationPoolBatch: groupSize * 3,
|
|
|
|
MinimumGroupSize: groupSize,
|
|
|
|
}
|
|
|
|
|
|
|
|
curator := metric.Curator{
|
|
|
|
Stop: p.stopBackgroundOperations,
|
|
|
|
}
|
|
|
|
|
|
|
|
return curator.Run(olderThan, time.Now(), processor, p.storage.DiskStorage.CurationRemarks, p.storage.DiskStorage.MetricSamples, p.storage.DiskStorage.MetricHighWatermarks, p.curationState)
|
|
|
|
}
|
|
|
|
|
2013-05-13 08:53:24 +00:00
|
|
|
func (p *prometheus) delete(olderThan time.Duration, batchSize int) error {
|
|
|
|
p.curationMutex.Lock()
|
|
|
|
defer p.curationMutex.Unlock()
|
|
|
|
|
|
|
|
processor := &metric.DeletionProcessor{
|
|
|
|
MaximumMutationPoolBatch: batchSize,
|
|
|
|
}
|
|
|
|
|
|
|
|
curator := metric.Curator{
|
|
|
|
Stop: p.stopBackgroundOperations,
|
|
|
|
}
|
|
|
|
|
|
|
|
return curator.Run(olderThan, time.Now(), processor, p.storage.DiskStorage.CurationRemarks, p.storage.DiskStorage.MetricSamples, p.storage.DiskStorage.MetricHighWatermarks, p.curationState)
|
|
|
|
}
|
|
|
|
|
2013-05-07 15:14:04 +00:00
|
|
|
func (p *prometheus) close() {
|
|
|
|
if p.headCompactionTimer != nil {
|
|
|
|
p.headCompactionTimer.Stop()
|
|
|
|
}
|
|
|
|
if p.bodyCompactionTimer != nil {
|
|
|
|
p.bodyCompactionTimer.Stop()
|
|
|
|
}
|
|
|
|
if p.tailCompactionTimer != nil {
|
|
|
|
p.tailCompactionTimer.Stop()
|
|
|
|
}
|
2013-05-13 08:53:24 +00:00
|
|
|
if p.deletionTimer != nil {
|
|
|
|
p.deletionTimer.Stop()
|
|
|
|
}
|
2013-05-07 15:14:04 +00:00
|
|
|
|
|
|
|
if len(p.stopBackgroundOperations) == 0 {
|
|
|
|
p.stopBackgroundOperations <- true
|
|
|
|
}
|
|
|
|
|
2013-05-13 08:53:24 +00:00
|
|
|
p.curationMutex.Lock()
|
2013-05-07 15:14:04 +00:00
|
|
|
|
2013-07-30 15:18:07 +00:00
|
|
|
p.ruleManager.Stop()
|
2013-04-29 09:17:56 +00:00
|
|
|
p.storage.Close()
|
2013-07-30 15:18:07 +00:00
|
|
|
|
|
|
|
close(p.notifications)
|
2013-05-07 15:14:04 +00:00
|
|
|
close(p.stopBackgroundOperations)
|
|
|
|
close(p.curationState)
|
2013-04-29 09:17:56 +00:00
|
|
|
}
|
|
|
|
|
2012-11-24 11:33:34 +00:00
|
|
|
func main() {
|
2013-04-29 09:17:56 +00:00
|
|
|
// TODO(all): Future additions to main should be, where applicable, glumped
|
|
|
|
// into the prometheus struct above---at least where the scoping of the entire
|
|
|
|
// server is concerned.
|
2013-01-22 18:32:56 +00:00
|
|
|
flag.Parse()
|
2013-04-25 09:47:48 +00:00
|
|
|
|
2013-04-25 11:14:50 +00:00
|
|
|
versionInfoTmpl.Execute(os.Stdout, BuildInfo)
|
|
|
|
|
2013-04-25 09:47:48 +00:00
|
|
|
if *printVersion {
|
|
|
|
os.Exit(0)
|
|
|
|
}
|
|
|
|
|
2013-01-22 18:32:56 +00:00
|
|
|
conf, err := config.LoadFromFile(*configFile)
|
2013-01-07 22:24:26 +00:00
|
|
|
if err != nil {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Fatalf("Error loading configuration from %s: %v", *configFile, err)
|
2013-01-07 22:24:26 +00:00
|
|
|
}
|
|
|
|
|
2013-05-14 15:50:52 +00:00
|
|
|
ts, err := metric.NewTieredStorage(uint(*diskAppendQueueCapacity), 100, *arenaFlushInterval, *arenaTTL, *metricsStoragePath)
|
2013-03-27 10:25:05 +00:00
|
|
|
if err != nil {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Fatal("Error opening storage:", err)
|
2013-05-02 16:27:12 +00:00
|
|
|
}
|
2013-04-29 09:17:56 +00:00
|
|
|
|
2013-06-25 12:02:27 +00:00
|
|
|
unwrittenSamples := make(chan *extraction.Result, *samplesQueueCapacity)
|
2013-05-05 17:32:04 +00:00
|
|
|
curationState := make(chan metric.CurationState, 1)
|
2013-05-07 15:14:04 +00:00
|
|
|
// Coprime numbers, fool!
|
|
|
|
headCompactionTimer := time.NewTicker(*headCompactInterval)
|
|
|
|
bodyCompactionTimer := time.NewTicker(*bodyCompactInterval)
|
|
|
|
tailCompactionTimer := time.NewTicker(*tailCompactInterval)
|
2013-05-13 08:53:24 +00:00
|
|
|
deletionTimer := time.NewTicker(*deleteInterval)
|
2013-05-05 17:32:04 +00:00
|
|
|
|
|
|
|
// Queue depth will need to be exposed
|
2013-06-25 12:02:27 +00:00
|
|
|
targetManager := retrieval.NewTargetManager(unwrittenSamples, *concurrentRetrievalAllowance)
|
2013-05-05 17:32:04 +00:00
|
|
|
targetManager.AddTargetsFromConfig(conf)
|
|
|
|
|
2013-08-09 17:32:55 +00:00
|
|
|
notifications := make(chan notification.NotificationReqs, *notificationQueueCapacity)
|
2013-07-30 15:18:07 +00:00
|
|
|
|
2013-06-11 09:00:55 +00:00
|
|
|
// Queue depth will need to be exposed
|
2013-07-30 15:18:07 +00:00
|
|
|
ruleManager := rules.NewRuleManager(unwrittenSamples, notifications, conf.EvaluationInterval(), ts)
|
2013-08-12 15:18:02 +00:00
|
|
|
if err := ruleManager.AddRulesFromConfig(conf); err != nil {
|
|
|
|
glog.Fatal("Error loading rule files:", err)
|
2013-06-11 09:00:55 +00:00
|
|
|
}
|
|
|
|
go ruleManager.Run()
|
|
|
|
|
2013-08-09 16:09:44 +00:00
|
|
|
prometheusUrl := web.MustBuildServerUrl()
|
|
|
|
notificationHandler := notification.NewNotificationHandler(*alertmanagerUrl, prometheusUrl, notifications)
|
2013-07-30 15:18:07 +00:00
|
|
|
go notificationHandler.Run()
|
|
|
|
|
2013-05-14 09:21:27 +00:00
|
|
|
flags := map[string]string{}
|
|
|
|
|
|
|
|
flag.VisitAll(func(f *flag.Flag) {
|
|
|
|
flags[f.Name] = f.Value.String()
|
|
|
|
})
|
|
|
|
|
2013-05-05 17:32:04 +00:00
|
|
|
statusHandler := &web.StatusHandler{
|
2013-05-14 09:21:27 +00:00
|
|
|
PrometheusStatus: &web.PrometheusStatus{
|
|
|
|
BuildInfo: BuildInfo,
|
|
|
|
Config: conf.String(),
|
2013-06-13 14:10:05 +00:00
|
|
|
RuleManager: ruleManager,
|
2013-05-14 09:21:27 +00:00
|
|
|
TargetPools: targetManager.Pools(),
|
|
|
|
Flags: flags,
|
2013-05-24 08:44:34 +00:00
|
|
|
Birth: time.Now(),
|
2013-05-14 09:21:27 +00:00
|
|
|
},
|
2013-05-05 17:32:04 +00:00
|
|
|
CurationState: curationState,
|
|
|
|
}
|
|
|
|
|
2013-06-13 14:10:05 +00:00
|
|
|
alertsHandler := &web.AlertsHandler{
|
|
|
|
RuleManager: ruleManager,
|
|
|
|
}
|
|
|
|
|
2013-05-14 09:21:27 +00:00
|
|
|
databasesHandler := &web.DatabasesHandler{
|
2013-08-05 16:34:19 +00:00
|
|
|
Provider: ts.DiskStorage,
|
|
|
|
RefreshInterval: 5 * time.Minute,
|
2013-05-14 09:21:27 +00:00
|
|
|
}
|
2013-05-05 17:32:04 +00:00
|
|
|
|
|
|
|
metricsService := &api.MetricsService{
|
|
|
|
Config: &conf,
|
|
|
|
TargetManager: targetManager,
|
|
|
|
Storage: ts,
|
|
|
|
}
|
|
|
|
|
|
|
|
webService := &web.WebService{
|
2013-05-14 09:21:27 +00:00
|
|
|
StatusHandler: statusHandler,
|
|
|
|
MetricsHandler: metricsService,
|
|
|
|
DatabasesHandler: databasesHandler,
|
2013-06-13 14:10:05 +00:00
|
|
|
AlertsHandler: alertsHandler,
|
2013-05-05 17:32:04 +00:00
|
|
|
}
|
2013-04-29 09:17:56 +00:00
|
|
|
|
2013-06-25 12:02:27 +00:00
|
|
|
prometheus := &prometheus{
|
2013-05-13 08:53:24 +00:00
|
|
|
bodyCompactionTimer: bodyCompactionTimer,
|
|
|
|
headCompactionTimer: headCompactionTimer,
|
|
|
|
tailCompactionTimer: tailCompactionTimer,
|
|
|
|
|
|
|
|
deletionTimer: deletionTimer,
|
|
|
|
|
2013-08-05 15:31:49 +00:00
|
|
|
curationState: curationState,
|
2013-05-14 09:21:27 +00:00
|
|
|
|
2013-06-25 12:02:27 +00:00
|
|
|
unwrittenSamples: unwrittenSamples,
|
2013-05-13 08:53:24 +00:00
|
|
|
|
2013-05-07 15:14:04 +00:00
|
|
|
stopBackgroundOperations: make(chan bool, 1),
|
2013-05-13 08:53:24 +00:00
|
|
|
|
2013-07-30 15:18:07 +00:00
|
|
|
ruleManager: ruleManager,
|
|
|
|
notifications: notifications,
|
|
|
|
storage: ts,
|
2013-04-29 09:17:56 +00:00
|
|
|
}
|
|
|
|
defer prometheus.close()
|
|
|
|
|
2013-06-06 08:42:21 +00:00
|
|
|
storageStarted := make(chan bool)
|
|
|
|
go ts.Serve(storageStarted)
|
|
|
|
<-storageStarted
|
|
|
|
|
2013-04-29 09:17:56 +00:00
|
|
|
go prometheus.interruptHandler()
|
2012-12-11 19:46:16 +00:00
|
|
|
|
2013-05-07 15:14:04 +00:00
|
|
|
go func() {
|
|
|
|
for _ = range prometheus.headCompactionTimer.C {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Starting head compaction...")
|
2013-05-07 15:14:04 +00:00
|
|
|
err := prometheus.compact(*headAge, *headGroupSize)
|
|
|
|
|
|
|
|
if err != nil {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Error("could not compact:", err)
|
2013-05-07 15:14:04 +00:00
|
|
|
}
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Done")
|
2013-05-07 15:14:04 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
for _ = range prometheus.bodyCompactionTimer.C {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Starting body compaction...")
|
2013-05-07 15:14:04 +00:00
|
|
|
err := prometheus.compact(*bodyAge, *bodyGroupSize)
|
|
|
|
|
|
|
|
if err != nil {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Error("could not compact:", err)
|
2013-05-07 15:14:04 +00:00
|
|
|
}
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Done")
|
2013-05-07 15:14:04 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
for _ = range prometheus.tailCompactionTimer.C {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Starting tail compaction...")
|
2013-05-07 15:14:04 +00:00
|
|
|
err := prometheus.compact(*tailAge, *tailGroupSize)
|
|
|
|
|
|
|
|
if err != nil {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Error("could not compact:", err)
|
2013-05-07 15:14:04 +00:00
|
|
|
}
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Done")
|
2013-05-07 15:14:04 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2013-05-13 08:53:24 +00:00
|
|
|
go func() {
|
|
|
|
for _ = range prometheus.deletionTimer.C {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Starting deletion of stale values...")
|
2013-05-13 08:53:24 +00:00
|
|
|
err := prometheus.delete(*deleteAge, deletionBatchSize)
|
|
|
|
|
|
|
|
if err != nil {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Error("could not delete:", err)
|
2013-05-13 08:53:24 +00:00
|
|
|
}
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Info("Done")
|
2013-05-13 08:53:24 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2013-05-05 17:32:04 +00:00
|
|
|
go func() {
|
|
|
|
err := webService.ServeForever()
|
|
|
|
if err != nil {
|
2013-08-12 15:18:02 +00:00
|
|
|
glog.Fatal(err)
|
2013-05-05 17:32:04 +00:00
|
|
|
}
|
|
|
|
}()
|
2013-01-04 16:55:58 +00:00
|
|
|
|
2013-04-29 09:17:56 +00:00
|
|
|
// TODO(all): Migrate this into prometheus.serve().
|
2013-06-25 12:02:27 +00:00
|
|
|
for block := range unwrittenSamples {
|
|
|
|
if block.Err == nil {
|
|
|
|
ts.AppendSamples(block.Samples)
|
2012-12-25 12:50:36 +00:00
|
|
|
}
|
2012-11-24 11:33:34 +00:00
|
|
|
}
|
|
|
|
}
|