diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 75efeca80..25426b9a3 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -71,10 +71,19 @@ var ( Name: "prometheus_config_last_reload_success_timestamp_seconds", Help: "Timestamp of the last successful configuration reload.", }) + + defaultRetentionString = "15d" + defaultRetentionDuration model.Duration ) func init() { prometheus.MustRegister(version.NewCollector("prometheus")) + + var err error + defaultRetentionDuration, err = model.ParseDuration(defaultRetentionString) + if err != nil { + panic(err) + } } func main() { @@ -83,6 +92,11 @@ func main() { runtime.SetMutexProfileFraction(20) } + var ( + oldFlagRetentionDuration model.Duration + newFlagRetentionDuration model.Duration + ) + cfg := struct { configFile string @@ -171,8 +185,14 @@ func main() { "Size at which to split the tsdb WAL segment files (e.g. 100MB)"). Hidden().PlaceHolder("").BytesVar(&cfg.tsdb.WALSegmentSize) - a.Flag("storage.tsdb.retention", "How long to retain samples in storage."). - Default("15d").SetValue(&cfg.tsdb.Retention) + a.Flag("storage.tsdb.retention", "[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use \"storage.tsdb.retention.time\" instead"). + Default(defaultRetentionString).SetValue(&oldFlagRetentionDuration) + + a.Flag("storage.tsdb.retention.time", "How long to retain samples in storage. Overrides \"storage.tsdb.retention\" if this flag is set to anything other than default."). + Default(defaultRetentionString).SetValue(&newFlagRetentionDuration) + + a.Flag("storage.tsdb.retention.size", "[EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units supported: KB, MB, GB, TB, PB. This flag is experimental and can be changed in future releases."). + Default("0").BytesVar(&cfg.tsdb.MaxBytes) a.Flag("storage.tsdb.no-lockfile", "Do not create lockfile in data directory."). Default("false").BoolVar(&cfg.tsdb.NoLockfile) @@ -244,8 +264,10 @@ func main() { // RoutePrefix must always be at least '/'. cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/") + cfg.tsdb.RetentionDuration = chooseRetention(oldFlagRetentionDuration, newFlagRetentionDuration) + if cfg.tsdb.MaxBlockDuration == 0 { - cfg.tsdb.MaxBlockDuration = cfg.tsdb.Retention / 10 + cfg.tsdb.MaxBlockDuration = cfg.tsdb.RetentionDuration / 10 } promql.LookbackDelta = time.Duration(cfg.lookbackDelta) @@ -253,6 +275,10 @@ func main() { logger := promlog.New(&cfg.promlogConfig) + if oldFlagRetentionDuration != defaultRetentionDuration { + level.Warn(logger).Log("deprecation_notice", `"storage.tsdb.retention" flag is deprecated use "storage.tsdb.retention.time" instead.`) + } + // Above level 6, the k8s client would log bearer tokens in clear-text. klog.ClampLevel(6) klog.SetLogger(log.With(logger, "component", "k8s_client_runtime")) @@ -757,3 +783,19 @@ func sendAlerts(s sender, externalURL string) rules.NotifyFunc { } } } + +// chooseRetention is some roundabout code to support both RetentionDuration and Retention (for different flags). +// If Retention is 15d, then it means that the default value is set and the value of RetentionDuration is used. +func chooseRetention(oldFlagDuration, newFlagDuration model.Duration) model.Duration { + retention := oldFlagDuration + if retention == defaultRetentionDuration { + retention = newFlagDuration + } + + // Further newFlag takes precedence if it's set to anything other than default. + if newFlagDuration != defaultRetentionDuration { + retention = newFlagDuration + } + + return retention +} diff --git a/cmd/prometheus/main_test.go b/cmd/prometheus/main_test.go index e82e55bc3..0671d759e 100644 --- a/cmd/prometheus/main_test.go +++ b/cmd/prometheus/main_test.go @@ -25,6 +25,7 @@ import ( "testing" "time" + "github.com/prometheus/common/model" "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/rules" @@ -284,3 +285,31 @@ func TestWALSegmentSizeBounds(t *testing.T) { } } } + +func TestChooseRetention(t *testing.T) { + retention1, err := model.ParseDuration("20d") + testutil.Ok(t, err) + retention2, err := model.ParseDuration("30d") + testutil.Ok(t, err) + + cases := []struct { + oldFlagRetention model.Duration + newFlagRetention model.Duration + + chosen model.Duration + }{ + // Both are default (unset flags). + {defaultRetentionDuration, defaultRetentionDuration, defaultRetentionDuration}, + // Old flag is set and new flag is unset. + {retention1, defaultRetentionDuration, retention1}, + // Old flag is unset and new flag is set. + {defaultRetentionDuration, retention2, retention2}, + // Both flags are set. + {retention1, retention2, retention2}, + } + + for _, tc := range cases { + retention := chooseRetention(tc.oldFlagRetention, tc.newFlagRetention) + testutil.Equals(t, tc.chosen, retention) + } +} diff --git a/docs/storage.md b/docs/storage.md index 21cac5e89..9f6b0e913 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -52,7 +52,9 @@ For further details on file format, see [TSDB format](https://github.com/prometh Prometheus has several flags that allow configuring the local storage. The most important ones are: * `--storage.tsdb.path`: This determines where Prometheus writes its database. Defaults to `data/`. -* `--storage.tsdb.retention`: This determines when to remove old data. Defaults to `15d`. +* `--storage.tsdb.retention.time`: This determines when to remove old data. Defaults to `15d`. Overrides `storage.tsdb.retention` if this flag is set to anything other than default. +* `--storage.tsdb.retention.size`: [EXPERIMENTAL] This determines the maximum number of bytes that storage blocks can use (note that this does not include the WAL size, which can be substantial). The oldest data will be removed first. Defaults to `0` or disabled. This flag is experimental and can be changed in future releases. Units supported: KB, MB, GB, PB. Ex: "512MB" +* `--storage.tsdb.retention`: This flag has been deprecated in favour of `storage.tsdb.retention.time`. On average, Prometheus uses only around 1-2 bytes per sample. Thus, to plan the capacity of a Prometheus server, you can use the rough formula: @@ -64,6 +66,8 @@ To tune the rate of ingested samples per second, you can either reduce the numbe If your local storage becomes corrupted for whatever reason, your best bet is to shut down Prometheus and remove the entire storage directory. However, you can also try removing individual block directories to resolve the problem. This means losing a time window of around two hours worth of data per block directory. Again, Prometheus's local storage is not meant as durable long-term storage. +If both time and size retention policies are specified, whichever policy triggers first will be used at that instant. + ## Remote storage integrations Prometheus's local storage is limited by single nodes in its scalability and durability. Instead of trying to solve clustered storage in Prometheus itself, Prometheus has a set of interfaces that allow integrating with remote storage systems. diff --git a/storage/tsdb/tsdb.go b/storage/tsdb/tsdb.go index f18036216..4b45dd47b 100644 --- a/storage/tsdb/tsdb.go +++ b/storage/tsdb/tsdb.go @@ -119,7 +119,10 @@ type Options struct { WALSegmentSize units.Base2Bytes // Duration for how long to retain data. - Retention model.Duration + RetentionDuration model.Duration + + // Maximum number of bytes to be retained. + MaxBytes units.Base2Bytes // Disable creation and consideration of lockfile. NoLockfile bool @@ -183,7 +186,8 @@ func Open(path string, l log.Logger, r prometheus.Registerer, opts *Options) (*t db, err := tsdb.Open(path, l, r, &tsdb.Options{ WALSegmentSize: int(opts.WALSegmentSize), - RetentionDuration: uint64(time.Duration(opts.Retention).Seconds() * 1000), + RetentionDuration: uint64(time.Duration(opts.RetentionDuration).Seconds() * 1000), + MaxBytes: int64(opts.MaxBytes), BlockRanges: rngs, NoLockfile: opts.NoLockfile, })