Add flag for size based retention (#5109)

* Add flag for size based retention

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

* Deprecate the old retention flag for a new one.

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

* Add ability to take a suffix for size flag

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

* Address feedback

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>
This commit is contained in:
Goutham Veeramachaneni 2019-01-18 19:18:36 +05:30 committed by GitHub
parent 3bd41cc92c
commit 384cba1211
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 85 additions and 6 deletions

View File

@ -71,10 +71,19 @@ var (
Name: "prometheus_config_last_reload_success_timestamp_seconds", Name: "prometheus_config_last_reload_success_timestamp_seconds",
Help: "Timestamp of the last successful configuration reload.", Help: "Timestamp of the last successful configuration reload.",
}) })
defaultRetentionString = "15d"
defaultRetentionDuration model.Duration
) )
func init() { func init() {
prometheus.MustRegister(version.NewCollector("prometheus")) prometheus.MustRegister(version.NewCollector("prometheus"))
var err error
defaultRetentionDuration, err = model.ParseDuration(defaultRetentionString)
if err != nil {
panic(err)
}
} }
func main() { func main() {
@ -83,6 +92,11 @@ func main() {
runtime.SetMutexProfileFraction(20) runtime.SetMutexProfileFraction(20)
} }
var (
oldFlagRetentionDuration model.Duration
newFlagRetentionDuration model.Duration
)
cfg := struct { cfg := struct {
configFile string configFile string
@ -171,8 +185,14 @@ func main() {
"Size at which to split the tsdb WAL segment files (e.g. 100MB)"). "Size at which to split the tsdb WAL segment files (e.g. 100MB)").
Hidden().PlaceHolder("<bytes>").BytesVar(&cfg.tsdb.WALSegmentSize) Hidden().PlaceHolder("<bytes>").BytesVar(&cfg.tsdb.WALSegmentSize)
a.Flag("storage.tsdb.retention", "How long to retain samples in storage."). a.Flag("storage.tsdb.retention", "[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use \"storage.tsdb.retention.time\" instead").
Default("15d").SetValue(&cfg.tsdb.Retention) Default(defaultRetentionString).SetValue(&oldFlagRetentionDuration)
a.Flag("storage.tsdb.retention.time", "How long to retain samples in storage. Overrides \"storage.tsdb.retention\" if this flag is set to anything other than default.").
Default(defaultRetentionString).SetValue(&newFlagRetentionDuration)
a.Flag("storage.tsdb.retention.size", "[EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units supported: KB, MB, GB, TB, PB. This flag is experimental and can be changed in future releases.").
Default("0").BytesVar(&cfg.tsdb.MaxBytes)
a.Flag("storage.tsdb.no-lockfile", "Do not create lockfile in data directory."). a.Flag("storage.tsdb.no-lockfile", "Do not create lockfile in data directory.").
Default("false").BoolVar(&cfg.tsdb.NoLockfile) Default("false").BoolVar(&cfg.tsdb.NoLockfile)
@ -244,8 +264,10 @@ func main() {
// RoutePrefix must always be at least '/'. // RoutePrefix must always be at least '/'.
cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/") cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/")
cfg.tsdb.RetentionDuration = chooseRetention(oldFlagRetentionDuration, newFlagRetentionDuration)
if cfg.tsdb.MaxBlockDuration == 0 { if cfg.tsdb.MaxBlockDuration == 0 {
cfg.tsdb.MaxBlockDuration = cfg.tsdb.Retention / 10 cfg.tsdb.MaxBlockDuration = cfg.tsdb.RetentionDuration / 10
} }
promql.LookbackDelta = time.Duration(cfg.lookbackDelta) promql.LookbackDelta = time.Duration(cfg.lookbackDelta)
@ -253,6 +275,10 @@ func main() {
logger := promlog.New(&cfg.promlogConfig) logger := promlog.New(&cfg.promlogConfig)
if oldFlagRetentionDuration != defaultRetentionDuration {
level.Warn(logger).Log("deprecation_notice", `"storage.tsdb.retention" flag is deprecated use "storage.tsdb.retention.time" instead.`)
}
// Above level 6, the k8s client would log bearer tokens in clear-text. // Above level 6, the k8s client would log bearer tokens in clear-text.
klog.ClampLevel(6) klog.ClampLevel(6)
klog.SetLogger(log.With(logger, "component", "k8s_client_runtime")) klog.SetLogger(log.With(logger, "component", "k8s_client_runtime"))
@ -757,3 +783,19 @@ func sendAlerts(s sender, externalURL string) rules.NotifyFunc {
} }
} }
} }
// chooseRetention is some roundabout code to support both RetentionDuration and Retention (for different flags).
// If Retention is 15d, then it means that the default value is set and the value of RetentionDuration is used.
func chooseRetention(oldFlagDuration, newFlagDuration model.Duration) model.Duration {
retention := oldFlagDuration
if retention == defaultRetentionDuration {
retention = newFlagDuration
}
// Further newFlag takes precedence if it's set to anything other than default.
if newFlagDuration != defaultRetentionDuration {
retention = newFlagDuration
}
return retention
}

View File

@ -25,6 +25,7 @@ import (
"testing" "testing"
"time" "time"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/notifier"
"github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/rules"
@ -284,3 +285,31 @@ func TestWALSegmentSizeBounds(t *testing.T) {
} }
} }
} }
func TestChooseRetention(t *testing.T) {
retention1, err := model.ParseDuration("20d")
testutil.Ok(t, err)
retention2, err := model.ParseDuration("30d")
testutil.Ok(t, err)
cases := []struct {
oldFlagRetention model.Duration
newFlagRetention model.Duration
chosen model.Duration
}{
// Both are default (unset flags).
{defaultRetentionDuration, defaultRetentionDuration, defaultRetentionDuration},
// Old flag is set and new flag is unset.
{retention1, defaultRetentionDuration, retention1},
// Old flag is unset and new flag is set.
{defaultRetentionDuration, retention2, retention2},
// Both flags are set.
{retention1, retention2, retention2},
}
for _, tc := range cases {
retention := chooseRetention(tc.oldFlagRetention, tc.newFlagRetention)
testutil.Equals(t, tc.chosen, retention)
}
}

View File

@ -52,7 +52,9 @@ For further details on file format, see [TSDB format](https://github.com/prometh
Prometheus has several flags that allow configuring the local storage. The most important ones are: Prometheus has several flags that allow configuring the local storage. The most important ones are:
* `--storage.tsdb.path`: This determines where Prometheus writes its database. Defaults to `data/`. * `--storage.tsdb.path`: This determines where Prometheus writes its database. Defaults to `data/`.
* `--storage.tsdb.retention`: This determines when to remove old data. Defaults to `15d`. * `--storage.tsdb.retention.time`: This determines when to remove old data. Defaults to `15d`. Overrides `storage.tsdb.retention` if this flag is set to anything other than default.
* `--storage.tsdb.retention.size`: [EXPERIMENTAL] This determines the maximum number of bytes that storage blocks can use (note that this does not include the WAL size, which can be substantial). The oldest data will be removed first. Defaults to `0` or disabled. This flag is experimental and can be changed in future releases. Units supported: KB, MB, GB, PB. Ex: "512MB"
* `--storage.tsdb.retention`: This flag has been deprecated in favour of `storage.tsdb.retention.time`.
On average, Prometheus uses only around 1-2 bytes per sample. Thus, to plan the capacity of a Prometheus server, you can use the rough formula: On average, Prometheus uses only around 1-2 bytes per sample. Thus, to plan the capacity of a Prometheus server, you can use the rough formula:
@ -64,6 +66,8 @@ To tune the rate of ingested samples per second, you can either reduce the numbe
If your local storage becomes corrupted for whatever reason, your best bet is to shut down Prometheus and remove the entire storage directory. However, you can also try removing individual block directories to resolve the problem. This means losing a time window of around two hours worth of data per block directory. Again, Prometheus's local storage is not meant as durable long-term storage. If your local storage becomes corrupted for whatever reason, your best bet is to shut down Prometheus and remove the entire storage directory. However, you can also try removing individual block directories to resolve the problem. This means losing a time window of around two hours worth of data per block directory. Again, Prometheus's local storage is not meant as durable long-term storage.
If both time and size retention policies are specified, whichever policy triggers first will be used at that instant.
## Remote storage integrations ## Remote storage integrations
Prometheus's local storage is limited by single nodes in its scalability and durability. Instead of trying to solve clustered storage in Prometheus itself, Prometheus has a set of interfaces that allow integrating with remote storage systems. Prometheus's local storage is limited by single nodes in its scalability and durability. Instead of trying to solve clustered storage in Prometheus itself, Prometheus has a set of interfaces that allow integrating with remote storage systems.

View File

@ -119,7 +119,10 @@ type Options struct {
WALSegmentSize units.Base2Bytes WALSegmentSize units.Base2Bytes
// Duration for how long to retain data. // Duration for how long to retain data.
Retention model.Duration RetentionDuration model.Duration
// Maximum number of bytes to be retained.
MaxBytes units.Base2Bytes
// Disable creation and consideration of lockfile. // Disable creation and consideration of lockfile.
NoLockfile bool NoLockfile bool
@ -183,7 +186,8 @@ func Open(path string, l log.Logger, r prometheus.Registerer, opts *Options) (*t
db, err := tsdb.Open(path, l, r, &tsdb.Options{ db, err := tsdb.Open(path, l, r, &tsdb.Options{
WALSegmentSize: int(opts.WALSegmentSize), WALSegmentSize: int(opts.WALSegmentSize),
RetentionDuration: uint64(time.Duration(opts.Retention).Seconds() * 1000), RetentionDuration: uint64(time.Duration(opts.RetentionDuration).Seconds() * 1000),
MaxBytes: int64(opts.MaxBytes),
BlockRanges: rngs, BlockRanges: rngs,
NoLockfile: opts.NoLockfile, NoLockfile: opts.NoLockfile,
}) })