Add flag for size based retention (#5109)
* Add flag for size based retention Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com> * Deprecate the old retention flag for a new one. Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com> * Add ability to take a suffix for size flag Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com> * Address feedback Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>
This commit is contained in:
parent
3bd41cc92c
commit
384cba1211
|
@ -71,10 +71,19 @@ var (
|
||||||
Name: "prometheus_config_last_reload_success_timestamp_seconds",
|
Name: "prometheus_config_last_reload_success_timestamp_seconds",
|
||||||
Help: "Timestamp of the last successful configuration reload.",
|
Help: "Timestamp of the last successful configuration reload.",
|
||||||
})
|
})
|
||||||
|
|
||||||
|
defaultRetentionString = "15d"
|
||||||
|
defaultRetentionDuration model.Duration
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
prometheus.MustRegister(version.NewCollector("prometheus"))
|
prometheus.MustRegister(version.NewCollector("prometheus"))
|
||||||
|
|
||||||
|
var err error
|
||||||
|
defaultRetentionDuration, err = model.ParseDuration(defaultRetentionString)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -83,6 +92,11 @@ func main() {
|
||||||
runtime.SetMutexProfileFraction(20)
|
runtime.SetMutexProfileFraction(20)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
oldFlagRetentionDuration model.Duration
|
||||||
|
newFlagRetentionDuration model.Duration
|
||||||
|
)
|
||||||
|
|
||||||
cfg := struct {
|
cfg := struct {
|
||||||
configFile string
|
configFile string
|
||||||
|
|
||||||
|
@ -171,8 +185,14 @@ func main() {
|
||||||
"Size at which to split the tsdb WAL segment files (e.g. 100MB)").
|
"Size at which to split the tsdb WAL segment files (e.g. 100MB)").
|
||||||
Hidden().PlaceHolder("<bytes>").BytesVar(&cfg.tsdb.WALSegmentSize)
|
Hidden().PlaceHolder("<bytes>").BytesVar(&cfg.tsdb.WALSegmentSize)
|
||||||
|
|
||||||
a.Flag("storage.tsdb.retention", "How long to retain samples in storage.").
|
a.Flag("storage.tsdb.retention", "[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use \"storage.tsdb.retention.time\" instead").
|
||||||
Default("15d").SetValue(&cfg.tsdb.Retention)
|
Default(defaultRetentionString).SetValue(&oldFlagRetentionDuration)
|
||||||
|
|
||||||
|
a.Flag("storage.tsdb.retention.time", "How long to retain samples in storage. Overrides \"storage.tsdb.retention\" if this flag is set to anything other than default.").
|
||||||
|
Default(defaultRetentionString).SetValue(&newFlagRetentionDuration)
|
||||||
|
|
||||||
|
a.Flag("storage.tsdb.retention.size", "[EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units supported: KB, MB, GB, TB, PB. This flag is experimental and can be changed in future releases.").
|
||||||
|
Default("0").BytesVar(&cfg.tsdb.MaxBytes)
|
||||||
|
|
||||||
a.Flag("storage.tsdb.no-lockfile", "Do not create lockfile in data directory.").
|
a.Flag("storage.tsdb.no-lockfile", "Do not create lockfile in data directory.").
|
||||||
Default("false").BoolVar(&cfg.tsdb.NoLockfile)
|
Default("false").BoolVar(&cfg.tsdb.NoLockfile)
|
||||||
|
@ -244,8 +264,10 @@ func main() {
|
||||||
// RoutePrefix must always be at least '/'.
|
// RoutePrefix must always be at least '/'.
|
||||||
cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/")
|
cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/")
|
||||||
|
|
||||||
|
cfg.tsdb.RetentionDuration = chooseRetention(oldFlagRetentionDuration, newFlagRetentionDuration)
|
||||||
|
|
||||||
if cfg.tsdb.MaxBlockDuration == 0 {
|
if cfg.tsdb.MaxBlockDuration == 0 {
|
||||||
cfg.tsdb.MaxBlockDuration = cfg.tsdb.Retention / 10
|
cfg.tsdb.MaxBlockDuration = cfg.tsdb.RetentionDuration / 10
|
||||||
}
|
}
|
||||||
|
|
||||||
promql.LookbackDelta = time.Duration(cfg.lookbackDelta)
|
promql.LookbackDelta = time.Duration(cfg.lookbackDelta)
|
||||||
|
@ -253,6 +275,10 @@ func main() {
|
||||||
|
|
||||||
logger := promlog.New(&cfg.promlogConfig)
|
logger := promlog.New(&cfg.promlogConfig)
|
||||||
|
|
||||||
|
if oldFlagRetentionDuration != defaultRetentionDuration {
|
||||||
|
level.Warn(logger).Log("deprecation_notice", `"storage.tsdb.retention" flag is deprecated use "storage.tsdb.retention.time" instead.`)
|
||||||
|
}
|
||||||
|
|
||||||
// Above level 6, the k8s client would log bearer tokens in clear-text.
|
// Above level 6, the k8s client would log bearer tokens in clear-text.
|
||||||
klog.ClampLevel(6)
|
klog.ClampLevel(6)
|
||||||
klog.SetLogger(log.With(logger, "component", "k8s_client_runtime"))
|
klog.SetLogger(log.With(logger, "component", "k8s_client_runtime"))
|
||||||
|
@ -757,3 +783,19 @@ func sendAlerts(s sender, externalURL string) rules.NotifyFunc {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// chooseRetention is some roundabout code to support both RetentionDuration and Retention (for different flags).
|
||||||
|
// If Retention is 15d, then it means that the default value is set and the value of RetentionDuration is used.
|
||||||
|
func chooseRetention(oldFlagDuration, newFlagDuration model.Duration) model.Duration {
|
||||||
|
retention := oldFlagDuration
|
||||||
|
if retention == defaultRetentionDuration {
|
||||||
|
retention = newFlagDuration
|
||||||
|
}
|
||||||
|
|
||||||
|
// Further newFlag takes precedence if it's set to anything other than default.
|
||||||
|
if newFlagDuration != defaultRetentionDuration {
|
||||||
|
retention = newFlagDuration
|
||||||
|
}
|
||||||
|
|
||||||
|
return retention
|
||||||
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/common/model"
|
||||||
"github.com/prometheus/prometheus/notifier"
|
"github.com/prometheus/prometheus/notifier"
|
||||||
"github.com/prometheus/prometheus/pkg/labels"
|
"github.com/prometheus/prometheus/pkg/labels"
|
||||||
"github.com/prometheus/prometheus/rules"
|
"github.com/prometheus/prometheus/rules"
|
||||||
|
@ -284,3 +285,31 @@ func TestWALSegmentSizeBounds(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChooseRetention(t *testing.T) {
|
||||||
|
retention1, err := model.ParseDuration("20d")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
retention2, err := model.ParseDuration("30d")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
oldFlagRetention model.Duration
|
||||||
|
newFlagRetention model.Duration
|
||||||
|
|
||||||
|
chosen model.Duration
|
||||||
|
}{
|
||||||
|
// Both are default (unset flags).
|
||||||
|
{defaultRetentionDuration, defaultRetentionDuration, defaultRetentionDuration},
|
||||||
|
// Old flag is set and new flag is unset.
|
||||||
|
{retention1, defaultRetentionDuration, retention1},
|
||||||
|
// Old flag is unset and new flag is set.
|
||||||
|
{defaultRetentionDuration, retention2, retention2},
|
||||||
|
// Both flags are set.
|
||||||
|
{retention1, retention2, retention2},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
retention := chooseRetention(tc.oldFlagRetention, tc.newFlagRetention)
|
||||||
|
testutil.Equals(t, tc.chosen, retention)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -52,7 +52,9 @@ For further details on file format, see [TSDB format](https://github.com/prometh
|
||||||
Prometheus has several flags that allow configuring the local storage. The most important ones are:
|
Prometheus has several flags that allow configuring the local storage. The most important ones are:
|
||||||
|
|
||||||
* `--storage.tsdb.path`: This determines where Prometheus writes its database. Defaults to `data/`.
|
* `--storage.tsdb.path`: This determines where Prometheus writes its database. Defaults to `data/`.
|
||||||
* `--storage.tsdb.retention`: This determines when to remove old data. Defaults to `15d`.
|
* `--storage.tsdb.retention.time`: This determines when to remove old data. Defaults to `15d`. Overrides `storage.tsdb.retention` if this flag is set to anything other than default.
|
||||||
|
* `--storage.tsdb.retention.size`: [EXPERIMENTAL] This determines the maximum number of bytes that storage blocks can use (note that this does not include the WAL size, which can be substantial). The oldest data will be removed first. Defaults to `0` or disabled. This flag is experimental and can be changed in future releases. Units supported: KB, MB, GB, PB. Ex: "512MB"
|
||||||
|
* `--storage.tsdb.retention`: This flag has been deprecated in favour of `storage.tsdb.retention.time`.
|
||||||
|
|
||||||
On average, Prometheus uses only around 1-2 bytes per sample. Thus, to plan the capacity of a Prometheus server, you can use the rough formula:
|
On average, Prometheus uses only around 1-2 bytes per sample. Thus, to plan the capacity of a Prometheus server, you can use the rough formula:
|
||||||
|
|
||||||
|
@ -64,6 +66,8 @@ To tune the rate of ingested samples per second, you can either reduce the numbe
|
||||||
|
|
||||||
If your local storage becomes corrupted for whatever reason, your best bet is to shut down Prometheus and remove the entire storage directory. However, you can also try removing individual block directories to resolve the problem. This means losing a time window of around two hours worth of data per block directory. Again, Prometheus's local storage is not meant as durable long-term storage.
|
If your local storage becomes corrupted for whatever reason, your best bet is to shut down Prometheus and remove the entire storage directory. However, you can also try removing individual block directories to resolve the problem. This means losing a time window of around two hours worth of data per block directory. Again, Prometheus's local storage is not meant as durable long-term storage.
|
||||||
|
|
||||||
|
If both time and size retention policies are specified, whichever policy triggers first will be used at that instant.
|
||||||
|
|
||||||
## Remote storage integrations
|
## Remote storage integrations
|
||||||
|
|
||||||
Prometheus's local storage is limited by single nodes in its scalability and durability. Instead of trying to solve clustered storage in Prometheus itself, Prometheus has a set of interfaces that allow integrating with remote storage systems.
|
Prometheus's local storage is limited by single nodes in its scalability and durability. Instead of trying to solve clustered storage in Prometheus itself, Prometheus has a set of interfaces that allow integrating with remote storage systems.
|
||||||
|
|
|
@ -119,7 +119,10 @@ type Options struct {
|
||||||
WALSegmentSize units.Base2Bytes
|
WALSegmentSize units.Base2Bytes
|
||||||
|
|
||||||
// Duration for how long to retain data.
|
// Duration for how long to retain data.
|
||||||
Retention model.Duration
|
RetentionDuration model.Duration
|
||||||
|
|
||||||
|
// Maximum number of bytes to be retained.
|
||||||
|
MaxBytes units.Base2Bytes
|
||||||
|
|
||||||
// Disable creation and consideration of lockfile.
|
// Disable creation and consideration of lockfile.
|
||||||
NoLockfile bool
|
NoLockfile bool
|
||||||
|
@ -183,7 +186,8 @@ func Open(path string, l log.Logger, r prometheus.Registerer, opts *Options) (*t
|
||||||
|
|
||||||
db, err := tsdb.Open(path, l, r, &tsdb.Options{
|
db, err := tsdb.Open(path, l, r, &tsdb.Options{
|
||||||
WALSegmentSize: int(opts.WALSegmentSize),
|
WALSegmentSize: int(opts.WALSegmentSize),
|
||||||
RetentionDuration: uint64(time.Duration(opts.Retention).Seconds() * 1000),
|
RetentionDuration: uint64(time.Duration(opts.RetentionDuration).Seconds() * 1000),
|
||||||
|
MaxBytes: int64(opts.MaxBytes),
|
||||||
BlockRanges: rngs,
|
BlockRanges: rngs,
|
||||||
NoLockfile: opts.NoLockfile,
|
NoLockfile: opts.NoLockfile,
|
||||||
})
|
})
|
||||||
|
|
Loading…
Reference in New Issue