fix the "failed compaction" metric. (#613)

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
This commit is contained in:
Krasi Georgiev 2019-05-30 13:57:28 +02:00 committed by GitHub
parent 13c80a5979
commit 882162d5b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 9 deletions

View File

@ -84,7 +84,6 @@ type LeveledCompactor struct {
type compactorMetrics struct {
ran prometheus.Counter
populatingBlocks prometheus.Gauge
failed prometheus.Counter
overlappingBlocks prometheus.Counter
duration prometheus.Histogram
chunkSize prometheus.Histogram
@ -103,10 +102,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
Name: "prometheus_tsdb_compaction_populating_block",
Help: "Set to 1 when a block is currently being written to the disk.",
})
m.failed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_compactions_failed_total",
Help: "Total number of compactions that failed for the partition.",
})
m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_vertical_compactions_total",
Help: "Total number of compactions done on overlapping blocks.",
@ -136,7 +131,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
r.MustRegister(
m.ran,
m.populatingBlocks,
m.failed,
m.overlappingBlocks,
m.duration,
m.chunkRange,
@ -541,9 +535,6 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe
if err := os.RemoveAll(tmp); err != nil {
level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error())
}
if err != nil {
c.metrics.failed.Inc()
}
c.metrics.ran.Inc()
c.metrics.duration.Observe(time.Since(t).Seconds())
}(time.Now())

View File

@ -1042,6 +1042,7 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) {
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "initial 'failed db reload' count metrics mismatch")
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "initial `compactions` count metric mismatch")
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "initial `compactions failed` count metric mismatch")
// Do the compaction and check the metrics.
// Compaction should succeed, but the reload should fail and
@ -1049,6 +1050,8 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) {
testutil.NotOk(t, db.compact())
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "'failed db reload' count metrics mismatch")
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "`compaction` count metric mismatch")
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "`compactions failed` count metric mismatch")
actBlocks, err = blockDirs(db.Dir())
testutil.Ok(t, err)
testutil.Equals(t, expBlocks, len(actBlocks)-1, "block count should be the same as before the compaction") // -1 to exclude the corrupted block.

11
db.go
View File

@ -147,6 +147,7 @@ type dbMetrics struct {
reloads prometheus.Counter
reloadsFailed prometheus.Counter
compactionsTriggered prometheus.Counter
compactionsFailed prometheus.Counter
timeRetentionCount prometheus.Counter
compactionsSkipped prometheus.Counter
startTime prometheus.GaugeFunc
@ -191,6 +192,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
Name: "prometheus_tsdb_compactions_triggered_total",
Help: "Total number of triggered compactions for the partition.",
})
m.compactionsFailed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_compactions_failed_total",
Help: "Total number of compactions that failed for the partition.",
})
m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_time_retentions_total",
Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.",
@ -231,6 +236,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
m.reloadsFailed,
m.timeRetentionCount,
m.compactionsTriggered,
m.compactionsFailed,
m.startTime,
m.tombCleanTimer,
m.blocksBytes,
@ -411,6 +417,11 @@ func (a dbAppender) Commit() error {
func (db *DB) compact() (err error) {
db.cmtx.Lock()
defer db.cmtx.Unlock()
defer func() {
if err != nil {
db.metrics.compactionsFailed.Inc()
}
}()
// Check whether we have pending head blocks that are ready to be persisted.
// They have the highest priority.
for {