Add metrics to count inconsistencies and fp collisions.

This commit is contained in:
beorn7 2015-05-21 17:50:06 +02:00
parent c44e7cd105
commit 3b9ab546e6
3 changed files with 64 additions and 31 deletions

View File

@ -7,6 +7,7 @@ import (
"sync" "sync"
"sync/atomic" "sync/atomic"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/log" "github.com/prometheus/log"
clientmodel "github.com/prometheus/client_golang/model" clientmodel "github.com/prometheus/client_golang/model"
@ -31,22 +32,31 @@ type fpMapper struct {
fpToSeries *seriesMap fpToSeries *seriesMap
p *persistence p *persistence
mappingsCounter prometheus.Counter
} }
// newFPMapper loads the collision map from the persistence and // newFPMapper loads the collision map from the persistence and
// returns an fpMapper ready to use. // returns an fpMapper ready to use.
func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) { func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) {
r := &fpMapper{ m := &fpMapper{
fpToSeries: fpToSeries, fpToSeries: fpToSeries,
p: p, p: p,
mappingsCounter: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "fingerprint_mappings_total",
Help: "The total number of fingerprints being mapped to avoid collisions.",
}),
} }
mappings, nextFP, err := p.loadFPMappings() mappings, nextFP, err := p.loadFPMappings()
if err != nil { if err != nil {
return nil, err return nil, err
} }
r.mappings = mappings m.mappings = mappings
r.highestMappedFP = nextFP m.mappingsCounter.Set(float64(len(m.mappings)))
return r, nil m.highestMappedFP = nextFP
return m, nil
} }
// mapFP takes a raw fingerprint (as returned by Metrics.FastFingerprint) and // mapFP takes a raw fingerprint (as returned by Metrics.FastFingerprint) and
@ -55,33 +65,33 @@ func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) {
// //
// If an error is encountered, it is returned together with the unchanged raw // If an error is encountered, it is returned together with the unchanged raw
// fingerprint. // fingerprint.
func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clientmodel.Fingerprint, error) { func (m *fpMapper) mapFP(fp clientmodel.Fingerprint, metric clientmodel.Metric) (clientmodel.Fingerprint, error) {
// First check if we are in the reserved FP space, in which case this is // First check if we are in the reserved FP space, in which case this is
// automatically a collision that has to be mapped. // automatically a collision that has to be mapped.
if fp <= maxMappedFP { if fp <= maxMappedFP {
return r.maybeAddMapping(fp, m) return m.maybeAddMapping(fp, metric)
} }
// Then check the most likely case: This fp belongs to a series that is // Then check the most likely case: This fp belongs to a series that is
// already in memory. // already in memory.
s, ok := r.fpToSeries.get(fp) s, ok := m.fpToSeries.get(fp)
if ok { if ok {
// FP exists in memory, but is it for the same metric? // FP exists in memory, but is it for the same metric?
if m.Equal(s.metric) { if metric.Equal(s.metric) {
// Yupp. We are done. // Yupp. We are done.
return fp, nil return fp, nil
} }
// Collision detected! // Collision detected!
return r.maybeAddMapping(fp, m) return m.maybeAddMapping(fp, metric)
} }
// Metric is not in memory. Before doing the expensive archive lookup, // Metric is not in memory. Before doing the expensive archive lookup,
// check if we have a mapping for this metric in place already. // check if we have a mapping for this metric in place already.
r.mtx.RLock() m.mtx.RLock()
mappedFPs, fpAlreadyMapped := r.mappings[fp] mappedFPs, fpAlreadyMapped := m.mappings[fp]
r.mtx.RUnlock() m.mtx.RUnlock()
if fpAlreadyMapped { if fpAlreadyMapped {
// We indeed have mapped fp historically. // We indeed have mapped fp historically.
ms := metricToUniqueString(m) ms := metricToUniqueString(metric)
// fp is locked by the caller, so no further locking of // fp is locked by the caller, so no further locking of
// 'collisions' required (it is specific to fp). // 'collisions' required (it is specific to fp).
mappedFP, ok := mappedFPs[ms] mappedFP, ok := mappedFPs[ms]
@ -93,18 +103,18 @@ func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clie
// If we are here, FP does not exist in memory and is either not mapped // If we are here, FP does not exist in memory and is either not mapped
// at all, or existing mappings for FP are not for m. Check if we have // at all, or existing mappings for FP are not for m. Check if we have
// something for FP in the archive. // something for FP in the archive.
archivedMetric, err := r.p.archivedMetric(fp) archivedMetric, err := m.p.archivedMetric(fp)
if err != nil { if err != nil {
return fp, err return fp, err
} }
if archivedMetric != nil { if archivedMetric != nil {
// FP exists in archive, but is it for the same metric? // FP exists in archive, but is it for the same metric?
if m.Equal(archivedMetric) { if metric.Equal(archivedMetric) {
// Yupp. We are done. // Yupp. We are done.
return fp, nil return fp, nil
} }
// Collision detected! // Collision detected!
return r.maybeAddMapping(fp, m) return m.maybeAddMapping(fp, metric)
} }
// As fp does not exist, neither in memory nor in archive, we can safely // As fp does not exist, neither in memory nor in archive, we can safely
// keep it unmapped. // keep it unmapped.
@ -114,14 +124,14 @@ func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clie
// maybeAddMapping is only used internally. It takes a detected collision and // maybeAddMapping is only used internally. It takes a detected collision and
// adds it to the collisions map if not yet there. In any case, it returns the // adds it to the collisions map if not yet there. In any case, it returns the
// truly unique fingerprint for the colliding metric. // truly unique fingerprint for the colliding metric.
func (r *fpMapper) maybeAddMapping( func (m *fpMapper) maybeAddMapping(
fp clientmodel.Fingerprint, fp clientmodel.Fingerprint,
collidingMetric clientmodel.Metric, collidingMetric clientmodel.Metric,
) (clientmodel.Fingerprint, error) { ) (clientmodel.Fingerprint, error) {
ms := metricToUniqueString(collidingMetric) ms := metricToUniqueString(collidingMetric)
r.mtx.RLock() m.mtx.RLock()
mappedFPs, ok := r.mappings[fp] mappedFPs, ok := m.mappings[fp]
r.mtx.RUnlock() m.mtx.RUnlock()
if ok { if ok {
// fp is locked by the caller, so no further locking required. // fp is locked by the caller, so no further locking required.
mappedFP, ok := mappedFPs[ms] mappedFP, ok := mappedFPs[ms]
@ -129,12 +139,12 @@ func (r *fpMapper) maybeAddMapping(
return mappedFP, nil // Existing mapping. return mappedFP, nil // Existing mapping.
} }
// A new mapping has to be created. // A new mapping has to be created.
mappedFP = r.nextMappedFP() mappedFP = m.nextMappedFP()
mappedFPs[ms] = mappedFP mappedFPs[ms] = mappedFP
r.mtx.RLock() m.mtx.RLock()
// Checkpoint mappings after each change. // Checkpoint mappings after each change.
err := r.p.checkpointFPMappings(r.mappings) err := m.p.checkpointFPMappings(m.mappings)
r.mtx.RUnlock() m.mtx.RUnlock()
log.Infof( log.Infof(
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.", "Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
fp, collidingMetric, mappedFP, fp, collidingMetric, mappedFP,
@ -142,13 +152,14 @@ func (r *fpMapper) maybeAddMapping(
return mappedFP, err return mappedFP, err
} }
// This is the first collision for fp. // This is the first collision for fp.
mappedFP := r.nextMappedFP() mappedFP := m.nextMappedFP()
mappedFPs = map[string]clientmodel.Fingerprint{ms: mappedFP} mappedFPs = map[string]clientmodel.Fingerprint{ms: mappedFP}
r.mtx.Lock() m.mtx.Lock()
r.mappings[fp] = mappedFPs m.mappings[fp] = mappedFPs
m.mappingsCounter.Inc()
// Checkpoint mappings after each change. // Checkpoint mappings after each change.
err := r.p.checkpointFPMappings(r.mappings) err := m.p.checkpointFPMappings(m.mappings)
r.mtx.Unlock() m.mtx.Unlock()
log.Infof( log.Infof(
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.", "Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
fp, collidingMetric, mappedFP, fp, collidingMetric, mappedFP,
@ -156,14 +167,24 @@ func (r *fpMapper) maybeAddMapping(
return mappedFP, err return mappedFP, err
} }
func (r *fpMapper) nextMappedFP() clientmodel.Fingerprint { func (m *fpMapper) nextMappedFP() clientmodel.Fingerprint {
mappedFP := clientmodel.Fingerprint(atomic.AddUint64((*uint64)(&r.highestMappedFP), 1)) mappedFP := clientmodel.Fingerprint(atomic.AddUint64((*uint64)(&m.highestMappedFP), 1))
if mappedFP > maxMappedFP { if mappedFP > maxMappedFP {
panic(fmt.Errorf("more than %v fingerprints mapped in collision detection", maxMappedFP)) panic(fmt.Errorf("more than %v fingerprints mapped in collision detection", maxMappedFP))
} }
return mappedFP return mappedFP
} }
// Describe implements prometheus.Collector.
func (m *fpMapper) Describe(ch chan<- *prometheus.Desc) {
ch <- m.mappingsCounter.Desc()
}
// Collect implements prometheus.Collector.
func (m *fpMapper) Collect(ch chan<- prometheus.Metric) {
ch <- m.mappingsCounter
}
// metricToUniqueString turns a metric into a string in a reproducible and // metricToUniqueString turns a metric into a string in a reproducible and
// unique way, i.e. the same metric will always create the same string, and // unique way, i.e. the same metric will always create the same string, and
// different metrics will always create different strings. In a way, it is the // different metrics will always create different strings. In a way, it is the

View File

@ -120,6 +120,7 @@ type persistence struct {
indexingBatchSizes prometheus.Summary indexingBatchSizes prometheus.Summary
indexingBatchDuration prometheus.Summary indexingBatchDuration prometheus.Summary
checkpointDuration prometheus.Gauge checkpointDuration prometheus.Gauge
dirtyCounter prometheus.Counter
dirtyMtx sync.Mutex // Protects dirty and becameDirty. dirtyMtx sync.Mutex // Protects dirty and becameDirty.
dirty bool // true if persistence was started in dirty state. dirty bool // true if persistence was started in dirty state.
@ -237,6 +238,12 @@ func newPersistence(basePath string, dirty, pedanticChecks bool, shouldSync sync
Name: "checkpoint_duration_milliseconds", Name: "checkpoint_duration_milliseconds",
Help: "The duration (in milliseconds) it took to checkpoint in-memory metrics and head chunks.", Help: "The duration (in milliseconds) it took to checkpoint in-memory metrics and head chunks.",
}), }),
dirtyCounter: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "inconsistencies_total",
Help: "A counter incremented each time an inconsistency in the local storage is detected. If this is greater zero, restart the server as soon as possible.",
}),
dirty: dirty, dirty: dirty,
pedanticChecks: pedanticChecks, pedanticChecks: pedanticChecks,
dirtyFileName: dirtyPath, dirtyFileName: dirtyPath,
@ -282,6 +289,7 @@ func (p *persistence) Describe(ch chan<- *prometheus.Desc) {
p.indexingBatchSizes.Describe(ch) p.indexingBatchSizes.Describe(ch)
p.indexingBatchDuration.Describe(ch) p.indexingBatchDuration.Describe(ch)
ch <- p.checkpointDuration.Desc() ch <- p.checkpointDuration.Desc()
ch <- p.dirtyCounter.Desc()
} }
// Collect implements prometheus.Collector. // Collect implements prometheus.Collector.
@ -293,6 +301,7 @@ func (p *persistence) Collect(ch chan<- prometheus.Metric) {
p.indexingBatchSizes.Collect(ch) p.indexingBatchSizes.Collect(ch)
p.indexingBatchDuration.Collect(ch) p.indexingBatchDuration.Collect(ch)
ch <- p.checkpointDuration ch <- p.checkpointDuration
ch <- p.dirtyCounter
} }
// isDirty returns the dirty flag in a goroutine-safe way. // isDirty returns the dirty flag in a goroutine-safe way.
@ -307,6 +316,7 @@ func (p *persistence) isDirty() bool {
// dirty during our runtime, there is no way back. If we were dirty from the // dirty during our runtime, there is no way back. If we were dirty from the
// start, a clean-up might make us clean again.) // start, a clean-up might make us clean again.)
func (p *persistence) setDirty(dirty bool) { func (p *persistence) setDirty(dirty bool) {
p.dirtyCounter.Inc()
p.dirtyMtx.Lock() p.dirtyMtx.Lock()
defer p.dirtyMtx.Unlock() defer p.dirtyMtx.Unlock()
if p.becameDirty { if p.becameDirty {

View File

@ -1003,6 +1003,7 @@ func (s *memorySeriesStorage) persistenceBacklogScore() float64 {
// Describe implements prometheus.Collector. // Describe implements prometheus.Collector.
func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) { func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
s.persistence.Describe(ch) s.persistence.Describe(ch)
s.mapper.Describe(ch)
ch <- s.persistErrors.Desc() ch <- s.persistErrors.Desc()
ch <- maxChunksToPersistDesc ch <- maxChunksToPersistDesc
@ -1018,6 +1019,7 @@ func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
// Collect implements prometheus.Collector. // Collect implements prometheus.Collector.
func (s *memorySeriesStorage) Collect(ch chan<- prometheus.Metric) { func (s *memorySeriesStorage) Collect(ch chan<- prometheus.Metric) {
s.persistence.Collect(ch) s.persistence.Collect(ch)
s.mapper.Collect(ch)
ch <- s.persistErrors ch <- s.persistErrors
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(