Add metrics to count inconsistencies and fp collisions.
This commit is contained in:
parent
c44e7cd105
commit
3b9ab546e6
|
@ -7,6 +7,7 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/log"
|
"github.com/prometheus/log"
|
||||||
|
|
||||||
clientmodel "github.com/prometheus/client_golang/model"
|
clientmodel "github.com/prometheus/client_golang/model"
|
||||||
|
@ -31,22 +32,31 @@ type fpMapper struct {
|
||||||
|
|
||||||
fpToSeries *seriesMap
|
fpToSeries *seriesMap
|
||||||
p *persistence
|
p *persistence
|
||||||
|
|
||||||
|
mappingsCounter prometheus.Counter
|
||||||
}
|
}
|
||||||
|
|
||||||
// newFPMapper loads the collision map from the persistence and
|
// newFPMapper loads the collision map from the persistence and
|
||||||
// returns an fpMapper ready to use.
|
// returns an fpMapper ready to use.
|
||||||
func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) {
|
func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) {
|
||||||
r := &fpMapper{
|
m := &fpMapper{
|
||||||
fpToSeries: fpToSeries,
|
fpToSeries: fpToSeries,
|
||||||
p: p,
|
p: p,
|
||||||
|
mappingsCounter: prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "fingerprint_mappings_total",
|
||||||
|
Help: "The total number of fingerprints being mapped to avoid collisions.",
|
||||||
|
}),
|
||||||
}
|
}
|
||||||
mappings, nextFP, err := p.loadFPMappings()
|
mappings, nextFP, err := p.loadFPMappings()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
r.mappings = mappings
|
m.mappings = mappings
|
||||||
r.highestMappedFP = nextFP
|
m.mappingsCounter.Set(float64(len(m.mappings)))
|
||||||
return r, nil
|
m.highestMappedFP = nextFP
|
||||||
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// mapFP takes a raw fingerprint (as returned by Metrics.FastFingerprint) and
|
// mapFP takes a raw fingerprint (as returned by Metrics.FastFingerprint) and
|
||||||
|
@ -55,33 +65,33 @@ func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) {
|
||||||
//
|
//
|
||||||
// If an error is encountered, it is returned together with the unchanged raw
|
// If an error is encountered, it is returned together with the unchanged raw
|
||||||
// fingerprint.
|
// fingerprint.
|
||||||
func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clientmodel.Fingerprint, error) {
|
func (m *fpMapper) mapFP(fp clientmodel.Fingerprint, metric clientmodel.Metric) (clientmodel.Fingerprint, error) {
|
||||||
// First check if we are in the reserved FP space, in which case this is
|
// First check if we are in the reserved FP space, in which case this is
|
||||||
// automatically a collision that has to be mapped.
|
// automatically a collision that has to be mapped.
|
||||||
if fp <= maxMappedFP {
|
if fp <= maxMappedFP {
|
||||||
return r.maybeAddMapping(fp, m)
|
return m.maybeAddMapping(fp, metric)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Then check the most likely case: This fp belongs to a series that is
|
// Then check the most likely case: This fp belongs to a series that is
|
||||||
// already in memory.
|
// already in memory.
|
||||||
s, ok := r.fpToSeries.get(fp)
|
s, ok := m.fpToSeries.get(fp)
|
||||||
if ok {
|
if ok {
|
||||||
// FP exists in memory, but is it for the same metric?
|
// FP exists in memory, but is it for the same metric?
|
||||||
if m.Equal(s.metric) {
|
if metric.Equal(s.metric) {
|
||||||
// Yupp. We are done.
|
// Yupp. We are done.
|
||||||
return fp, nil
|
return fp, nil
|
||||||
}
|
}
|
||||||
// Collision detected!
|
// Collision detected!
|
||||||
return r.maybeAddMapping(fp, m)
|
return m.maybeAddMapping(fp, metric)
|
||||||
}
|
}
|
||||||
// Metric is not in memory. Before doing the expensive archive lookup,
|
// Metric is not in memory. Before doing the expensive archive lookup,
|
||||||
// check if we have a mapping for this metric in place already.
|
// check if we have a mapping for this metric in place already.
|
||||||
r.mtx.RLock()
|
m.mtx.RLock()
|
||||||
mappedFPs, fpAlreadyMapped := r.mappings[fp]
|
mappedFPs, fpAlreadyMapped := m.mappings[fp]
|
||||||
r.mtx.RUnlock()
|
m.mtx.RUnlock()
|
||||||
if fpAlreadyMapped {
|
if fpAlreadyMapped {
|
||||||
// We indeed have mapped fp historically.
|
// We indeed have mapped fp historically.
|
||||||
ms := metricToUniqueString(m)
|
ms := metricToUniqueString(metric)
|
||||||
// fp is locked by the caller, so no further locking of
|
// fp is locked by the caller, so no further locking of
|
||||||
// 'collisions' required (it is specific to fp).
|
// 'collisions' required (it is specific to fp).
|
||||||
mappedFP, ok := mappedFPs[ms]
|
mappedFP, ok := mappedFPs[ms]
|
||||||
|
@ -93,18 +103,18 @@ func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clie
|
||||||
// If we are here, FP does not exist in memory and is either not mapped
|
// If we are here, FP does not exist in memory and is either not mapped
|
||||||
// at all, or existing mappings for FP are not for m. Check if we have
|
// at all, or existing mappings for FP are not for m. Check if we have
|
||||||
// something for FP in the archive.
|
// something for FP in the archive.
|
||||||
archivedMetric, err := r.p.archivedMetric(fp)
|
archivedMetric, err := m.p.archivedMetric(fp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fp, err
|
return fp, err
|
||||||
}
|
}
|
||||||
if archivedMetric != nil {
|
if archivedMetric != nil {
|
||||||
// FP exists in archive, but is it for the same metric?
|
// FP exists in archive, but is it for the same metric?
|
||||||
if m.Equal(archivedMetric) {
|
if metric.Equal(archivedMetric) {
|
||||||
// Yupp. We are done.
|
// Yupp. We are done.
|
||||||
return fp, nil
|
return fp, nil
|
||||||
}
|
}
|
||||||
// Collision detected!
|
// Collision detected!
|
||||||
return r.maybeAddMapping(fp, m)
|
return m.maybeAddMapping(fp, metric)
|
||||||
}
|
}
|
||||||
// As fp does not exist, neither in memory nor in archive, we can safely
|
// As fp does not exist, neither in memory nor in archive, we can safely
|
||||||
// keep it unmapped.
|
// keep it unmapped.
|
||||||
|
@ -114,14 +124,14 @@ func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clie
|
||||||
// maybeAddMapping is only used internally. It takes a detected collision and
|
// maybeAddMapping is only used internally. It takes a detected collision and
|
||||||
// adds it to the collisions map if not yet there. In any case, it returns the
|
// adds it to the collisions map if not yet there. In any case, it returns the
|
||||||
// truly unique fingerprint for the colliding metric.
|
// truly unique fingerprint for the colliding metric.
|
||||||
func (r *fpMapper) maybeAddMapping(
|
func (m *fpMapper) maybeAddMapping(
|
||||||
fp clientmodel.Fingerprint,
|
fp clientmodel.Fingerprint,
|
||||||
collidingMetric clientmodel.Metric,
|
collidingMetric clientmodel.Metric,
|
||||||
) (clientmodel.Fingerprint, error) {
|
) (clientmodel.Fingerprint, error) {
|
||||||
ms := metricToUniqueString(collidingMetric)
|
ms := metricToUniqueString(collidingMetric)
|
||||||
r.mtx.RLock()
|
m.mtx.RLock()
|
||||||
mappedFPs, ok := r.mappings[fp]
|
mappedFPs, ok := m.mappings[fp]
|
||||||
r.mtx.RUnlock()
|
m.mtx.RUnlock()
|
||||||
if ok {
|
if ok {
|
||||||
// fp is locked by the caller, so no further locking required.
|
// fp is locked by the caller, so no further locking required.
|
||||||
mappedFP, ok := mappedFPs[ms]
|
mappedFP, ok := mappedFPs[ms]
|
||||||
|
@ -129,12 +139,12 @@ func (r *fpMapper) maybeAddMapping(
|
||||||
return mappedFP, nil // Existing mapping.
|
return mappedFP, nil // Existing mapping.
|
||||||
}
|
}
|
||||||
// A new mapping has to be created.
|
// A new mapping has to be created.
|
||||||
mappedFP = r.nextMappedFP()
|
mappedFP = m.nextMappedFP()
|
||||||
mappedFPs[ms] = mappedFP
|
mappedFPs[ms] = mappedFP
|
||||||
r.mtx.RLock()
|
m.mtx.RLock()
|
||||||
// Checkpoint mappings after each change.
|
// Checkpoint mappings after each change.
|
||||||
err := r.p.checkpointFPMappings(r.mappings)
|
err := m.p.checkpointFPMappings(m.mappings)
|
||||||
r.mtx.RUnlock()
|
m.mtx.RUnlock()
|
||||||
log.Infof(
|
log.Infof(
|
||||||
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
|
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
|
||||||
fp, collidingMetric, mappedFP,
|
fp, collidingMetric, mappedFP,
|
||||||
|
@ -142,13 +152,14 @@ func (r *fpMapper) maybeAddMapping(
|
||||||
return mappedFP, err
|
return mappedFP, err
|
||||||
}
|
}
|
||||||
// This is the first collision for fp.
|
// This is the first collision for fp.
|
||||||
mappedFP := r.nextMappedFP()
|
mappedFP := m.nextMappedFP()
|
||||||
mappedFPs = map[string]clientmodel.Fingerprint{ms: mappedFP}
|
mappedFPs = map[string]clientmodel.Fingerprint{ms: mappedFP}
|
||||||
r.mtx.Lock()
|
m.mtx.Lock()
|
||||||
r.mappings[fp] = mappedFPs
|
m.mappings[fp] = mappedFPs
|
||||||
|
m.mappingsCounter.Inc()
|
||||||
// Checkpoint mappings after each change.
|
// Checkpoint mappings after each change.
|
||||||
err := r.p.checkpointFPMappings(r.mappings)
|
err := m.p.checkpointFPMappings(m.mappings)
|
||||||
r.mtx.Unlock()
|
m.mtx.Unlock()
|
||||||
log.Infof(
|
log.Infof(
|
||||||
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
|
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
|
||||||
fp, collidingMetric, mappedFP,
|
fp, collidingMetric, mappedFP,
|
||||||
|
@ -156,14 +167,24 @@ func (r *fpMapper) maybeAddMapping(
|
||||||
return mappedFP, err
|
return mappedFP, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *fpMapper) nextMappedFP() clientmodel.Fingerprint {
|
func (m *fpMapper) nextMappedFP() clientmodel.Fingerprint {
|
||||||
mappedFP := clientmodel.Fingerprint(atomic.AddUint64((*uint64)(&r.highestMappedFP), 1))
|
mappedFP := clientmodel.Fingerprint(atomic.AddUint64((*uint64)(&m.highestMappedFP), 1))
|
||||||
if mappedFP > maxMappedFP {
|
if mappedFP > maxMappedFP {
|
||||||
panic(fmt.Errorf("more than %v fingerprints mapped in collision detection", maxMappedFP))
|
panic(fmt.Errorf("more than %v fingerprints mapped in collision detection", maxMappedFP))
|
||||||
}
|
}
|
||||||
return mappedFP
|
return mappedFP
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Describe implements prometheus.Collector.
|
||||||
|
func (m *fpMapper) Describe(ch chan<- *prometheus.Desc) {
|
||||||
|
ch <- m.mappingsCounter.Desc()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect implements prometheus.Collector.
|
||||||
|
func (m *fpMapper) Collect(ch chan<- prometheus.Metric) {
|
||||||
|
ch <- m.mappingsCounter
|
||||||
|
}
|
||||||
|
|
||||||
// metricToUniqueString turns a metric into a string in a reproducible and
|
// metricToUniqueString turns a metric into a string in a reproducible and
|
||||||
// unique way, i.e. the same metric will always create the same string, and
|
// unique way, i.e. the same metric will always create the same string, and
|
||||||
// different metrics will always create different strings. In a way, it is the
|
// different metrics will always create different strings. In a way, it is the
|
||||||
|
|
|
@ -120,6 +120,7 @@ type persistence struct {
|
||||||
indexingBatchSizes prometheus.Summary
|
indexingBatchSizes prometheus.Summary
|
||||||
indexingBatchDuration prometheus.Summary
|
indexingBatchDuration prometheus.Summary
|
||||||
checkpointDuration prometheus.Gauge
|
checkpointDuration prometheus.Gauge
|
||||||
|
dirtyCounter prometheus.Counter
|
||||||
|
|
||||||
dirtyMtx sync.Mutex // Protects dirty and becameDirty.
|
dirtyMtx sync.Mutex // Protects dirty and becameDirty.
|
||||||
dirty bool // true if persistence was started in dirty state.
|
dirty bool // true if persistence was started in dirty state.
|
||||||
|
@ -237,6 +238,12 @@ func newPersistence(basePath string, dirty, pedanticChecks bool, shouldSync sync
|
||||||
Name: "checkpoint_duration_milliseconds",
|
Name: "checkpoint_duration_milliseconds",
|
||||||
Help: "The duration (in milliseconds) it took to checkpoint in-memory metrics and head chunks.",
|
Help: "The duration (in milliseconds) it took to checkpoint in-memory metrics and head chunks.",
|
||||||
}),
|
}),
|
||||||
|
dirtyCounter: prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "inconsistencies_total",
|
||||||
|
Help: "A counter incremented each time an inconsistency in the local storage is detected. If this is greater zero, restart the server as soon as possible.",
|
||||||
|
}),
|
||||||
dirty: dirty,
|
dirty: dirty,
|
||||||
pedanticChecks: pedanticChecks,
|
pedanticChecks: pedanticChecks,
|
||||||
dirtyFileName: dirtyPath,
|
dirtyFileName: dirtyPath,
|
||||||
|
@ -282,6 +289,7 @@ func (p *persistence) Describe(ch chan<- *prometheus.Desc) {
|
||||||
p.indexingBatchSizes.Describe(ch)
|
p.indexingBatchSizes.Describe(ch)
|
||||||
p.indexingBatchDuration.Describe(ch)
|
p.indexingBatchDuration.Describe(ch)
|
||||||
ch <- p.checkpointDuration.Desc()
|
ch <- p.checkpointDuration.Desc()
|
||||||
|
ch <- p.dirtyCounter.Desc()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect implements prometheus.Collector.
|
// Collect implements prometheus.Collector.
|
||||||
|
@ -293,6 +301,7 @@ func (p *persistence) Collect(ch chan<- prometheus.Metric) {
|
||||||
p.indexingBatchSizes.Collect(ch)
|
p.indexingBatchSizes.Collect(ch)
|
||||||
p.indexingBatchDuration.Collect(ch)
|
p.indexingBatchDuration.Collect(ch)
|
||||||
ch <- p.checkpointDuration
|
ch <- p.checkpointDuration
|
||||||
|
ch <- p.dirtyCounter
|
||||||
}
|
}
|
||||||
|
|
||||||
// isDirty returns the dirty flag in a goroutine-safe way.
|
// isDirty returns the dirty flag in a goroutine-safe way.
|
||||||
|
@ -307,6 +316,7 @@ func (p *persistence) isDirty() bool {
|
||||||
// dirty during our runtime, there is no way back. If we were dirty from the
|
// dirty during our runtime, there is no way back. If we were dirty from the
|
||||||
// start, a clean-up might make us clean again.)
|
// start, a clean-up might make us clean again.)
|
||||||
func (p *persistence) setDirty(dirty bool) {
|
func (p *persistence) setDirty(dirty bool) {
|
||||||
|
p.dirtyCounter.Inc()
|
||||||
p.dirtyMtx.Lock()
|
p.dirtyMtx.Lock()
|
||||||
defer p.dirtyMtx.Unlock()
|
defer p.dirtyMtx.Unlock()
|
||||||
if p.becameDirty {
|
if p.becameDirty {
|
||||||
|
|
|
@ -1003,6 +1003,7 @@ func (s *memorySeriesStorage) persistenceBacklogScore() float64 {
|
||||||
// Describe implements prometheus.Collector.
|
// Describe implements prometheus.Collector.
|
||||||
func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
|
func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
|
||||||
s.persistence.Describe(ch)
|
s.persistence.Describe(ch)
|
||||||
|
s.mapper.Describe(ch)
|
||||||
|
|
||||||
ch <- s.persistErrors.Desc()
|
ch <- s.persistErrors.Desc()
|
||||||
ch <- maxChunksToPersistDesc
|
ch <- maxChunksToPersistDesc
|
||||||
|
@ -1018,6 +1019,7 @@ func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
|
||||||
// Collect implements prometheus.Collector.
|
// Collect implements prometheus.Collector.
|
||||||
func (s *memorySeriesStorage) Collect(ch chan<- prometheus.Metric) {
|
func (s *memorySeriesStorage) Collect(ch chan<- prometheus.Metric) {
|
||||||
s.persistence.Collect(ch)
|
s.persistence.Collect(ch)
|
||||||
|
s.mapper.Collect(ch)
|
||||||
|
|
||||||
ch <- s.persistErrors
|
ch <- s.persistErrors
|
||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
|
|
Loading…
Reference in New Issue