2015-01-21 19:07:45 +00:00
|
|
|
// Copyright 2014 The Prometheus Authors
|
2014-09-19 16:18:44 +00:00
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2014-09-16 13:47:24 +00:00
|
|
|
package local
|
2014-06-06 09:55:53 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"sort"
|
|
|
|
"sync"
|
2015-03-09 01:33:10 +00:00
|
|
|
"time"
|
2014-06-06 09:55:53 +00:00
|
|
|
|
2015-08-20 15:18:46 +00:00
|
|
|
"github.com/prometheus/common/model"
|
2014-06-06 09:55:53 +00:00
|
|
|
|
|
|
|
"github.com/prometheus/prometheus/storage/metric"
|
|
|
|
)
|
|
|
|
|
2015-03-09 01:33:10 +00:00
|
|
|
const (
|
|
|
|
// chunkDescEvictionFactor is a factor used for chunkDesc eviction (as opposed
|
|
|
|
// to evictions of chunks, see method evictOlderThan. A chunk takes about 20x
|
|
|
|
// more memory than a chunkDesc. With a chunkDescEvictionFactor of 10, not more
|
|
|
|
// than a third of the total memory taken by a series will be used for
|
|
|
|
// chunkDescs.
|
|
|
|
chunkDescEvictionFactor = 10
|
|
|
|
|
|
|
|
headChunkTimeout = time.Hour // Close head chunk if not touched for that long.
|
|
|
|
)
|
2014-10-21 15:53:53 +00:00
|
|
|
|
2014-10-07 17:11:24 +00:00
|
|
|
// fingerprintSeriesPair pairs a fingerprint with a memorySeries pointer.
|
|
|
|
type fingerprintSeriesPair struct {
|
2015-08-20 15:18:46 +00:00
|
|
|
fp model.Fingerprint
|
2014-10-07 17:11:24 +00:00
|
|
|
series *memorySeries
|
|
|
|
}
|
|
|
|
|
|
|
|
// seriesMap maps fingerprints to memory series. All its methods are
|
|
|
|
// goroutine-safe. A SeriesMap is effectively is a goroutine-safe version of
|
2015-08-20 15:18:46 +00:00
|
|
|
// map[model.Fingerprint]*memorySeries.
|
2014-10-07 17:11:24 +00:00
|
|
|
type seriesMap struct {
|
|
|
|
mtx sync.RWMutex
|
2015-08-20 15:18:46 +00:00
|
|
|
m map[model.Fingerprint]*memorySeries
|
2014-10-07 17:11:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// newSeriesMap returns a newly allocated empty seriesMap. To create a seriesMap
|
|
|
|
// based on a prefilled map, use an explicit initializer.
|
2014-10-08 11:49:42 +00:00
|
|
|
func newSeriesMap() *seriesMap {
|
2015-08-20 15:18:46 +00:00
|
|
|
return &seriesMap{m: make(map[model.Fingerprint]*memorySeries)}
|
2014-10-07 17:11:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// length returns the number of mappings in the seriesMap.
|
2014-10-08 11:49:42 +00:00
|
|
|
func (sm *seriesMap) length() int {
|
2014-10-07 17:11:24 +00:00
|
|
|
sm.mtx.RLock()
|
|
|
|
defer sm.mtx.RUnlock()
|
|
|
|
|
|
|
|
return len(sm.m)
|
|
|
|
}
|
|
|
|
|
|
|
|
// get returns a memorySeries for a fingerprint. Return values have the same
|
|
|
|
// semantics as the native Go map.
|
2015-08-20 15:18:46 +00:00
|
|
|
func (sm *seriesMap) get(fp model.Fingerprint) (s *memorySeries, ok bool) {
|
2014-10-07 17:11:24 +00:00
|
|
|
sm.mtx.RLock()
|
|
|
|
defer sm.mtx.RUnlock()
|
|
|
|
|
|
|
|
s, ok = sm.m[fp]
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2014-10-10 17:16:07 +00:00
|
|
|
// put adds a mapping to the seriesMap. It panics if s == nil.
|
2015-08-20 15:18:46 +00:00
|
|
|
func (sm *seriesMap) put(fp model.Fingerprint, s *memorySeries) {
|
2014-10-07 17:11:24 +00:00
|
|
|
sm.mtx.Lock()
|
|
|
|
defer sm.mtx.Unlock()
|
|
|
|
|
2014-10-10 17:16:07 +00:00
|
|
|
if s == nil {
|
|
|
|
panic("tried to add nil pointer to seriesMap")
|
|
|
|
}
|
2014-10-07 17:11:24 +00:00
|
|
|
sm.m[fp] = s
|
|
|
|
}
|
|
|
|
|
|
|
|
// del removes a mapping from the series Map.
|
2015-08-20 15:18:46 +00:00
|
|
|
func (sm *seriesMap) del(fp model.Fingerprint) {
|
2014-10-07 17:11:24 +00:00
|
|
|
sm.mtx.Lock()
|
|
|
|
defer sm.mtx.Unlock()
|
|
|
|
|
|
|
|
delete(sm.m, fp)
|
|
|
|
}
|
|
|
|
|
|
|
|
// iter returns a channel that produces all mappings in the seriesMap. The
|
|
|
|
// channel will be closed once all fingerprints have been received. Not
|
|
|
|
// consuming all fingerprints from the channel will leak a goroutine. The
|
2014-10-07 18:09:56 +00:00
|
|
|
// semantics of concurrent modification of seriesMap is the similar as the one
|
|
|
|
// for iterating over a map with a 'range' clause. However, if the next element
|
|
|
|
// in iteration order is removed after the current element has been received
|
|
|
|
// from the channel, it will still be produced by the channel.
|
2014-10-08 11:49:42 +00:00
|
|
|
func (sm *seriesMap) iter() <-chan fingerprintSeriesPair {
|
2014-10-07 17:11:24 +00:00
|
|
|
ch := make(chan fingerprintSeriesPair)
|
|
|
|
go func() {
|
|
|
|
sm.mtx.RLock()
|
|
|
|
for fp, s := range sm.m {
|
|
|
|
sm.mtx.RUnlock()
|
|
|
|
ch <- fingerprintSeriesPair{fp, s}
|
|
|
|
sm.mtx.RLock()
|
|
|
|
}
|
|
|
|
sm.mtx.RUnlock()
|
2014-10-07 18:09:56 +00:00
|
|
|
close(ch)
|
2014-10-07 17:11:24 +00:00
|
|
|
}()
|
|
|
|
return ch
|
|
|
|
}
|
|
|
|
|
|
|
|
// fpIter returns a channel that produces all fingerprints in the seriesMap. The
|
|
|
|
// channel will be closed once all fingerprints have been received. Not
|
|
|
|
// consuming all fingerprints from the channel will leak a goroutine. The
|
2014-10-07 18:09:56 +00:00
|
|
|
// semantics of concurrent modification of seriesMap is the similar as the one
|
|
|
|
// for iterating over a map with a 'range' clause. However, if the next element
|
|
|
|
// in iteration order is removed after the current element has been received
|
|
|
|
// from the channel, it will still be produced by the channel.
|
2015-08-20 15:18:46 +00:00
|
|
|
func (sm *seriesMap) fpIter() <-chan model.Fingerprint {
|
|
|
|
ch := make(chan model.Fingerprint)
|
2014-10-07 17:11:24 +00:00
|
|
|
go func() {
|
|
|
|
sm.mtx.RLock()
|
|
|
|
for fp := range sm.m {
|
|
|
|
sm.mtx.RUnlock()
|
|
|
|
ch <- fp
|
|
|
|
sm.mtx.RLock()
|
|
|
|
}
|
|
|
|
sm.mtx.RUnlock()
|
2014-10-07 18:09:56 +00:00
|
|
|
close(ch)
|
2014-10-07 17:11:24 +00:00
|
|
|
}()
|
|
|
|
return ch
|
|
|
|
}
|
|
|
|
|
2014-06-06 09:55:53 +00:00
|
|
|
type memorySeries struct {
|
2015-08-20 15:18:46 +00:00
|
|
|
metric model.Metric
|
2014-09-10 16:41:52 +00:00
|
|
|
// Sorted by start time, overlapping chunk ranges are forbidden.
|
2014-10-15 13:53:05 +00:00
|
|
|
chunkDescs []*chunkDesc
|
2015-03-09 01:33:10 +00:00
|
|
|
// The index (within chunkDescs above) of the first chunkDesc that
|
|
|
|
// points to a non-persisted chunk. If all chunks are persisted, then
|
|
|
|
// persistWatermark == len(chunkDescs).
|
|
|
|
persistWatermark int
|
2015-03-19 11:59:26 +00:00
|
|
|
// The modification time of the series file. The zero value of time.Time
|
|
|
|
// is used to mark an unknown modification time.
|
|
|
|
modTime time.Time
|
2014-10-27 19:40:48 +00:00
|
|
|
// The chunkDescs in memory might not have all the chunkDescs for the
|
|
|
|
// chunks that are persisted to disk. The missing chunkDescs are all
|
|
|
|
// contiguous and at the tail end. chunkDescsOffset is the index of the
|
|
|
|
// chunk on disk that corresponds to the first chunkDesc in memory. If
|
|
|
|
// it is 0, the chunkDescs are all loaded. A value of -1 denotes a
|
|
|
|
// special case: There are chunks on disk, but the offset to the
|
2015-03-09 01:33:10 +00:00
|
|
|
// chunkDescs in memory is unknown. Also, in this special case, there is
|
|
|
|
// no overlap between chunks on disk and chunks in memory (implying that
|
|
|
|
// upon first persisting of a chunk in memory, the offset has to be
|
|
|
|
// set).
|
2014-10-27 19:40:48 +00:00
|
|
|
chunkDescsOffset int
|
2014-11-05 19:02:45 +00:00
|
|
|
// The savedFirstTime field is used as a fallback when the
|
|
|
|
// chunkDescsOffset is not 0. It can be used to save the firstTime of the
|
|
|
|
// first chunk before its chunk desc is evicted. In doubt, this field is
|
|
|
|
// just set to the oldest possible timestamp.
|
2015-08-20 15:18:46 +00:00
|
|
|
savedFirstTime model.Time
|
2016-02-19 17:16:41 +00:00
|
|
|
// The timestamp of the last sample in this series. Needed for fast
|
|
|
|
// access for federation and to ensure timestamp monotonicity during
|
|
|
|
// ingestion.
|
2015-08-20 15:18:46 +00:00
|
|
|
lastTime model.Time
|
2016-02-19 17:16:41 +00:00
|
|
|
// The last ingested sample value. Needed for fast access for
|
|
|
|
// federation.
|
|
|
|
lastSampleValue model.SampleValue
|
|
|
|
// Whether lastSampleValue has been set already.
|
|
|
|
lastSampleValueSet bool
|
2015-03-09 01:33:10 +00:00
|
|
|
// Whether the current head chunk has already been finished. If true,
|
|
|
|
// the current head chunk must not be modified anymore.
|
|
|
|
headChunkClosed bool
|
2014-10-27 14:55:44 +00:00
|
|
|
// Whether the current head chunk is used by an iterator. In that case,
|
2015-03-09 01:33:10 +00:00
|
|
|
// a non-closed head chunk has to be cloned before more samples are
|
2014-10-27 14:55:44 +00:00
|
|
|
// appended.
|
|
|
|
headChunkUsedByIterator bool
|
2015-03-09 01:33:10 +00:00
|
|
|
// Whether the series is inconsistent with the last checkpoint in a way
|
2015-03-18 18:09:07 +00:00
|
|
|
// that would require a disk seek during crash recovery.
|
2015-03-09 01:33:10 +00:00
|
|
|
dirty bool
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2014-10-07 17:11:24 +00:00
|
|
|
// newMemorySeries returns a pointer to a newly allocated memorySeries for the
|
2015-07-13 19:12:27 +00:00
|
|
|
// given metric. chunkDescs and modTime in the new series are set according to
|
|
|
|
// the provided parameters. chunkDescs can be nil or empty if this is a
|
|
|
|
// genuinely new time series (i.e. not one that is being unarchived). In that
|
|
|
|
// case, headChunkClosed is set to false, and firstTime and lastTime are both
|
2015-08-20 15:18:46 +00:00
|
|
|
// set to model.Earliest. The zero value for modTime can be used if the
|
2015-07-13 19:12:27 +00:00
|
|
|
// modification time of the series file is unknown (e.g. if this is a genuinely
|
|
|
|
// new series).
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
func newMemorySeries(m model.Metric, chunkDescs []*chunkDesc, modTime time.Time) (*memorySeries, error) {
|
|
|
|
var err error
|
2015-08-20 15:18:46 +00:00
|
|
|
firstTime := model.Earliest
|
|
|
|
lastTime := model.Earliest
|
2015-07-13 19:12:27 +00:00
|
|
|
if len(chunkDescs) > 0 {
|
|
|
|
firstTime = chunkDescs[0].firstTime()
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
if lastTime, err = chunkDescs[len(chunkDescs)-1].lastTime(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2014-11-05 19:02:45 +00:00
|
|
|
}
|
2015-07-13 19:12:27 +00:00
|
|
|
return &memorySeries{
|
|
|
|
metric: m,
|
|
|
|
chunkDescs: chunkDescs,
|
|
|
|
headChunkClosed: len(chunkDescs) > 0,
|
|
|
|
savedFirstTime: firstTime,
|
|
|
|
lastTime: lastTime,
|
|
|
|
persistWatermark: len(chunkDescs),
|
|
|
|
modTime: modTime,
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
}, nil
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2015-03-09 01:33:10 +00:00
|
|
|
// add adds a sample pair to the series. It returns the number of newly
|
|
|
|
// completed chunks (which are now eligible for persistence).
|
|
|
|
//
|
2014-10-07 17:11:24 +00:00
|
|
|
// The caller must have locked the fingerprint of the series.
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
func (s *memorySeries) add(v model.SamplePair) (int, error) {
|
2015-03-09 01:33:10 +00:00
|
|
|
if len(s.chunkDescs) == 0 || s.headChunkClosed {
|
2016-02-11 16:36:13 +00:00
|
|
|
newHead := newChunkDesc(newChunk(), v.Timestamp)
|
2014-09-16 13:47:24 +00:00
|
|
|
s.chunkDescs = append(s.chunkDescs, newHead)
|
2015-03-09 01:33:10 +00:00
|
|
|
s.headChunkClosed = false
|
2015-05-20 17:13:06 +00:00
|
|
|
} else if s.headChunkUsedByIterator && s.head().refCount() > 1 {
|
2014-10-27 14:55:44 +00:00
|
|
|
// We only need to clone the head chunk if the current head
|
|
|
|
// chunk was used in an iterator at all and if the refCount is
|
|
|
|
// still greater than the 1 we always have because the head
|
|
|
|
// chunk is not yet persisted. The latter is just an
|
|
|
|
// approximation. We will still clone unnecessarily if an older
|
|
|
|
// iterator using a previous version of the head chunk is still
|
|
|
|
// around and keep the head chunk pinned. We needed to track
|
|
|
|
// pins by version of the head chunk, which is probably not
|
|
|
|
// worth the effort.
|
|
|
|
chunkOps.WithLabelValues(clone).Inc()
|
|
|
|
// No locking needed here because a non-persisted head chunk can
|
|
|
|
// not get evicted concurrently.
|
2015-05-20 17:13:06 +00:00
|
|
|
s.head().c = s.head().c.clone()
|
2014-10-27 14:55:44 +00:00
|
|
|
s.headChunkUsedByIterator = false
|
2014-09-16 13:47:24 +00:00
|
|
|
}
|
|
|
|
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
chunks, err := s.head().add(v)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
2015-05-20 17:13:06 +00:00
|
|
|
s.head().c = chunks[0]
|
2014-06-06 09:55:53 +00:00
|
|
|
|
2015-03-09 01:33:10 +00:00
|
|
|
for _, c := range chunks[1:] {
|
2016-02-11 16:36:13 +00:00
|
|
|
s.chunkDescs = append(s.chunkDescs, newChunkDesc(c, c.firstTime()))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Populate lastTime of now-closed chunks.
|
|
|
|
for _, cd := range s.chunkDescs[len(s.chunkDescs)-len(chunks) : len(s.chunkDescs)-1] {
|
|
|
|
cd.maybePopulateLastTime()
|
2015-03-09 01:33:10 +00:00
|
|
|
}
|
2015-07-13 19:12:27 +00:00
|
|
|
|
|
|
|
s.lastTime = v.Timestamp
|
2016-02-19 17:16:41 +00:00
|
|
|
s.lastSampleValue = v.Value
|
|
|
|
s.lastSampleValueSet = true
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
return len(chunks) - 1, nil
|
2015-03-09 01:33:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// maybeCloseHeadChunk closes the head chunk if it has not been touched for the
|
2015-03-18 18:09:07 +00:00
|
|
|
// duration of headChunkTimeout. It returns whether the head chunk was closed.
|
2015-03-09 01:33:10 +00:00
|
|
|
// If the head chunk is already closed, the method is a no-op and returns false.
|
|
|
|
//
|
|
|
|
// The caller must have locked the fingerprint of the series.
|
|
|
|
func (s *memorySeries) maybeCloseHeadChunk() bool {
|
|
|
|
if s.headChunkClosed {
|
|
|
|
return false
|
|
|
|
}
|
2015-07-13 19:12:27 +00:00
|
|
|
if time.Now().Sub(s.lastTime.Time()) > headChunkTimeout {
|
2015-03-09 01:33:10 +00:00
|
|
|
s.headChunkClosed = true
|
|
|
|
// Since we cannot modify the head chunk from now on, we
|
|
|
|
// don't need to bother with cloning anymore.
|
|
|
|
s.headChunkUsedByIterator = false
|
2016-02-11 16:36:13 +00:00
|
|
|
s.head().maybePopulateLastTime()
|
2015-03-09 01:33:10 +00:00
|
|
|
return true
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
2015-03-09 01:33:10 +00:00
|
|
|
return false
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2014-11-13 19:50:25 +00:00
|
|
|
// evictChunkDescs evicts chunkDescs if there are chunkDescEvictionFactor times
|
|
|
|
// more than non-evicted chunks. iOldestNotEvicted is the index within the
|
|
|
|
// current chunkDescs of the oldest chunk that is not evicted.
|
|
|
|
func (s *memorySeries) evictChunkDescs(iOldestNotEvicted int) {
|
|
|
|
lenToKeep := chunkDescEvictionFactor * (len(s.chunkDescs) - iOldestNotEvicted)
|
|
|
|
if lenToKeep < len(s.chunkDescs) {
|
|
|
|
s.savedFirstTime = s.firstTime()
|
|
|
|
lenEvicted := len(s.chunkDescs) - lenToKeep
|
|
|
|
s.chunkDescsOffset += lenEvicted
|
2015-03-09 01:33:10 +00:00
|
|
|
s.persistWatermark -= lenEvicted
|
2014-11-13 19:50:25 +00:00
|
|
|
chunkDescOps.WithLabelValues(evict).Add(float64(lenEvicted))
|
2014-11-27 17:25:03 +00:00
|
|
|
numMemChunkDescs.Sub(float64(lenEvicted))
|
2014-11-13 19:50:25 +00:00
|
|
|
s.chunkDescs = append(
|
|
|
|
make([]*chunkDesc, 0, lenToKeep),
|
|
|
|
s.chunkDescs[lenEvicted:]...,
|
|
|
|
)
|
2015-03-09 01:33:10 +00:00
|
|
|
s.dirty = true
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-09 01:33:10 +00:00
|
|
|
// dropChunks removes chunkDescs older than t. The caller must have locked the
|
|
|
|
// fingerprint of the series.
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
func (s *memorySeries) dropChunks(t model.Time) error {
|
2014-06-06 09:55:53 +00:00
|
|
|
keepIdx := len(s.chunkDescs)
|
|
|
|
for i, cd := range s.chunkDescs {
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
lt, err := cd.lastTime()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if !lt.Before(t) {
|
2014-06-06 09:55:53 +00:00
|
|
|
keepIdx = i
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2014-10-27 19:40:48 +00:00
|
|
|
if keepIdx > 0 {
|
2015-03-09 01:33:10 +00:00
|
|
|
s.chunkDescs = append(
|
|
|
|
make([]*chunkDesc, 0, len(s.chunkDescs)-keepIdx),
|
|
|
|
s.chunkDescs[keepIdx:]...,
|
|
|
|
)
|
|
|
|
s.persistWatermark -= keepIdx
|
|
|
|
if s.persistWatermark < 0 {
|
|
|
|
panic("dropped unpersisted chunks from memory")
|
|
|
|
}
|
|
|
|
if s.chunkDescsOffset != -1 {
|
|
|
|
s.chunkDescsOffset += keepIdx
|
|
|
|
}
|
2014-11-27 17:25:03 +00:00
|
|
|
numMemChunkDescs.Sub(float64(keepIdx))
|
2015-03-09 01:33:10 +00:00
|
|
|
s.dirty = true
|
2014-10-27 19:40:48 +00:00
|
|
|
}
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
return nil
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2014-10-07 17:11:24 +00:00
|
|
|
// preloadChunks is an internal helper method.
|
2015-05-06 14:53:12 +00:00
|
|
|
func (s *memorySeries) preloadChunks(
|
2015-08-20 15:18:46 +00:00
|
|
|
indexes []int, fp model.Fingerprint, mss *memorySeriesStorage,
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
) ([]*chunkDesc, SeriesIterator, error) {
|
2014-06-06 09:55:53 +00:00
|
|
|
loadIndexes := []int{}
|
2014-10-15 13:53:05 +00:00
|
|
|
pinnedChunkDescs := make([]*chunkDesc, 0, len(indexes))
|
2014-06-06 09:55:53 +00:00
|
|
|
for _, idx := range indexes {
|
2014-10-22 17:21:23 +00:00
|
|
|
cd := s.chunkDescs[idx]
|
|
|
|
pinnedChunkDescs = append(pinnedChunkDescs, cd)
|
2014-11-13 19:50:25 +00:00
|
|
|
cd.pin(mss.evictRequests) // Have to pin everything first to prevent immediate eviction on chunk loading.
|
2014-10-22 17:21:23 +00:00
|
|
|
if cd.isEvicted() {
|
2014-06-06 09:55:53 +00:00
|
|
|
loadIndexes = append(loadIndexes, idx)
|
|
|
|
}
|
|
|
|
}
|
2014-10-22 17:21:23 +00:00
|
|
|
chunkOps.WithLabelValues(pin).Add(float64(len(pinnedChunkDescs)))
|
2014-06-06 09:55:53 +00:00
|
|
|
|
|
|
|
if len(loadIndexes) > 0 {
|
2014-10-27 19:40:48 +00:00
|
|
|
if s.chunkDescsOffset == -1 {
|
|
|
|
panic("requested loading chunks from persistence in a situation where we must not have persisted data for chunk descriptors in memory")
|
|
|
|
}
|
2014-11-27 19:46:45 +00:00
|
|
|
chunks, err := mss.loadChunks(fp, loadIndexes, s.chunkDescsOffset)
|
2014-06-06 09:55:53 +00:00
|
|
|
if err != nil {
|
2014-10-22 17:21:23 +00:00
|
|
|
// Unpin the chunks since we won't return them as pinned chunks now.
|
2014-08-20 13:05:58 +00:00
|
|
|
for _, cd := range pinnedChunkDescs {
|
2014-11-13 19:50:25 +00:00
|
|
|
cd.unpin(mss.evictRequests)
|
2014-08-20 13:05:58 +00:00
|
|
|
}
|
2014-10-22 17:21:23 +00:00
|
|
|
chunkOps.WithLabelValues(unpin).Add(float64(len(pinnedChunkDescs)))
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
return nil, nopIter, err
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
for i, c := range chunks {
|
2014-10-22 17:21:23 +00:00
|
|
|
s.chunkDescs[loadIndexes[i]].setChunk(c)
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
if !s.headChunkClosed && indexes[len(indexes)-1] == len(s.chunkDescs)-1 {
|
|
|
|
s.headChunkUsedByIterator = true
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
curriedQuarantineSeries := func(err error) {
|
|
|
|
mss.quarantineSeries(fp, s.metric, err)
|
|
|
|
}
|
|
|
|
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
iter := &boundedIterator{
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
it: s.newIterator(pinnedChunkDescs, curriedQuarantineSeries),
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
start: model.Now().Add(-mss.dropAfter),
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
return pinnedChunkDescs, iter, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// newIterator returns a new SeriesIterator for the provided chunkDescs (which
|
2016-02-19 17:35:30 +00:00
|
|
|
// must be pinned).
|
|
|
|
//
|
|
|
|
// The caller must have locked the fingerprint of the memorySeries.
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
func (s *memorySeries) newIterator(pinnedChunkDescs []*chunkDesc, quarantine func(error)) SeriesIterator {
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
chunks := make([]chunk, 0, len(pinnedChunkDescs))
|
|
|
|
for _, cd := range pinnedChunkDescs {
|
|
|
|
// It's OK to directly access cd.c here (without locking) as the
|
|
|
|
// series FP is locked and the chunk is pinned.
|
|
|
|
chunks = append(chunks, cd.c)
|
|
|
|
}
|
|
|
|
return &memorySeriesIterator{
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
chunks: chunks,
|
|
|
|
chunkIts: make([]chunkIterator, len(chunks)),
|
|
|
|
quarantine: quarantine,
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
}
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2014-10-07 17:11:24 +00:00
|
|
|
// preloadChunksForRange loads chunks for the given range from the persistence.
|
|
|
|
// The caller must have locked the fingerprint of the series.
|
2014-10-14 16:23:32 +00:00
|
|
|
func (s *memorySeries) preloadChunksForRange(
|
2016-02-19 17:35:30 +00:00
|
|
|
fp model.Fingerprint,
|
2015-08-20 15:18:46 +00:00
|
|
|
from model.Time, through model.Time,
|
2016-02-19 17:35:30 +00:00
|
|
|
lastSampleOnly bool,
|
|
|
|
mss *memorySeriesStorage,
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
) ([]*chunkDesc, SeriesIterator, error) {
|
2016-02-19 17:35:30 +00:00
|
|
|
// If we have to preload for only one sample, and we have a
|
|
|
|
// lastSamplePair in the series, and thas last samplePair is in the
|
|
|
|
// interval, just take it in a singleSampleSeriesIterator. No need to
|
|
|
|
// pin or load anything.
|
|
|
|
if lastSampleOnly {
|
|
|
|
lastSample := s.lastSamplePair()
|
|
|
|
if !through.Before(lastSample.Timestamp) &&
|
|
|
|
!from.After(lastSample.Timestamp) &&
|
|
|
|
lastSample != ZeroSamplePair {
|
|
|
|
iter := &boundedIterator{
|
|
|
|
it: &singleSampleSeriesIterator{samplePair: lastSample},
|
|
|
|
start: model.Now().Add(-mss.dropAfter),
|
|
|
|
}
|
|
|
|
return nil, iter, nil
|
|
|
|
}
|
|
|
|
}
|
2015-08-20 15:18:46 +00:00
|
|
|
firstChunkDescTime := model.Latest
|
2014-10-14 16:23:32 +00:00
|
|
|
if len(s.chunkDescs) > 0 {
|
|
|
|
firstChunkDescTime = s.chunkDescs[0].firstTime()
|
|
|
|
}
|
2014-10-27 19:40:48 +00:00
|
|
|
if s.chunkDescsOffset != 0 && from.Before(firstChunkDescTime) {
|
2015-07-06 23:10:14 +00:00
|
|
|
cds, err := mss.loadChunkDescs(fp, s.persistWatermark)
|
2014-10-14 16:23:32 +00:00
|
|
|
if err != nil {
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
return nil, nopIter, err
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
2014-10-14 16:23:32 +00:00
|
|
|
s.chunkDescs = append(cds, s.chunkDescs...)
|
2014-10-27 19:40:48 +00:00
|
|
|
s.chunkDescsOffset = 0
|
2015-03-09 01:33:10 +00:00
|
|
|
s.persistWatermark += len(cds)
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
firstChunkDescTime = s.chunkDescs[0].firstTime()
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
if len(s.chunkDescs) == 0 || through.Before(firstChunkDescTime) {
|
|
|
|
return nil, nopIter, nil
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Find first chunk with start time after "from".
|
|
|
|
fromIdx := sort.Search(len(s.chunkDescs), func(i int) bool {
|
|
|
|
return s.chunkDescs[i].firstTime().After(from)
|
|
|
|
})
|
|
|
|
// Find first chunk with start time after "through".
|
|
|
|
throughIdx := sort.Search(len(s.chunkDescs), func(i int) bool {
|
|
|
|
return s.chunkDescs[i].firstTime().After(through)
|
|
|
|
})
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
if fromIdx == len(s.chunkDescs) {
|
|
|
|
// Even the last chunk starts before "from". Find out if the
|
|
|
|
// series ends before "from" and we don't need to do anything.
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
lt, err := s.chunkDescs[len(s.chunkDescs)-1].lastTime()
|
|
|
|
if err != nil {
|
|
|
|
return nil, nopIter, err
|
|
|
|
}
|
|
|
|
if lt.Before(from) {
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
return nil, nopIter, nil
|
|
|
|
}
|
|
|
|
}
|
2014-06-06 09:55:53 +00:00
|
|
|
if fromIdx > 0 {
|
|
|
|
fromIdx--
|
|
|
|
}
|
|
|
|
if throughIdx == len(s.chunkDescs) {
|
|
|
|
throughIdx--
|
|
|
|
}
|
|
|
|
|
|
|
|
pinIndexes := make([]int, 0, throughIdx-fromIdx+1)
|
|
|
|
for i := fromIdx; i <= throughIdx; i++ {
|
|
|
|
pinIndexes = append(pinIndexes, i)
|
|
|
|
}
|
2015-05-06 14:53:12 +00:00
|
|
|
return s.preloadChunks(pinIndexes, fp, mss)
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2014-10-28 18:01:41 +00:00
|
|
|
// head returns a pointer to the head chunk descriptor. The caller must have
|
Fix a bug handling freshly unarchived series.
Usually, if you unarchive a series, it is to add something to it,
which will create a new head chunk. However, if a series in
unarchived, and before anything is added to it, it is handled by the
maintenance loop, it will be archived again. In that case, we have to
load the chunkDescs to know the lastTime of the series to be
archived. Usually, this case will happen only rarely (as a race, has
never happened so far, possibly because the locking around unarchiving
and the subsequent sample append is smart enough). However, during
crash recovery, we sometimes treat series as "freshly unarchived"
without directly appending a sample. We might add more cases of that
type later, so better deal with archiving properly and load chunkDescs
if required.
2015-01-08 15:10:31 +00:00
|
|
|
// locked the fingerprint of the memorySeries. This method will panic if this
|
|
|
|
// series has no chunk descriptors.
|
2014-06-06 09:55:53 +00:00
|
|
|
func (s *memorySeries) head() *chunkDesc {
|
|
|
|
return s.chunkDescs[len(s.chunkDescs)-1]
|
|
|
|
}
|
|
|
|
|
2016-02-19 17:16:41 +00:00
|
|
|
// firstTime returns the timestamp of the first sample in the series.
|
|
|
|
//
|
|
|
|
// The caller must have locked the fingerprint of the memorySeries.
|
2015-08-20 15:18:46 +00:00
|
|
|
func (s *memorySeries) firstTime() model.Time {
|
2014-11-05 19:02:45 +00:00
|
|
|
if s.chunkDescsOffset == 0 && len(s.chunkDescs) > 0 {
|
|
|
|
return s.chunkDescs[0].firstTime()
|
|
|
|
}
|
|
|
|
return s.savedFirstTime
|
2014-09-15 17:24:26 +00:00
|
|
|
}
|
|
|
|
|
2016-02-19 17:16:41 +00:00
|
|
|
// lastSamplePair returns the last ingested SamplePair. It returns
|
|
|
|
// ZeroSamplePair if this memorySeries has never received a sample (via the add
|
|
|
|
// method), which is the case for freshly unarchived series or newly created
|
|
|
|
// ones and also for all series after a server restart. However, in that case,
|
|
|
|
// series will most likely be considered stale anyway.
|
|
|
|
//
|
|
|
|
// The caller must have locked the fingerprint of the memorySeries.
|
|
|
|
func (s *memorySeries) lastSamplePair() model.SamplePair {
|
|
|
|
if !s.lastSampleValueSet {
|
|
|
|
return ZeroSamplePair
|
|
|
|
}
|
|
|
|
return model.SamplePair{
|
|
|
|
Timestamp: s.lastTime,
|
|
|
|
Value: s.lastSampleValue,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-20 17:13:06 +00:00
|
|
|
// chunksToPersist returns a slice of chunkDescs eligible for persistence. It's
|
|
|
|
// the caller's responsibility to actually persist the returned chunks
|
|
|
|
// afterwards. The method sets the persistWatermark and the dirty flag
|
|
|
|
// accordingly.
|
2015-03-09 01:33:10 +00:00
|
|
|
//
|
|
|
|
// The caller must have locked the fingerprint of the series.
|
2015-05-20 17:13:06 +00:00
|
|
|
func (s *memorySeries) chunksToPersist() []*chunkDesc {
|
2015-03-09 01:33:10 +00:00
|
|
|
newWatermark := len(s.chunkDescs)
|
|
|
|
if !s.headChunkClosed {
|
|
|
|
newWatermark--
|
|
|
|
}
|
|
|
|
if newWatermark == s.persistWatermark {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
cds := s.chunkDescs[s.persistWatermark:newWatermark]
|
|
|
|
s.dirty = true
|
|
|
|
s.persistWatermark = newWatermark
|
|
|
|
return cds
|
|
|
|
}
|
|
|
|
|
2014-10-14 11:52:39 +00:00
|
|
|
// memorySeriesIterator implements SeriesIterator.
|
|
|
|
type memorySeriesIterator struct {
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
chunkIt chunkIterator // Last chunkIterator used by ValueAtOrBeforeTime.
|
|
|
|
chunkIts []chunkIterator // Caches chunkIterators.
|
|
|
|
chunks []chunk
|
|
|
|
quarantine func(error) // Call to quarantine the series this iterator belongs to.
|
2014-10-14 11:52:39 +00:00
|
|
|
}
|
|
|
|
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// ValueAtOrBeforeTime implements SeriesIterator.
|
|
|
|
func (it *memorySeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePair {
|
2014-06-06 09:55:53 +00:00
|
|
|
// The most common case. We are iterating through a chunk.
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
if it.chunkIt != nil {
|
|
|
|
containsT, err := it.chunkIt.contains(t)
|
|
|
|
if err != nil {
|
|
|
|
it.quarantine(err)
|
|
|
|
return ZeroSamplePair
|
|
|
|
}
|
|
|
|
if containsT {
|
|
|
|
value, err := it.chunkIt.valueAtOrBeforeTime(t)
|
|
|
|
if err != nil {
|
|
|
|
it.quarantine(err)
|
|
|
|
return ZeroSamplePair
|
|
|
|
}
|
|
|
|
return value
|
|
|
|
}
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if len(it.chunks) == 0 {
|
2016-02-19 15:46:11 +00:00
|
|
|
return ZeroSamplePair
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// Find the last chunk where firstTime() is before or equal to t.
|
2015-04-14 11:46:38 +00:00
|
|
|
l := len(it.chunks) - 1
|
2014-06-06 09:55:53 +00:00
|
|
|
i := sort.Search(len(it.chunks), func(i int) bool {
|
2015-04-14 11:46:38 +00:00
|
|
|
return !it.chunks[l-i].firstTime().After(t)
|
2014-06-06 09:55:53 +00:00
|
|
|
})
|
|
|
|
if i == len(it.chunks) {
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// Even the first chunk starts after t.
|
2016-02-19 15:46:11 +00:00
|
|
|
return ZeroSamplePair
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
2015-05-20 17:13:06 +00:00
|
|
|
it.chunkIt = it.chunkIterator(l - i)
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
value, err := it.chunkIt.valueAtOrBeforeTime(t)
|
|
|
|
if err != nil {
|
|
|
|
it.quarantine(err)
|
|
|
|
return ZeroSamplePair
|
|
|
|
}
|
|
|
|
return value
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2015-05-20 17:13:06 +00:00
|
|
|
// RangeValues implements SeriesIterator.
|
2015-08-22 12:52:35 +00:00
|
|
|
func (it *memorySeriesIterator) RangeValues(in metric.Interval) []model.SamplePair {
|
2015-04-14 11:46:38 +00:00
|
|
|
// Find the first chunk for which the first sample is within the interval.
|
2014-06-06 09:55:53 +00:00
|
|
|
i := sort.Search(len(it.chunks), func(i int) bool {
|
2015-04-14 11:46:38 +00:00
|
|
|
return !it.chunks[i].firstTime().Before(in.OldestInclusive)
|
2014-06-06 09:55:53 +00:00
|
|
|
})
|
2015-04-14 11:46:38 +00:00
|
|
|
// Only now check the last timestamp of the previous chunk (which is
|
|
|
|
// fairly expensive).
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
if i > 0 {
|
|
|
|
lt, err := it.chunkIterator(i - 1).lastTimestamp()
|
|
|
|
if err != nil {
|
|
|
|
it.quarantine(err)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if !lt.Before(in.OldestInclusive) {
|
|
|
|
i--
|
|
|
|
}
|
2015-04-14 11:46:38 +00:00
|
|
|
}
|
|
|
|
|
2015-08-22 12:52:35 +00:00
|
|
|
values := []model.SamplePair{}
|
2015-05-04 18:16:01 +00:00
|
|
|
for j, c := range it.chunks[i:] {
|
2014-06-06 09:55:53 +00:00
|
|
|
if c.firstTime().After(in.NewestInclusive) {
|
|
|
|
break
|
|
|
|
}
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
chValues, err := it.chunkIterator(i + j).rangeValues(in)
|
|
|
|
if err != nil {
|
|
|
|
it.quarantine(err)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
values = append(values, chValues...)
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
return values
|
|
|
|
}
|
2014-10-14 11:52:39 +00:00
|
|
|
|
2015-05-20 17:13:06 +00:00
|
|
|
// chunkIterator returns the chunkIterator for the chunk at position i (and
|
2015-05-04 18:16:01 +00:00
|
|
|
// creates it if needed).
|
2015-05-20 17:13:06 +00:00
|
|
|
func (it *memorySeriesIterator) chunkIterator(i int) chunkIterator {
|
2015-05-04 18:16:01 +00:00
|
|
|
chunkIt := it.chunkIts[i]
|
|
|
|
if chunkIt == nil {
|
|
|
|
chunkIt = it.chunks[i].newIterator()
|
|
|
|
it.chunkIts[i] = chunkIt
|
|
|
|
}
|
|
|
|
return chunkIt
|
|
|
|
}
|
|
|
|
|
2016-02-19 17:35:30 +00:00
|
|
|
// singleSampleSeriesIterator implements Series Iterator. It is a "shortcut
|
|
|
|
// iterator" that returns a single samplee only. The sample is saved in the
|
|
|
|
// iterator itself, so no chunks need to be pinned.
|
|
|
|
type singleSampleSeriesIterator struct {
|
|
|
|
samplePair model.SamplePair
|
|
|
|
}
|
|
|
|
|
|
|
|
// ValueAtTime implements SeriesIterator.
|
|
|
|
func (it *singleSampleSeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePair {
|
|
|
|
if it.samplePair.Timestamp.After(t) {
|
|
|
|
return ZeroSamplePair
|
|
|
|
}
|
|
|
|
return it.samplePair
|
|
|
|
}
|
|
|
|
|
|
|
|
// RangeValues implements SeriesIterator.
|
|
|
|
func (it *singleSampleSeriesIterator) RangeValues(in metric.Interval) []model.SamplePair {
|
|
|
|
if it.samplePair.Timestamp.After(in.NewestInclusive) ||
|
|
|
|
it.samplePair.Timestamp.Before(in.OldestInclusive) {
|
|
|
|
return []model.SamplePair{}
|
|
|
|
}
|
|
|
|
return []model.SamplePair{it.samplePair}
|
|
|
|
}
|
|
|
|
|
2014-10-14 11:52:39 +00:00
|
|
|
// nopSeriesIterator implements Series Iterator. It never returns any values.
|
|
|
|
type nopSeriesIterator struct{}
|
|
|
|
|
2015-05-20 17:13:06 +00:00
|
|
|
// ValueAtTime implements SeriesIterator.
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
func (i nopSeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePair {
|
2016-02-19 15:46:11 +00:00
|
|
|
return ZeroSamplePair
|
2014-10-14 11:52:39 +00:00
|
|
|
}
|
|
|
|
|
2015-05-20 17:13:06 +00:00
|
|
|
// RangeValues implements SeriesIterator.
|
2015-08-24 13:07:27 +00:00
|
|
|
func (i nopSeriesIterator) RangeValues(in metric.Interval) []model.SamplePair {
|
2015-08-22 12:52:35 +00:00
|
|
|
return []model.SamplePair{}
|
2014-10-14 11:52:39 +00:00
|
|
|
}
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
|
|
|
|
var nopIter nopSeriesIterator // A nopSeriesIterator for convenience. Can be shared.
|