2015-01-21 19:07:45 +00:00
|
|
|
// Copyright 2014 The Prometheus Authors
|
2014-09-19 16:18:44 +00:00
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
package chunk
|
2014-06-06 09:55:53 +00:00
|
|
|
|
|
|
|
import (
|
2014-11-13 19:50:25 +00:00
|
|
|
"container/list"
|
2016-03-12 20:34:51 +00:00
|
|
|
"errors"
|
2015-03-04 12:40:18 +00:00
|
|
|
"fmt"
|
2014-06-06 09:55:53 +00:00
|
|
|
"io"
|
2016-03-07 19:23:14 +00:00
|
|
|
"sort"
|
2014-10-22 17:21:23 +00:00
|
|
|
"sync"
|
2014-10-23 13:18:32 +00:00
|
|
|
"sync/atomic"
|
2014-06-06 09:55:53 +00:00
|
|
|
|
2015-08-20 15:18:46 +00:00
|
|
|
"github.com/prometheus/common/model"
|
2014-06-06 09:55:53 +00:00
|
|
|
|
|
|
|
"github.com/prometheus/prometheus/storage/metric"
|
|
|
|
)
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// ChunkLen is the length of a chunk in bytes.
|
|
|
|
const ChunkLen = 1024
|
|
|
|
|
|
|
|
// DefaultEncoding can be changed via a flag.
|
|
|
|
var DefaultEncoding = DoubleDelta
|
2015-03-13 14:49:07 +00:00
|
|
|
|
2016-03-12 20:34:51 +00:00
|
|
|
var errChunkBoundsExceeded = errors.New("attempted access outside of chunk boundaries")
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// EvictRequest is a request to evict a chunk from memory.
|
|
|
|
type EvictRequest struct {
|
2016-09-28 21:33:34 +00:00
|
|
|
Desc *Desc
|
2016-09-21 21:44:27 +00:00
|
|
|
Evict bool
|
|
|
|
}
|
|
|
|
|
|
|
|
// Encoding defines which encoding we are using, delta, doubledelta, or varbit
|
|
|
|
type Encoding byte
|
2015-03-13 14:49:07 +00:00
|
|
|
|
2015-06-15 10:49:28 +00:00
|
|
|
// String implements flag.Value.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (e Encoding) String() string {
|
|
|
|
return fmt.Sprintf("%d", e)
|
2015-06-15 10:49:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Set implements flag.Value.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (e *Encoding) Set(s string) error {
|
2015-06-15 10:49:28 +00:00
|
|
|
switch s {
|
|
|
|
case "0":
|
2016-09-28 21:33:34 +00:00
|
|
|
*e = Delta
|
2015-06-15 10:49:28 +00:00
|
|
|
case "1":
|
2016-09-28 21:33:34 +00:00
|
|
|
*e = DoubleDelta
|
2016-03-12 20:34:51 +00:00
|
|
|
case "2":
|
2016-09-28 21:33:34 +00:00
|
|
|
*e = Varbit
|
2015-06-15 10:49:28 +00:00
|
|
|
default:
|
|
|
|
return fmt.Errorf("invalid chunk encoding: %s", s)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-03-13 14:49:07 +00:00
|
|
|
const (
|
2016-09-21 15:56:55 +00:00
|
|
|
// Delta encoding
|
2016-09-21 21:44:27 +00:00
|
|
|
Delta Encoding = iota
|
2016-09-21 15:56:55 +00:00
|
|
|
// DoubleDelta encoding
|
|
|
|
DoubleDelta
|
|
|
|
// Varbit encoding
|
|
|
|
Varbit
|
2015-03-13 14:49:07 +00:00
|
|
|
)
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// Desc contains meta-data for a chunk. Pay special attention to the
|
2016-03-02 12:45:17 +00:00
|
|
|
// documented requirements for calling its methods concurrently (WRT pinning and
|
2016-02-19 11:24:29 +00:00
|
|
|
// locking). The doc comments spell out the requirements for each method, but
|
|
|
|
// here is an overview and general explanation:
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
//
|
|
|
|
// Everything that changes the pinning of the underlying chunk or deals with its
|
2016-09-21 21:44:27 +00:00
|
|
|
// eviction is protected by a mutex. This affects the following methods: Pin,
|
|
|
|
// Unpin, RefCount, IsEvicted, MaybeEvict. These methods can be called at any
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// time without further prerequisites.
|
|
|
|
//
|
|
|
|
// Another group of methods acts on (or sets) the underlying chunk. These
|
|
|
|
// methods involve no locking. They may only be called if the caller has pinned
|
|
|
|
// the chunk (to guarantee the chunk is not evicted concurrently). Also, the
|
|
|
|
// caller must make sure nobody else will call these methods concurrently,
|
2016-09-28 21:33:34 +00:00
|
|
|
// either by holding the sole reference to the Desc (usually during loading
|
|
|
|
// or creation) or by locking the fingerprint of the series the Desc
|
2016-09-21 21:44:27 +00:00
|
|
|
// belongs to. The affected methods are: Add, MaybePopulateLastTime, SetChunk.
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
//
|
2016-09-21 21:44:27 +00:00
|
|
|
// Finally, there are the special cases FirstTime and LastTime. LastTime requires
|
2016-02-19 11:24:29 +00:00
|
|
|
// to have locked the fingerprint of the series but the chunk does not need to
|
2016-09-28 21:33:34 +00:00
|
|
|
// be pinned. That's because the ChunkLastTime field in Desc gets populated
|
2016-02-19 11:24:29 +00:00
|
|
|
// upon completion of the chunk (when it is still pinned, and which happens
|
|
|
|
// while the series's fingerprint is locked). Once that has happened, calling
|
2016-09-21 21:44:27 +00:00
|
|
|
// LastTime does not require the chunk to be loaded anymore. Before that has
|
2016-09-28 21:33:34 +00:00
|
|
|
// happened, the chunk is pinned anyway. The ChunkFirstTime field in Desc
|
|
|
|
// is populated upon creation of a Desc, so it is alway safe to call
|
2016-09-21 21:44:27 +00:00
|
|
|
// FirstTime. The FirstTime method is arguably not needed and only there for
|
|
|
|
// consistency with LastTime.
|
|
|
|
type Desc struct {
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
sync.Mutex // Protects pinning.
|
2016-09-21 21:44:27 +00:00
|
|
|
C Chunk // nil if chunk is evicted.
|
2015-05-20 17:13:06 +00:00
|
|
|
rCnt int
|
2016-09-21 21:44:27 +00:00
|
|
|
ChunkFirstTime model.Time // Populated at creation. Immutable.
|
|
|
|
ChunkLastTime model.Time // Populated on closing of the chunk, model.Earliest if unset.
|
2014-11-13 19:50:25 +00:00
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// EvictListElement is nil if the chunk is not in the evict list.
|
2016-09-28 21:33:34 +00:00
|
|
|
// EvictListElement is _not_ protected by the Desc mutex.
|
2016-06-29 06:14:23 +00:00
|
|
|
// It must only be touched by the evict list handler in MemorySeriesStorage.
|
2016-09-21 21:44:27 +00:00
|
|
|
EvictListElement *list.Element
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewDesc creates a new Desc pointing to the provided chunk. The provided chunk
|
|
|
|
// is assumed to be not persisted yet. Therefore, the refCount of the new
|
2016-09-28 21:33:34 +00:00
|
|
|
// Desc is 1 (preventing eviction prior to persisting).
|
2016-09-21 21:44:27 +00:00
|
|
|
func NewDesc(c Chunk, firstTime model.Time) *Desc {
|
2016-09-28 21:33:34 +00:00
|
|
|
Ops.WithLabelValues(CreateAndPin).Inc()
|
2016-09-21 21:44:27 +00:00
|
|
|
atomic.AddInt64(&NumMemChunks, 1)
|
2016-09-28 21:33:34 +00:00
|
|
|
NumMemDescs.Inc()
|
2016-09-21 21:44:27 +00:00
|
|
|
return &Desc{
|
|
|
|
C: c,
|
2016-02-11 16:36:13 +00:00
|
|
|
rCnt: 1,
|
2016-09-21 21:44:27 +00:00
|
|
|
ChunkFirstTime: firstTime,
|
|
|
|
ChunkLastTime: model.Earliest,
|
2016-02-11 16:36:13 +00:00
|
|
|
}
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 15:56:55 +00:00
|
|
|
// Add adds a sample pair to the underlying chunk. For safe concurrent access,
|
2016-02-19 11:24:29 +00:00
|
|
|
// The chunk must be pinned, and the caller must have locked the fingerprint of
|
|
|
|
// the series.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) Add(s model.SamplePair) ([]Chunk, error) {
|
|
|
|
return d.C.Add(s)
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// Pin increments the refCount by one. Upon increment from 0 to 1, this
|
2016-09-28 21:33:34 +00:00
|
|
|
// Desc is removed from the evict list. To enable the latter, the
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// evictRequests channel has to be provided. This method can be called
|
|
|
|
// concurrently at any time.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) Pin(evictRequests chan<- EvictRequest) {
|
|
|
|
d.Lock()
|
|
|
|
defer d.Unlock()
|
2014-10-22 17:21:23 +00:00
|
|
|
|
2016-09-28 21:33:34 +00:00
|
|
|
if d.rCnt == 0 {
|
2014-11-13 19:50:25 +00:00
|
|
|
// Remove ourselves from the evict list.
|
2016-09-28 21:33:34 +00:00
|
|
|
evictRequests <- EvictRequest{d, false}
|
2014-11-13 19:50:25 +00:00
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
d.rCnt++
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// Unpin decrements the refCount by one. Upon decrement from 1 to 0, this
|
2016-09-28 21:33:34 +00:00
|
|
|
// Desc is added to the evict list. To enable the latter, the evictRequests
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// channel has to be provided. This method can be called concurrently at any
|
|
|
|
// time.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) Unpin(evictRequests chan<- EvictRequest) {
|
|
|
|
d.Lock()
|
|
|
|
defer d.Unlock()
|
2014-10-22 17:21:23 +00:00
|
|
|
|
2016-09-28 21:33:34 +00:00
|
|
|
if d.rCnt == 0 {
|
2014-10-22 17:21:23 +00:00
|
|
|
panic("cannot unpin already unpinned chunk")
|
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
d.rCnt--
|
|
|
|
if d.rCnt == 0 {
|
2014-11-13 19:50:25 +00:00
|
|
|
// Add ourselves to the back of the evict list.
|
2016-09-28 21:33:34 +00:00
|
|
|
evictRequests <- EvictRequest{d, true}
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// RefCount returns the number of pins. This method can be called concurrently
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// at any time.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) RefCount() int {
|
|
|
|
d.Lock()
|
|
|
|
defer d.Unlock()
|
2014-10-27 14:55:44 +00:00
|
|
|
|
2016-09-28 21:33:34 +00:00
|
|
|
return d.rCnt
|
2014-10-27 14:55:44 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// FirstTime returns the timestamp of the first sample in the chunk. This method
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// can be called concurrently at any time. It only returns the immutable
|
2016-09-28 21:33:34 +00:00
|
|
|
// d.ChunkFirstTime without any locking. Arguably, this method is
|
2016-09-21 21:44:27 +00:00
|
|
|
// useless. However, it provides consistency with the LastTime method.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) FirstTime() model.Time {
|
|
|
|
return d.ChunkFirstTime
|
2016-02-11 16:36:13 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// LastTime returns the timestamp of the last sample in the chunk. For safe
|
2016-02-19 11:24:29 +00:00
|
|
|
// concurrent access, this method requires the fingerprint of the time series to
|
|
|
|
// be locked.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) LastTime() (model.Time, error) {
|
|
|
|
if d.ChunkLastTime != model.Earliest || d.C == nil {
|
|
|
|
return d.ChunkLastTime, nil
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
return d.C.NewIterator().LastTimestamp()
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// MaybePopulateLastTime populates the ChunkLastTime from the underlying chunk
|
2016-02-19 11:24:29 +00:00
|
|
|
// if it has not yet happened. Call this method directly after having added the
|
|
|
|
// last sample to a chunk or after closing a head chunk due to age. For safe
|
|
|
|
// concurrent access, the chunk must be pinned, and the caller must have locked
|
|
|
|
// the fingerprint of the series.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) MaybePopulateLastTime() error {
|
|
|
|
if d.ChunkLastTime == model.Earliest && d.C != nil {
|
|
|
|
t, err := d.C.NewIterator().LastTimestamp()
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
d.ChunkLastTime = t
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
return nil
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// IsEvicted returns whether the chunk is evicted. For safe concurrent access,
|
2016-02-19 11:24:29 +00:00
|
|
|
// the caller must have locked the fingerprint of the series.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) IsEvicted() bool {
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// Locking required here because we do not want the caller to force
|
|
|
|
// pinning the chunk first, so it could be evicted while this method is
|
|
|
|
// called.
|
2016-09-28 21:33:34 +00:00
|
|
|
d.Lock()
|
|
|
|
defer d.Unlock()
|
2014-10-22 17:21:23 +00:00
|
|
|
|
2016-09-28 21:33:34 +00:00
|
|
|
return d.C == nil
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// SetChunk sets the underlying chunk. The caller must have locked the
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// fingerprint of the series and must have "pre-pinned" the chunk (i.e. first
|
2016-09-21 21:44:27 +00:00
|
|
|
// call Pin and then set the chunk).
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) SetChunk(c Chunk) {
|
|
|
|
if d.C != nil {
|
2014-10-22 17:21:23 +00:00
|
|
|
panic("chunk already set")
|
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
d.C = c
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// MaybeEvict evicts the chunk if the refCount is 0. It returns whether the chunk
|
2014-11-13 19:50:25 +00:00
|
|
|
// is now evicted, which includes the case that the chunk was evicted even
|
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 17:47:50 +00:00
|
|
|
// before this method was called. It can be called concurrently at any time.
|
2016-09-28 21:33:34 +00:00
|
|
|
func (d *Desc) MaybeEvict() bool {
|
|
|
|
d.Lock()
|
|
|
|
defer d.Unlock()
|
2014-10-22 17:21:23 +00:00
|
|
|
|
2016-09-28 21:33:34 +00:00
|
|
|
if d.C == nil {
|
2014-10-22 17:21:23 +00:00
|
|
|
return true
|
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
if d.rCnt != 0 {
|
2014-11-13 19:50:25 +00:00
|
|
|
return false
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
if d.ChunkLastTime == model.Earliest {
|
2016-02-19 11:24:29 +00:00
|
|
|
// This must never happen.
|
2016-09-21 21:44:27 +00:00
|
|
|
panic("ChunkLastTime not populated for evicted chunk")
|
2016-02-11 16:36:13 +00:00
|
|
|
}
|
2016-09-28 21:33:34 +00:00
|
|
|
d.C = nil
|
2016-10-10 14:30:10 +00:00
|
|
|
Ops.WithLabelValues(Evict).Inc()
|
|
|
|
atomic.AddInt64(&NumMemChunks, -1)
|
2014-11-13 19:50:25 +00:00
|
|
|
return true
|
2014-10-22 17:21:23 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 15:56:55 +00:00
|
|
|
// Chunk is the interface for all chunks. Chunks are generally not
|
2014-09-16 13:47:24 +00:00
|
|
|
// goroutine-safe.
|
2016-09-21 15:56:55 +00:00
|
|
|
type Chunk interface {
|
2016-11-06 15:34:41 +00:00
|
|
|
// Add adds a SamplePair to the chunks, performs any necessary
|
2014-09-16 13:47:24 +00:00
|
|
|
// re-encoding, and adds any necessary overflow chunks. It returns the
|
|
|
|
// new version of the original chunk, followed by overflow chunks, if
|
|
|
|
// any. The first chunk returned might be the same as the original one
|
|
|
|
// or a newly allocated version. In any case, take the returned chunk as
|
2016-02-10 02:47:00 +00:00
|
|
|
// the relevant one and discard the original chunk.
|
2016-09-21 15:56:55 +00:00
|
|
|
Add(sample model.SamplePair) ([]Chunk, error)
|
|
|
|
Clone() Chunk
|
|
|
|
FirstTime() model.Time
|
2016-09-21 21:44:27 +00:00
|
|
|
NewIterator() Iterator
|
2016-09-21 15:56:55 +00:00
|
|
|
Marshal(io.Writer) error
|
|
|
|
MarshalToBuf([]byte) error
|
|
|
|
Unmarshal(io.Reader) error
|
|
|
|
UnmarshalFromBuf([]byte) error
|
2016-09-21 21:44:27 +00:00
|
|
|
Encoding() Encoding
|
2016-10-05 18:32:55 +00:00
|
|
|
Utilization() float64
|
2016-11-06 15:34:41 +00:00
|
|
|
|
|
|
|
// Len returns the number of samples in the chunk. Implementations may be
|
|
|
|
// expensive.
|
|
|
|
Len() int
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// Iterator enables efficient access to the content of a chunk. It is
|
|
|
|
// generally not safe to use an Iterator concurrently with or after chunk
|
2016-03-09 15:20:39 +00:00
|
|
|
// mutation.
|
2016-09-21 21:44:27 +00:00
|
|
|
type Iterator interface {
|
2015-04-14 11:46:38 +00:00
|
|
|
// Gets the last timestamp in the chunk.
|
2016-09-21 15:56:55 +00:00
|
|
|
LastTimestamp() (model.Time, error)
|
2014-09-16 13:47:24 +00:00
|
|
|
// Whether a given timestamp is contained between first and last value
|
|
|
|
// in the chunk.
|
2016-09-21 15:56:55 +00:00
|
|
|
Contains(model.Time) (bool, error)
|
2016-03-09 15:20:39 +00:00
|
|
|
// Scans the next value in the chunk. Directly after the iterator has
|
|
|
|
// been created, the next value is the first value in the
|
|
|
|
// chunk. Otherwise, it is the value following the last value scanned or
|
2016-09-21 21:44:27 +00:00
|
|
|
// found (by one of the Find... methods). Returns false if either the
|
2016-03-09 15:20:39 +00:00
|
|
|
// end of the chunk is reached or an error has occurred.
|
2016-09-21 15:56:55 +00:00
|
|
|
Scan() bool
|
2016-03-09 15:20:39 +00:00
|
|
|
// Finds the most recent value at or before the provided time. Returns
|
|
|
|
// false if either the chunk contains no value at or before the provided
|
|
|
|
// time, or an error has occurred.
|
2016-09-21 15:56:55 +00:00
|
|
|
FindAtOrBefore(model.Time) bool
|
2016-03-09 15:20:39 +00:00
|
|
|
// Finds the oldest value at or after the provided time. Returns false
|
|
|
|
// if either the chunk contains no value at or after the provided time,
|
|
|
|
// or an error has occurred.
|
2016-09-21 15:56:55 +00:00
|
|
|
FindAtOrAfter(model.Time) bool
|
2016-03-09 15:20:39 +00:00
|
|
|
// Returns the last value scanned (by the scan method) or found (by one
|
2016-09-28 21:40:26 +00:00
|
|
|
// of the find... methods). It returns model.ZeroSamplePair before any of
|
2016-03-09 15:20:39 +00:00
|
|
|
// those methods were called.
|
2016-09-21 15:56:55 +00:00
|
|
|
Value() model.SamplePair
|
2016-03-17 13:37:24 +00:00
|
|
|
// Returns the last error encountered. In general, an error signals data
|
2016-03-09 15:20:39 +00:00
|
|
|
// corruption in the chunk and requires quarantining.
|
2016-09-21 15:56:55 +00:00
|
|
|
Err() error
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// RangeValues is a utility function that retrieves all values within the given
|
|
|
|
// range from an Iterator.
|
|
|
|
func RangeValues(it Iterator, in metric.Interval) ([]model.SamplePair, error) {
|
2016-03-09 15:20:39 +00:00
|
|
|
result := []model.SamplePair{}
|
2016-09-21 15:56:55 +00:00
|
|
|
if !it.FindAtOrAfter(in.OldestInclusive) {
|
|
|
|
return result, it.Err()
|
2016-03-09 15:20:39 +00:00
|
|
|
}
|
2016-09-21 15:56:55 +00:00
|
|
|
for !it.Value().Timestamp.After(in.NewestInclusive) {
|
|
|
|
result = append(result, it.Value())
|
|
|
|
if !it.Scan() {
|
2016-03-09 15:20:39 +00:00
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2016-09-21 15:56:55 +00:00
|
|
|
return result, it.Err()
|
2016-03-09 15:20:39 +00:00
|
|
|
}
|
|
|
|
|
2016-03-12 20:34:51 +00:00
|
|
|
// addToOverflowChunk is a utility function that creates a new chunk as overflow
|
2016-03-17 10:58:37 +00:00
|
|
|
// chunk, adds the provided sample to it, and returns a chunk slice containing
|
2016-03-12 20:34:51 +00:00
|
|
|
// the provided old chunk followed by the new overflow chunk.
|
2016-09-21 15:56:55 +00:00
|
|
|
func addToOverflowChunk(c Chunk, s model.SamplePair) ([]Chunk, error) {
|
2016-09-28 21:33:34 +00:00
|
|
|
overflowChunks, err := New().Add(s)
|
2016-03-12 20:34:51 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-09-21 15:56:55 +00:00
|
|
|
return []Chunk{c, overflowChunks[0]}, nil
|
2016-03-12 20:34:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// transcodeAndAdd is a utility function that transcodes the dst chunk into the
|
|
|
|
// provided src chunk (plus the necessary overflow chunks) and then adds the
|
|
|
|
// provided sample. It returns the new chunks (transcoded plus overflow) with
|
|
|
|
// the new sample at the end.
|
2016-09-21 15:56:55 +00:00
|
|
|
func transcodeAndAdd(dst Chunk, src Chunk, s model.SamplePair) ([]Chunk, error) {
|
2016-09-28 21:33:34 +00:00
|
|
|
Ops.WithLabelValues(Transcode).Inc()
|
2014-06-06 09:55:53 +00:00
|
|
|
|
2016-03-07 18:50:13 +00:00
|
|
|
var (
|
|
|
|
head = dst
|
2016-09-21 15:56:55 +00:00
|
|
|
body, NewChunks []Chunk
|
2016-03-07 18:50:13 +00:00
|
|
|
err error
|
|
|
|
)
|
|
|
|
|
2016-09-21 15:56:55 +00:00
|
|
|
it := src.NewIterator()
|
|
|
|
for it.Scan() {
|
|
|
|
if NewChunks, err = head.Add(it.Value()); err != nil {
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2016-09-21 15:56:55 +00:00
|
|
|
body = append(body, NewChunks[:len(NewChunks)-1]...)
|
|
|
|
head = NewChunks[len(NewChunks)-1]
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
2016-09-21 15:56:55 +00:00
|
|
|
if it.Err() != nil {
|
|
|
|
return nil, it.Err()
|
2016-03-07 18:50:13 +00:00
|
|
|
}
|
|
|
|
|
2016-09-21 15:56:55 +00:00
|
|
|
if NewChunks, err = head.Add(s); err != nil {
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2016-09-21 15:56:55 +00:00
|
|
|
return append(body, NewChunks...), nil
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2016-09-28 21:33:34 +00:00
|
|
|
// New creates a new chunk according to the encoding set by the
|
2016-09-21 21:44:27 +00:00
|
|
|
// DefaultEncoding flag.
|
2016-09-28 21:33:34 +00:00
|
|
|
func New() Chunk {
|
|
|
|
chunk, err := NewForEncoding(DefaultEncoding)
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return chunk
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
|
2016-09-28 21:33:34 +00:00
|
|
|
// NewForEncoding allows configuring what chunk type you want
|
|
|
|
func NewForEncoding(encoding Encoding) (Chunk, error) {
|
2015-03-13 14:49:07 +00:00
|
|
|
switch encoding {
|
2016-09-21 15:56:55 +00:00
|
|
|
case Delta:
|
2016-09-21 21:44:27 +00:00
|
|
|
return newDeltaEncodedChunk(d1, d0, true, ChunkLen), nil
|
2016-09-21 15:56:55 +00:00
|
|
|
case DoubleDelta:
|
2016-09-21 21:44:27 +00:00
|
|
|
return newDoubleDeltaEncodedChunk(d1, d0, true, ChunkLen), nil
|
2016-09-21 15:56:55 +00:00
|
|
|
case Varbit:
|
2016-03-23 15:30:41 +00:00
|
|
|
return newVarbitChunk(varbitZeroEncoding), nil
|
2014-06-06 09:55:53 +00:00
|
|
|
default:
|
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 11:23:42 +00:00
|
|
|
return nil, fmt.Errorf("unknown chunk encoding: %v", encoding)
|
2014-06-06 09:55:53 +00:00
|
|
|
}
|
|
|
|
}
|
2016-03-07 19:23:14 +00:00
|
|
|
|
|
|
|
// indexAccessor allows accesses to samples by index.
|
|
|
|
type indexAccessor interface {
|
|
|
|
timestampAtIndex(int) model.Time
|
|
|
|
sampleValueAtIndex(int) model.SampleValue
|
|
|
|
err() error
|
|
|
|
}
|
|
|
|
|
|
|
|
// indexAccessingChunkIterator is a chunk iterator for chunks for which an
|
|
|
|
// indexAccessor implementation exists.
|
|
|
|
type indexAccessingChunkIterator struct {
|
|
|
|
len int
|
|
|
|
pos int
|
|
|
|
lastValue model.SamplePair
|
|
|
|
acc indexAccessor
|
|
|
|
}
|
|
|
|
|
|
|
|
func newIndexAccessingChunkIterator(len int, acc indexAccessor) *indexAccessingChunkIterator {
|
|
|
|
return &indexAccessingChunkIterator{
|
|
|
|
len: len,
|
|
|
|
pos: -1,
|
2016-09-28 21:40:26 +00:00
|
|
|
lastValue: model.ZeroSamplePair,
|
2016-03-07 19:23:14 +00:00
|
|
|
acc: acc,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// lastTimestamp implements Iterator.
|
2016-09-21 15:56:55 +00:00
|
|
|
func (it *indexAccessingChunkIterator) LastTimestamp() (model.Time, error) {
|
2016-03-07 19:23:14 +00:00
|
|
|
return it.acc.timestampAtIndex(it.len - 1), it.acc.err()
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// contains implements Iterator.
|
2016-09-21 15:56:55 +00:00
|
|
|
func (it *indexAccessingChunkIterator) Contains(t model.Time) (bool, error) {
|
2016-03-07 19:23:14 +00:00
|
|
|
return !t.Before(it.acc.timestampAtIndex(0)) &&
|
|
|
|
!t.After(it.acc.timestampAtIndex(it.len-1)), it.acc.err()
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// scan implements Iterator.
|
2016-09-21 15:56:55 +00:00
|
|
|
func (it *indexAccessingChunkIterator) Scan() bool {
|
2016-03-07 19:23:14 +00:00
|
|
|
it.pos++
|
|
|
|
if it.pos >= it.len {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
it.lastValue = model.SamplePair{
|
|
|
|
Timestamp: it.acc.timestampAtIndex(it.pos),
|
|
|
|
Value: it.acc.sampleValueAtIndex(it.pos),
|
|
|
|
}
|
|
|
|
return it.acc.err() == nil
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// findAtOrBefore implements Iterator.
|
2016-09-21 15:56:55 +00:00
|
|
|
func (it *indexAccessingChunkIterator) FindAtOrBefore(t model.Time) bool {
|
2016-03-09 15:20:39 +00:00
|
|
|
i := sort.Search(it.len, func(i int) bool {
|
|
|
|
return it.acc.timestampAtIndex(i).After(t)
|
|
|
|
})
|
|
|
|
if i == 0 || it.acc.err() != nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
it.pos = i - 1
|
|
|
|
it.lastValue = model.SamplePair{
|
|
|
|
Timestamp: it.acc.timestampAtIndex(i - 1),
|
|
|
|
Value: it.acc.sampleValueAtIndex(i - 1),
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// findAtOrAfter implements Iterator.
|
2016-09-21 15:56:55 +00:00
|
|
|
func (it *indexAccessingChunkIterator) FindAtOrAfter(t model.Time) bool {
|
2016-03-09 15:20:39 +00:00
|
|
|
i := sort.Search(it.len, func(i int) bool {
|
|
|
|
return !it.acc.timestampAtIndex(i).Before(t)
|
|
|
|
})
|
|
|
|
if i == it.len || it.acc.err() != nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
it.pos = i
|
|
|
|
it.lastValue = model.SamplePair{
|
|
|
|
Timestamp: it.acc.timestampAtIndex(i),
|
|
|
|
Value: it.acc.sampleValueAtIndex(i),
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// value implements Iterator.
|
2016-09-21 15:56:55 +00:00
|
|
|
func (it *indexAccessingChunkIterator) Value() model.SamplePair {
|
2016-03-07 19:23:14 +00:00
|
|
|
return it.lastValue
|
|
|
|
}
|
|
|
|
|
2016-09-21 21:44:27 +00:00
|
|
|
// err implements Iterator.
|
2016-09-21 15:56:55 +00:00
|
|
|
func (it *indexAccessingChunkIterator) Err() error {
|
2016-03-07 19:23:14 +00:00
|
|
|
return it.acc.err()
|
|
|
|
}
|