Optimize queries using regex matchers for set lookups (#602)

* Original version of the set optimization

Signed-off-by: naivewong <867245430@qq.com>

* simple set matcher

Signed-off-by: naivewong <867245430@qq.com>

* simple set matcher

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* add benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* use genSeries from #467

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>
This commit is contained in:
naivewong 2019-05-27 19:24:46 +08:00 committed by Ganesh Vernekar
parent 562e93e8e6
commit 13c80a5979
4 changed files with 324 additions and 36 deletions

View File

@ -21,6 +21,7 @@ import (
"math/rand"
"os"
"path/filepath"
"strconv"
"testing"
"github.com/go-kit/kit/log"
@ -184,6 +185,11 @@ func createBlock(tb testing.TB, dir string, series []Series) string {
return filepath.Join(dir, ulid.String())
}
const (
defaultLabelName = "labelName"
defaultLabelValue = "labelValue"
)
// genSeries generates series with a given number of labels and values.
func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
if totalSeries == 0 || labelCount == 0 {
@ -193,8 +199,9 @@ func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
series := make([]Series, totalSeries)
for i := 0; i < totalSeries; i++ {
lbls := make(map[string]string, labelCount)
for len(lbls) < labelCount {
lbls[randString()] = randString()
lbls[defaultLabelName] = strconv.Itoa(i)
for j := 1; len(lbls) < labelCount; j++ {
lbls[defaultLabelName+strconv.Itoa(j)] = defaultLabelValue + strconv.Itoa(j)
}
samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
for t := mint; t <= maxt; t++ {
@ -224,31 +231,3 @@ func populateSeries(lbls []map[string]string, mint, maxt int64) []Series {
}
return series
}
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
const (
letterIdxBits = 6 // 6 bits to represent a letter index
letterIdxMask = 1<<letterIdxBits - 1 // All 1-bits, as many as letterIdxBits
letterIdxMax = 63 / letterIdxBits // # of letter indices fitting in 63 bits
)
// randString generates random string.
func randString() string {
maxLength := int32(50)
length := rand.Int31n(maxLength)
b := make([]byte, length+1)
// A rand.Int63() generates 63 random bits, enough for letterIdxMax characters!
for i, cache, remain := length, rand.Int63(), letterIdxMax; i >= 0; {
if remain == 0 {
cache, remain = rand.Int63(), letterIdxMax
}
if idx := int(cache & letterIdxMask); idx < len(letterBytes) {
b[i] = letterBytes[idx]
i--
}
cache >>= letterIdxBits
remain--
}
return string(b)
}

View File

@ -63,14 +63,15 @@ func NewEqualMatcher(name, value string) Matcher {
return &EqualMatcher{name: name, value: value}
}
type regexpMatcher struct {
type RegexpMatcher struct {
name string
re *regexp.Regexp
}
func (m regexpMatcher) Name() string { return m.name }
func (m regexpMatcher) Matches(v string) bool { return m.re.MatchString(v) }
func (m regexpMatcher) String() string { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) }
func (m RegexpMatcher) Name() string { return m.name }
func (m RegexpMatcher) Matches(v string) bool { return m.re.MatchString(v) }
func (m RegexpMatcher) String() string { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) }
func (m RegexpMatcher) Value() string { return m.re.String() }
// NewRegexpMatcher returns a new matcher verifying that a value matches
// the regular expression pattern.
@ -79,7 +80,7 @@ func NewRegexpMatcher(name, pattern string) (Matcher, error) {
if err != nil {
return nil, err
}
return &regexpMatcher{name: name, re: re}, nil
return &RegexpMatcher{name: name, re: re}, nil
}
// NewMustRegexpMatcher returns a new matcher verifying that a value matches
@ -90,7 +91,7 @@ func NewMustRegexpMatcher(name, pattern string) Matcher {
if err != nil {
panic(err)
}
return &regexpMatcher{name: name, re: re}
return &RegexpMatcher{name: name, re: re}
}

View File

@ -17,6 +17,7 @@ import (
"fmt"
"sort"
"strings"
"unicode/utf8"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/chunkenc"
@ -266,6 +267,62 @@ func (q *blockQuerier) Close() error {
return merr.Err()
}
// Bitmap used by func isRegexMetaCharacter to check whether a character needs to be escaped.
var regexMetaCharacterBytes [16]byte
// isRegexMetaCharacter reports whether byte b needs to be escaped.
func isRegexMetaCharacter(b byte) bool {
return b < utf8.RuneSelf && regexMetaCharacterBytes[b%16]&(1<<(b/16)) != 0
}
func init() {
for _, b := range []byte(`.+*?()|[]{}^$`) {
regexMetaCharacterBytes[b%16] |= 1 << (b / 16)
}
}
func findSetMatches(pattern string) []string {
// Return empty matches if the wrapper from Prometheus is missing.
if len(pattern) < 6 || pattern[:4] != "^(?:" || pattern[len(pattern)-2:] != ")$" {
return nil
}
escaped := false
sets := []*strings.Builder{&strings.Builder{}}
for i := 4; i < len(pattern)-2; i++ {
if escaped {
switch {
case isRegexMetaCharacter(pattern[i]):
sets[len(sets)-1].WriteByte(pattern[i])
case pattern[i] == '\\':
sets[len(sets)-1].WriteByte('\\')
default:
return nil
}
escaped = false
} else {
switch {
case isRegexMetaCharacter(pattern[i]):
if pattern[i] == '|' {
sets = append(sets, &strings.Builder{})
} else {
return nil
}
case pattern[i] == '\\':
escaped = true
default:
sets[len(sets)-1].WriteByte(pattern[i])
}
}
}
matches := make([]string, 0, len(sets))
for _, s := range sets {
if s.Len() > 0 {
matches = append(matches, s.String())
}
}
return matches
}
// PostingsForMatchers assembles a single postings iterator against the index reader
// based on the given matchers.
func PostingsForMatchers(ix IndexReader, ms ...labels.Matcher) (index.Postings, error) {
@ -346,6 +403,14 @@ func postingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings, error
return ix.Postings(em.Name(), em.Value())
}
// Fast-path for set matching.
if em, ok := m.(*labels.RegexpMatcher); ok {
setMatches := findSetMatches(em.Value())
if len(setMatches) > 0 {
return postingsForSetMatcher(ix, em.Name(), setMatches)
}
}
tpls, err := ix.LabelValues(m.Name())
if err != nil {
return nil, err
@ -411,6 +476,18 @@ func inversePostingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings
return index.Merge(rit...), nil
}
func postingsForSetMatcher(ix IndexReader, name string, matches []string) (index.Postings, error) {
var its []index.Postings
for _, match := range matches {
if it, err := ix.Postings(name, match); err == nil {
its = append(its, it)
} else {
return nil, err
}
}
return index.Merge(its...), nil
}
func mergeStrings(a, b []string) []string {
maxl := len(a)
if len(b) > len(a) {

View File

@ -1691,6 +1691,192 @@ func BenchmarkQuerySeek(b *testing.B) {
}
}
// Refer to https://github.com/prometheus/prometheus/issues/2651.
func BenchmarkSetMatcher(b *testing.B) {
cases := []struct {
numBlocks int
numSeries int
numSamplesPerSeriesPerBlock int
cardinality int
pattern string
}{
// The first three cases are to find out whether the set
// matcher is always faster than regex matcher.
{
numBlocks: 1,
numSeries: 1,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 15,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 15,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3)$",
},
// Big data sizes benchmarks.
{
numBlocks: 20,
numSeries: 1000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3)$",
},
{
numBlocks: 20,
numSeries: 1000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
// Increase cardinality.
{
numBlocks: 1,
numSeries: 100000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 500000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 500000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 10,
numSeries: 500000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 500000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 1000000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 1000000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
}
for _, c := range cases {
dir, err := ioutil.TempDir("", "bench_postings_for_matchers")
testutil.Ok(b, err)
defer func() {
testutil.Ok(b, os.RemoveAll(dir))
}()
var (
blocks []*Block
prefilledLabels []map[string]string
generatedSeries []Series
)
for i := int64(0); i < int64(c.numBlocks); i++ {
mint := i * int64(c.numSamplesPerSeriesPerBlock)
maxt := mint + int64(c.numSamplesPerSeriesPerBlock) - 1
if len(prefilledLabels) == 0 {
generatedSeries = genSeries(c.numSeries, 10, mint, maxt)
for _, s := range generatedSeries {
prefilledLabels = append(prefilledLabels, s.Labels().Map())
}
} else {
generatedSeries = populateSeries(prefilledLabels, mint, maxt)
}
block, err := OpenBlock(nil, createBlock(b, dir, generatedSeries), nil)
testutil.Ok(b, err)
blocks = append(blocks, block)
defer block.Close()
}
que := &querier{
blocks: make([]Querier, 0, len(blocks)),
}
for _, blk := range blocks {
q, err := NewBlockQuerier(blk, math.MinInt64, math.MaxInt64)
testutil.Ok(b, err)
que.blocks = append(que.blocks, q)
}
defer que.Close()
benchMsg := fmt.Sprintf("nSeries=%d,nBlocks=%d,cardinality=%d,pattern=\"%s\"", c.numSeries, c.numBlocks, c.cardinality, c.pattern)
b.Run(benchMsg, func(b *testing.B) {
b.ResetTimer()
b.ReportAllocs()
for n := 0; n < b.N; n++ {
_, err := que.Select(labels.NewMustRegexpMatcher("test", c.pattern))
testutil.Ok(b, err)
}
})
}
}
// Refer to https://github.com/prometheus/prometheus/issues/2651.
func TestFindSetMatches(t *testing.T) {
cases := []struct {
pattern string
exp []string
}{
// Simple sets.
{
pattern: "^(?:foo|bar|baz)$",
exp: []string{
"foo",
"bar",
"baz",
},
},
// Simple sets containing escaped characters.
{
pattern: "^(?:fo\\.o|bar\\?|\\^baz)$",
exp: []string{
"fo.o",
"bar?",
"^baz",
},
},
// Simple sets containing special characters without escaping.
{
pattern: "^(?:fo.o|bar?|^baz)$",
exp: nil,
},
// Missing wrapper.
{
pattern: "foo|bar|baz",
exp: nil,
},
}
for _, c := range cases {
matches := findSetMatches(c.pattern)
if len(c.exp) == 0 {
if len(matches) != 0 {
t.Errorf("Evaluating %s, unexpected result %v", c.pattern, matches)
}
} else {
if len(matches) != len(c.exp) {
t.Errorf("Evaluating %s, length of result not equal to exp", c.pattern)
} else {
for i := 0; i < len(c.exp); i++ {
if c.exp[i] != matches[i] {
t.Errorf("Evaluating %s, unexpected result %s", c.pattern, matches[i])
}
}
}
}
}
}
func TestPostingsForMatchers(t *testing.T) {
h, err := NewHead(nil, nil, nil, 1000)
testutil.Ok(t, err)
@ -1703,6 +1889,7 @@ func TestPostingsForMatchers(t *testing.T) {
app.Add(labels.FromStrings("n", "1", "i", "a"), 0, 0)
app.Add(labels.FromStrings("n", "1", "i", "b"), 0, 0)
app.Add(labels.FromStrings("n", "2"), 0, 0)
app.Add(labels.FromStrings("n", "2.5"), 0, 0)
testutil.Ok(t, app.Commit())
cases := []struct {
@ -1735,6 +1922,7 @@ func TestPostingsForMatchers(t *testing.T) {
labels.FromStrings("n", "1", "i", "a"),
labels.FromStrings("n", "1", "i", "b"),
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
// Not equals.
@ -1742,6 +1930,7 @@ func TestPostingsForMatchers(t *testing.T) {
matchers: []labels.Matcher{labels.Not(labels.NewEqualMatcher("n", "1"))},
exp: []labels.Labels{
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
{
@ -1796,6 +1985,7 @@ func TestPostingsForMatchers(t *testing.T) {
exp: []labels.Labels{
labels.FromStrings("n", "1"),
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
{
@ -1824,6 +2014,7 @@ func TestPostingsForMatchers(t *testing.T) {
matchers: []labels.Matcher{labels.Not(labels.NewMustRegexpMatcher("n", "^1$"))},
exp: []labels.Labels{
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
{
@ -1869,6 +2060,46 @@ func TestPostingsForMatchers(t *testing.T) {
labels.FromStrings("n", "1", "i", "a"),
},
},
// Set optimization for Regex.
// Refer to https://github.com/prometheus/prometheus/issues/2651.
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:1|2)$")},
exp: []labels.Labels{
labels.FromStrings("n", "1"),
labels.FromStrings("n", "1", "i", "a"),
labels.FromStrings("n", "1", "i", "b"),
labels.FromStrings("n", "2"),
},
},
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:a|b)$")},
exp: []labels.Labels{
labels.FromStrings("n", "1", "i", "a"),
labels.FromStrings("n", "1", "i", "b"),
},
},
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:x1|2)$")},
exp: []labels.Labels{
labels.FromStrings("n", "2"),
},
},
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:2|2\\.5)$")},
exp: []labels.Labels{
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
// Empty value.
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:c||d)$")},
exp: []labels.Labels{
labels.FromStrings("n", "1"),
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
}
ir, err := h.Index()