From 13c80a5979c18b06243f6b388c4787a3d2060111 Mon Sep 17 00:00:00 2001 From: naivewong <867245430@qq.com> Date: Mon, 27 May 2019 19:24:46 +0800 Subject: [PATCH] Optimize queries using regex matchers for set lookups (#602) * Original version of the set optimization Signed-off-by: naivewong <867245430@qq.com> * simple set matcher Signed-off-by: naivewong <867245430@qq.com> * simple set matcher Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * add benchmark Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * use genSeries from #467 Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> --- block_test.go | 39 ++------ labels/selector.go | 13 +-- querier.go | 77 +++++++++++++++ querier_test.go | 231 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 324 insertions(+), 36 deletions(-) diff --git a/block_test.go b/block_test.go index bdfd58fbc..3ae2ec243 100644 --- a/block_test.go +++ b/block_test.go @@ -21,6 +21,7 @@ import ( "math/rand" "os" "path/filepath" + "strconv" "testing" "github.com/go-kit/kit/log" @@ -184,6 +185,11 @@ func createBlock(tb testing.TB, dir string, series []Series) string { return filepath.Join(dir, ulid.String()) } +const ( + defaultLabelName = "labelName" + defaultLabelValue = "labelValue" +) + // genSeries generates series with a given number of labels and values. func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series { if totalSeries == 0 || labelCount == 0 { @@ -193,8 +199,9 @@ func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series { series := make([]Series, totalSeries) for i := 0; i < totalSeries; i++ { lbls := make(map[string]string, labelCount) - for len(lbls) < labelCount { - lbls[randString()] = randString() + lbls[defaultLabelName] = strconv.Itoa(i) + for j := 1; len(lbls) < labelCount; j++ { + lbls[defaultLabelName+strconv.Itoa(j)] = defaultLabelValue + strconv.Itoa(j) } samples := make([]tsdbutil.Sample, 0, maxt-mint+1) for t := mint; t <= maxt; t++ { @@ -224,31 +231,3 @@ func populateSeries(lbls []map[string]string, mint, maxt int64) []Series { } return series } - -const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" -const ( - letterIdxBits = 6 // 6 bits to represent a letter index - letterIdxMask = 1<= 0; { - if remain == 0 { - cache, remain = rand.Int63(), letterIdxMax - } - if idx := int(cache & letterIdxMask); idx < len(letterBytes) { - b[i] = letterBytes[idx] - i-- - } - cache >>= letterIdxBits - remain-- - } - - return string(b) -} diff --git a/labels/selector.go b/labels/selector.go index a0565f57e..c94ebb332 100644 --- a/labels/selector.go +++ b/labels/selector.go @@ -63,14 +63,15 @@ func NewEqualMatcher(name, value string) Matcher { return &EqualMatcher{name: name, value: value} } -type regexpMatcher struct { +type RegexpMatcher struct { name string re *regexp.Regexp } -func (m regexpMatcher) Name() string { return m.name } -func (m regexpMatcher) Matches(v string) bool { return m.re.MatchString(v) } -func (m regexpMatcher) String() string { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) } +func (m RegexpMatcher) Name() string { return m.name } +func (m RegexpMatcher) Matches(v string) bool { return m.re.MatchString(v) } +func (m RegexpMatcher) String() string { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) } +func (m RegexpMatcher) Value() string { return m.re.String() } // NewRegexpMatcher returns a new matcher verifying that a value matches // the regular expression pattern. @@ -79,7 +80,7 @@ func NewRegexpMatcher(name, pattern string) (Matcher, error) { if err != nil { return nil, err } - return ®expMatcher{name: name, re: re}, nil + return &RegexpMatcher{name: name, re: re}, nil } // NewMustRegexpMatcher returns a new matcher verifying that a value matches @@ -90,7 +91,7 @@ func NewMustRegexpMatcher(name, pattern string) Matcher { if err != nil { panic(err) } - return ®expMatcher{name: name, re: re} + return &RegexpMatcher{name: name, re: re} } diff --git a/querier.go b/querier.go index 9d99de083..253102b0e 100644 --- a/querier.go +++ b/querier.go @@ -17,6 +17,7 @@ import ( "fmt" "sort" "strings" + "unicode/utf8" "github.com/pkg/errors" "github.com/prometheus/tsdb/chunkenc" @@ -266,6 +267,62 @@ func (q *blockQuerier) Close() error { return merr.Err() } +// Bitmap used by func isRegexMetaCharacter to check whether a character needs to be escaped. +var regexMetaCharacterBytes [16]byte + +// isRegexMetaCharacter reports whether byte b needs to be escaped. +func isRegexMetaCharacter(b byte) bool { + return b < utf8.RuneSelf && regexMetaCharacterBytes[b%16]&(1<<(b/16)) != 0 +} + +func init() { + for _, b := range []byte(`.+*?()|[]{}^$`) { + regexMetaCharacterBytes[b%16] |= 1 << (b / 16) + } +} + +func findSetMatches(pattern string) []string { + // Return empty matches if the wrapper from Prometheus is missing. + if len(pattern) < 6 || pattern[:4] != "^(?:" || pattern[len(pattern)-2:] != ")$" { + return nil + } + escaped := false + sets := []*strings.Builder{&strings.Builder{}} + for i := 4; i < len(pattern)-2; i++ { + if escaped { + switch { + case isRegexMetaCharacter(pattern[i]): + sets[len(sets)-1].WriteByte(pattern[i]) + case pattern[i] == '\\': + sets[len(sets)-1].WriteByte('\\') + default: + return nil + } + escaped = false + } else { + switch { + case isRegexMetaCharacter(pattern[i]): + if pattern[i] == '|' { + sets = append(sets, &strings.Builder{}) + } else { + return nil + } + case pattern[i] == '\\': + escaped = true + default: + sets[len(sets)-1].WriteByte(pattern[i]) + } + } + } + matches := make([]string, 0, len(sets)) + for _, s := range sets { + if s.Len() > 0 { + matches = append(matches, s.String()) + } + } + return matches +} + // PostingsForMatchers assembles a single postings iterator against the index reader // based on the given matchers. func PostingsForMatchers(ix IndexReader, ms ...labels.Matcher) (index.Postings, error) { @@ -346,6 +403,14 @@ func postingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings, error return ix.Postings(em.Name(), em.Value()) } + // Fast-path for set matching. + if em, ok := m.(*labels.RegexpMatcher); ok { + setMatches := findSetMatches(em.Value()) + if len(setMatches) > 0 { + return postingsForSetMatcher(ix, em.Name(), setMatches) + } + } + tpls, err := ix.LabelValues(m.Name()) if err != nil { return nil, err @@ -411,6 +476,18 @@ func inversePostingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings return index.Merge(rit...), nil } +func postingsForSetMatcher(ix IndexReader, name string, matches []string) (index.Postings, error) { + var its []index.Postings + for _, match := range matches { + if it, err := ix.Postings(name, match); err == nil { + its = append(its, it) + } else { + return nil, err + } + } + return index.Merge(its...), nil +} + func mergeStrings(a, b []string) []string { maxl := len(a) if len(b) > len(a) { diff --git a/querier_test.go b/querier_test.go index cb53462a1..dfbc6a75d 100644 --- a/querier_test.go +++ b/querier_test.go @@ -1691,6 +1691,192 @@ func BenchmarkQuerySeek(b *testing.B) { } } +// Refer to https://github.com/prometheus/prometheus/issues/2651. +func BenchmarkSetMatcher(b *testing.B) { + cases := []struct { + numBlocks int + numSeries int + numSamplesPerSeriesPerBlock int + cardinality int + pattern string + }{ + // The first three cases are to find out whether the set + // matcher is always faster than regex matcher. + { + numBlocks: 1, + numSeries: 1, + numSamplesPerSeriesPerBlock: 10, + cardinality: 100, + pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$", + }, + { + numBlocks: 1, + numSeries: 15, + numSamplesPerSeriesPerBlock: 10, + cardinality: 100, + pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$", + }, + { + numBlocks: 1, + numSeries: 15, + numSamplesPerSeriesPerBlock: 10, + cardinality: 100, + pattern: "^(?:1|2|3)$", + }, + // Big data sizes benchmarks. + { + numBlocks: 20, + numSeries: 1000, + numSamplesPerSeriesPerBlock: 10, + cardinality: 100, + pattern: "^(?:1|2|3)$", + }, + { + numBlocks: 20, + numSeries: 1000, + numSamplesPerSeriesPerBlock: 10, + cardinality: 100, + pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$", + }, + // Increase cardinality. + { + numBlocks: 1, + numSeries: 100000, + numSamplesPerSeriesPerBlock: 10, + cardinality: 100000, + pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$", + }, + { + numBlocks: 1, + numSeries: 500000, + numSamplesPerSeriesPerBlock: 10, + cardinality: 500000, + pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$", + }, + { + numBlocks: 10, + numSeries: 500000, + numSamplesPerSeriesPerBlock: 10, + cardinality: 500000, + pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$", + }, + { + numBlocks: 1, + numSeries: 1000000, + numSamplesPerSeriesPerBlock: 10, + cardinality: 1000000, + pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$", + }, + } + + for _, c := range cases { + dir, err := ioutil.TempDir("", "bench_postings_for_matchers") + testutil.Ok(b, err) + defer func() { + testutil.Ok(b, os.RemoveAll(dir)) + }() + + var ( + blocks []*Block + prefilledLabels []map[string]string + generatedSeries []Series + ) + for i := int64(0); i < int64(c.numBlocks); i++ { + mint := i * int64(c.numSamplesPerSeriesPerBlock) + maxt := mint + int64(c.numSamplesPerSeriesPerBlock) - 1 + if len(prefilledLabels) == 0 { + generatedSeries = genSeries(c.numSeries, 10, mint, maxt) + for _, s := range generatedSeries { + prefilledLabels = append(prefilledLabels, s.Labels().Map()) + } + } else { + generatedSeries = populateSeries(prefilledLabels, mint, maxt) + } + block, err := OpenBlock(nil, createBlock(b, dir, generatedSeries), nil) + testutil.Ok(b, err) + blocks = append(blocks, block) + defer block.Close() + } + + que := &querier{ + blocks: make([]Querier, 0, len(blocks)), + } + for _, blk := range blocks { + q, err := NewBlockQuerier(blk, math.MinInt64, math.MaxInt64) + testutil.Ok(b, err) + que.blocks = append(que.blocks, q) + } + defer que.Close() + + benchMsg := fmt.Sprintf("nSeries=%d,nBlocks=%d,cardinality=%d,pattern=\"%s\"", c.numSeries, c.numBlocks, c.cardinality, c.pattern) + b.Run(benchMsg, func(b *testing.B) { + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + _, err := que.Select(labels.NewMustRegexpMatcher("test", c.pattern)) + testutil.Ok(b, err) + + } + }) + } +} + +// Refer to https://github.com/prometheus/prometheus/issues/2651. +func TestFindSetMatches(t *testing.T) { + cases := []struct { + pattern string + exp []string + }{ + // Simple sets. + { + pattern: "^(?:foo|bar|baz)$", + exp: []string{ + "foo", + "bar", + "baz", + }, + }, + // Simple sets containing escaped characters. + { + pattern: "^(?:fo\\.o|bar\\?|\\^baz)$", + exp: []string{ + "fo.o", + "bar?", + "^baz", + }, + }, + // Simple sets containing special characters without escaping. + { + pattern: "^(?:fo.o|bar?|^baz)$", + exp: nil, + }, + // Missing wrapper. + { + pattern: "foo|bar|baz", + exp: nil, + }, + } + + for _, c := range cases { + matches := findSetMatches(c.pattern) + if len(c.exp) == 0 { + if len(matches) != 0 { + t.Errorf("Evaluating %s, unexpected result %v", c.pattern, matches) + } + } else { + if len(matches) != len(c.exp) { + t.Errorf("Evaluating %s, length of result not equal to exp", c.pattern) + } else { + for i := 0; i < len(c.exp); i++ { + if c.exp[i] != matches[i] { + t.Errorf("Evaluating %s, unexpected result %s", c.pattern, matches[i]) + } + } + } + } + } +} + func TestPostingsForMatchers(t *testing.T) { h, err := NewHead(nil, nil, nil, 1000) testutil.Ok(t, err) @@ -1703,6 +1889,7 @@ func TestPostingsForMatchers(t *testing.T) { app.Add(labels.FromStrings("n", "1", "i", "a"), 0, 0) app.Add(labels.FromStrings("n", "1", "i", "b"), 0, 0) app.Add(labels.FromStrings("n", "2"), 0, 0) + app.Add(labels.FromStrings("n", "2.5"), 0, 0) testutil.Ok(t, app.Commit()) cases := []struct { @@ -1735,6 +1922,7 @@ func TestPostingsForMatchers(t *testing.T) { labels.FromStrings("n", "1", "i", "a"), labels.FromStrings("n", "1", "i", "b"), labels.FromStrings("n", "2"), + labels.FromStrings("n", "2.5"), }, }, // Not equals. @@ -1742,6 +1930,7 @@ func TestPostingsForMatchers(t *testing.T) { matchers: []labels.Matcher{labels.Not(labels.NewEqualMatcher("n", "1"))}, exp: []labels.Labels{ labels.FromStrings("n", "2"), + labels.FromStrings("n", "2.5"), }, }, { @@ -1796,6 +1985,7 @@ func TestPostingsForMatchers(t *testing.T) { exp: []labels.Labels{ labels.FromStrings("n", "1"), labels.FromStrings("n", "2"), + labels.FromStrings("n", "2.5"), }, }, { @@ -1824,6 +2014,7 @@ func TestPostingsForMatchers(t *testing.T) { matchers: []labels.Matcher{labels.Not(labels.NewMustRegexpMatcher("n", "^1$"))}, exp: []labels.Labels{ labels.FromStrings("n", "2"), + labels.FromStrings("n", "2.5"), }, }, { @@ -1869,6 +2060,46 @@ func TestPostingsForMatchers(t *testing.T) { labels.FromStrings("n", "1", "i", "a"), }, }, + // Set optimization for Regex. + // Refer to https://github.com/prometheus/prometheus/issues/2651. + { + matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:1|2)$")}, + exp: []labels.Labels{ + labels.FromStrings("n", "1"), + labels.FromStrings("n", "1", "i", "a"), + labels.FromStrings("n", "1", "i", "b"), + labels.FromStrings("n", "2"), + }, + }, + { + matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:a|b)$")}, + exp: []labels.Labels{ + labels.FromStrings("n", "1", "i", "a"), + labels.FromStrings("n", "1", "i", "b"), + }, + }, + { + matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:x1|2)$")}, + exp: []labels.Labels{ + labels.FromStrings("n", "2"), + }, + }, + { + matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:2|2\\.5)$")}, + exp: []labels.Labels{ + labels.FromStrings("n", "2"), + labels.FromStrings("n", "2.5"), + }, + }, + // Empty value. + { + matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:c||d)$")}, + exp: []labels.Labels{ + labels.FromStrings("n", "1"), + labels.FromStrings("n", "2"), + labels.FromStrings("n", "2.5"), + }, + }, } ir, err := h.Index()