Optimize queries using regex matchers for set lookups (#602)

* Original version of the set optimization Signed-off-by: naivewong <867245430@qq.com> * simple set matcher Signed-off-by: naivewong <867245430@qq.com> * simple set matcher Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * add benchmark Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update benchmark Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com> * use genSeries from #467 Signed-off-by: naivewong <867245430@qq.com> * update Signed-off-by: naivewong <867245430@qq.com>
2019-05-27 19:24:46 +08:00 · 2019-05-27 19:24:46 +08:00 · 13c80a5979
parent 562e93e8e6
commit 13c80a5979
4 changed files with 324 additions and 36 deletions
--- a/block_test.go
+++ b/block_test.go
@ -21,6 +21,7 @@ import (
 	"math/rand"
 	"os"
 	"path/filepath"
 	"strconv"
 	"testing"
 	"github.com/go-kit/kit/log"
@ -184,6 +185,11 @@ func createBlock(tb testing.TB, dir string, series []Series) string {
 	return filepath.Join(dir, ulid.String())
 }
 const (
 	defaultLabelName  = "labelName"
 	defaultLabelValue = "labelValue"
 )
 // genSeries generates series with a given number of labels and values.
 func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
 	if totalSeries == 0 || labelCount == 0 {
@ -193,8 +199,9 @@ func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
 	series := make([]Series, totalSeries)
 	for i := 0; i < totalSeries; i++ {
 		lbls := make(map[string]string, labelCount)
-		for len(lbls) < labelCount {
+		lbls[defaultLabelName] = strconv.Itoa(i)
-			lbls[randString()] = randString()
+		for j := 1; len(lbls) < labelCount; j++ {
 			lbls[defaultLabelName+strconv.Itoa(j)] = defaultLabelValue + strconv.Itoa(j)
 		}
 		samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
 		for t := mint; t <= maxt; t++ {
@ -224,31 +231,3 @@ func populateSeries(lbls []map[string]string, mint, maxt int64) []Series {
 	}
 	return series
 }
 const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 const (
 	letterIdxBits = 6                    // 6 bits to represent a letter index
 	letterIdxMask = 1<<letterIdxBits - 1 // All 1-bits, as many as letterIdxBits
 	letterIdxMax  = 63 / letterIdxBits   // # of letter indices fitting in 63 bits
 )
 // randString generates random string.
 func randString() string {
 	maxLength := int32(50)
 	length := rand.Int31n(maxLength)
 	b := make([]byte, length+1)
 	// A rand.Int63() generates 63 random bits, enough for letterIdxMax characters!
 	for i, cache, remain := length, rand.Int63(), letterIdxMax; i >= 0; {
 		if remain == 0 {
 			cache, remain = rand.Int63(), letterIdxMax
 		}
 		if idx := int(cache & letterIdxMask); idx < len(letterBytes) {
 			b[i] = letterBytes[idx]
 			i--
 		}
 		cache >>= letterIdxBits
 		remain--
 	}
 	return string(b)
 }
--- a/labels/selector.go
+++ b/labels/selector.go
@ -63,14 +63,15 @@ func NewEqualMatcher(name, value string) Matcher {
 	return &EqualMatcher{name: name, value: value}
 }
-type regexpMatcher struct {
+type RegexpMatcher struct {
 	name string
 	re   *regexp.Regexp
 }
-func (m regexpMatcher) Name() string          { return m.name }
+func (m RegexpMatcher) Name() string          { return m.name }
-func (m regexpMatcher) Matches(v string) bool { return m.re.MatchString(v) }
+func (m RegexpMatcher) Matches(v string) bool { return m.re.MatchString(v) }
-func (m regexpMatcher) String() string        { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) }
+func (m RegexpMatcher) String() string        { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) }
 func (m RegexpMatcher) Value() string         { return m.re.String() }
 // NewRegexpMatcher returns a new matcher verifying that a value matches
 // the regular expression pattern.
@ -79,7 +80,7 @@ func NewRegexpMatcher(name, pattern string) (Matcher, error) {
 	if err != nil {
 		return nil, err
 	}
-	return &regexpMatcher{name: name, re: re}, nil
+	return &RegexpMatcher{name: name, re: re}, nil
 }
 // NewMustRegexpMatcher returns a new matcher verifying that a value matches
@ -90,7 +91,7 @@ func NewMustRegexpMatcher(name, pattern string) Matcher {
 	if err != nil {
 		panic(err)
 	}
-	return &regexpMatcher{name: name, re: re}
+	return &RegexpMatcher{name: name, re: re}
 }
--- a/querier.go
+++ b/querier.go
@ -17,6 +17,7 @@ import (
 	"fmt"
 	"sort"
 	"strings"
 	"unicode/utf8"
 	"github.com/pkg/errors"
 	"github.com/prometheus/tsdb/chunkenc"
@ -266,6 +267,62 @@ func (q *blockQuerier) Close() error {
 	return merr.Err()
 }
 // Bitmap used by func isRegexMetaCharacter to check whether a character needs to be escaped.
 var regexMetaCharacterBytes [16]byte
 // isRegexMetaCharacter reports whether byte b needs to be escaped.
 func isRegexMetaCharacter(b byte) bool {
 	return b < utf8.RuneSelf && regexMetaCharacterBytes[b%16]&(1<<(b/16)) != 0
 }
 func init() {
 	for _, b := range []byte(`.+*?()|[]{}^$`) {
 		regexMetaCharacterBytes[b%16] |= 1 << (b / 16)
 	}
 }
 func findSetMatches(pattern string) []string {
 	// Return empty matches if the wrapper from Prometheus is missing.
 	if len(pattern) < 6 || pattern[:4] != "^(?:" || pattern[len(pattern)-2:] != ")$" {
 		return nil
 	}
 	escaped := false
 	sets := []*strings.Builder{&strings.Builder{}}
 	for i := 4; i < len(pattern)-2; i++ {
 		if escaped {
 			switch {
 			case isRegexMetaCharacter(pattern[i]):
 				sets[len(sets)-1].WriteByte(pattern[i])
 			case pattern[i] == '\\':
 				sets[len(sets)-1].WriteByte('\\')
 			default:
 				return nil
 			}
 			escaped = false
 		} else {
 			switch {
 			case isRegexMetaCharacter(pattern[i]):
 				if pattern[i] == '|' {
 					sets = append(sets, &strings.Builder{})
 				} else {
 					return nil
 				}
 			case pattern[i] == '\\':
 				escaped = true
 			default:
 				sets[len(sets)-1].WriteByte(pattern[i])
 			}
 		}
 	}
 	matches := make([]string, 0, len(sets))
 	for _, s := range sets {
 		if s.Len() > 0 {
 			matches = append(matches, s.String())
 		}
 	}
 	return matches
 }
 // PostingsForMatchers assembles a single postings iterator against the index reader
 // based on the given matchers.
 func PostingsForMatchers(ix IndexReader, ms ...labels.Matcher) (index.Postings, error) {
@ -346,6 +403,14 @@ func postingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings, error
 		return ix.Postings(em.Name(), em.Value())
 	}
 	// Fast-path for set matching.
 	if em, ok := m.(*labels.RegexpMatcher); ok {
 		setMatches := findSetMatches(em.Value())
 		if len(setMatches) > 0 {
 			return postingsForSetMatcher(ix, em.Name(), setMatches)
 		}
 	}
 	tpls, err := ix.LabelValues(m.Name())
 	if err != nil {
 		return nil, err
@ -411,6 +476,18 @@ func inversePostingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings
 	return index.Merge(rit...), nil
 }
 func postingsForSetMatcher(ix IndexReader, name string, matches []string) (index.Postings, error) {
 	var its []index.Postings
 	for _, match := range matches {
 		if it, err := ix.Postings(name, match); err == nil {
 			its = append(its, it)
 		} else {
 			return nil, err
 		}
 	}
 	return index.Merge(its...), nil
 }
 func mergeStrings(a, b []string) []string {
 	maxl := len(a)
 	if len(b) > len(a) {
--- a/querier_test.go
+++ b/querier_test.go
@ -1691,6 +1691,192 @@ func BenchmarkQuerySeek(b *testing.B) {
 	}
 }
 // Refer to https://github.com/prometheus/prometheus/issues/2651.
 func BenchmarkSetMatcher(b *testing.B) {
 	cases := []struct {
 		numBlocks                   int
 		numSeries                   int
 		numSamplesPerSeriesPerBlock int
 		cardinality                 int
 		pattern                     string
 	}{
 		// The first three cases are to find out whether the set
 		// matcher is always faster than regex matcher.
 		{
 			numBlocks:                   1,
 			numSeries:                   1,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 100,
 			pattern:                     "^(?:1|2|3|4|5|6|7|8|9|10)$",
 		},
 		{
 			numBlocks:                   1,
 			numSeries:                   15,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 100,
 			pattern:                     "^(?:1|2|3|4|5|6|7|8|9|10)$",
 		},
 		{
 			numBlocks:                   1,
 			numSeries:                   15,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 100,
 			pattern:                     "^(?:1|2|3)$",
 		},
 		// Big data sizes benchmarks.
 		{
 			numBlocks:                   20,
 			numSeries:                   1000,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 100,
 			pattern:                     "^(?:1|2|3)$",
 		},
 		{
 			numBlocks:                   20,
 			numSeries:                   1000,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 100,
 			pattern:                     "^(?:1|2|3|4|5|6|7|8|9|10)$",
 		},
 		// Increase cardinality.
 		{
 			numBlocks:                   1,
 			numSeries:                   100000,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 100000,
 			pattern:                     "^(?:1|2|3|4|5|6|7|8|9|10)$",
 		},
 		{
 			numBlocks:                   1,
 			numSeries:                   500000,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 500000,
 			pattern:                     "^(?:1|2|3|4|5|6|7|8|9|10)$",
 		},
 		{
 			numBlocks:                   10,
 			numSeries:                   500000,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 500000,
 			pattern:                     "^(?:1|2|3|4|5|6|7|8|9|10)$",
 		},
 		{
 			numBlocks:                   1,
 			numSeries:                   1000000,
 			numSamplesPerSeriesPerBlock: 10,
 			cardinality:                 1000000,
 			pattern:                     "^(?:1|2|3|4|5|6|7|8|9|10)$",
 		},
 	}
 	for _, c := range cases {
 		dir, err := ioutil.TempDir("", "bench_postings_for_matchers")
 		testutil.Ok(b, err)
 		defer func() {
 			testutil.Ok(b, os.RemoveAll(dir))
 		}()
 		var (
 			blocks          []*Block
 			prefilledLabels []map[string]string
 			generatedSeries []Series
 		)
 		for i := int64(0); i < int64(c.numBlocks); i++ {
 			mint := i * int64(c.numSamplesPerSeriesPerBlock)
 			maxt := mint + int64(c.numSamplesPerSeriesPerBlock) - 1
 			if len(prefilledLabels) == 0 {
 				generatedSeries = genSeries(c.numSeries, 10, mint, maxt)
 				for _, s := range generatedSeries {
 					prefilledLabels = append(prefilledLabels, s.Labels().Map())
 				}
 			} else {
 				generatedSeries = populateSeries(prefilledLabels, mint, maxt)
 			}
 			block, err := OpenBlock(nil, createBlock(b, dir, generatedSeries), nil)
 			testutil.Ok(b, err)
 			blocks = append(blocks, block)
 			defer block.Close()
 		}
 		que := &querier{
 			blocks: make([]Querier, 0, len(blocks)),
 		}
 		for _, blk := range blocks {
 			q, err := NewBlockQuerier(blk, math.MinInt64, math.MaxInt64)
 			testutil.Ok(b, err)
 			que.blocks = append(que.blocks, q)
 		}
 		defer que.Close()
 		benchMsg := fmt.Sprintf("nSeries=%d,nBlocks=%d,cardinality=%d,pattern=\"%s\"", c.numSeries, c.numBlocks, c.cardinality, c.pattern)
 		b.Run(benchMsg, func(b *testing.B) {
 			b.ResetTimer()
 			b.ReportAllocs()
 			for n := 0; n < b.N; n++ {
 				_, err := que.Select(labels.NewMustRegexpMatcher("test", c.pattern))
 				testutil.Ok(b, err)
 			}
 		})
 	}
 }
 // Refer to https://github.com/prometheus/prometheus/issues/2651.
 func TestFindSetMatches(t *testing.T) {
 	cases := []struct {
 		pattern string
 		exp     []string
 	}{
 		// Simple sets.
 		{
 			pattern: "^(?:foo|bar|baz)$",
 			exp: []string{
 				"foo",
 				"bar",
 				"baz",
 			},
 		},
 		// Simple sets containing escaped characters.
 		{
 			pattern: "^(?:fo\\.o|bar\\?|\\^baz)$",
 			exp: []string{
 				"fo.o",
 				"bar?",
 				"^baz",
 			},
 		},
 		// Simple sets containing special characters without escaping.
 		{
 			pattern: "^(?:fo.o|bar?|^baz)$",
 			exp:     nil,
 		},
 		// Missing wrapper.
 		{
 			pattern: "foo|bar|baz",
 			exp:     nil,
 		},
 	}
 	for _, c := range cases {
 		matches := findSetMatches(c.pattern)
 		if len(c.exp) == 0 {
 			if len(matches) != 0 {
 				t.Errorf("Evaluating %s, unexpected result %v", c.pattern, matches)
 			}
 		} else {
 			if len(matches) != len(c.exp) {
 				t.Errorf("Evaluating %s, length of result not equal to exp", c.pattern)
 			} else {
 				for i := 0; i < len(c.exp); i++ {
 					if c.exp[i] != matches[i] {
 						t.Errorf("Evaluating %s, unexpected result %s", c.pattern, matches[i])
 					}
 				}
 			}
 		}
 	}
 }
 func TestPostingsForMatchers(t *testing.T) {
 	h, err := NewHead(nil, nil, nil, 1000)
 	testutil.Ok(t, err)
@ -1703,6 +1889,7 @@ func TestPostingsForMatchers(t *testing.T) {
 	app.Add(labels.FromStrings("n", "1", "i", "a"), 0, 0)
 	app.Add(labels.FromStrings("n", "1", "i", "b"), 0, 0)
 	app.Add(labels.FromStrings("n", "2"), 0, 0)
 	app.Add(labels.FromStrings("n", "2.5"), 0, 0)
 	testutil.Ok(t, app.Commit())
 	cases := []struct {
@ -1735,6 +1922,7 @@ func TestPostingsForMatchers(t *testing.T) {
 				labels.FromStrings("n", "1", "i", "a"),
 				labels.FromStrings("n", "1", "i", "b"),
 				labels.FromStrings("n", "2"),
 				labels.FromStrings("n", "2.5"),
 			},
 		},
 		// Not equals.
@ -1742,6 +1930,7 @@ func TestPostingsForMatchers(t *testing.T) {
 			matchers: []labels.Matcher{labels.Not(labels.NewEqualMatcher("n", "1"))},
 			exp: []labels.Labels{
 				labels.FromStrings("n", "2"),
 				labels.FromStrings("n", "2.5"),
 			},
 		},
 		{
@ -1796,6 +1985,7 @@ func TestPostingsForMatchers(t *testing.T) {
 			exp: []labels.Labels{
 				labels.FromStrings("n", "1"),
 				labels.FromStrings("n", "2"),
 				labels.FromStrings("n", "2.5"),
 			},
 		},
 		{
@ -1824,6 +2014,7 @@ func TestPostingsForMatchers(t *testing.T) {
 			matchers: []labels.Matcher{labels.Not(labels.NewMustRegexpMatcher("n", "^1$"))},
 			exp: []labels.Labels{
 				labels.FromStrings("n", "2"),
 				labels.FromStrings("n", "2.5"),
 			},
 		},
 		{
@ -1869,6 +2060,46 @@ func TestPostingsForMatchers(t *testing.T) {
 				labels.FromStrings("n", "1", "i", "a"),
 			},
 		},
 		// Set optimization for Regex.
 		// Refer to https://github.com/prometheus/prometheus/issues/2651.
 		{
 			matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:1|2)$")},
 			exp: []labels.Labels{
 				labels.FromStrings("n", "1"),
 				labels.FromStrings("n", "1", "i", "a"),
 				labels.FromStrings("n", "1", "i", "b"),
 				labels.FromStrings("n", "2"),
 			},
 		},
 		{
 			matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:a|b)$")},
 			exp: []labels.Labels{
 				labels.FromStrings("n", "1", "i", "a"),
 				labels.FromStrings("n", "1", "i", "b"),
 			},
 		},
 		{
 			matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:x1|2)$")},
 			exp: []labels.Labels{
 				labels.FromStrings("n", "2"),
 			},
 		},
 		{
 			matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:2|2\\.5)$")},
 			exp: []labels.Labels{
 				labels.FromStrings("n", "2"),
 				labels.FromStrings("n", "2.5"),
 			},
 		},
 		// Empty value.
 		{
 			matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:c||d)$")},
 			exp: []labels.Labels{
 				labels.FromStrings("n", "1"),
 				labels.FromStrings("n", "2"),
 				labels.FromStrings("n", "2.5"),
 			},
 		},
 	}
 	ir, err := h.Index()