[BUGFIX] FastRegexpMatcher: do Unicode normalization as part of case-insensitive comparison (#14170)
* Converted string to standarized form * Added golang.org/x/text in Go dependencies * Added test cases for FastRegexMatcher * Added benchmark for toNormalizedLower Signed-off-by: RA <ranveeravhad777@gmail.com>
This commit is contained in:
parent
64c5cc5134
commit
39902ba694
2
go.mod
2
go.mod
|
@ -77,6 +77,7 @@ require (
|
||||||
golang.org/x/oauth2 v0.21.0
|
golang.org/x/oauth2 v0.21.0
|
||||||
golang.org/x/sync v0.7.0
|
golang.org/x/sync v0.7.0
|
||||||
golang.org/x/sys v0.21.0
|
golang.org/x/sys v0.21.0
|
||||||
|
golang.org/x/text v0.16.0
|
||||||
golang.org/x/time v0.5.0
|
golang.org/x/time v0.5.0
|
||||||
golang.org/x/tools v0.22.0
|
golang.org/x/tools v0.22.0
|
||||||
google.golang.org/api v0.183.0
|
google.golang.org/api v0.183.0
|
||||||
|
@ -188,7 +189,6 @@ require (
|
||||||
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a // indirect
|
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a // indirect
|
||||||
golang.org/x/mod v0.18.0 // indirect
|
golang.org/x/mod v0.18.0 // indirect
|
||||||
golang.org/x/term v0.21.0 // indirect
|
golang.org/x/term v0.21.0 // indirect
|
||||||
golang.org/x/text v0.16.0 // indirect
|
|
||||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 // indirect
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 // indirect
|
||||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||||
gopkg.in/ini.v1 v1.67.0 // indirect
|
gopkg.in/ini.v1 v1.67.0 // indirect
|
||||||
|
|
|
@ -16,10 +16,12 @@ package labels
|
||||||
import (
|
import (
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
"unicode"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/grafana/regexp"
|
"github.com/grafana/regexp"
|
||||||
"github.com/grafana/regexp/syntax"
|
"github.com/grafana/regexp/syntax"
|
||||||
|
"golang.org/x/text/unicode/norm"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -766,7 +768,7 @@ type equalMultiStringMapMatcher struct {
|
||||||
|
|
||||||
func (m *equalMultiStringMapMatcher) add(s string) {
|
func (m *equalMultiStringMapMatcher) add(s string) {
|
||||||
if !m.caseSensitive {
|
if !m.caseSensitive {
|
||||||
s = strings.ToLower(s)
|
s = toNormalisedLower(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
m.values[s] = struct{}{}
|
m.values[s] = struct{}{}
|
||||||
|
@ -786,13 +788,51 @@ func (m *equalMultiStringMapMatcher) setMatches() []string {
|
||||||
|
|
||||||
func (m *equalMultiStringMapMatcher) Matches(s string) bool {
|
func (m *equalMultiStringMapMatcher) Matches(s string) bool {
|
||||||
if !m.caseSensitive {
|
if !m.caseSensitive {
|
||||||
s = strings.ToLower(s)
|
s = toNormalisedLower(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, ok := m.values[s]
|
_, ok := m.values[s]
|
||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert
|
||||||
|
// it to lower case.
|
||||||
|
func toNormalisedLower(s string) string {
|
||||||
|
// Check if the string is all ASCII chars and convert any upper case character to lower case character.
|
||||||
|
isASCII := true
|
||||||
|
var (
|
||||||
|
b strings.Builder
|
||||||
|
pos int
|
||||||
|
)
|
||||||
|
b.Grow(len(s))
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
c := s[i]
|
||||||
|
if isASCII && c >= utf8.RuneSelf {
|
||||||
|
isASCII = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if 'A' <= c && c <= 'Z' {
|
||||||
|
c += 'a' - 'A'
|
||||||
|
if pos < i {
|
||||||
|
b.WriteString(s[pos:i])
|
||||||
|
}
|
||||||
|
b.WriteByte(c)
|
||||||
|
pos = i + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if pos < len(s) {
|
||||||
|
b.WriteString(s[pos:])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optimize for ASCII-only strings. In this case we don't have to do any normalization.
|
||||||
|
if isASCII {
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalise and convert to lower.
|
||||||
|
return strings.Map(unicode.ToLower, norm.NFKD.String(b.String()))
|
||||||
|
}
|
||||||
|
|
||||||
// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
|
// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
|
||||||
// (including an empty one) as far as it doesn't contain any newline character.
|
// (including an empty one) as far as it doesn't contain any newline character.
|
||||||
type anyStringWithoutNewlineMatcher struct{}
|
type anyStringWithoutNewlineMatcher struct{}
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue