alertmanager/matcher/parse/lexer.go

// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parse

import (
	"fmt"
	"strings"
	"unicode"
	"unicode/utf8"
)

const (
	eof rune = -1
)

func isReserved(r rune) bool {
	return unicode.IsSpace(r) || strings.ContainsRune("{}!=~,\\\"'`", r)
}

// expectedError is returned when the next rune does not match what is expected.
type expectedError struct {
	position
	input    string
	expected string
}

func (e expectedError) Error() string {
	if e.offsetEnd >= len(e.input) {
		return fmt.Sprintf("%d:%d: unexpected end of input, expected one of '%s'",
			e.columnStart,
			e.columnEnd,
			e.expected,
		)
	}
	return fmt.Sprintf("%d:%d: %s: expected one of '%s'",
		e.columnStart,
		e.columnEnd,
		e.input[e.offsetStart:e.offsetEnd],
		e.expected,
	)
}

// invalidInputError is returned when the next rune in the input does not match
// the grammar of Prometheus-like matchers.
type invalidInputError struct {
	position
	input string
}

func (e invalidInputError) Error() string {
	return fmt.Sprintf("%d:%d: %s: invalid input",
		e.columnStart,
		e.columnEnd,
		e.input[e.offsetStart:e.offsetEnd],
	)
}

// unterminatedError is returned when text in quotes does not have a closing quote.
type unterminatedError struct {
	position
	input string
	quote rune
}

func (e unterminatedError) Error() string {
	return fmt.Sprintf("%d:%d: %s: missing end %c",
		e.columnStart,
		e.columnEnd,
		e.input[e.offsetStart:e.offsetEnd],
		e.quote,
	)
}

// lexer scans a sequence of tokens that match the grammar of Prometheus-like
// matchers. A token is emitted for each call to scan() which returns the
// next token in the input or an error if the input does not conform to the
// grammar. A token can be one of a number of kinds and corresponds to a
// subslice of the input. Once the input has been consumed successive calls to
// scan() return a tokenEOF token.
type lexer struct {
	input  string
	err    error
	start  int // The offset of the current token.
	pos    int // The position of the cursor in the input.
	width  int // The width of the last rune.
	column int // The column offset of the current token.
	cols   int // The number of columns (runes) decoded from the input.
}

// Scans the next token in the input or an error if the input does not
// conform to the grammar. Once the input has been consumed successive
// calls scan() return a tokenEOF token.
func (l *lexer) scan() (token, error) {
	t := token{}
	// Do not attempt to emit more tokens if the input is invalid.
	if l.err != nil {
		return t, l.err
	}
	// Iterate over each rune in the input and either emit a token or an error.
	for r := l.next(); r != eof; r = l.next() {
		switch {
		case r == '{':
			t = l.emit(tokenOpenBrace)
			return t, l.err
		case r == '}':
			t = l.emit(tokenCloseBrace)
			return t, l.err
		case r == ',':
			t = l.emit(tokenComma)
			return t, l.err
		case r == '=' || r == '!':
			l.rewind()
			t, l.err = l.scanOperator()
			return t, l.err
		case r == '"':
			l.rewind()
			t, l.err = l.scanQuoted()
			return t, l.err
		case !isReserved(r):
			l.rewind()
			t, l.err = l.scanUnquoted()
			return t, l.err
		case unicode.IsSpace(r):
			l.skip()
		default:
			l.err = invalidInputError{
				position: l.position(),
				input:    l.input,
			}
			return t, l.err
		}
	}
	return t, l.err
}

func (l *lexer) scanOperator() (token, error) {
	// If the first rune is an '!' then it must be followed with either an
	// '=' or '~' to not match a string or regex.
	if l.accept("!") {
		if l.accept("=") {
			return l.emit(tokenNotEquals), nil
		}
		if l.accept("~") {
			return l.emit(tokenNotMatches), nil
		}
		return token{}, expectedError{
			position: l.position(),
			input:    l.input,
			expected: "=~",
		}
	}
	// If the first rune is an '=' then it can be followed with an optional
	// '~' to match a regex.
	if l.accept("=") {
		if l.accept("~") {
			return l.emit(tokenMatches), nil
		}
		return l.emit(tokenEquals), nil
	}
	return token{}, expectedError{
		position: l.position(),
		input:    l.input,
		expected: "!=",
	}
}

func (l *lexer) scanQuoted() (token, error) {
	if err := l.expect("\""); err != nil {
		return token{}, err
	}
	var isEscaped bool
	for r := l.next(); r != eof; r = l.next() {
		if isEscaped {
			isEscaped = false
		} else if r == '\\' {
			isEscaped = true
		} else if r == '"' {
			l.rewind()
			break
		}
	}
	if err := l.expect("\""); err != nil {
		return token{}, unterminatedError{
			position: l.position(),
			input:    l.input,
			quote:    '"',
		}
	}
	return l.emit(tokenQuoted), nil
}

func (l *lexer) scanUnquoted() (token, error) {
	for r := l.next(); r != eof; r = l.next() {
		if isReserved(r) {
			l.rewind()
			break
		}
	}
	return l.emit(tokenUnquoted), nil
}

// peek the next token in the input or an error if the input does not
// conform to the grammar. Once the input has been consumed successive
// calls peek() return a tokenEOF token.
func (l *lexer) peek() (token, error) {
	start := l.start
	pos := l.pos
	width := l.width
	column := l.column
	cols := l.cols
	// Do not reset l.err because we can return it on the next call to scan().
	defer func() {
		l.start = start
		l.pos = pos
		l.width = width
		l.column = column
		l.cols = cols
	}()
	return l.scan()
}

// position returns the position of the last emitted token.
func (l *lexer) position() position {
	return position{
		offsetStart: l.start,
		offsetEnd:   l.pos,
		columnStart: l.column,
		columnEnd:   l.cols,
	}
}

// accept consumes the next if its one of the valid runes.
// It returns true if the next rune was accepted, otherwise false.
func (l *lexer) accept(valid string) bool {
	if strings.ContainsRune(valid, l.next()) {
		return true
	}
	l.rewind()
	return false
}

// expect consumes the next rune if its one of the valid runes.
// It returns nil if the next rune is valid, otherwise an expectedError
// error.
func (l *lexer) expect(valid string) error {
	if strings.ContainsRune(valid, l.next()) {
		return nil
	}
	l.rewind()
	return expectedError{
		position: l.position(),
		input:    l.input,
		expected: valid,
	}
}

// emits returns the scanned input as a token.
func (l *lexer) emit(kind tokenKind) token {
	t := token{
		kind:     kind,
		value:    l.input[l.start:l.pos],
		position: l.position(),
	}
	l.start = l.pos
	l.column = l.cols
	return t
}

// next returns the next rune in the input or eof.
func (l *lexer) next() rune {
	if l.pos >= len(l.input) {
		l.width = 0
		return eof
	}
	r, width := utf8.DecodeRuneInString(l.input[l.pos:])
	l.width = width
	l.pos += width
	l.cols++
	return r
}

// rewind the last rune in the input. It should not be called more than once
// between consecutive calls of next.
func (l *lexer) rewind() {
	l.pos -= l.width
	// When the next rune in the input is eof the width is zero. This check
	// prevents cols from being decremented when the next rune being accepted
	// is instead eof.
	if l.width > 0 {
		l.cols--
	}
}

// skip the scanned input between start and pos.
func (l *lexer) skip() {
	l.start = l.pos
	l.column = l.cols
}
Support UTF-8 label matchers: Add new parser (#3453) * Add label matchers parser This commit adds the new label matchers parser as proposed in #3353. Included is a number of compliance tests comparing the grammar supported in the new parser with the existing parser in pkg/labels. Signed-off-by: George Robinson <george.robinson@grafana.com> --------- Signed-off-by: George Robinson <george.robinson@grafana.com> 2023-09-05 10:32:58 +00:00			`// Copyright 2023 The Prometheus Authors`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`package parse`

			`import (`
			`"fmt"`
			`"strings"`
			`"unicode"`
			`"unicode/utf8"`
			`)`

			`const (`
			`eof rune = -1`
			`)`

			`func isReserved(r rune) bool {`
Support UTF-8 label matchers: Do not allow unquoted escape sequences (#3571) * Do not allow unquoted escape sequences This commit updates the matchers parser to reject unquoted openmetrics escape sequences. As an example, foo=bar\n will no longer parse, and must instead be written as foo="bar\n". This avoids an issue where the input is valid in both the matchers and classic parsers, but results in two different parsings. --------- Signed-off-by: George Robinson <george.robinson@grafana.com> 2023-10-30 13:56:54 +00:00			return unicode.IsSpace(r) \|\| strings.ContainsRune("{}!=~,\\\"'`", r)
Support UTF-8 label matchers: Add new parser (#3453) * Add label matchers parser This commit adds the new label matchers parser as proposed in #3353. Included is a number of compliance tests comparing the grammar supported in the new parser with the existing parser in pkg/labels. Signed-off-by: George Robinson <george.robinson@grafana.com> --------- Signed-off-by: George Robinson <george.robinson@grafana.com> 2023-09-05 10:32:58 +00:00			`}`

			`// expectedError is returned when the next rune does not match what is expected.`
			`type expectedError struct {`
			`position`
			`input string`
			`expected string`
			`}`

			`func (e expectedError) Error() string {`
			`if e.offsetEnd >= len(e.input) {`
			`return fmt.Sprintf("%d:%d: unexpected end of input, expected one of '%s'",`
			`e.columnStart,`
			`e.columnEnd,`
			`e.expected,`
			`)`
			`}`
			`return fmt.Sprintf("%d:%d: %s: expected one of '%s'",`
			`e.columnStart,`
			`e.columnEnd,`
			`e.input[e.offsetStart:e.offsetEnd],`
			`e.expected,`
			`)`
			`}`

			`// invalidInputError is returned when the next rune in the input does not match`
			`// the grammar of Prometheus-like matchers.`
			`type invalidInputError struct {`
			`position`
			`input string`
			`}`

			`func (e invalidInputError) Error() string {`
			`return fmt.Sprintf("%d:%d: %s: invalid input",`
			`e.columnStart,`
			`e.columnEnd,`
			`e.input[e.offsetStart:e.offsetEnd],`
			`)`
			`}`

			`// unterminatedError is returned when text in quotes does not have a closing quote.`
			`type unterminatedError struct {`
			`position`
			`input string`
			`quote rune`
			`}`

			`func (e unterminatedError) Error() string {`
			`return fmt.Sprintf("%d:%d: %s: missing end %c",`
			`e.columnStart,`
			`e.columnEnd,`
			`e.input[e.offsetStart:e.offsetEnd],`
			`e.quote,`
			`)`
			`}`

			`// lexer scans a sequence of tokens that match the grammar of Prometheus-like`
			`// matchers. A token is emitted for each call to scan() which returns the`
			`// next token in the input or an error if the input does not conform to the`
			`// grammar. A token can be one of a number of kinds and corresponds to a`
			`// subslice of the input. Once the input has been consumed successive calls to`
			`// scan() return a tokenEOF token.`
			`type lexer struct {`
			`input string`
			`err error`
			`start int // The offset of the current token.`
			`pos int // The position of the cursor in the input.`
			`width int // The width of the last rune.`
			`column int // The column offset of the current token.`
			`cols int // The number of columns (runes) decoded from the input.`
			`}`

			`// Scans the next token in the input or an error if the input does not`
			`// conform to the grammar. Once the input has been consumed successive`
			`// calls scan() return a tokenEOF token.`
			`func (l *lexer) scan() (token, error) {`
			`t := token{}`
			`// Do not attempt to emit more tokens if the input is invalid.`
			`if l.err != nil {`
			`return t, l.err`
			`}`
			`// Iterate over each rune in the input and either emit a token or an error.`
			`for r := l.next(); r != eof; r = l.next() {`
			`switch {`
			`case r == '{':`
			`t = l.emit(tokenOpenBrace)`
			`return t, l.err`
			`case r == '}':`
			`t = l.emit(tokenCloseBrace)`
			`return t, l.err`
			`case r == ',':`
			`t = l.emit(tokenComma)`
			`return t, l.err`
			`case r == '=' \|\| r == '!':`
			`l.rewind()`
			`t, l.err = l.scanOperator()`
			`return t, l.err`
			`case r == '"':`
			`l.rewind()`
			`t, l.err = l.scanQuoted()`
			`return t, l.err`
			`case !isReserved(r):`
			`l.rewind()`
			`t, l.err = l.scanUnquoted()`
			`return t, l.err`
			`case unicode.IsSpace(r):`
			`l.skip()`
			`default:`
			`l.err = invalidInputError{`
			`position: l.position(),`
			`input: l.input,`
			`}`
			`return t, l.err`
			`}`
			`}`
			`return t, l.err`
			`}`

			`func (l *lexer) scanOperator() (token, error) {`
			`// If the first rune is an '!' then it must be followed with either an`
			`// '=' or '~' to not match a string or regex.`
			`if l.accept("!") {`
			`if l.accept("=") {`
			`return l.emit(tokenNotEquals), nil`
			`}`
			`if l.accept("~") {`
			`return l.emit(tokenNotMatches), nil`
			`}`
			`return token{}, expectedError{`
			`position: l.position(),`
			`input: l.input,`
			`expected: "=~",`
			`}`
			`}`
			`// If the first rune is an '=' then it can be followed with an optional`
			`// '~' to match a regex.`
			`if l.accept("=") {`
			`if l.accept("~") {`
			`return l.emit(tokenMatches), nil`
			`}`
			`return l.emit(tokenEquals), nil`
			`}`
			`return token{}, expectedError{`
			`position: l.position(),`
			`input: l.input,`
			`expected: "!=",`
			`}`
			`}`

			`func (l *lexer) scanQuoted() (token, error) {`
			`if err := l.expect("\""); err != nil {`
			`return token{}, err`
			`}`
			`var isEscaped bool`
			`for r := l.next(); r != eof; r = l.next() {`
			`if isEscaped {`
			`isEscaped = false`
			`} else if r == '\\' {`
			`isEscaped = true`
			`} else if r == '"' {`
			`l.rewind()`
			`break`
			`}`
			`}`
			`if err := l.expect("\""); err != nil {`
			`return token{}, unterminatedError{`
			`position: l.position(),`
			`input: l.input,`
			`quote: '"',`
			`}`
			`}`
			`return l.emit(tokenQuoted), nil`
			`}`

			`func (l *lexer) scanUnquoted() (token, error) {`
			`for r := l.next(); r != eof; r = l.next() {`
			`if isReserved(r) {`
			`l.rewind()`
			`break`
			`}`
			`}`
			`return l.emit(tokenUnquoted), nil`
			`}`

			`// peek the next token in the input or an error if the input does not`
			`// conform to the grammar. Once the input has been consumed successive`
			`// calls peek() return a tokenEOF token.`
			`func (l *lexer) peek() (token, error) {`
			`start := l.start`
			`pos := l.pos`
			`width := l.width`
			`column := l.column`
			`cols := l.cols`
			`// Do not reset l.err because we can return it on the next call to scan().`
			`defer func() {`
			`l.start = start`
			`l.pos = pos`
			`l.width = width`
			`l.column = column`
			`l.cols = cols`
			`}()`
			`return l.scan()`
			`}`

			`// position returns the position of the last emitted token.`
			`func (l *lexer) position() position {`
			`return position{`
			`offsetStart: l.start,`
			`offsetEnd: l.pos,`
			`columnStart: l.column,`
			`columnEnd: l.cols,`
			`}`
			`}`

			`// accept consumes the next if its one of the valid runes.`
			`// It returns true if the next rune was accepted, otherwise false.`
			`func (l *lexer) accept(valid string) bool {`
			`if strings.ContainsRune(valid, l.next()) {`
			`return true`
			`}`
			`l.rewind()`
			`return false`
			`}`

			`// expect consumes the next rune if its one of the valid runes.`
Add godot linter (#3613) * Add godot linter Signed-off-by: George Robinson <george.robinson@grafana.com> * Remove extra line from LICENSE Signed-off-by: George Robinson <george.robinson@grafana.com> --------- Signed-off-by: George Robinson <george.robinson@grafana.com> 2024-03-21 11:26:46 +00:00			`// It returns nil if the next rune is valid, otherwise an expectedError`
Support UTF-8 label matchers: Add new parser (#3453) * Add label matchers parser This commit adds the new label matchers parser as proposed in #3353. Included is a number of compliance tests comparing the grammar supported in the new parser with the existing parser in pkg/labels. Signed-off-by: George Robinson <george.robinson@grafana.com> --------- Signed-off-by: George Robinson <george.robinson@grafana.com> 2023-09-05 10:32:58 +00:00			`// error.`
			`func (l *lexer) expect(valid string) error {`
			`if strings.ContainsRune(valid, l.next()) {`
			`return nil`
			`}`
			`l.rewind()`
			`return expectedError{`
			`position: l.position(),`
			`input: l.input,`
			`expected: valid,`
			`}`
			`}`

			`// emits returns the scanned input as a token.`
			`func (l *lexer) emit(kind tokenKind) token {`
			`t := token{`
			`kind: kind,`
			`value: l.input[l.start:l.pos],`
			`position: l.position(),`
			`}`
			`l.start = l.pos`
			`l.column = l.cols`
			`return t`
			`}`

			`// next returns the next rune in the input or eof.`
			`func (l *lexer) next() rune {`
			`if l.pos >= len(l.input) {`
			`l.width = 0`
			`return eof`
			`}`
			`r, width := utf8.DecodeRuneInString(l.input[l.pos:])`
			`l.width = width`
			`l.pos += width`
			`l.cols++`
			`return r`
			`}`

			`// rewind the last rune in the input. It should not be called more than once`
			`// between consecutive calls of next.`
			`func (l *lexer) rewind() {`
			`l.pos -= l.width`
			`// When the next rune in the input is eof the width is zero. This check`
			`// prevents cols from being decremented when the next rune being accepted`
			`// is instead eof.`
			`if l.width > 0 {`
			`l.cols--`
			`}`
			`}`

			`// skip the scanned input between start and pos.`
			`func (l *lexer) skip() {`
			`l.start = l.pos`
			`l.column = l.cols`
			`}`