diff --git a/pkg/textparse/lex.l b/pkg/textparse/lex.l index cc711e410..a6b728c71 100644 --- a/pkg/textparse/lex.l +++ b/pkg/textparse/lex.l @@ -15,48 +15,38 @@ package textparse import ( - "fmt" - "math" - "strconv" - "unicode/utf8" - - "github.com/prometheus/prometheus/pkg/value" + "fmt" ) const ( - lstateInit = iota - lstateName - lstateValue - lstateTimestamp - lstateLabels - lstateLName - lstateLEq - lstateLValue - lstateLValueIn + sInit = iota + sComment + sMeta1 + sMeta2 + sLabels + sLValue + sValue + sTimestamp ) // Lex is called by the parser generated by "go tool yacc" to obtain each // token. The method is opened before the matching rules block and closed at // the end of the file. -func (l *lexer) Lex() int { - l.state = lstateInit - +func (l *lexer) Lex() token { if l.i >= len(l.b) { - return eof + return tEOF } c := l.b[l.i] + l.start = l.i - l.ts = nil - l.mstart = l.nextMstart - l.offsets = l.offsets[:0] %} D [0-9] L [a-zA-Z_] M [a-zA-Z_:] +C [^\n] -%x lstateName lstateValue lstateTimestamp lstateLabels lstateLName lstateLEq lstateLValue lstateLValueIn - +%x sComment sMeta1 sMeta2 sLabels sLValue sValue sTimestamp %yyc c %yyn c = l.next() @@ -65,65 +55,46 @@ M [a-zA-Z_:] %% -\0 return eof -#[^\r\n]*\n l.mstart = l.i -[\r\n \t]+ l.mstart = l.i +\0 return tEOF +\n l.state = sInit; return tLinebreak +<*>[ \t]+ return tWhitespace -{M}({M}|{D})* l.state = lstateName - l.offsets = append(l.offsets, l.i) - l.mend = l.i +#[ \t]+ l.state = sComment +# return l.consumeComment() +HELP[\t ]+ l.state = sMeta1; return tHelp +TYPE[\t ]+ l.state = sMeta1; return tType +{M}({M}|{D})* l.state = sMeta2; return tMName +{C}+ l.state = sInit; return tText -([ \t]*)\{ l.state = lstateLabels - -[ \t]+ l.state = lstateValue - l.vstart = l.i - - -[ \t]+ -,?\} l.state = lstateValue - l.mend = l.i -(,?[ \t]*) l.state = lstateLName - l.offsets = append(l.offsets, l.i) - -{L}({L}|{D})* l.state = lstateLEq - l.offsets = append(l.offsets, l.i) - -[ \t]*= l.state = lstateLValue - -[ \t]+ -\" l.state = lstateLValueIn - l.offsets = append(l.offsets, l.i) -(\\.|[^\\"])*\" l.state = lstateLabels - if !utf8.Valid(l.b[l.offsets[len(l.offsets)-1]:l.i-1]) { - l.err = fmt.Errorf("invalid UTF-8 label value") - return -1 - } - l.offsets = append(l.offsets, l.i-1) - -[ \t]+ l.vstart = l.i -(NaN) l.val = math.Float64frombits(value.NormalNaN) - l.state = lstateTimestamp - -[^\n \t\r]+ // We don't parse strictly correct floats as the conversion - // repeats the effort anyway. - l.val, l.err = strconv.ParseFloat(yoloString(l.b[l.vstart:l.i]), 64) - if l.err != nil { - return -1 - } - l.state = lstateTimestamp - -[ \t]+ l.tstart = l.i -{D}+ ts, err := strconv.ParseInt(yoloString(l.b[l.tstart:l.i]), 10, 64) - if err != nil { - l.err = err - return -1 - } - l.ts = &ts -[\r\n]+ l.nextMstart = l.i - return 1 -\0 return 1 +{M}({M}|{D})* l.state = sValue; return tMName +\{ l.state = sLabels; return tBraceOpen +{L}({L}|{D})* return tLName +\} l.state = sValue; return tBraceClose += l.state = sLValue; return tEqual +, return tComma +\"(\\.|[^\\"])*\" l.state = sLabels; return tLValue +[^{ \t\n]+ l.state = sTimestamp; return tValue +{D}+ return tTimestamp +\n l.state = sInit; return tLinebreak %% - l.err = fmt.Errorf("no token found") - return -1 + // Workaround to gobble up comments that started with a HELP or TYPE + // prefix. We just consume all characters until we reach a newline. + // This saves us from adding disproportionate complexity to the parser. + if l.state == sComment { + return l.consumeComment() + } + return tInvalid +} + +func (l *lexer) consumeComment() token { + for c := l.cur(); ; c = l.next() { + switch c { + case 0: + return tEOF + case '\n': + l.state = sInit + return tComment + } + } } diff --git a/pkg/textparse/lex.l.go b/pkg/textparse/lex.l.go index 0a45d070f..33a6a9fce 100644 --- a/pkg/textparse/lex.l.go +++ b/pkg/textparse/lex.l.go @@ -17,39 +17,28 @@ package textparse import ( "fmt" - "math" - "strconv" - "unicode/utf8" - - "github.com/prometheus/prometheus/pkg/value" ) const ( - lstateInit = iota - lstateName - lstateValue - lstateTimestamp - lstateLabels - lstateLName - lstateLEq - lstateLValue - lstateLValueIn + sInit = iota + sComment + sMeta1 + sMeta2 + sLabels + sLValue + sValue + sTimestamp ) // Lex is called by the parser generated by "go tool yacc" to obtain each // token. The method is opened before the matching rules block and closed at // the end of the file. -func (l *lexer) Lex() int { - l.state = lstateInit - +func (l *lexer) Lex() token { if l.i >= len(l.b) { - return eof + return tEOF } c := l.b[l.i] - - l.ts = nil - l.mstart = l.nextMstart - l.offsets = l.offsets[:0] + l.start = l.i yystate0: @@ -58,22 +47,20 @@ yystate0: panic(fmt.Errorf(`invalid start condition %d`, yyt)) case 0: // start condition: INITIAL goto yystart1 - case 1: // start condition: lstateName - goto yystart7 - case 2: // start condition: lstateValue - goto yystart10 - case 3: // start condition: lstateTimestamp - goto yystart16 - case 4: // start condition: lstateLabels + case 1: // start condition: sComment + goto yystart8 + case 2: // start condition: sMeta1 + goto yystart19 + case 3: // start condition: sMeta2 goto yystart21 - case 5: // start condition: lstateLName - goto yystart26 - case 6: // start condition: lstateLEq - goto yystart28 - case 7: // start condition: lstateLValue - goto yystart31 - case 8: // start condition: lstateLValueIn - goto yystart34 + case 4: // start condition: sLabels + goto yystart24 + case 5: // start condition: sLValue + goto yystart29 + case 6: // start condition: sValue + goto yystart33 + case 7: // start condition: sTimestamp + goto yystart36 } goto yystate0 // silence unused label error @@ -85,10 +72,12 @@ yystart1: default: goto yyabort case c == '#': - goto yystate4 + goto yystate5 case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': - goto yystate6 - case c == '\t' || c == '\n' || c == '\r' || c == ' ': + goto yystate7 + case c == '\n': + goto yystate4 + case c == '\t' || c == ' ': goto yystate3 case c == '\x00': goto yystate2 @@ -103,74 +92,71 @@ yystate3: switch { default: goto yyrule3 - case c == '\t' || c == '\n' || c == '\r' || c == ' ': + case c == '\t' || c == ' ': goto yystate3 } yystate4: c = l.next() - switch { - default: - goto yyabort - case c == '\n': - goto yystate5 - case c >= '\x01' && c <= '\t' || c == '\v' || c == '\f' || c >= '\x0e' && c <= 'ÿ': - goto yystate4 - } + goto yyrule2 yystate5: c = l.next() - goto yyrule2 + switch { + default: + goto yyrule5 + case c == '\t' || c == ' ': + goto yystate6 + } yystate6: c = l.next() switch { default: goto yyrule4 - case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': + case c == '\t' || c == ' ': goto yystate6 } - goto yystate7 // silence unused label error yystate7: c = l.next() -yystart7: + switch { + default: + goto yyrule10 + case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': + goto yystate7 + } + + goto yystate8 // silence unused label error +yystate8: + c = l.next() +yystart8: switch { default: goto yyabort - case c == '\t' || c == ' ': - goto yystate8 - case c == '{': + case c == 'H': goto yystate9 - } - -yystate8: - c = l.next() - switch { - default: - goto yyrule6 + case c == 'T': + goto yystate14 case c == '\t' || c == ' ': - goto yystate8 - case c == '{': - goto yystate9 + goto yystate3 } yystate9: c = l.next() - goto yyrule5 - - goto yystate10 // silence unused label error -yystate10: - c = l.next() -yystart10: switch { default: goto yyabort - case c == 'N': - goto yystate13 - case c == '\t' || c == ' ': - goto yystate12 - case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'M' || c >= 'O' && c <= 'ÿ': + case c == 'E': + goto yystate10 + } + +yystate10: + c = l.next() + switch { + default: + goto yyabort + case c == 'L': goto yystate11 } @@ -178,96 +164,93 @@ yystate11: c = l.next() switch { default: - goto yyrule17 - case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'ÿ': - goto yystate11 + goto yyabort + case c == 'P': + goto yystate12 } yystate12: c = l.next() switch { default: - goto yyrule15 + goto yyabort case c == '\t' || c == ' ': - goto yystate12 + goto yystate13 } yystate13: c = l.next() switch { default: - goto yyrule17 - case c == 'a': - goto yystate14 - case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= '`' || c >= 'b' && c <= 'ÿ': - goto yystate11 + goto yyrule6 + case c == '\t' || c == ' ': + goto yystate13 } yystate14: c = l.next() switch { default: - goto yyrule17 - case c == 'N': + goto yyabort + case c == 'Y': goto yystate15 - case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'M' || c >= 'O' && c <= 'ÿ': - goto yystate11 } yystate15: c = l.next() switch { default: - goto yyrule16 - case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'ÿ': - goto yystate11 + goto yyabort + case c == 'P': + goto yystate16 } - goto yystate16 // silence unused label error yystate16: c = l.next() -yystart16: switch { default: goto yyabort - case c == '\n' || c == '\r': - goto yystate19 - case c == '\t' || c == ' ': - goto yystate18 - case c == '\x00': + case c == 'E': goto yystate17 - case c >= '0' && c <= '9': - goto yystate20 } yystate17: c = l.next() - goto yyrule21 + switch { + default: + goto yyabort + case c == '\t' || c == ' ': + goto yystate18 + } yystate18: c = l.next() switch { default: - goto yyrule18 + goto yyrule7 case c == '\t' || c == ' ': goto yystate18 } + goto yystate19 // silence unused label error yystate19: c = l.next() +yystart19: switch { default: - goto yyrule20 - case c == '\n' || c == '\r': - goto yystate19 + goto yyabort + case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': + goto yystate20 + case c == '\t' || c == ' ': + goto yystate3 } yystate20: c = l.next() switch { default: - goto yyrule19 - case c >= '0' && c <= '9': + goto yyrule8 + case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': goto yystate20 } @@ -277,21 +260,19 @@ yystate21: yystart21: switch { default: - goto yyrule9 - case c == ',': - goto yystate23 + goto yyabort case c == '\t' || c == ' ': + goto yystate23 + case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ': goto yystate22 - case c == '}': - goto yystate25 } yystate22: c = l.next() switch { default: - goto yyrule7 - case c == '\t' || c == ' ': + goto yyrule9 + case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ': goto yystate22 } @@ -299,269 +280,271 @@ yystate23: c = l.next() switch { default: - goto yyrule9 + goto yyrule3 case c == '\t' || c == ' ': - goto yystate24 - case c == '}': - goto yystate25 + goto yystate23 + case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ': + goto yystate22 } + goto yystate24 // silence unused label error yystate24: c = l.next() +yystart24: switch { default: - goto yyrule9 + goto yyabort + case c == ',': + goto yystate25 + case c == '=': + goto yystate26 case c == '\t' || c == ' ': - goto yystate24 + goto yystate3 + case c == '}': + goto yystate28 + case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': + goto yystate27 } yystate25: c = l.next() - goto yyrule8 + goto yyrule15 - goto yystate26 // silence unused label error yystate26: c = l.next() -yystart26: - switch { - default: - goto yyabort - case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': - goto yystate27 - } + goto yyrule14 yystate27: c = l.next() switch { default: - goto yyrule10 + goto yyrule12 case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z': goto yystate27 } - goto yystate28 // silence unused label error yystate28: c = l.next() -yystart28: - switch { - default: - goto yyabort - case c == '=': - goto yystate30 - case c == '\t' || c == ' ': - goto yystate29 - } + goto yyrule13 + goto yystate29 // silence unused label error yystate29: c = l.next() - switch { - default: - goto yyabort - case c == '=': - goto yystate30 - case c == '\t' || c == ' ': - goto yystate29 - } - -yystate30: - c = l.next() - goto yyrule11 - - goto yystate31 // silence unused label error -yystate31: - c = l.next() -yystart31: +yystart29: switch { default: goto yyabort case c == '"': - goto yystate33 + goto yystate30 case c == '\t' || c == ' ': - goto yystate32 + goto yystate3 } +yystate30: + c = l.next() + switch { + default: + goto yyabort + case c == '"': + goto yystate31 + case c == '\\': + goto yystate32 + case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ': + goto yystate30 + } + +yystate31: + c = l.next() + goto yyrule16 + yystate32: c = l.next() switch { default: - goto yyrule12 - case c == '\t' || c == ' ': - goto yystate32 + goto yyabort + case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ': + goto yystate30 } + goto yystate33 // silence unused label error yystate33: c = l.next() - goto yyrule13 - - goto yystate34 // silence unused label error -yystate34: - c = l.next() -yystart34: +yystart33: switch { default: goto yyabort - case c == '"': - goto yystate36 - case c == '\\': - goto yystate37 - case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ': + case c == '\t' || c == ' ': + goto yystate3 + case c == '{': goto yystate35 + case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate34 + } + +yystate34: + c = l.next() + switch { + default: + goto yyrule17 + case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate34 } yystate35: c = l.next() + goto yyrule11 + + goto yystate36 // silence unused label error +yystate36: + c = l.next() +yystart36: switch { default: goto yyabort - case c == '"': - goto yystate36 - case c == '\\': + case c == '\n': goto yystate37 - case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ': - goto yystate35 + case c == '\t' || c == ' ': + goto yystate3 + case c >= '0' && c <= '9': + goto yystate38 } -yystate36: - c = l.next() - goto yyrule14 - yystate37: + c = l.next() + goto yyrule19 + +yystate38: c = l.next() switch { default: - goto yyabort - case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ': - goto yystate35 + goto yyrule18 + case c >= '0' && c <= '9': + goto yystate38 } yyrule1: // \0 { - return eof + return tEOF } -yyrule2: // #[^\r\n]*\n +yyrule2: // \n { - l.mstart = l.i + l.state = sInit + return tLinebreak goto yystate0 } -yyrule3: // [\r\n \t]+ +yyrule3: // [ \t]+ { - l.mstart = l.i + return tWhitespace + } +yyrule4: // #[ \t]+ + { + l.state = sComment goto yystate0 } -yyrule4: // {M}({M}|{D})* +yyrule5: // # { - l.state = lstateName - l.offsets = append(l.offsets, l.i) - l.mend = l.i + return l.consumeComment() + } +yyrule6: // HELP[\t ]+ + { + l.state = sMeta1 + return tHelp goto yystate0 } -yyrule5: // ([ \t]*)\{ +yyrule7: // TYPE[\t ]+ { - l.state = lstateLabels + l.state = sMeta1 + return tType goto yystate0 } -yyrule6: // [ \t]+ +yyrule8: // {M}({M}|{D})* { - l.state = lstateValue - l.vstart = l.i + l.state = sMeta2 + return tMName goto yystate0 } -yyrule7: // [ \t]+ - - goto yystate0 -yyrule8: // ,?\} +yyrule9: // {C}+ { - l.state = lstateValue - l.mend = l.i + l.state = sInit + return tText goto yystate0 } -yyrule9: // (,?[ \t]*) +yyrule10: // {M}({M}|{D})* { - l.state = lstateLName - l.offsets = append(l.offsets, l.i) + l.state = sValue + return tMName goto yystate0 } -yyrule10: // {L}({L}|{D})* +yyrule11: // \{ { - l.state = lstateLEq - l.offsets = append(l.offsets, l.i) + l.state = sLabels + return tBraceOpen goto yystate0 } -yyrule11: // [ \t]*= +yyrule12: // {L}({L}|{D})* { - l.state = lstateLValue + return tLName + } +yyrule13: // \} + { + l.state = sValue + return tBraceClose goto yystate0 } -yyrule12: // [ \t]+ - - goto yystate0 -yyrule13: // \" +yyrule14: // = { - l.state = lstateLValueIn - l.offsets = append(l.offsets, l.i) + l.state = sLValue + return tEqual goto yystate0 } -yyrule14: // (\\.|[^\\"])*\" +yyrule15: // , { - l.state = lstateLabels - if !utf8.Valid(l.b[l.offsets[len(l.offsets)-1] : l.i-1]) { - l.err = fmt.Errorf("invalid UTF-8 label value") - return -1 - } - l.offsets = append(l.offsets, l.i-1) + return tComma + } +yyrule16: // \"(\\.|[^\\"])*\" + { + l.state = sLabels + return tLValue goto yystate0 } -yyrule15: // [ \t]+ +yyrule17: // [^{ \t\n]+ { - l.vstart = l.i + l.state = sTimestamp + return tValue goto yystate0 } -yyrule16: // (NaN) +yyrule18: // {D}+ { - l.val = math.Float64frombits(value.NormalNaN) - l.state = lstateTimestamp + return tTimestamp + } +yyrule19: // \n + { + l.state = sInit + return tLinebreak goto yystate0 } -yyrule17: // [^\n \t\r]+ - { - // We don't parse strictly correct floats as the conversion - // repeats the effort anyway. - l.val, l.err = strconv.ParseFloat(yoloString(l.b[l.vstart:l.i]), 64) - if l.err != nil { - return -1 - } - l.state = lstateTimestamp - goto yystate0 - } -yyrule18: // [ \t]+ - { - l.tstart = l.i - goto yystate0 - } -yyrule19: // {D}+ - { - ts, err := strconv.ParseInt(yoloString(l.b[l.tstart:l.i]), 10, 64) - if err != nil { - l.err = err - return -1 - } - l.ts = &ts - goto yystate0 - } -yyrule20: // [\r\n]+ - { - l.nextMstart = l.i - return 1 - } -yyrule21: // \0 - { - return 1 - } panic("unreachable") goto yyabort // silence unused label error yyabort: // no lexem recognized - l.err = fmt.Errorf("no token found") - return -1 + // Workaround to gobble up comments that started with a HELP or TYPE + // prefix. We just consume all characters until we reach a newline. + // This saves us from adding disproportionate complexity to the parser. + if l.state == sComment { + return l.consumeComment() + } + return tInvalid +} + +func (l *lexer) consumeComment() token { + for c := l.cur(); ; c = l.next() { + switch c { + case 0: + return tEOF + case '\n': + l.state = sInit + return tComment + } + } } diff --git a/pkg/textparse/parse.go b/pkg/textparse/parse.go index 4783bc580..ccbca0cae 100644 --- a/pkg/textparse/parse.go +++ b/pkg/textparse/parse.go @@ -19,45 +19,115 @@ package textparse import ( "errors" + "fmt" "io" + "math" "sort" + "strconv" "strings" + "unicode/utf8" "unsafe" + "github.com/prometheus/prometheus/pkg/value" + "github.com/prometheus/prometheus/pkg/labels" ) type lexer struct { - b []byte - i int - vstart int - tstart int - - err error - val float64 - ts *int64 - offsets []int - mstart, mend int - nextMstart int - + b []byte + i int + start int + err error state int } -const eof = 0 +type token int +const ( + tInvalid token = -1 + tEOF token = 0 + tLinebreak token = iota + tWhitespace + tHelp + tType + tText + tComment + tBlank + tMName + tBraceOpen + tBraceClose + tLName + tLValue + tComma + tEqual + tTimestamp + tValue +) + +func (t token) String() string { + switch t { + case tInvalid: + return "INVALID" + case tEOF: + return "EOF" + case tLinebreak: + return "LINEBREAK" + case tWhitespace: + return "WHITESPACE" + case tHelp: + return "HELP" + case tType: + return "TYPE" + case tText: + return "TEXT" + case tComment: + return "COMMENT" + case tBlank: + return "BLANK" + case tMName: + return "MNAME" + case tBraceOpen: + return "BOPEN" + case tBraceClose: + return "BCLOSE" + case tLName: + return "LNAME" + case tLValue: + return "LVALUE" + case tEqual: + return "EQUAL" + case tComma: + return "COMMA" + case tTimestamp: + return "TIMESTAMP" + case tValue: + return "VALUE" + } + return fmt.Sprintf("", t) +} + +// buf returns the buffer of the current token. +func (l *lexer) buf() []byte { + return l.b[l.start:l.i] +} + +func (l *lexer) cur() byte { + return l.b[l.i] +} + +// next advances the lexer to the next character. func (l *lexer) next() byte { l.i++ if l.i >= len(l.b) { l.err = io.EOF - return eof + return byte(tEOF) } - c := l.b[l.i] - - // Consume null byte when encountered in label-value. - if c == eof && (l.state == lstateLValueIn || l.state == lstateLValue) { - return l.next() + // Lex struggles with null bytes. If we are in a label value, where + // they are allowed, consume them here immediately. + for l.b[l.i] == 0 && l.state == sLValue { + l.i++ } - return c + return l.b[l.i] } func (l *lexer) Error(es string) { @@ -67,43 +137,50 @@ func (l *lexer) Error(es string) { // Parser parses samples from a byte slice of samples in the official // Prometheus text exposition format. type Parser struct { - l *lexer - err error - val float64 + l *lexer + series []byte + text []byte + mtype MetricType + val float64 + ts int64 + hasTS bool + start int + offsets []int } // New returns a new parser of the byte slice. func New(b []byte) *Parser { - return &Parser{l: &lexer{b: b}} + return &Parser{l: &lexer{b: append(b, '\n')}} } -// Next advances the parser to the next sample. It returns false if no -// more samples were read or an error occurred. -func (p *Parser) Next() bool { - switch p.l.Lex() { - case -1, eof: - return false - case 1: - return true - } - panic("unexpected") -} - -// At returns the bytes of the metric, the timestamp if set, and the value +// Series returns the bytes of the series, the timestamp if set, and the value // of the current sample. -func (p *Parser) At() ([]byte, *int64, float64) { - return p.l.b[p.l.mstart:p.l.mend], p.l.ts, p.l.val +func (p *Parser) Series() ([]byte, *int64, float64) { + if p.hasTS { + return p.series, &p.ts, p.val + } + return p.series, nil, p.val } -// Err returns the current error. -func (p *Parser) Err() error { - if p.err != nil { - return p.err - } - if p.l.err == io.EOF { - return nil - } - return p.l.err +// Help returns the metric name and help text in the current entry. +// Must only be called after Next returned a help entry. +// The returned byte slices become invalid after the next call to Next. +func (p *Parser) Help() ([]byte, []byte) { + return p.l.b[p.offsets[0]:p.offsets[1]], p.text +} + +// Type returns the metric name and type in the current entry. +// Must only be called after Next returned a type entry. +// The returned byte slices become invalid after the next call to Next. +func (p *Parser) Type() ([]byte, MetricType) { + return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype +} + +// Comment returns the text of the current comment. +// Must only be called after Next returned a comment entry. +// The returned byte slice becomes invalid after the next call to Next. +func (p *Parser) Comment() []byte { + return p.text } // Metric writes the labels of the current sample into the passed labels. @@ -111,33 +188,208 @@ func (p *Parser) Err() error { func (p *Parser) Metric(l *labels.Labels) string { // Allocate the full immutable string immediately, so we just // have to create references on it below. - s := string(p.l.b[p.l.mstart:p.l.mend]) + s := string(p.series) *l = append(*l, labels.Label{ Name: labels.MetricName, - Value: s[:p.l.offsets[0]-p.l.mstart], + Value: s[:p.offsets[0]-p.start], }) - for i := 1; i < len(p.l.offsets); i += 4 { - a := p.l.offsets[i] - p.l.mstart - b := p.l.offsets[i+1] - p.l.mstart - c := p.l.offsets[i+2] - p.l.mstart - d := p.l.offsets[i+3] - p.l.mstart + for i := 1; i < len(p.offsets); i += 4 { + a := p.offsets[i] - p.start + b := p.offsets[i+1] - p.start + c := p.offsets[i+2] - p.start + d := p.offsets[i+3] - p.start // Replacer causes allocations. Replace only when necessary. if strings.IndexByte(s[c:d], byte('\\')) >= 0 { *l = append(*l, labels.Label{Name: s[a:b], Value: replacer.Replace(s[c:d])}) continue } - *l = append(*l, labels.Label{Name: s[a:b], Value: s[c:d]}) } + // Sort labels. We can skip the first entry since the metric name is + // already at the right place. sort.Sort((*l)[1:]) return s } +// nextToken returns the next token from the lexer. It skips over tabs +// and spaces. +func (p *Parser) nextToken() token { + for { + if tok := p.l.Lex(); tok != tWhitespace { + return tok + } + } +} + +// Entry represents the type of a parsed entry. +type Entry int + +const ( + EntryInvalid Entry = -1 + EntryType Entry = 0 + EntryHelp Entry = 1 + EntrySeries Entry = 2 + EntryComment Entry = 3 +) + +// MetricType represents metric type values. +type MetricType string + +const ( + MetricTypeCounter = "counter" + MetricTypeGauge = "gauge" + MetricTypeHistogram = "histogram" + MetricTypeSummary = "summary" + MetricTypeUntyped = "untyped" +) + +func parseError(exp string, got token) error { + return fmt.Errorf("%s, got %q", exp, got) +} + +// Next advances the parser to the next sample. It returns false if no +// more samples were read or an error occurred. +func (p *Parser) Next() (Entry, error) { + var err error + + p.start = p.l.i + p.offsets = p.offsets[:0] + + switch t := p.nextToken(); t { + case tEOF: + return EntryInvalid, io.EOF + case tLinebreak: + // Allow full blank lines. + return p.Next() + + case tHelp, tType: + switch t := p.nextToken(); t { + case tMName: + p.offsets = append(p.offsets, p.l.start, p.l.i) + default: + return EntryInvalid, parseError("expected metric name after HELP", t) + } + switch t := p.nextToken(); t { + case tText: + p.text = p.l.buf()[1:] + default: + return EntryInvalid, parseError("expected text in HELP", t) + } + if t == tType { + switch s := yoloString(p.text); s { + case "counter": + p.mtype = MetricTypeCounter + case "gauge": + p.mtype = MetricTypeGauge + case "histogram": + p.mtype = MetricTypeHistogram + case "summary": + p.mtype = MetricTypeSummary + case "untyped": + p.mtype = MetricTypeUntyped + default: + return EntryInvalid, fmt.Errorf("invalid metric type %q", s) + } + } + if t := p.nextToken(); t != tLinebreak { + return EntryInvalid, parseError("linebreak expected after metadata", t) + } + switch t { + case tHelp: + return EntryHelp, nil + case tType: + return EntryType, nil + } + case tComment: + p.text = p.l.buf() + if t := p.nextToken(); t != tLinebreak { + return EntryInvalid, parseError("linebreak expected after comment", t) + } + return EntryComment, nil + + case tMName: + p.offsets = append(p.offsets, p.l.i) + p.series = p.l.b[p.start:p.l.i] + + t2 := p.nextToken() + if t2 == tBraceOpen { + if err := p.parseLVals(); err != nil { + return EntryInvalid, err + } + p.series = p.l.b[p.start:p.l.i] + t2 = p.nextToken() + } + if t2 != tValue { + return EntryInvalid, parseError("expected value after metric", t) + } + if p.val, err = strconv.ParseFloat(yoloString(p.l.buf()), 64); err != nil { + return EntryInvalid, err + } + // Ensure canonical NaN value. + if math.IsNaN(p.val) { + p.val = math.Float64frombits(value.NormalNaN) + } + p.hasTS = false + switch p.nextToken() { + case tLinebreak: + break + case tTimestamp: + p.hasTS = true + if p.ts, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64); err != nil { + return EntryInvalid, err + } + if t2 := p.nextToken(); t2 != tLinebreak { + return EntryInvalid, parseError("expected next entry after timestamp", t) + } + default: + return EntryInvalid, parseError("expected timestamp or new record", t) + } + return EntrySeries, nil + + default: + err = fmt.Errorf("%q is not a valid start token", t) + } + return EntryInvalid, err +} + +func (p *Parser) parseLVals() error { + t := p.nextToken() + for { + switch t { + case tBraceClose: + return nil + case tLName: + default: + return parseError("expected label name", t) + } + p.offsets = append(p.offsets, p.l.start, p.l.i) + + if t := p.nextToken(); t != tEqual { + return parseError("expected equal", t) + } + if t := p.nextToken(); t != tLValue { + return parseError("expected label value", t) + } + if !utf8.Valid(p.l.buf()) { + return fmt.Errorf("invalid UTF-8 label value") + } + + // The lexer ensures the value string is quoted. Strip first + // and last character. + p.offsets = append(p.offsets, p.l.start+1, p.l.i-1) + + // Free trailing commas are allowed. + if t = p.nextToken(); t == tComma { + t = p.nextToken() + } + } +} + var replacer = strings.NewReplacer( `\"`, `"`, `\\`, `\`, diff --git a/pkg/textparse/parse_test.go b/pkg/textparse/parse_test.go index 65354bc93..b22cce7a1 100644 --- a/pkg/textparse/parse_test.go +++ b/pkg/textparse/parse_test.go @@ -29,15 +29,19 @@ import ( func TestParse(t *testing.T) { input := `# HELP go_gc_duration_seconds A summary of the GC invocation durations. -# TYPE go_gc_duration_seconds summary +# TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 4.9351e-05 go_gc_duration_seconds{quantile="0.25",} 7.424100000000001e-05 go_gc_duration_seconds{quantile="0.5",a="b"} 8.3835e-05 go_gc_duration_seconds{quantile="0.8", a="b"} 8.3835e-05 go_gc_duration_seconds{ quantile="0.9", a="b"} 8.3835e-05 +# Hrandom comment starting with prefix of HELP +# +# comment with escaped \n newline +# comment with escaped \ escape character go_gc_duration_seconds{ quantile="1.0", a="b" } 8.3835e-05 go_gc_duration_seconds { quantile="1.0", a="b" } 8.3835e-05 -go_gc_duration_seconds { quantile= "1.0", a= "b" } 8.3835e-05 +go_gc_duration_seconds { quantile= "1.0", a= "b", } 8.3835e-05 go_gc_duration_seconds { quantile = "1.0", a = "b" } 8.3835e-05 go_gc_duration_seconds_count 99 some:aggregate:rate5m{a_b="c"} 1 @@ -52,12 +56,21 @@ testmetric{label="\"bar\""} 1` int64p := func(x int64) *int64 { return &x } exp := []struct { - lset labels.Labels - m string - t *int64 - v float64 + lset labels.Labels + m string + t *int64 + v float64 + typ MetricType + help string + comment string }{ { + m: "go_gc_duration_seconds", + help: "A summary of the GC invocation durations.", + }, { + m: "go_gc_duration_seconds", + typ: MetricTypeSummary, + }, { m: `go_gc_duration_seconds{quantile="0"}`, v: 4.9351e-05, lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0"), @@ -77,6 +90,14 @@ testmetric{label="\"bar\""} 1` m: `go_gc_duration_seconds{ quantile="0.9", a="b"}`, v: 8.3835e-05, lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.9", "a", "b"), + }, { + comment: "# Hrandom comment starting with prefix of HELP", + }, { + comment: "#", + }, { + comment: "# comment with escaped \\n newline", + }, { + comment: "# comment with escaped \\ escape character", }, { m: `go_gc_duration_seconds{ quantile="1.0", a="b" }`, v: 8.3835e-05, @@ -86,7 +107,7 @@ testmetric{label="\"bar\""} 1` v: 8.3835e-05, lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "1.0", "a", "b"), }, { - m: `go_gc_duration_seconds { quantile= "1.0", a= "b" }`, + m: `go_gc_duration_seconds { quantile= "1.0", a= "b", }`, v: 8.3835e-05, lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "1.0", "a", "b"), }, { @@ -101,6 +122,12 @@ testmetric{label="\"bar\""} 1` m: `some:aggregate:rate5m{a_b="c"}`, v: 1, lset: labels.FromStrings("__name__", "some:aggregate:rate5m", "a_b", "c"), + }, { + m: "go_goroutines", + help: "Number of goroutines that currently exist.", + }, { + m: "go_goroutines", + typ: MetricTypeGauge, }, { m: `go_goroutines`, v: 33, @@ -130,23 +157,42 @@ testmetric{label="\"bar\""} 1` var res labels.Labels - for p.Next() { - m, ts, v := p.At() + for { + et, err := p.Next() + if err == io.EOF { + break + } + require.NoError(t, err) - p.Metric(&res) + switch et { + case EntrySeries: + m, ts, v := p.Series() - require.Equal(t, exp[i].m, string(m)) - require.Equal(t, exp[i].t, ts) - require.Equal(t, exp[i].v, v) - require.Equal(t, exp[i].lset, res) + p.Metric(&res) + + require.Equal(t, exp[i].m, string(m)) + require.Equal(t, exp[i].t, ts) + require.Equal(t, exp[i].v, v) + require.Equal(t, exp[i].lset, res) + res = res[:0] + + case EntryType: + m, typ := p.Type() + require.Equal(t, exp[i].m, string(m)) + require.Equal(t, exp[i].typ, typ) + + case EntryHelp: + m, h := p.Help() + require.Equal(t, exp[i].m, string(m)) + require.Equal(t, exp[i].help, string(h)) + + case EntryComment: + require.Equal(t, exp[i].comment, string(p.Comment())) + } i++ - res = res[:0] } - - require.NoError(t, p.Err()) require.Equal(t, len(exp), i) - } func TestParseErrors(t *testing.T) { @@ -156,19 +202,19 @@ func TestParseErrors(t *testing.T) { }{ { input: "a", - err: "no token found", + err: "expected value after metric, got \"MNAME\"", }, { input: "a{b='c'} 1\n", - err: "no token found", + err: "expected label value, got \"INVALID\"", }, { input: "a{b=\n", - err: "no token found", + err: "expected label value, got \"INVALID\"", }, { input: "a{\xff=\"foo\"} 1\n", - err: "no token found", + err: "expected label name, got \"INVALID\"", }, { input: "a{b=\"\xff\"} 1\n", @@ -180,20 +226,22 @@ func TestParseErrors(t *testing.T) { }, { input: "something_weird{problem=\"", - err: "no token found", + err: "expected label value, got \"INVALID\"", }, { input: "empty_label_name{=\"\"} 0", - err: "no token found", + err: "expected label name, got \"EQUAL\"", }, } - for _, c := range cases { + for i, c := range cases { p := New([]byte(c.input)) - for p.Next() { + var err error + for err == nil { + _, err = p.Next() } - require.NotNil(t, p.Err()) - require.Equal(t, c.err, p.Err().Error()) + require.NotNil(t, err) + require.Equal(t, c.err, err.Error(), "test %d", i) } } @@ -220,34 +268,36 @@ func TestNullByteHandling(t *testing.T) { }, { input: "a{b=\x00\"ssss\"} 1\n", - err: "no token found", + err: "expected label value, got \"INVALID\"", }, { input: "a{b=\"\x00", - err: "no token found", + err: "expected label value, got \"INVALID\"", }, { input: "a{b\x00=\"hiih\"} 1", - err: "no token found", + err: "expected equal, got \"INVALID\"", }, { input: "a\x00{b=\"ddd\"} 1", - err: "no token found", + err: "expected value after metric, got \"MNAME\"", }, } - for _, c := range cases { + for i, c := range cases { p := New([]byte(c.input)) - for p.Next() { + var err error + for err == nil { + _, err = p.Next() } if c.err == "" { - require.NoError(t, p.Err()) + require.Equal(t, io.EOF, err, "test %d", i) continue } - require.Error(t, p.Err()) - require.Equal(t, c.err, p.Err().Error()) + require.Error(t, err) + require.Equal(t, c.err, err.Error(), "test %d", i) } } @@ -274,13 +324,21 @@ func BenchmarkParse(b *testing.B) { for i := 0; i < b.N; i += testdataSampleCount { p := New(buf) - for p.Next() && i < b.N { - m, _, _ := p.At() - - total += len(m) - i++ + Outer: + for i < b.N { + t, err := p.Next() + switch t { + case EntryInvalid: + if err == io.EOF { + break Outer + } + b.Fatal(err) + case EntrySeries: + m, _, _ := p.Series() + total += len(m) + i++ + } } - require.NoError(b, p.Err()) } _ = total }) @@ -294,16 +352,25 @@ func BenchmarkParse(b *testing.B) { for i := 0; i < b.N; i += testdataSampleCount { p := New(buf) - for p.Next() && i < b.N { - m, _, _ := p.At() + Outer: + for i < b.N { + t, err := p.Next() + switch t { + case EntryInvalid: + if err == io.EOF { + break Outer + } + b.Fatal(err) + case EntrySeries: + m, _, _ := p.Series() - res := make(labels.Labels, 0, 5) - p.Metric(&res) + res := make(labels.Labels, 0, 5) + p.Metric(&res) - total += len(m) - i++ + total += len(m) + i++ + } } - require.NoError(b, p.Err()) } _ = total }) @@ -318,16 +385,25 @@ func BenchmarkParse(b *testing.B) { for i := 0; i < b.N; i += testdataSampleCount { p := New(buf) - for p.Next() && i < b.N { - m, _, _ := p.At() + Outer: + for i < b.N { + t, err := p.Next() + switch t { + case EntryInvalid: + if err == io.EOF { + break Outer + } + b.Fatal(err) + case EntrySeries: + m, _, _ := p.Series() - p.Metric(&res) + p.Metric(&res) - total += len(m) - i++ - res = res[:0] + total += len(m) + i++ + res = res[:0] + } } - require.NoError(b, p.Err()) } _ = total }) @@ -361,7 +437,6 @@ func BenchmarkParse(b *testing.B) { }) } } - func BenchmarkGzip(b *testing.B) { for _, fn := range []string{"testdata.txt", "testdata.nometa.txt"} { b.Run(fn, func(b *testing.B) { diff --git a/scrape/scrape.go b/scrape/scrape.go index ed421b792..d98e81c5f 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -830,11 +830,21 @@ func (sl *scrapeLoop) append(b []byte, ts time.Time) (total, added int, err erro var sampleLimitErr error loop: - for p.Next() { + for { + var et textparse.Entry + if et, err = p.Next(); err != nil { + if err == io.EOF { + err = nil + } + break + } + if et != textparse.EntrySeries { + continue + } total++ t := defTime - met, tp, v := p.At() + met, tp, v := p.Series() if tp != nil { t = *tp } @@ -931,10 +941,10 @@ loop: } added++ } - if err == nil { - err = p.Err() - } if sampleLimitErr != nil { + if err == nil { + err = sampleLimitErr + } // We only want to increment this once per scrape, so this is Inc'd outside the loop. targetScrapeSampleLimit.Inc() }