MEDIUM: sample: Extend functionality for field/word converters

Extend functionality of field/word converters, so it's possible to extract field(s)/word(s) counting from the beginning/end and/or extract multiple fields/words (including separators) eg. str(f1_f2_f3__f5),field(2,_,2) # f2_f3 str(f1_f2_f3__f5),field(2,_,0) # f2_f3__f5 str(f1_f2_f3__f5),field(-2,_,3) # f2_f3_ str(f1_f2_f3__f5),field(-3,_,0) # f1_f2_f3 str(w1_w2_w3___w4),word(3,_,2) # w3___w4 str(w1_w2_w3___w4),word(2,_,0) # w2_w3___w4 str(w1_w2_w3___w4),word(-2,_,3) # w1_w2_w3 str(w1_w2_w3___w4),word(-3,_,0) # w1_w2 Change is backward compatible.
2025-05-01 23:39:32 +00:00 · 2018-04-16 14:30:46 +02:00 · 2018-04-16 14:30:46 +02:00 · 9631a28275
commit 9631a28275
parent 9a4da683a6
2 changed files with 125 additions and 41 deletions
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@ -12907,10 +12907,20 @@ even
  Returns a boolean TRUE if the input value of type signed integer is even
  otherwise returns FALSE. It is functionally equivalent to "not,and(1),bool".

-field(<index>,<delimiters>)
-  Extracts the substring at the given index considering given delimiters from
-  an input string. Indexes start at 1 and delimiters are a string formatted
-  list of chars.
+field(<index>,<delimiters>[,<count>])
+  Extracts the substring at the given index counting from the beginning
+  (positive index) or from the end (negative index) considering given delimiters
+  from an input string. Indexes start at 1 or -1 and delimiters are a string
+  formatted list of chars. Optionally you can specify <count> of fields to
+  extract (default: 1). Value of 0 indicates extraction of all remaining
+  fields.
+
+  Example :
+      str(f1_f2_f3__f5),field(5,_)    # f5
+      str(f1_f2_f3__f5),field(2,_,0)  # f2_f3__f5
+      str(f1_f2_f3__f5),field(2,_,2)  # f2_f3
+      str(f1_f2_f3__f5),field(-2,_,3) # f2_f3_
+      str(f1_f2_f3__f5),field(-3,_,0) # f1_f2_f3

 hex
  Converts a binary input sample to a hex string containing two hex digits per
@ -13440,9 +13450,19 @@ utime(<format>[,<offset>])
      # e.g.  20140710162350 127.0.0.1:57325
      log-format %[date,utime(%Y%m%d%H%M%S)]\ %ci:%cp

-word(<index>,<delimiters>)
-  Extracts the nth word considering given delimiters from an input string.
-  Indexes start at 1 and delimiters are a string formatted list of chars.
+word(<index>,<delimiters>[,<count>])
+  Extracts the nth word counting from the beginning (positive index) or from
+  the end (negative index) considering given delimiters from an input string.
+  Indexes start at 1 or -1 and delimiters are a string formatted list of chars.
+  Optionally you can specify <count> of words to extract (default: 1).
+  Value of 0 indicates extraction of all remaining words.
+
+  Example :
+      str(f1_f2_f3__f5),word(4,_)    # f5
+      str(f1_f2_f3__f5),word(2,_,0)  # f2_f3__f5
+      str(f1_f2_f3__f5),word(3,_,2)  # f3__f5
+      str(f1_f2_f3__f5),word(-2,_,3) # f1_f2_f3
+      str(f1_f2_f3__f5),word(-3,_,0) # f1_f2

 wt6([<avalanche>])
  Hashes a binary input sample into an unsigned 32-bit quantity using the WT6
--- a/src/sample.c
+++ b/src/sample.c
@ -1997,27 +1997,54 @@ static int sample_conv_field_check(struct arg *args, struct sample_conv *conv,
 */
 static int sample_conv_field(const struct arg *arg_p, struct sample *smp, void *private)
 {
-	unsigned int field;
+	int field;
 	char *start, *end;
 	int i;
+	int count = (arg_p[2].type == ARGT_SINT) ? arg_p[2].data.sint : 1;

 	if (!arg_p[0].data.sint)
 		return 0;

-	field = 1;
-	end = start = smp->data.u.str.str;
-	while (end - smp->data.u.str.str < smp->data.u.str.len) {
-
-		for (i = 0 ; i < arg_p[1].data.str.len ; i++) {
-			if (*end == arg_p[1].data.str.str[i]) {
-				if (field == arg_p[0].data.sint)
-					goto found;
-				start = end+1;
-				field++;
-				break;
+	if (arg_p[0].data.sint < 0) {
+		field = -1;
+		end = start = smp->data.u.str.str + smp->data.u.str.len;
+		while (start > smp->data.u.str.str) {
+			for (i = 0 ; i < arg_p[1].data.str.len ; i++) {
+				if (*(start-1) == arg_p[1].data.str.str[i]) {
+					if (field == arg_p[0].data.sint) {
+						if (count == 1)
+							goto found;
+						else if (count > 1)
+							count--;
+					} else {
+						end = start-1;
+						field--;
+					}
+					break;
+				}
 			}
+			start--;
+		}
+	} else {
+		field = 1;
+		end = start = smp->data.u.str.str;
+		while (end - smp->data.u.str.str < smp->data.u.str.len) {
+			for (i = 0 ; i < arg_p[1].data.str.len ; i++) {
+				if (*end == arg_p[1].data.str.str[i]) {
+					if (field == arg_p[0].data.sint) {
+						if (count == 1)
+							goto found;
+						else if (count > 1)
+							count--;
+					} else {
+						start = end+1;
+						field++;
+					}
+					break;
+				}
+			}
+			end++;
 		}
-		end++;
 	}

 	/* Field not found */
@ -2048,37 +2075,74 @@ found:
 */
 static int sample_conv_word(const struct arg *arg_p, struct sample *smp, void *private)
 {
-	unsigned int word;
+	int word;
 	char *start, *end;
 	int i, issep, inword;
+	int count = (arg_p[2].type == ARGT_SINT) ? arg_p[2].data.sint : 1;

 	if (!arg_p[0].data.sint)
 		return 0;

 	word = 0;
 	inword = 0;
-	end = start = smp->data.u.str.str;
-	while (end - smp->data.u.str.str < smp->data.u.str.len) {
-		issep = 0;
-		for (i = 0 ; i < arg_p[1].data.str.len ; i++) {
-			if (*end == arg_p[1].data.str.str[i]) {
-				issep = 1;
-				break;
+	if (arg_p[0].data.sint < 0) {
+		end = start = smp->data.u.str.str + smp->data.u.str.len;
+		while (start > smp->data.u.str.str) {
+			issep = 0;
+			for (i = 0 ; i < arg_p[1].data.str.len ; i++) {
+				if (*(start-1) == arg_p[1].data.str.str[i]) {
+					issep = 1;
+					break;
+				}
 			}
-		}
-		if (!inword) {
-			if (!issep) {
-				word++;
-				start = end;
-				inword = 1;
+			if (!inword) {
+				if (!issep) {
+					if (word != arg_p[0].data.sint) {
+						word--;
+						end = start;
+					}
+					inword = 1;
+				}
 			}
+			else if (issep) {
+				if (word == arg_p[0].data.sint)
+					if (count == 1)
+						goto found;
+					else if (count > 1)
+						count--;
+				inword = 0;
+			}
+			start--;
 		}
-		else if (issep) {
-			if (word == arg_p[0].data.sint)
-				goto found;
-			inword = 0;
+	} else {
+		end = start = smp->data.u.str.str;
+		while (end - smp->data.u.str.str < smp->data.u.str.len) {
+			issep = 0;
+			for (i = 0 ; i < arg_p[1].data.str.len ; i++) {
+				if (*end == arg_p[1].data.str.str[i]) {
+					issep = 1;
+					break;
+				}
+			}
+			if (!inword) {
+				if (!issep) {
+					if (word != arg_p[0].data.sint) {
+						word++;
+						start = end;
+					}
+					inword = 1;
+				}
+			}
+			else if (issep) {
+				if (word == arg_p[0].data.sint)
+					if (count == 1)
+						goto found;
+					else if (count > 1)
+						count--;
+				inword = 0;
+			}
+			end++;
 		}
-		end++;
 	}

 	/* Field not found */
@ -2928,8 +2992,8 @@ static struct sample_conv_kw_list sample_conv_kws = {ILH, {
 	{ "xxh64",  sample_conv_xxh64,     ARG1(0,SINT), NULL, SMP_T_BIN,  SMP_T_SINT  },
 	{ "json",   sample_conv_json,      ARG1(1,STR),  sample_conv_json_check, SMP_T_STR,  SMP_T_STR },
 	{ "bytes",  sample_conv_bytes,     ARG2(1,SINT,SINT), NULL, SMP_T_BIN,  SMP_T_BIN },
-	{ "field",  sample_conv_field,     ARG2(2,SINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
-	{ "word",   sample_conv_word,      ARG2(2,SINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
+	{ "field",  sample_conv_field,     ARG3(2,SINT,STR,SINT), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
+	{ "word",   sample_conv_word,      ARG3(2,SINT,STR,SINT), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
 	{ "regsub", sample_conv_regsub,    ARG3(2,REG,STR,STR), sample_conv_regsub_check, SMP_T_STR, SMP_T_STR },
 	{ "sha1",   sample_conv_sha1,      0,            NULL, SMP_T_BIN,  SMP_T_BIN  },
 	{ "concat", sample_conv_concat,    ARG3(1,STR,STR,STR), smp_check_concat, SMP_T_STR,  SMP_T_STR },