MINOR: tools: add simple word fingerprinting to find similar-looking words

This introduces two functions, one which creates a fingerprint of a word,
and one which computes a distance between two words fingerprints. The
fingerprint is made by counting the transitions between one character and
another one. Here we consider the 26 alphabetic letters regardless of
their case, then any digit as a digit, and anything else as "other". We
also consider the first and last locations as transitions from begin to
first char, and last char to end. The distance is simply the sum of the
squares of the differences between two fingerprints. This way, doubling/
missing a letter has the same cost, however some repeated transitions
such as "e"->"r" like in "server" are very unlikely to match against
situations where they do not exist. This is a naive approach but it seems
to work sufficiently well for now. It may be refined in the future if
needed.
This commit is contained in:
Willy Tarreau 2021-03-12 09:01:52 +01:00
parent 133c8c412e
commit ba2c4459a5
2 changed files with 45 additions and 0 deletions

View File

@ -865,6 +865,8 @@ int my_unsetenv(const char *name);
char *env_expand(char *in);
uint32_t parse_line(char *in, char *out, size_t *outlen, char **args, int *nbargs, uint32_t opts, char **errptr);
size_t sanitize_for_printing(char *line, size_t pos, size_t width);
void make_word_fingerprint(uint8_t *fp, const char *word);
int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2);
/* debugging macro to emit messages using write() on fd #-1 so that strace sees
* them.

View File

@ -5369,6 +5369,49 @@ size_t sanitize_for_printing(char *line, size_t pos, size_t width)
return pos - shift;
}
/* Initialize array <fp> with the fingerprint of word <word> by counting the
* transitions between characters. <fp> is a 1024-entries array indexed as
* 32*from+to. Positions for 'from' and 'to' are:
* 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
*/
void make_word_fingerprint(uint8_t *fp, const char *word)
{
const char *p;
int from, to;
int c;
memset(fp, 0, 1024);
from = 28; // begin
for (p = word; *p; p++) {
c = tolower(*p);
switch(c) {
case 'a'...'z': to = c - 'a'; break;
case 'A'...'Z': to = tolower(c) - 'a'; break;
case '0'...'9': to = 26; break;
default: to = 27; break;
}
fp[32 * from + to]++;
from = to;
}
to = 28; // end
fp[32 * from + to]++;
}
/* Return the distance between two word fingerprints created by function
* make_word_fingerprint(). It's a positive integer calculated as the sum of
* the squares of the differences between each location.
*/
int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2)
{
int i, k, dist = 0;
for (i = 0; i < 1024; i++) {
k = (int)fp1[i] - (int)fp2[i];
dist += k * k;
}
return dist;
}
static int init_tools_per_thread()
{
/* Let's make each thread start from a different position */