mirror of
http://git.haproxy.org/git/haproxy.git/
synced 2025-02-22 13:46:52 +00:00
MINOR: tools: add simple word fingerprinting to find similar-looking words
This introduces two functions, one which creates a fingerprint of a word, and one which computes a distance between two words fingerprints. The fingerprint is made by counting the transitions between one character and another one. Here we consider the 26 alphabetic letters regardless of their case, then any digit as a digit, and anything else as "other". We also consider the first and last locations as transitions from begin to first char, and last char to end. The distance is simply the sum of the squares of the differences between two fingerprints. This way, doubling/ missing a letter has the same cost, however some repeated transitions such as "e"->"r" like in "server" are very unlikely to match against situations where they do not exist. This is a naive approach but it seems to work sufficiently well for now. It may be refined in the future if needed.
This commit is contained in:
parent
133c8c412e
commit
ba2c4459a5
@ -865,6 +865,8 @@ int my_unsetenv(const char *name);
|
||||
char *env_expand(char *in);
|
||||
uint32_t parse_line(char *in, char *out, size_t *outlen, char **args, int *nbargs, uint32_t opts, char **errptr);
|
||||
size_t sanitize_for_printing(char *line, size_t pos, size_t width);
|
||||
void make_word_fingerprint(uint8_t *fp, const char *word);
|
||||
int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2);
|
||||
|
||||
/* debugging macro to emit messages using write() on fd #-1 so that strace sees
|
||||
* them.
|
||||
|
43
src/tools.c
43
src/tools.c
@ -5369,6 +5369,49 @@ size_t sanitize_for_printing(char *line, size_t pos, size_t width)
|
||||
return pos - shift;
|
||||
}
|
||||
|
||||
/* Initialize array <fp> with the fingerprint of word <word> by counting the
|
||||
* transitions between characters. <fp> is a 1024-entries array indexed as
|
||||
* 32*from+to. Positions for 'from' and 'to' are:
|
||||
* 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
|
||||
*/
|
||||
void make_word_fingerprint(uint8_t *fp, const char *word)
|
||||
{
|
||||
const char *p;
|
||||
int from, to;
|
||||
int c;
|
||||
|
||||
memset(fp, 0, 1024);
|
||||
from = 28; // begin
|
||||
for (p = word; *p; p++) {
|
||||
c = tolower(*p);
|
||||
switch(c) {
|
||||
case 'a'...'z': to = c - 'a'; break;
|
||||
case 'A'...'Z': to = tolower(c) - 'a'; break;
|
||||
case '0'...'9': to = 26; break;
|
||||
default: to = 27; break;
|
||||
}
|
||||
fp[32 * from + to]++;
|
||||
from = to;
|
||||
}
|
||||
to = 28; // end
|
||||
fp[32 * from + to]++;
|
||||
}
|
||||
|
||||
/* Return the distance between two word fingerprints created by function
|
||||
* make_word_fingerprint(). It's a positive integer calculated as the sum of
|
||||
* the squares of the differences between each location.
|
||||
*/
|
||||
int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2)
|
||||
{
|
||||
int i, k, dist = 0;
|
||||
|
||||
for (i = 0; i < 1024; i++) {
|
||||
k = (int)fp1[i] - (int)fp2[i];
|
||||
dist += k * k;
|
||||
}
|
||||
return dist;
|
||||
}
|
||||
|
||||
static int init_tools_per_thread()
|
||||
{
|
||||
/* Let's make each thread start from a different position */
|
||||
|
Loading…
Reference in New Issue
Block a user