crush: implement weight and id overrides for straw2

bucket_straw2_choose needs to use weights that may be different from
weight_items. For instance to compensate for an uneven distribution
caused by a low number of values. Or to fix the probability biais
introduced by conditional probabilities (see
http://tracker.ceph.com/issues/15653 for more information).

We introduce a weight_set for each straw2 bucket to set the desired
weight for a given item at a given position. The weight of a given item
when picking the first replica (first position) may be different from
the weight the second replica (second position). For instance the weight
matrix for a given bucket containing items 3, 7 and 13 could be as
follows:

          position 0   position 1

item 3     0x10000      0x100000
item 7     0x40000       0x10000
item 13    0x40000       0x10000

When crush_do_rule picks the first of two replicas (position 0), item 7,
3 are four times more likely to be choosen by bucket_straw2_choose than
item 13. When choosing the second replica (position 1), item 3 is ten
times more likely to be choosen than item 7, 13.

By default the weight_set of each bucket exactly matches the content of
item_weights for each position to ensure backward compatibility.

bucket_straw2_choose compares items by using their id. The same ids are
also used to index buckets and they must be unique. For each item in a
bucket an array of ids can be provided for placement purposes and they
are used instead of the ids. If no replacement ids are provided, the
legacy behavior is preserved.

Signed-off-by: Loic Dachary <loic@dachary.org>
This commit is contained in:
Loic Dachary 2017-04-13 18:14:44 +02:00 committed by Loic Dachary
parent 18245ecd78
commit 19537a450f
5 changed files with 172 additions and 21 deletions

View File

@ -1400,6 +1400,67 @@ int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *b)
} }
} }
struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions)
{
int b;
int sum_bucket_size = 0;
int bucket_count = 0;
for (b = 0; b < map->max_buckets; b++) {
if (map->buckets[b] == 0)
continue;
sum_bucket_size += map->buckets[b]->size;
bucket_count++;
}
dprintk("sum_bucket_size %d max_buckets %d bucket_count %d\n",
sum_bucket_size, map->max_buckets, bucket_count);
int size = (sizeof(struct crush_choose_arg) * map->max_buckets +
sizeof(struct crush_weight_set) * bucket_count * num_positions +
sizeof(__u32) * sum_bucket_size * num_positions + // weights
sizeof(__u32) * sum_bucket_size); // ids
char *space = malloc(size);
struct crush_choose_arg *arg = (struct crush_choose_arg *)space;
struct crush_weight_set *weight_set = (struct crush_weight_set *)(arg + map->max_buckets);
__u32 *weights = (__u32 *)(weight_set + bucket_count * num_positions);
char *weight_set_ends = (char*)weights;
int *ids = (int *)(weights + sum_bucket_size * num_positions);
char *weights_end = (char *)ids;
char *ids_end = (char *)(ids + sum_bucket_size);
BUG_ON(space + size != ids_end);
for (b = 0; b < map->max_buckets; b++) {
if (map->buckets[b] == 0) {
memset(&arg[b], '\0', sizeof(struct crush_choose_arg));
continue;
}
struct crush_bucket_straw2 *bucket = (struct crush_bucket_straw2 *)map->buckets[b];
int position;
for (position = 0; position < num_positions; position++) {
memcpy(weights, bucket->item_weights, sizeof(__u32) * bucket->h.size);
weight_set[position].weights = weights;
weight_set[position].size = bucket->h.size;
dprintk("moving weight %d bytes forward\n", (int)((weights + bucket->h.size) - weights));
weights += bucket->h.size;
}
arg[b].weight_set = weight_set;
arg[b].weight_set_size = num_positions;
weight_set += position;
memcpy(ids, bucket->h.items, sizeof(int) * bucket->h.size);
arg[b].ids = ids;
arg[b].ids_size = bucket->h.size;
ids += bucket->h.size;
}
BUG_ON((char*)weight_set_ends != (char*)weight_set);
BUG_ON((char*)weights_end != (char*)weights);
BUG_ON((char*)ids != (char*)ids_end);
return arg;
}
void crush_destroy_choose_args(struct crush_choose_arg *args)
{
free(args);
}
/***************************/ /***************************/
/* methods to check for safe arithmetic operations */ /* methods to check for safe arithmetic operations */

View File

@ -208,6 +208,8 @@ extern int crush_add_bucket(struct crush_map *map,
* @returns a pointer to the newly created bucket or NULL * @returns a pointer to the newly created bucket or NULL
*/ */
struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights); struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights);
extern struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions);
extern void crush_destroy_choose_args(struct crush_choose_arg *args);
/** @ingroup API /** @ingroup API
* *
* Add __item__ to __bucket__ with __weight__. The weight of the new * Add __item__ to __bucket__ with __weight__. The weight of the new

View File

@ -238,6 +238,61 @@ struct crush_bucket {
__s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */ __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */
}; };
/** @ingroup API
*
* Replacement weights for each item in a bucket. The size of the
* array must be exactly the size of the straw2 bucket, just as the
* item_weights array.
*
*/
struct crush_weight_set {
__u32 *weights; /*!< 16.16 fixed point weights in the same order as items */
__u32 size; /*!< size of the __weights__ array */
};
/** @ingroup API
*
* Replacement weights and ids for a given straw2 bucket, for
* placement purposes.
*
* When crush_do_rule() chooses the Nth item from a straw2 bucket, the
* replacement weights found at __weight_set[N]__ are used instead of
* the weights from __item_weights__. If __N__ is greater than
* __weight_set_size__, the weights found at __weight_set_size-1__ are
* used instead. For instance if __weight_set__ is:
*
* [ [ 0x10000, 0x20000 ], // position 0
* [ 0x20000, 0x40000 ] ] // position 1
*
* choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
* choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
* choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
* etc.
*
*/
struct crush_choose_arg {
int *ids; /*!< values to use instead of items */
__u32 ids_size; /*!< size of the __ids__ array */
struct crush_weight_set *weight_set; /*!< weight replacements for a given position */
__u32 weight_set_size; /*!< size of the __weight_set__ array */
};
/** @ingroup API
*
* Replacement weights and ids for each bucket in the crushmap. The
* __size__ of the __args__ array must be exactly the same as the
* __map->max_buckets__.
*
* The __crush_choose_arg__ at index N will be used when choosing
* an item from the bucket __map->buckets[N]__ bucket, provided it
* is a straw2 bucket.
*
*/
struct crush_choose_arg_map {
struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */
__u32 size; /*!< size of the __args__ array */
};
/** @ingroup API /** @ingroup API
* The weight of each item in the bucket when * The weight of each item in the bucket when
* __h.alg__ == ::CRUSH_BUCKET_UNIFORM. * __h.alg__ == ::CRUSH_BUCKET_UNIFORM.

View File

@ -299,19 +299,40 @@ static __u64 crush_ln(unsigned int xin)
* *
*/ */
static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
const struct crush_choose_arg *arg,
int position)
{
if ((arg == NULL) ||
(arg->weight_set == NULL) ||
(arg->weight_set_size == 0))
return bucket->item_weights;
if (position >= arg->weight_set_size)
position = arg->weight_set_size - 1;
return arg->weight_set[position].weights;
}
static inline int *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
const struct crush_choose_arg *arg)
{
if ((arg == NULL) || (arg->ids == NULL))
return bucket->h.items;
return arg->ids;
}
static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
int x, int r) int x, int r, const struct crush_choose_arg *arg,
int position)
{ {
unsigned int i, high = 0; unsigned int i, high = 0;
unsigned int u; unsigned int u;
unsigned int w;
__s64 ln, draw, high_draw = 0; __s64 ln, draw, high_draw = 0;
__u32 *weights = get_choose_arg_weights(bucket, arg, position);
int *ids = get_choose_arg_ids(bucket, arg);
for (i = 0; i < bucket->h.size; i++) { for (i = 0; i < bucket->h.size; i++) {
w = bucket->item_weights[i]; dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
if (w) { if (weights[i]) {
u = crush_hash32_3(bucket->h.hash, x, u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
bucket->h.items[i], r);
u &= 0xffff; u &= 0xffff;
/* /*
@ -332,7 +353,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
* weight means a larger (less negative) value * weight means a larger (less negative) value
* for draw. * for draw.
*/ */
draw = div64_s64(ln, w); draw = div64_s64(ln, weights[i]);
} else { } else {
draw = S64_MIN; draw = S64_MIN;
} }
@ -349,7 +370,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
static int crush_bucket_choose(const struct crush_bucket *in, static int crush_bucket_choose(const struct crush_bucket *in,
struct crush_work_bucket *work, struct crush_work_bucket *work,
int x, int r) int x, int r,
const struct crush_choose_arg *arg,
int position)
{ {
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
BUG_ON(in->size == 0); BUG_ON(in->size == 0);
@ -371,7 +394,7 @@ static int crush_bucket_choose(const struct crush_bucket *in,
case CRUSH_BUCKET_STRAW2: case CRUSH_BUCKET_STRAW2:
return bucket_straw2_choose( return bucket_straw2_choose(
(const struct crush_bucket_straw2 *)in, (const struct crush_bucket_straw2 *)in,
x, r); x, r, arg, position);
default: default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg); dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0]; return in->items[0];
@ -433,7 +456,8 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int vary_r, unsigned int vary_r,
unsigned int stable, unsigned int stable,
int *out2, int *out2,
int parent_r) int parent_r,
const struct crush_choose_arg *choose_args)
{ {
int rep; int rep;
unsigned int ftotal, flocal; unsigned int ftotal, flocal;
@ -485,7 +509,9 @@ parent_r %d stable %d\n",
else else
item = crush_bucket_choose( item = crush_bucket_choose(
in, work->work[-1-in->id], in, work->work[-1-in->id],
x, r); x, r,
(choose_args ? &choose_args[-1-in->id] : 0),
outpos);
if (item >= map->max_devices) { if (item >= map->max_devices) {
dprintk(" bad item %d\n", item); dprintk(" bad item %d\n", item);
skip_rep = 1; skip_rep = 1;
@ -542,7 +568,8 @@ parent_r %d stable %d\n",
vary_r, vary_r,
stable, stable,
NULL, NULL,
sub_r) <= outpos) sub_r,
choose_args) <= outpos)
/* didn't get leaf */ /* didn't get leaf */
reject = 1; reject = 1;
} else { } else {
@ -619,7 +646,8 @@ static void crush_choose_indep(const struct crush_map *map,
unsigned int recurse_tries, unsigned int recurse_tries,
int recurse_to_leaf, int recurse_to_leaf,
int *out2, int *out2,
int parent_r) int parent_r,
const struct crush_choose_arg *choose_args)
{ {
const struct crush_bucket *in = bucket; const struct crush_bucket *in = bucket;
int endpos = outpos + left; int endpos = outpos + left;
@ -691,7 +719,9 @@ static void crush_choose_indep(const struct crush_map *map,
item = crush_bucket_choose( item = crush_bucket_choose(
in, work->work[-1-in->id], in, work->work[-1-in->id],
x, r); x, r,
(choose_args ? &choose_args[-1-in->id] : 0),
outpos);
if (item >= map->max_devices) { if (item >= map->max_devices) {
dprintk(" bad item %d\n", item); dprintk(" bad item %d\n", item);
out[rep] = CRUSH_ITEM_NONE; out[rep] = CRUSH_ITEM_NONE;
@ -745,7 +775,7 @@ static void crush_choose_indep(const struct crush_map *map,
x, 1, numrep, 0, x, 1, numrep, 0,
out2, rep, out2, rep,
recurse_tries, 0, recurse_tries, 0,
0, NULL, r); 0, NULL, r, choose_args);
if (out2[rep] == CRUSH_ITEM_NONE) { if (out2[rep] == CRUSH_ITEM_NONE) {
/* placed nothing; no leaf */ /* placed nothing; no leaf */
break; break;
@ -854,7 +884,7 @@ void crush_init_workspace(const struct crush_map *m, void *v) {
int crush_do_rule(const struct crush_map *map, int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max, int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max, const __u32 *weight, int weight_max,
void *cwin) void *cwin, const struct crush_choose_arg *choose_args)
{ {
int result_len; int result_len;
struct crush_work *cw = cwin; struct crush_work *cw = cwin;
@ -1009,7 +1039,8 @@ int crush_do_rule(const struct crush_map *map,
vary_r, vary_r,
stable, stable,
c+osize, c+osize,
0); 0,
choose_args);
} else { } else {
out_size = ((numrep < (result_max-osize)) ? out_size = ((numrep < (result_max-osize)) ?
numrep : (result_max-osize)); numrep : (result_max-osize));
@ -1026,7 +1057,8 @@ int crush_do_rule(const struct crush_map *map,
choose_leaf_tries : 1, choose_leaf_tries : 1,
recurse_to_leaf, recurse_to_leaf,
c+osize, c+osize,
0); 0,
choose_args);
osize += out_size; osize += out_size;
} }
} }

View File

@ -67,7 +67,8 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i
* @param result_max the size of the __result__ array * @param result_max the size of the __result__ array
* @param weights an array of weights of size __weight_max__ * @param weights an array of weights of size __weight_max__
* @param weight_max the size of the __weights__ array * @param weight_max the size of the __weights__ array
* @param cwin must be the value of crush_work_size(__map__, __result_max__) * @param cwin must be an char array initialized by crush_init_workspace
* @param choose_args weights and ids for each known bucket
* *
* @return 0 on error or the size of __result__ on success * @return 0 on error or the size of __result__ on success
*/ */
@ -75,7 +76,7 @@ extern int crush_do_rule(const struct crush_map *map,
int ruleno, int ruleno,
int x, int *result, int result_max, int x, int *result, int result_max,
const __u32 *weights, int weight_max, const __u32 *weights, int weight_max,
void *cwin); void *cwin, const struct crush_choose_arg *choose_args);
/* Returns the exact amount of workspace that will need to be used /* Returns the exact amount of workspace that will need to be used
for a given combination of crush_map and result_max. The caller can for a given combination of crush_map and result_max. The caller can