crush: comments, cleanup

This commit is contained in:
Sage Weil 2008-10-22 15:26:04 -07:00
parent 44c9462b2d
commit e077a45d86
10 changed files with 383 additions and 276 deletions

View File

@ -1,4 +1,4 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#ifndef __CRUSH_WRAPPER_H
@ -66,12 +66,12 @@ private:
for (std::map<int, string>::iterator p = f.begin(); p != f.end(); p++)
r[p->second] = p->first;
}
public:
CrushWrapper() : crush(0), have_rmaps(false) {}
~CrushWrapper() {
if (crush) crush_destroy(crush);
}
}
/* building */
void create() {
@ -109,7 +109,7 @@ public:
if (name_rmap.count(name))
return name_rmap[name];
return 0; /* hrm */
}
}
const char *get_item_name(int t) {
if (name_map.count(t))
return name_map[t].c_str();
@ -129,7 +129,7 @@ public:
if (rule_name_rmap.count(name))
return rule_name_rmap[name];
return 0; /* hrm */
}
}
const char *get_rule_name(int t) {
if (rule_name_map.count(t))
return rule_name_map[t].c_str();
@ -152,13 +152,13 @@ public:
if (d >= crush->max_devices) return -1;
return crush->device_offload[d];
}
/*** rules ***/
private:
crush_rule *get_rule(unsigned ruleno) {
if (!crush) return (crush_rule *)(-ENOENT);
if (ruleno >= crush->max_rules)
if (ruleno >= crush->max_rules)
return 0;
return crush->rules[ruleno];
}
@ -255,15 +255,15 @@ public:
int set_rule_step_emit(unsigned ruleno, unsigned step) {
return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0);
}
/** buckets **/
private:
crush_bucket *get_bucket(int id) {
if (!crush) return (crush_bucket *)(-ENOENT);
int pos = -1 - id;
if ((unsigned)pos >= crush->max_buckets) return 0;
if (pos >= crush->max_buckets) return 0;
return crush->buckets[pos];
}
@ -318,7 +318,7 @@ public:
crush_bucket *b = crush_make_bucket(alg, type, size, items, weights);
return crush_add_bucket(crush, bucketno, b);
}
void finalize() {
assert(crush);
crush_finalize(crush);
@ -342,7 +342,7 @@ public:
}
void do_rule(int rule, int x, vector<int>& out, int maxout, int forcefeed) {
int rawout[maxout];
int numrep = crush_do_rule(crush, rule, x, rawout, maxout, forcefeed);
out.resize(numrep);
@ -356,13 +356,13 @@ public:
for (map<int,double>::iterator p = weights.begin(); p != weights.end(); p++)
if (p->second > max)
max = p->second;
for (map<int,double>::iterator p = weights.begin(); p != weights.end(); p++) {
unsigned w = 0x10000 - (unsigned)(p->second / max * 0x10000);
set_offload(p->first, w);
}
}
int read_from_file(const char *fn) {
@ -390,7 +390,7 @@ public:
::encode(crush->device_offload[i], bl);
// buckets
for (unsigned i=0; i<crush->max_buckets; i++) {
for (int i=0; i<crush->max_buckets; i++) {
__u32 alg = 0;
if (crush->buckets[i]) alg = crush->buckets[i]->alg;
::encode(alg, bl);
@ -403,7 +403,7 @@ public:
::encode(crush->buckets[i]->size, bl);
for (unsigned j=0; j<crush->buckets[i]->size; j++)
::encode(crush->buckets[i]->items[j], bl);
switch (crush->buckets[i]->alg) {
case CRUSH_BUCKET_UNIFORM:
for (unsigned j=0; j<crush->buckets[i]->size; j++)
@ -419,7 +419,7 @@ public:
break;
case CRUSH_BUCKET_TREE:
for (unsigned j=0; j<crush->buckets[i]->size; j++)
for (unsigned j=0; j<crush->buckets[i]->size; j++)
::encode(((crush_bucket_tree*)crush->buckets[i])->node_weights[j], bl);
break;
@ -459,10 +459,10 @@ public:
crush->device_offload = (__u32*)malloc(sizeof(crush->device_offload[0])*crush->max_devices);
for (int i=0; i < crush->max_devices; i++)
::decode(crush->device_offload[i], blp);
// buckets
crush->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*crush->max_buckets);
for (unsigned i=0; i<crush->max_buckets; i++) {
for (int i=0; i<crush->max_buckets; i++) {
__u32 alg;
::decode(alg, blp);
if (!alg) {
@ -489,7 +489,7 @@ public:
}
crush->buckets[i] = (crush_bucket*)malloc(size);
memset(crush->buckets[i], 0, size);
::decode(crush->buckets[i]->id, blp);
::decode(crush->buckets[i]->type, blp);
::decode(crush->buckets[i]->alg, blp);
@ -502,7 +502,7 @@ public:
switch (crush->buckets[i]->alg) {
case CRUSH_BUCKET_UNIFORM:
((crush_bucket_uniform*)crush->buckets[i])->primes =
((crush_bucket_uniform*)crush->buckets[i])->primes =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<crush->buckets[i]->size; j++)
::decode(((crush_bucket_uniform*)crush->buckets[i])->primes[j], blp);
@ -510,9 +510,9 @@ public:
break;
case CRUSH_BUCKET_LIST:
((crush_bucket_list*)crush->buckets[i])->item_weights =
((crush_bucket_list*)crush->buckets[i])->item_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
((crush_bucket_list*)crush->buckets[i])->sum_weights =
((crush_bucket_list*)crush->buckets[i])->sum_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<crush->buckets[i]->size; j++) {
@ -522,16 +522,16 @@ public:
break;
case CRUSH_BUCKET_TREE:
((crush_bucket_tree*)crush->buckets[i])->node_weights =
((crush_bucket_tree*)crush->buckets[i])->node_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<crush->buckets[i]->size; j++)
for (unsigned j=0; j<crush->buckets[i]->size; j++)
::decode(((crush_bucket_tree*)crush->buckets[i])->node_weights[j], blp);
break;
case CRUSH_BUCKET_STRAW:
((crush_bucket_straw*)crush->buckets[i])->straws =
((crush_bucket_straw*)crush->buckets[i])->straws =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
((crush_bucket_straw*)crush->buckets[i])->item_weights =
((crush_bucket_straw*)crush->buckets[i])->item_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<crush->buckets[i]->size; j++) {
::decode(((crush_bucket_straw*)crush->buckets[i])->item_weights[j], blp);

View File

@ -2,5 +2,5 @@
#include "crush.h"
#include "hash.h"
int
int

View File

@ -24,21 +24,21 @@ struct crush_map *crush_create()
void crush_finalize(struct crush_map *map)
{
int b, i;
/* calc max_devices */
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
for (i=0; i<map->buckets[b]->size; i++)
for (i=0; i<map->buckets[b]->size; i++)
if (map->buckets[b]->items[i] >= map->max_devices)
map->max_devices = map->buckets[b]->items[i] + 1;
}
/* allocate arrays */
map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices);
memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices);
map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets);
memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets);
/* build parent maps */
crush_calc_parents(map);
@ -70,7 +70,7 @@ int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0]));
memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
}
/* add it */
map->rules[ruleno] = rule;
return ruleno;
@ -105,7 +105,7 @@ int crush_get_next_bucket_id(struct crush_map *map)
{
int pos;
for (pos=0; pos < map->max_buckets; pos++)
if (map->buckets[pos] == 0)
if (map->buckets[pos] == 0)
break;
return -1 - pos;
}
@ -119,7 +119,7 @@ int crush_add_bucket(struct crush_map *map,
int pos;
/* find a bucket id */
if (id == 0)
if (id == 0)
id = crush_get_next_bucket_id(map);
pos = -1 - id;
@ -153,20 +153,20 @@ crush_make_uniform_bucket(int type, int size,
{
int i, j, x;
struct crush_bucket_uniform *bucket;
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.alg = CRUSH_BUCKET_UNIFORM;
bucket->h.type = type;
bucket->h.size = size;
bucket->h.weight = size * item_weight;
bucket->item_weight = item_weight;
bucket->h.items = malloc(sizeof(__u32)*size);
for (i=0; i<size; i++)
bucket->h.items[i] = items[i];
/* generate some primes */
bucket->primes = malloc(sizeof(__u32)*size);
@ -177,12 +177,12 @@ crush_make_uniform_bucket(int type, int size,
x = size + 1;
x += crush_hash32(size) % (3*size); /* make it big */
x |= 1; /* and odd */
i=0;
while (i < size) {
for (j=2; j*j <= x; j++)
for (j=2; j*j <= x; j++)
if (x % j == 0) break;
if (j*j > x)
if (j*j > x)
bucket->primes[i++] = x;
x += 2;
}
@ -207,7 +207,7 @@ crush_make_list_bucket(int type, int size,
bucket->h.alg = CRUSH_BUCKET_LIST;
bucket->h.type = type;
bucket->h.size = size;
bucket->h.items = malloc(sizeof(__u32)*size);
bucket->item_weights = malloc(sizeof(__u32)*size);
bucket->sum_weights = malloc(sizeof(__u32)*size);
@ -215,7 +215,7 @@ crush_make_list_bucket(int type, int size,
/*
* caller will place new items at end. so, we reverse things,
* since we put new items at the beginning.
*/
*/
for (i=0; i<size; i++) {
int pos = size - i - 1;
bucket->h.items[pos] = items[i];
@ -225,7 +225,7 @@ crush_make_list_bucket(int type, int size,
/*printf("%d item %d weight %d sum %d\n",
i, items[i], weights[i], bucket->sum_weights[i]);*/
}
bucket->h.weight = w;
return bucket;
@ -237,15 +237,15 @@ crush_make_list_bucket(int type, int size,
static int height(int n) {
int h = 0;
while ((n & 1) == 0) {
h++;
h++;
n = n >> 1;
}
return h;
}
static int on_right(int n, int h) {
return n & (1 << (h+1));
static int on_right(int n, int h) {
return n & (1 << (h+1));
}
static int parent(int n)
static int parent(int n)
{
int h = height(n);
if (on_right(n, h))
@ -263,7 +263,7 @@ crush_make_tree_bucket(int type, int size,
int depth;
int node;
int t, i, j;
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.alg = CRUSH_BUCKET_TREE;
@ -283,7 +283,7 @@ crush_make_tree_bucket(int type, int size,
memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size);
memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size);
for (i=0; i<size; i++) {
node = ((i+1) << 1)-1;
bucket->h.items[node] = items[i];
@ -304,7 +304,7 @@ crush_make_tree_bucket(int type, int size,
/* straw bucket */
struct crush_bucket_straw *
crush_make_straw_bucket(int type,
crush_make_straw_bucket(int type,
int size,
int *items,
int *weights)
@ -312,27 +312,27 @@ crush_make_straw_bucket(int type,
struct crush_bucket_straw *bucket;
int *reverse;
int i, j, k;
double straw, wbelow, lastw, wnext, pbelow;
int numleft;
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.alg = CRUSH_BUCKET_STRAW;
bucket->h.type = type;
bucket->h.size = size;
bucket->h.items = malloc(sizeof(__u32)*size);
bucket->item_weights = malloc(sizeof(__u32)*size);
bucket->straws = malloc(sizeof(__u32)*size);
bucket->h.weight = 0;
for (i=0; i<size; i++) {
bucket->h.items[i] = items[i];
bucket->h.weight += weights[i];
bucket->item_weights[i] = weights[i];
}
/* reverse sort by weight (simple insertion sort) */
reverse = malloc(sizeof(int) * size);
reverse[0] = 0;
@ -349,28 +349,28 @@ crush_make_straw_bucket(int type,
if (j == i)
reverse[i] = i;
}
numleft = size;
straw = 1.0;
wbelow = 0;
lastw = 0;
i=0;
while (i < size) {
/* set this item's straw */
bucket->straws[reverse[i]] = straw * 0x10000;
/*printf("item %d at %d weight %d straw %d (%lf)\n",
/*printf("item %d at %d weight %d straw %d (%lf)\n",
items[reverse[i]],
reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/
i++;
if (i == size) break;
/* same weight as previous? */
if (weights[reverse[i]] == weights[reverse[i-1]]) {
/*printf("same as previous\n");*/
continue;
}
/* adjust straw for next guy */
wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft;
for (j=i; j<size; j++)
@ -381,14 +381,14 @@ crush_make_straw_bucket(int type,
wnext = numleft * (weights[reverse[i]] - weights[reverse[i-1]]);
pbelow = wbelow / (wbelow + wnext);
/*printf("wbelow %lf wnext %lf pbelow %lf\n", wbelow, wnext, pbelow);*/
straw *= pow((double)1.0 / pbelow, (double)1.0 / (double)numleft);
lastw = weights[reverse[i-1]];
}
free(reverse);
return bucket;
}
@ -408,15 +408,15 @@ crush_make_bucket(int alg, int type, int size,
else
item_weight = 0;
return (struct crush_bucket *)crush_make_uniform_bucket(type, size, items, item_weight);
case CRUSH_BUCKET_LIST:
return (struct crush_bucket *)crush_make_list_bucket(type, size, items, weights);
case CRUSH_BUCKET_TREE:
return (struct crush_bucket *)crush_make_tree_bucket(type, size, items, weights);
case CRUSH_BUCKET_STRAW:
return (struct crush_bucket *)crush_make_straw_bucket(type, size, items, weights);
}
}
return 0;
}

View File

@ -5,40 +5,52 @@
#else
# include <stdlib.h>
# include <assert.h>
# define kfree(x) free(x)
# define kfree(x) do { if (x) free(x); } while (0)
# define BUG_ON(x) assert(!(x))
#endif
#include "crush.h"
int crush_get_bucket_item_weight(struct crush_bucket *b, int pos)
/**
* crush_get_bucket_item_weight - Get weight of an item in given bucket
* @b: bucket pointer
* @p: item index in bucket
*/
int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
{
if (pos >= b->size)
if (p >= b->size)
return 0;
switch (b->alg) {
switch (b->alg) {
case CRUSH_BUCKET_UNIFORM:
return ((struct crush_bucket_uniform*)b)->item_weight;
case CRUSH_BUCKET_LIST:
return ((struct crush_bucket_list*)b)->item_weights[pos];
case CRUSH_BUCKET_TREE:
if (pos & 1)
return ((struct crush_bucket_tree*)b)->node_weights[pos];
return ((struct crush_bucket_list*)b)->item_weights[p];
case CRUSH_BUCKET_TREE:
if (p & 1)
return ((struct crush_bucket_tree*)b)->node_weights[p];
return 0;
case CRUSH_BUCKET_STRAW:
return ((struct crush_bucket_straw*)b)->item_weights[pos];
return ((struct crush_bucket_straw*)b)->item_weights[p];
}
return 0;
}
/**
* crush_calc_parents - Calculate parent vectors for the given crush map.
* @map: crush_map pointer
*/
void crush_calc_parents(struct crush_map *map)
{
int i, b, c;
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == NULL) continue;
for (i=0; i<map->buckets[b]->size; i++) {
for (b = 0; b < map->max_buckets; b++) {
if (map->buckets[b] == NULL)
continue;
for (i = 0; i < map->buckets[b]->size; i++) {
c = map->buckets[b]->items[i];
BUG_ON(c >= map->max_devices);
BUG_ON(c >= map->max_devices ||
c < -map->max_buckets);
if (c >= 0)
map->device_parents[c] = map->buckets[b]->id;
else
@ -76,51 +88,52 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
kfree(b);
}
void crush_destroy_bucket(struct crush_bucket *b)
{
switch (b->alg) {
case CRUSH_BUCKET_UNIFORM:
crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
break;
case CRUSH_BUCKET_LIST:
crush_destroy_bucket_list((struct crush_bucket_list *)b);
break;
case CRUSH_BUCKET_TREE:
crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
break;
case CRUSH_BUCKET_STRAW:
crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
break;
}
}
/*
* deallocate
/**
* crush_destroy - Destroy a crush_map
* @map: crush_map pointer
*/
void crush_destroy(struct crush_map *map)
{
int b;
/* buckets */
if (map->buckets) {
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == NULL) continue;
switch (map->buckets[b]->alg) {
case CRUSH_BUCKET_UNIFORM:
crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]);
break;
case CRUSH_BUCKET_LIST:
crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]);
break;
case CRUSH_BUCKET_TREE:
crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]);
break;
case CRUSH_BUCKET_STRAW:
crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]);
break;
}
for (b = 0; b < map->max_buckets; b++) {
if (map->buckets[b] == NULL)
continue;
crush_destroy_bucket(map->buckets[b]);
}
kfree(map->buckets);
}
/* rules */
if (map->rules) {
for (b=0; b<map->max_rules; b++) {
if (map->rules[b] == NULL) continue;
for (b = 0; b < map->max_rules; b++)
kfree(map->rules[b]);
}
kfree(map->rules);
}
if (map->bucket_parents)
kfree(map->bucket_parents);
if (map->device_parents)
kfree(map->device_parents);
if (map->device_offload)
kfree(map->device_offload);
kfree(map->bucket_parents);
kfree(map->device_parents);
kfree(map->device_offload);
kfree(map);
}

View File

@ -3,7 +3,34 @@
#include <linux/types.h>
/*** RULES ***/
/*
* CRUSH is a pseudo-random data distribution algorithm that
* efficiently distributes input values (typically, data objects)
* across a heterogeneous, structured storage cluster.
*
* The algorithm was originally described in detail in this paper
* (although the algorithm has evolved somewhat since then):
*
* http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
*/
#define CRUSH_MAX_DEPTH 10
#define CRUSH_MAX_SET 10
/*
* CRUSH uses user-defined "rules" to describe how inputs should be
* mapped to devices. A rule consists of sequence of steps to perform
* to generate the set of output devices.
*/
struct crush_rule_step {
__u32 op;
__s32 arg1;
__s32 arg2;
};
/* step op codes */
enum {
CRUSH_RULE_NOOP = 0,
CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
@ -15,22 +42,18 @@ enum {
CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
};
#define CRUSH_MAX_DEPTH 10
#define CRUSH_MAX_SET 10
/*
* for specifying choose numrep relative to the max
* parameter passed to do_rule
* for specifying choose num (arg1) relative to the max parameter
* passed to do_rule
*/
#define CRUSH_CHOOSE_N 0
#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
struct crush_rule_step {
__u32 op;
__s32 arg1;
__s32 arg2;
};
/*
* The rule mask is used to describe what the rule is intended for.
* Given a storage pool and size of output set, we search through the
* rule list for a matching rule_mask.
*/
struct crush_rule_mask {
__u8 pool;
__u8 type;
@ -49,9 +72,20 @@ struct crush_rule {
/*** BUCKETS ***/
/* bucket algorithms */
/*
* A bucket is a named container of other items (either devices or
* other buckets). Items within a bucket are chosen using one of a
* few different algorithms. The table summarizes how the speed of
* each option measures up against mapping stability when items are
* added or removed.
*
* Bucket Alg Speed Additions Removals
* ------------------------------------------------
* uniform O(1) poor poor
* list O(n) optimal poor
* tree O(log n) good good
* straw O(n) optimal optimal
*/
enum {
CRUSH_BUCKET_UNIFORM = 1,
CRUSH_BUCKET_LIST = 2,
@ -70,7 +104,7 @@ static inline const char *crush_bucket_alg_name(int alg) {
struct crush_bucket {
__s32 id; /* this'll be negative */
__u16 type; /* non-zero; 0 is reserved for devices */
__u16 type; /* non-zero; type=0 is reserved for devices */
__u16 alg; /* one of CRUSH_BUCKET_* */
__u32 weight; /* 16-bit fixed point */
__u32 size; /* num items */
@ -80,58 +114,69 @@ struct crush_bucket {
struct crush_bucket_uniform {
struct crush_bucket h;
__u32 *primes;
__u32 item_weight; /* 16-bit fixed point */
__u32 item_weight; /* 16-bit fixed point; all items equally weighted */
};
struct crush_bucket_list {
struct crush_bucket h;
__u32 *item_weights; /* 16-bit fixed point */
__u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */
__u32 *sum_weights; /* 16-bit fixed point. element i is sum
of weights 0..i, inclusive */
};
struct crush_bucket_tree {
struct crush_bucket h; /* note: h.size is tree size, not number of actual items */
struct crush_bucket h; /* note: h.size is _tree_ size, not number of
actual items */
__u32 *node_weights;
};
struct crush_bucket_straw {
struct crush_bucket h;
__u32 *item_weights;
__u32 *straws; /* 16-bit fixed point */
__u32 *item_weights; /* 16-bit fixed point */
__u32 *straws; /* 16-bit fixed point */
};
/*** CRUSH ***/
/*
* CRUSH map includes all buckets, rules, etc.
*/
struct crush_map {
struct crush_bucket **buckets;
struct crush_rule **rules;
/* parent pointers */
/*
* Parent pointers to identify the parent bucket a device or
* bucket in the hierarchy. If an item appears more than
* once, this is the _last_ time it appeared (where buckets
* are processed in bucket id order, from -1 on down to
* -max_buckets.
*/
__u32 *bucket_parents;
__u32 *device_parents;
/* offload
* size max_devices, values 0...0xffff
/*
* device offload.
* size max_devices, values 0..0x10000
* 0 == normal
* 0x10000 == 100% offload (i.e. failed)
*/
__u32 *device_offload;
__u32 max_buckets;
__u32 *device_offload;
__s32 max_buckets;
__u32 max_rules;
__s32 max_devices;
};
/* common */
/* crush.c */
extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
extern void crush_calc_parents(struct crush_map *m);
extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *);
extern void crush_destroy_bucket_list(struct crush_bucket_list *);
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *);
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *);
extern void crush_calc_parents(struct crush_map *map);
extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
extern void crush_destroy_bucket(struct crush_bucket *b);
extern void crush_destroy(struct crush_map *map);
#endif

View File

@ -1,4 +1,4 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
@ -7,9 +7,9 @@
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*
*/
#ifndef __CRUSH_GRAMMAR
@ -70,7 +70,7 @@ struct crush_grammar : public grammar<crush_grammar>
rule<ScannerT, parser_context<>, parser_tag<_crushrule> > crushrule;
rule<ScannerT, parser_context<>, parser_tag<_crushmap> > crushmap;
definition(crush_grammar const& /*self*/)
{
// base types
@ -86,21 +86,21 @@ struct crush_grammar : public grammar<crush_grammar>
>> !( ( str_p("offload") >> real_p ) |
( str_p("load") >> real_p ) |
str_p("down"));
// bucket types
bucket_type = str_p("type") >> posint >> name;
// buckets
bucket_id = str_p("id") >> negint;
bucket_alg = str_p("alg") >> ( str_p("uniform") |
str_p("list") |
str_p("tree") |
bucket_alg = str_p("alg") >> ( str_p("uniform") |
str_p("list") |
str_p("tree") |
str_p("straw") );
bucket_item = str_p("item") >> name
>> !( str_p("weight") >> real_p )
>> !( str_p("pos") >> posint );
bucket = name >> name >> '{' >> !bucket_id >> bucket_alg >> *bucket_item >> '}';
// rules
step_take = str_p("take") >> name;
step_choose = str_p("choose")
@ -112,11 +112,11 @@ struct crush_grammar : public grammar<crush_grammar>
>> integer
>> str_p("type") >> name;
step_emit = str_p("emit");
step = str_p("step") >> ( step_take |
step_choose |
step_chooseleaf |
step = str_p("step") >> ( step_take |
step_choose |
step_chooseleaf |
step_emit );
crushrule = str_p("rule") >> !name >> '{'
crushrule = str_p("rule") >> !name >> '{'
>> str_p("pool") >> posint
>> str_p("type") >> ( str_p("replicated") | str_p("raid4") )
>> str_p("min_size") >> posint
@ -127,8 +127,8 @@ struct crush_grammar : public grammar<crush_grammar>
// the whole crush map
crushmap = *(device | bucket_type) >> *bucket >> *crushrule;
}
rule<ScannerT, parser_context<>, parser_tag<_crushmap> > const&
rule<ScannerT, parser_context<>, parser_tag<_crushmap> > const&
start() const { return crushmap; }
};
};

View File

@ -5,15 +5,15 @@
// http://burtleburtle.net/bob/hash/evahash.html
// a, b = random bits, c = input and output
#define hashmix(a,b,c) \
a=a-b; a=a-c; a=a^(c>>13); \
b=b-c; b=b-a; b=b^(a<<8); \
c=c-a; c=c-b; c=c^(b>>13); \
a=a-b; a=a-c; a=a^(c>>12); \
b=b-c; b=b-a; b=b^(a<<16); \
c=c-a; c=c-b; c=c^(b>>5); \
a=a-b; a=a-c; a=a^(c>>3); \
b=b-c; b=b-a; b=b^(a<<10); \
c=c-a; c=c-b; c=c^(b>>15);
a=a-b; a=a-c; a=a^(c>>13); \
b=b-c; b=b-a; b=b^(a<<8); \
c=c-a; c=c-b; c=c^(b>>13); \
a=a-b; a=a-c; a=a^(c>>12); \
b=b-c; b=b-a; b=b^(a<<16); \
c=c-a; c=c-b; c=c^(b>>5); \
a=a-b; a=a-c; a=a^(c>>3); \
b=b-c; b=b-a; b=b^(a<<10); \
c=c-a; c=c-b; c=c^(b>>15);
#define crush_hash_seed 1315423911

View File

@ -15,10 +15,17 @@
#include "hash.h"
/**
* crush_find_rule - find a crush_rule id for a given pool, type, and size.
* @map: the crush_map
* @pool: the storage pool id (user defined)
* @type: storage pool type (user defined)
* @size: output set size
*/
int crush_find_rule(struct crush_map *map, int pool, int type, int size)
{
int i;
for (i = 0; i < map->max_rules; i++) {
if (map->rules[i] &&
map->rules[i]->mask.pool == pool &&
@ -31,34 +38,37 @@ int crush_find_rule(struct crush_map *map, int pool, int type, int size)
}
/** bucket choose methods **/
/*
* bucket choose methods
*
* For each bucket algorithm, we have a "choose" method that, given a
* crush input @x and replica position (usually, position in output set) @r,
* will produce an item in the bucket.
*/
/* uniform */
static int
crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
int x, int r)
{
unsigned o, p, s;
o = crush_hash32_2(x, bucket->h.id) & 0xffff;
p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size];
s = (x + o + (r+1)*p) % bucket->h.size;
unsigned o = crush_hash32_2(x, bucket->h.id) & 0xffff;
unsigned p = bucket->primes[crush_hash32_2(bucket->h.id, x) %
bucket->h.size];
unsigned s = (x + o + (r+1)*p) % bucket->h.size;
/*printf("%d %d %d %d\n", x, o, r, p);*/
return bucket->h.items[s];
}
/* list */
static int
crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
static int bucket_list_choose(struct crush_bucket_list *bucket,
int x, int r)
{
int i;
__u64 w;
for (i=0; i<bucket->h.size; i++) {
w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id);
for (i = 0; i < bucket->h.size; i++) {
__u64 w = crush_hash32_4(x, bucket->h.items[i], r,
bucket->h.id);
w &= 0xffff;
/*printf("%d x %d item %d weight %d sum_weight %d r %lld",
/*printf("%d x %d item %d weight %d sum_weight %d r %lld",
i, x, bucket->h.items[i], bucket->item_weights[i], bucket->sum_weights[i], w);*/
w *= bucket->sum_weights[i];
w = w >> 16;
@ -66,36 +76,38 @@ crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
if (w < bucket->item_weights[i])
return bucket->h.items[i];
}
BUG_ON(1);
return 0;
}
/* tree */
static int height(int n) {
int h = 0;
while ((n & 1) == 0) {
h++;
h++;
n = n >> 1;
}
return h;
}
static int left(int x) {
int h = height(x);
return x - (1 << (h-1));
}
static int right(int x) {
int h = height(x);
return x + (1 << (h-1));
}
static int terminal(int x) {
return x & 1;
}
static int
crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
int x, int r)
{
int n, l;
__u32 w;
@ -109,8 +121,8 @@ crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
w = bucket->node_weights[n];
t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
t = t >> 32;
/* left or right? */
/* descend to the left or right? */
l = left(n);
if (t < bucket->node_weights[l])
n = l;
@ -124,15 +136,15 @@ crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
/* straw */
static int
crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
static int bucket_straw_choose(struct crush_bucket_straw *bucket,
int x, int r)
{
int i;
int high = 0;
__u64 high_draw = 0;
__u64 draw;
for (i=0; i<bucket->h.size; i++) {
for (i = 0; i < bucket->h.size; i++) {
draw = crush_hash32_3(x, bucket->h.items[i], r);
draw &= 0xffff;
draw *= bucket->straws[i];
@ -141,34 +153,62 @@ crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
high_draw = draw;
}
}
return bucket->h.items[high];
}
static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
{
switch (in->alg) {
case CRUSH_BUCKET_UNIFORM:
return bucket_uniform_choose((struct crush_bucket_uniform *)in,
x, r);
case CRUSH_BUCKET_LIST:
return bucket_list_choose((struct crush_bucket_list *)in, x, r);
case CRUSH_BUCKET_TREE:
return bucket_tree_choose((struct crush_bucket_tree *)in, x, r);
case CRUSH_BUCKET_STRAW:
return bucket_straw_choose((struct crush_bucket_straw *)in,
x, r);
default:
BUG_ON(1);
return in->items[0];
}
}
/** crush proper **/
/*
* true if device is marked "out" (failed, fully offloaded)
* of the cluster
*/
static int is_out(struct crush_map *map, int item, int x)
{
if (map->device_offload[item]) {
if (map->device_offload[item] >= 0x10000)
if (map->device_offload[item] >= 0x10000)
return 1;
else if ((crush_hash32_2(x, item) & 0xffff) < map->device_offload[item])
else if ((crush_hash32_2(x, item) & 0xffff) <
map->device_offload[item])
return 1;
}
return 0;
}
/*
* choose numrep distinct items of given type
/**
* crush_choose - choose numrep distinct items of given type
* @map: the crush_map
* @bucket: the bucket we are choose an item from
* @x: crush input value
* @numrep: the number of items to choose
* @type: the type of item to choose
* @out: pointer to output vector
* @outpos: our position in that vector
* @firstn: true if choosing "first n" items, false if choosing "indep"
* @recurse_to_leaf: true if we want one device under each item of given type
* @out2: second output vector for leaf items (if @recurse_to_leaf)
*/
static int crush_choose(struct crush_map *map,
struct crush_bucket *bucket,
int x, int numrep, int type,
int *out, int outpos,
int *out, int outpos,
int firstn, int recurse_to_leaf,
int *out2)
{
@ -181,7 +221,7 @@ static int crush_choose(struct crush_map *map,
int item;
int itemtype;
int collide, reject;
for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
@ -189,7 +229,7 @@ static int crush_choose(struct crush_map *map,
do {
retry_descent = 0;
in = bucket; /* initial bucket */
/* choose through intervening buckets */
flocal = 0;
do {
@ -197,65 +237,55 @@ static int crush_choose(struct crush_map *map,
r = rep;
if (in->alg == CRUSH_BUCKET_UNIFORM) {
/* be careful */
if (firstn || numrep >= in->size)
r += ftotal; /* r' = r + f_total */
if (firstn || numrep >= in->size)
/* r' = r + f_total */
r += ftotal;
else if (in->size % numrep == 0)
r += (numrep+1) * flocal; /* r'=r+(n+1)*f_local */
/* r'=r+(n+1)*f_local */
r += (numrep+1) * flocal;
else
r += numrep * flocal; /* r' = r + n*f_local */
/* r' = r + n*f_local */
r += numrep * flocal;
} else {
if (firstn)
r += ftotal; /* r' = r + f_total */
else
r += numrep * flocal; /* r' = r + n*f_local */
if (firstn)
/* r' = r + f_total */
r += ftotal;
else
/* r' = r + n*f_local */
r += numrep * flocal;
}
/* bucket choose */
switch (in->alg) {
case CRUSH_BUCKET_UNIFORM:
item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r);
break;
case CRUSH_BUCKET_LIST:
item = crush_bucket_list_choose((struct crush_bucket_list*)in, x, r);
break;
case CRUSH_BUCKET_TREE:
item = crush_bucket_tree_choose((struct crush_bucket_tree*)in, x, r);
break;
case CRUSH_BUCKET_STRAW:
item = crush_bucket_straw_choose((struct crush_bucket_straw*)in, x, r);
break;
default:
BUG_ON(1);
item = in->items[0];
}
item = crush_bucket_choose(in, x, r);
BUG_ON(item >= map->max_devices);
/* desired type? */
if (item < 0)
if (item < 0)
itemtype = map->buckets[-1-item]->type;
else
else
itemtype = 0;
/* keep going? */
if (itemtype != type) {
BUG_ON(item >= 0 || (-1-item) >= map->max_buckets);
BUG_ON(item >= 0 ||
(-1-item) >= map->max_buckets);
in = map->buckets[-1-item];
continue;
}
/* collision? */
collide = 0;
for (i=0; i<outpos; i++) {
for (i = 0; i < outpos; i++) {
if (out[i] == item) {
collide = 1;
break;
}
}
/* out? */
if (itemtype == 0)
if (itemtype == 0)
reject = is_out(map, item, x);
else
else
reject = 0;
if (recurse_to_leaf &&
@ -264,34 +294,46 @@ static int crush_choose(struct crush_map *map,
out2+outpos, 0,
firstn, 0, NULL))
reject = 1;
if (reject || collide) {
ftotal++;
flocal++;
if (collide && flocal < 3)
retry_bucket = 1; /* retry locally a few times */
if (collide && flocal < 3)
/* retry locally a few times */
retry_bucket = 1;
else if (ftotal < 10)
retry_descent = 1; /* then retry descent */
/* then retry descent */
retry_descent = 1;
else
skip_rep = 1; /* else give up */
/* else give up */
skip_rep = 1;
}
} while (retry_bucket);
} while (retry_descent);
if (skip_rep) continue;
out[outpos] = item;
outpos++;
}
return outpos;
}
/**
* crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map
* @ruleno: the rule id
* @x: hash input
* @result: pointer to result vector
* @result_max: maximum result size
* @force: force initial replica choice; -1 for none
*/
int crush_do_rule(struct crush_map *map,
int ruleno, int x, int *result, int result_max,
int force) /* -1 for none */
int force)
{
int result_len;
int force_context[CRUSH_MAX_DEPTH];
@ -309,7 +351,7 @@ int crush_do_rule(struct crush_map *map,
int step;
int i,j;
int numrep;
BUG_ON(ruleno >= map->max_rules);
rule = map->rules[ruleno];
result_len = 0;
@ -340,7 +382,7 @@ int crush_do_rule(struct crush_map *map,
}
}
}
for (step = 0; step < rule->len; step++) {
switch (rule->steps[step].op) {
case CRUSH_RULE_TAKE:
@ -351,23 +393,23 @@ int crush_do_rule(struct crush_map *map,
}
wsize = 1;
break;
case CRUSH_RULE_CHOOSE_FIRSTN:
case CRUSH_RULE_CHOOSE_INDEP:
case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_LEAF_INDEP:
BUG_ON(wsize == 0);
/* reset output */
osize = 0;
recurse_to_leaf = rule->steps[step].op >=
CRUSH_RULE_CHOOSE_LEAF_FIRSTN;
for (i = 0; i < wsize; i++) {
/*
* see CRUSH_N, CRUSH_N_MINUS macros.
* basically, numrep <= 0 means relative to
* the provided result_max
* the provided result_max
*/
numrep = rule->steps[step].arg1;
if (numrep <= 0) {
@ -391,8 +433,11 @@ int crush_do_rule(struct crush_map *map,
}
osize += crush_choose(map,
map->buckets[-1-w[i]],
x, numrep, rule->steps[step].arg2,
o+osize, j, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN,
x, numrep,
rule->steps[step].arg2,
o+osize, j,
rule->steps[step].op ==
CRUSH_RULE_CHOOSE_FIRSTN,
recurse_to_leaf, c+osize);
}
@ -405,9 +450,9 @@ int crush_do_rule(struct crush_map *map,
o = w;
w = tmp;
wsize = osize;
break;
break;
case CRUSH_RULE_EMIT:
for (i=0; i<wsize && result_len < result_max; i++) {
result[result_len] = w[i];
@ -415,12 +460,12 @@ int crush_do_rule(struct crush_map *map,
}
wsize = 0;
break;
default:
BUG_ON(1);
}
}
return result_len;
}

View File

@ -3,6 +3,10 @@
#include "crush.h"
/*
* CRUSH functions for find rules and then mapping an input to an
* output set.
*/
extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
extern int crush_do_rule(struct crush_map *map,
int ruleno,

View File

@ -19,12 +19,12 @@ int main()
int root;
int ruleno;
int r[10];
int uw[10] = { 1000, 1000, 500, 1000, 2000, 1000, 1000, 3000, 1000, 500 };
struct crush_bucket *b;
struct crush_rule *rule;
struct crush_map *map = crush_create();
d = 0;
@ -59,7 +59,7 @@ int main()
}
for (i=0; i<100; i += 10)
printf("%2d : %d\n", i, o[i]);
printf("%2d : %d\n", i, o[i]);
return 0;
}