merged r1961:1988 from ceph/branches/sage/crush back into trunk

git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1989 29311d96-e01e-0410-9327-a35deaab8ce9
This commit is contained in:
sageweil 2007-10-23 21:53:24 +00:00
parent 6820be6c24
commit dae8bec9a1
46 changed files with 1511 additions and 912 deletions

View File

@ -16,7 +16,7 @@
EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib
EXTRA_CFLAGS += -g
EXTRA_CFLAGS += -pg
EXTRA_CFLAGS += -O3
#EXTRA_CFLAGS += -O3
# base
CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS}
@ -145,22 +145,22 @@ mkmonmap: mkmonmap.cc common.o
extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
cmon: cmon.o mon.o msg/SimpleMessenger.o common.o
cmon: cmon.o mon.o msg/SimpleMessenger.o common.o crush/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o
cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o crush/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o
cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o
csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o
cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o crush/libcrush.o
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
@ -191,15 +191,15 @@ ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o
# fake*
fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o
fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o crush/libcrush.o
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crush/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
# mpi startup
newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o
${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
@ -243,6 +243,14 @@ gprof-helper.so: test/gprof-helper.c
test_disk_bw: test/test_disk_bw.cc common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
# crush
crush/libcrush.o: force_look
cd crush ; make
force_look:
true
# bits
common.o: ${COMMON_OBJS}
${LDINC} $@ $^
@ -268,7 +276,6 @@ mon.o: ${MON_OBJS}
osbdb.o: ${OSBDB_OBJS}
${LDINC} $@ $^
# generic rules
%.so: %.cc
${CC} -shared -fPIC ${CFLAGS} $< -o $@

View File

@ -0,0 +1,543 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef __crush_CRUSH_H
#define __crush_CRUSH_H
#include <iostream>
#include <list>
#include <vector>
#include <set>
#include <map>
using std::set;
using std::map;
using std::vector;
using std::list;
#include <ext/hash_map>
#include <ext/hash_set>
using namespace __gnu_cxx;
#include "Bucket.h"
#include "include/buffer.h"
namespace crush {
// *** RULES ***
class RuleStep {
public:
int cmd;
vector<int> args;
RuleStep(int c) : cmd(c) {}
RuleStep(int c, int a) : cmd(c) {
args.push_back(a);
}
RuleStep(int c, int a, int b) : cmd(c) {
args.push_back(a);
args.push_back(b);
}
RuleStep(int o, int a, int b, int c) : cmd(o) {
args.push_back(a);
args.push_back(b);
args.push_back(c);
}
void _encode(bufferlist& bl) {
bl.append((char*)&cmd, sizeof(cmd));
::_encode(args, bl);
}
void _decode(bufferlist& bl, int& off) {
bl.copy(off, sizeof(cmd), (char*)&cmd);
off += sizeof(cmd);
::_decode(args, bl, off);
}
};
// Rule operations
const int CRUSH_RULE_TAKE = 0;
const int CRUSH_RULE_CHOOSE = 1; // first n by default
const int CRUSH_RULE_CHOOSE_FIRSTN = 1;
const int CRUSH_RULE_CHOOSE_INDEP = 2;
const int CRUSH_RULE_EMIT = 3;
class Rule {
public:
vector< RuleStep > steps;
void _encode(bufferlist& bl) {
int n = steps.size();
bl.append((char*)&n, sizeof(n));
for (int i=0; i<n; i++)
steps[i]._encode(bl);
}
void _decode(bufferlist& bl, int& off) {
steps.clear();
int n;
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
steps.push_back(RuleStep(0));
steps[i]._decode(bl, off);
}
}
};
// *** CRUSH ***
class Crush {
protected:
map<int, Bucket*> buckets;
int bucketno;
Hash h;
hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
public:
map<int, Rule> rules;
//map<int,int> collisions;
//map<int,int> bumps;
void _encode(bufferlist& bl) {
// buckets
int n = buckets.size();
bl.append((char*)&n, sizeof(n));
for (map<int, Bucket*>::const_iterator it = buckets.begin();
it != buckets.end();
it++) {
bl.append((char*)&it->first, sizeof(it->first));
it->second->_encode(bl);
}
bl.append((char*)&bucketno, sizeof(bucketno));
// hash
int s = h.get_seed();
bl.append((char*)&s, sizeof(s));
//::_encode(out, bl);
//::_encode(overload, bl);
// rules
n = rules.size();
bl.append((char*)&n, sizeof(n));
for(map<int, Rule>::iterator it = rules.begin();
it != rules.end();
it++) {
bl.append((char*)&it->first, sizeof(it->first));
it->second._encode(bl);
}
}
void _decode(bufferlist& bl, int& off) {
int n;
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
int bid;
bl.copy(off, sizeof(bid), (char*)&bid);
off += sizeof(bid);
Bucket *b = decode_bucket(bl, off);
buckets[bid] = b;
}
bl.copy(off, sizeof(bucketno), (char*)&bucketno);
off += sizeof(bucketno);
int s;
bl.copy(off, sizeof(s), (char*)&s);
off += sizeof(s);
h.set_seed(s);
//::_decode(out, bl, off);
//::_decode(overload, bl, off);
// rules
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
int r;
bl.copy(off, sizeof(r), (char*)&r);
off += sizeof(r);
rules[r]._decode(bl,off);
}
// index
build_parent_map();
}
void build_parent_map() {
parent_map.clear();
// index every bucket
for (map<int, Bucket*>::iterator bp = buckets.begin();
bp != buckets.end();
++bp) {
// index bucket items
vector<int> items;
bp->second->get_items(items);
for (vector<int>::iterator ip = items.begin();
ip != items.end();
++ip)
parent_map[*ip] = bp->first;
}
}
public:
Crush(int seed=123) : bucketno(-1), h(seed) {}
~Crush() {
// hose buckets
for (map<int, Bucket*>::iterator it = buckets.begin();
it != buckets.end();
it++) {
delete it->second;
}
}
int print(ostream& out, int root, int indent=0) {
for (int i=0; i<indent; i++) out << " ";
Bucket *b = buckets[root];
assert(b);
out << b->get_weight() << "\t" << b->get_id() << "\t";
for (int i=0; i<indent; i++) out << " ";
out << b->get_bucket_type() << ": ";
vector<int> items;
b->get_items(items);
if (buckets.count(items[0])) {
out << std::endl;
for (unsigned i=0; i<items.size(); i++)
print(out, items[i], indent+1);
} else {
out << "[";
for (unsigned i=0; i<items.size(); i++) {
if (i) out << " ";
out << items[i];
}
out << "]";
}
return 0;
}
int add_bucket( Bucket *b ) {
int n = bucketno;
bucketno--;
b->set_id(n);
buckets[n] = b;
return n;
}
void add_item(int parent, int item, float w, bool back=false) {
// add item
assert(!buckets[parent]->is_uniform());
Bucket *p = buckets[parent];
p->add_item(item, w, back);
// set item's parent
Bucket *n = buckets[item];
if (n)
n->set_parent(parent);
// update weights
while (buckets.count(p->get_parent())) {
int child = p->get_id();
p = buckets[p->get_parent()];
p->adjust_item_weight(child, w);
}
}
/*
this is a hack, fix me! weights should be consistent throughout hierarchy!
*/
void set_bucket_weight(int item, float w) {
Bucket *b = buckets[item];
float adj = w - b->get_weight();
while (buckets.count(b->get_parent())) {
Bucket *p = buckets[b->get_parent()];
p->adjust_item_weight(b->get_id(), adj);
b = p;
}
}
/*
* choose numrep distinct items of type type
*/
void choose(int x,
int numrep,
int type,
Bucket *inbucket,
vector<int>& outvec,
bool firstn,
set<int>& outset, map<int,float>& overloadmap,
bool forcefeed=false,
int forcefeedval=-1) {
int off = outvec.size();
// for each replica
for (int rep=0; rep<numrep; rep++) {
int outv = -1; // my result
// forcefeed?
if (forcefeed) {
forcefeed = false;
outvec.push_back(forcefeedval);
continue;
}
// keep trying until we get a non-out, non-colliding item
int ftotal = 0;
bool skip_rep = false;
while (1) {
// start with the input bucket
Bucket *in = inbucket;
// choose through intervening buckets
int flocal = 0;
bool retry_rep = false;
while (1) {
// r may be twiddled to (try to) avoid past collisions
int r = rep;
if (in->is_uniform()) {
// uniform bucket; be careful!
if (firstn || numrep >= in->get_size()) {
// uniform bucket is too small; just walk thru elements
r += ftotal; // r' = r + f_total (first n)
} else {
// make sure numrep is not a multple of bucket size
int add = numrep*flocal; // r' = r + n*f_local
if (in->get_size() % numrep == 0) {
add += add/in->get_size(); // shift seq once per pass through the bucket
}
r += add;
}
} else {
// mixed bucket; just make a distinct-ish r sequence
if (firstn)
r += ftotal; // r' = r + f_total
else
r += numrep * flocal; // r' = r + n*f_local
}
// choose
outv = in->choose_r(x, r, h);
// did we get the type we want?
int itemtype = 0; // 0 is terminal type
Bucket *newin = 0; // remember bucket we hit
if (in->is_uniform()) {
itemtype = ((UniformBucket*)in)->get_item_type();
} else {
if (buckets.count(outv)) { // another bucket
newin = buckets[outv];
itemtype = newin->get_type();
}
}
if (itemtype == type) { // this is what we want!
// collision?
bool collide = false;
for (int prep=0; prep<rep; prep++) {
if (outvec[off+prep] == outv) {
collide = true;
break;
}
}
// ok choice?
bool bad = false;
if (type == 0 && outset.count(outv))
bad = true;
if (overloadmap.count(outv)) {
float f = (float)(h(x, outv) % 1000) / 1000.0;
if (f > overloadmap[outv])
bad = true;
}
if (collide || bad) {
ftotal++;
flocal++;
if (collide && flocal < 3)
continue; // try locally a few times!
if (ftotal >= 10) {
// ok fine, just ignore dup. FIXME.
skip_rep = true;
break;
}
retry_rep = true;
}
break; // ok then!
}
// next
in = newin;
}
if (retry_rep) continue; // try again
break;
}
// skip this rep? (e.g. too many collisions, we give up)
if (skip_rep) continue;
// output this value
outvec.push_back(outv);
} // for rep
// double check!
if (0) {
for (unsigned i=1; i<outvec.size(); i++)
for (unsigned j=0; j<i; j++)
assert(outvec[i] != outvec[j]);
}
}
void do_rule(Rule& rule, int x, vector<int>& result,
set<int>& outset, map<int,float>& overloadmap,
int forcefeed=-1) {
//int numresult = 0;
result.clear();
// determine hierarchical context for forcefeed (if any)
list<int> force_stack;
if (forcefeed >= 0 && parent_map.count(forcefeed)) {
int t = forcefeed;
while (1) {
force_stack.push_front(t);
//cout << "push " << t << " onto force_stack" << std::endl;
if (parent_map.count(t) == 0) break; // reached root, presumably.
//cout << " " << t << " parent is " << parent_map[t] << std::endl;
t = parent_map[t];
}
}
// working vector
vector<int> w; // working variable
// go through each statement
for (vector<RuleStep>::iterator pc = rule.steps.begin();
pc != rule.steps.end();
pc++) {
// move input?
// do it
switch (pc->cmd) {
case CRUSH_RULE_TAKE:
{
const int arg = pc->args[0];
//cout << "take " << arg << std::endl;
if (!force_stack.empty()) {
assert(force_stack.front() == arg);
force_stack.pop_front();
}
w.clear();
w.push_back(arg);
}
break;
case CRUSH_RULE_CHOOSE_FIRSTN:
case CRUSH_RULE_CHOOSE_INDEP:
{
const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN;
const int numrep = pc->args[0];
const int type = pc->args[1];
//cout << "choose " << numrep << " of type " << type << std::endl;
assert(!w.empty());
// reset output
vector<int> out;
// forcefeeding?
bool forcing = false;
int forceval = -1;
if (!force_stack.empty()) {
forceval = force_stack.front();
force_stack.pop_front();
//cout << "priming out with " << forceval << std::endl;
forcing = true;
} else if (forcefeed >= 0 && type == 0) {
//cout << "forcing context-less " << forcefeed << std::endl;
forceval = forcefeed;
forcefeed = -1;
forcing = true;
}
// do each row independently
for (vector<int>::iterator i = w.begin();
i != w.end();
i++) {
assert(buckets.count(*i));
Bucket *b = buckets[*i];
choose(x, numrep, type, b, out, firstn,
outset, overloadmap,
forcing,
forceval);
forcing = false; // only once
} // for inrow
// put back into w
w.swap(out);
out.clear();
}
break;
case CRUSH_RULE_EMIT:
{
for (unsigned i=0; i<w.size(); i++)
result.push_back(w[i]);
//result[numresult++] = w[i];
w.clear();
}
break;
default:
assert(0);
}
}
}
};
}
#endif

View File

@ -0,0 +1,227 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#ifndef __CRUSH_WRAPPER_H
#define __CRUSH_WRAPPER_H
#include "crush.h"
#include "hash.h"
#include "mapper.h"
#include "builder.h"
#include "include/encodable.h"
#include <map>
#include <set>
class CrushWrapper {
public:
struct crush_map *map;
CrushWrapper() : map(0) {}
~CrushWrapper() {
if (map) crush_destroy(map);
}
void create() {
if (map) crush_destroy(map);
map = crush_create();
}
void finalize() {
assert(map);
crush_finalize(map);
}
void update_offload_map(std::set<int32_t>& out_osds,
std::map<int32_t,float>& overload_osds) {
for (int i=0; i<map->max_devices; i++) {
if (out_osds.count(i))
map->device_offload[i] = 0x10000;
else if (overload_osds.count(i))
map->device_offload[i] = (int)(0x10000 * overload_osds[i]); // FIXME: reverse?
else
map->device_offload[i] = 0; // normal.
}
}
void do_rule(int rule, int x, vector<int>& out, int maxout, int forcefeed) {
int rawout[maxout];
int numrep = crush_do_rule(map, rule, x, rawout, maxout, forcefeed);
out.resize(numrep);
for (int i=0; i<numrep; i++)
out[i] = rawout[i];
}
void _encode(bufferlist &bl) {
::_encode_simple(map->max_buckets, bl);
::_encode_simple(map->max_rules, bl);
::_encode_simple(map->max_devices, bl);
// simple arrays
bl.append((char*)map->device_offload, sizeof(map->device_offload[0]) * map->max_devices);
// buckets
for (unsigned i=0; i<map->max_buckets; i++) {
__u32 type = 0;
if (map->buckets[i]) type = map->buckets[i]->bucket_type;
::_encode_simple(type, bl);
if (!type) continue;
::_encode_simple(map->buckets[i]->id, bl);
::_encode_simple(map->buckets[i]->type, bl);
::_encode_simple(map->buckets[i]->bucket_type, bl);
::_encode_simple(map->buckets[i]->weight, bl);
::_encode_simple(map->buckets[i]->size, bl);
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_encode_simple(map->buckets[i]->items[j], bl);
switch (map->buckets[i]->type) {
case CRUSH_BUCKET_UNIFORM:
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], bl);
::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, bl);
break;
case CRUSH_BUCKET_LIST:
for (unsigned j=0; j<map->buckets[i]->size; j++) {
::_encode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], bl);
::_encode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], bl);
}
break;
case CRUSH_BUCKET_TREE:
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_encode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], bl);
break;
case CRUSH_BUCKET_STRAW:
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_encode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], bl);
break;
}
}
// rules
for (unsigned i=0; i<map->max_rules; i++) {
__u32 yes = map->rules[i] ? 1:0;
::_encode_simple(yes, bl);
if (!yes) continue;
::_encode_simple(map->rules[i]->len, bl);
for (unsigned j=0; j<map->rules[i]->len; j++)
::_encode_simple(map->rules[i]->steps[j], bl);
}
}
void _decode(bufferlist::iterator &blp) {
create();
::_decode_simple(map->max_buckets, blp);
::_decode_simple(map->max_rules, blp);
::_decode_simple(map->max_devices, blp);
map->device_offload = (__u32*)malloc(sizeof(map->device_offload[0])*map->max_devices);
blp.copy(sizeof(map->device_offload[0])*map->max_devices, (char*)map->device_offload);
// buckets
map->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*map->max_buckets);
for (unsigned i=0; i<map->max_buckets; i++) {
__u32 type;
::_decode_simple(type, blp);
if (!type) {
map->buckets[i] = 0;
continue;
}
int size = 0;
switch (type) {
case CRUSH_BUCKET_UNIFORM:
size = sizeof(crush_bucket_uniform);
break;
case CRUSH_BUCKET_LIST:
size = sizeof(crush_bucket_list);
break;
case CRUSH_BUCKET_TREE:
size = sizeof(crush_bucket_tree);
break;
case CRUSH_BUCKET_STRAW:
size = sizeof(crush_bucket_straw);
break;
default:
assert(0);
}
map->buckets[i] = (crush_bucket*)malloc(size);
memset(map->buckets[i], 0, size);
::_decode_simple(map->buckets[i]->id, blp);
::_decode_simple(map->buckets[i]->type, blp);
::_decode_simple(map->buckets[i]->bucket_type, blp);
::_decode_simple(map->buckets[i]->weight, blp);
::_decode_simple(map->buckets[i]->size, blp);
map->buckets[i]->items = (__s32*)malloc(sizeof(__s32)*map->buckets[i]->size);
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_decode_simple(map->buckets[i]->items[j], blp);
switch (map->buckets[i]->type) {
case CRUSH_BUCKET_UNIFORM:
((crush_bucket_uniform*)map->buckets[i])->primes =
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], blp);
::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, blp);
break;
case CRUSH_BUCKET_LIST:
((crush_bucket_list*)map->buckets[i])->item_weights =
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
((crush_bucket_list*)map->buckets[i])->sum_weights =
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<map->buckets[i]->size; j++) {
::_decode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], blp);
::_decode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], blp);
}
break;
case CRUSH_BUCKET_TREE:
((crush_bucket_tree*)map->buckets[i])->node_weights =
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_decode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], blp);
break;
case CRUSH_BUCKET_STRAW:
((crush_bucket_straw*)map->buckets[i])->straws =
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<map->buckets[i]->size; j++)
::_decode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], blp);
break;
}
}
// rules
map->rules = (crush_rule**)malloc(sizeof(crush_rule*)*map->max_rules);
for (unsigned i=0; i<map->max_rules; i++) {
__u32 yes;
::_decode_simple(yes, blp);
if (!yes) {
map->rules[i] = 0;
continue;
}
map->rules[i] = (crush_rule*)malloc(sizeof(crush_rule));
memset(map->rules[i], 0, sizeof(crush_rule));
::_decode_simple(map->rules[i]->len, blp);
map->rules[i]->steps = (crush_rule_step*)malloc(sizeof(crush_rule_step) * map->rules[i]->len);
for (unsigned j=0; j<map->rules[i]->len; j++)
::_decode_simple(map->rules[i]->steps[j], blp);
}
finalize();
}
};
#endif

View File

@ -1,11 +1,12 @@
CC = gcc
CFLAGS = -Wall
CFLAGS += -O3 -g
CFLAGS += -g
CFLAGS += -O3
LD = ld
RM = rm
all: depend builder.o libcrush.o
all: depend libcrush.o test
clean:
rm -f *.o libcrush.o
@ -13,9 +14,12 @@ clean:
%.o: %.c
${CC} ${CFLAGS} -c $< -o $@
libcrush.o: crush.o buckets.o
libcrush.o: builder.o crush.o mapper.o
$(LD) -i -o $@ $^
test: test.c libcrush.o
$(CC) ${CFLAGS} -lm $^ -o $@
.depend:
touch .depend

View File

@ -0,0 +1,6 @@
#include "crush.h"
#include "hash.h"
int

View File

@ -2,16 +2,16 @@
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "builder.h"
#include "hash.h"
struct crush_map *crush_new()
struct crush_map *crush_create()
{
struct crush_map *m;
m = malloc(sizeof(*m));
memset(&m, 0, sizeof(m));
memset(m, 0, sizeof(*m));
return m;
}
@ -26,73 +26,36 @@ void crush_finalize(struct crush_map *map)
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
for (i=0; i<map->buckets[b]->size; i++)
if (map->buckets[b]->items[i] > map->max_devices)
map->max_devices = map->buckets[b]->items[i];
if (map->buckets[b]->items[i] >= map->max_devices)
map->max_devices = map->buckets[b]->items[i] + 1;
}
/* allocate arrays */
map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices);
map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices);
memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices);
memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices);
map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets);
memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets);
/* build reverse map */
/* build parent maps */
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
for (i=0; i<map->buckets[b]->size; i++) {
c = map->buckets[b]->items[i];
BUG_ON(c >= map->max_devices);
if (c >= 0)
map->device_parents[c] = map->buckets[b]->id;
else
map->bucket_parents[-c] = map->buckets[b]->id;
map->bucket_parents[-1-c] = map->buckets[b]->id;
}
}
/* new device offload map? */
if (!map->device_offload) {
map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices);
memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices);
}
}
/*
* deallocate
*/
void crush_destroy(struct crush_map *map)
{
int b;
/* buckets */
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
switch (map->buckets[b]->type) {
case CRUSH_BUCKET_UNIFORM:
crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]);
break;
case CRUSH_BUCKET_LIST:
crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]);
break;
case CRUSH_BUCKET_TREE:
crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]);
break;
case CRUSH_BUCKET_STRAW:
crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]);
break;
}
}
free(map->buckets);
/* rules */
for (b=0; b<map->max_rules; b++) {
if (map->rules[b] == 0) continue;
if (map->rules[b]->steps)
free(map->rules[b]->steps);
free(map->rules[b]);
}
free(map->rules);
free(map->bucket_parents);
free(map->device_parents);
free(map->device_offload);
memset(map, 0, sizeof(*map));
}
@ -100,25 +63,26 @@ void crush_destroy(struct crush_map *map)
/** rules **/
int crush_add_rule(struct crush_map *map,
int ruleno,
struct crush_rule *rule)
{
int id;
/* find a rule id */
for (id=0; id < map->max_rules; id++)
if (map->rules[id] == 0) break;
if (id == map->max_rules) {
int oldsize;
if (ruleno < 0) {
for (ruleno=0; ruleno < map->max_rules; ruleno++)
if (map->rules[ruleno] == 0) break;
}
if (ruleno >= map->max_rules) {
/* expand array */
if (map->max_rules)
map->max_rules *= 2;
else
map->max_rules = 8;
oldsize = map->max_rules;
map->max_rules = ruleno+1;
map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0]));
memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
}
/* add it */
map->rules[id] = rule;
return id;
map->rules[ruleno] = rule;
return ruleno;
}
struct crush_rule *crush_make_rule()
@ -126,7 +90,7 @@ struct crush_rule *crush_make_rule()
struct crush_rule *rule;
rule = malloc(sizeof(struct crush_rule));
memset(&rule, 0, sizeof(rule));
memset(rule, 0, sizeof(*rule));
return rule;
}
@ -134,9 +98,9 @@ void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2)
{
rule->len++;
if (rule->steps)
rule->steps = malloc(sizeof(rule->steps[0])*rule->len);
else
rule->steps = realloc(rule->steps, sizeof(rule->steps[0])*rule->len);
else
rule->steps = malloc(sizeof(rule->steps[0])*rule->len);
rule->steps[rule->len-1].op = op;
rule->steps[rule->len-1].arg1 = arg1;
rule->steps[rule->len-1].arg2 = arg2;
@ -149,35 +113,43 @@ int crush_add_bucket(struct crush_map *map,
struct crush_bucket *bucket)
{
int id;
int oldsize;
/* find a bucket id */
for (id=0; id < map->max_buckets; id++)
if (map->buckets[id] == 0) break;
if (id == map->max_buckets) {
/* expand array */
oldsize = map->max_buckets;
if (map->max_buckets)
map->max_buckets *= 2;
else
map->max_buckets = 8;
map->buckets = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]));
memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));
}
/* add it */
bucket->id = id;
bucket->id = -1 - id;
map->buckets[id] = bucket;
return id;
return -1 - id;
}
void crush_make_uniform_bucket(struct crush_map *map,
struct crush_bucket_uniform *bucket,
int size,
int *items,
int item_weight)
/* uniform bucket */
struct crush_bucket_uniform *
crush_make_uniform_bucket(int type, int size,
int *items,
int item_weight)
{
int i, j, x;
struct crush_bucket_uniform *bucket;
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.bucket_type = CRUSH_BUCKET_UNIFORM;
bucket->h.type = type;
bucket->h.size = size;
bucket->h.weight = size * item_weight;
@ -202,48 +174,135 @@ void crush_make_uniform_bucket(struct crush_map *map,
bucket->primes[i++] = x;
x += 2;
}
return bucket;
}
void crush_make_list_bucket(struct crush_map *map,
struct crush_bucket_list *bucket,
int size,
int *items,
int *weights)
/* list bucket */
struct crush_bucket_list*
crush_make_list_bucket(int type, int size,
int *items,
int *weights)
{
int i;
int w;
struct crush_bucket_list *bucket;
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.bucket_type = CRUSH_BUCKET_LIST;
bucket->h.type = type;
bucket->h.size = size;
bucket->h.items = malloc(sizeof(__u32)*size);
bucket->item_weights = malloc(sizeof(__u32)*size);
bucket->sum_weights = malloc(sizeof(__u32)*size);
w = 0;
for (i=0; i<size; i++) {
for (i=size-1; i>=0; i--) {
bucket->h.items[i] = items[i];
bucket->item_weights[i] = weights[i];
w += weights[i];
bucket->sum_weights[i] = w;
/*printf("%d item %d weight %d sum %d\n",
i, items[i], weights[i], bucket->sum_weights[i]);*/
}
bucket->h.weight = w;
return bucket;
}
void crush_make_straw_bucket(struct crush_map *map,
struct crush_bucket_straw *bucket,
int size,
int *items,
int *weights)
/* tree bucket */
static int height(int n) {
int h = 0;
while ((n & 1) == 0) {
h++;
n = n >> 1;
}
return h;
}
static int on_right(int n, int h) {
return n & (1 << (h+1));
}
static int parent(int n)
{
int h = height(n);
if (on_right(n, h))
return n - (1<<h);
else
return n + (1<<h);
}
struct crush_bucket_tree*
crush_make_tree_bucket(int type, int size,
int *items, /* in leaf order */
int *weights)
{
struct crush_bucket_tree *bucket;
int depth;
int node;
int t, i, j;
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.bucket_type = CRUSH_BUCKET_TREE;
bucket->h.type = type;
bucket->h.size = size;
/* calc tree depth */
depth = 1;
t = size - 1;
while (t) {
t = t >> 1;
depth++;
}
bucket->h.size = 1 << depth;
bucket->h.items = malloc(sizeof(__u32)*bucket->h.size);
bucket->node_weights = malloc(sizeof(__u32)*bucket->h.size);
memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size);
memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size);
for (i=0; i<size; i++) {
node = ((i+1) << 1)-1;
bucket->h.items[node] = items[i];
bucket->node_weights[node] = weights[i];
bucket->h.weight += weights[i];
for (j=1; j<depth; j++) {
node = parent(node);
bucket->node_weights[node] += weights[i];
}
}
BUG_ON(bucket->node_weights[bucket->h.size/2] != bucket->h.weight);
return bucket;
}
/* straw bucket */
struct crush_bucket_straw *
crush_make_straw_bucket(int type,
int size,
int *items,
int *weights)
{
struct crush_bucket_straw *bucket;
int *reverse;
int i, j, k;
double straw, wbelow, lastw, wnext, pbelow;
int numleft;
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.bucket_type = CRUSH_BUCKET_STRAW;
bucket->h.type = type;
bucket->h.size = size;
bucket->h.items = malloc(sizeof(__u32)*size);
@ -255,20 +314,21 @@ void crush_make_straw_bucket(struct crush_map *map,
bucket->h.weight += weights[i];
}
/* reverse sort by weight */
/* reverse sort by weight (simple insertion sort) */
reverse = malloc(sizeof(int) * size);
reverse[0] = items[0];
reverse[0] = 0;
for (i=1; i<size; i++) {
for (j=0; j<i; j++) {
if (weights[i] < weights[reverse[j]]) {
/* insert here */
for (k=i; k>j; k--)
reverse[k] = reverse[k-1];
reverse[j] = items[i];
reverse[j] = i;
break;
}
}
if (j == i)
reverse[i] = items[i];
reverse[i] = i;
}
numleft = size;
@ -280,25 +340,36 @@ void crush_make_straw_bucket(struct crush_map *map,
while (i < size) {
/* set this item's straw */
bucket->straws[reverse[i]] = straw * 0x10000;
/*printf("item %d at %d weight %d straw %d (%lf)\n",
items[reverse[i]],
reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/
i++;
if (i == size) break;
/* same weight as previous? */
if (weights[reverse[i]] == weights[reverse[i-1]])
if (weights[reverse[i]] == weights[reverse[i-1]]) {
/*printf("same as previous\n");*/
continue;
}
/* adjust straw for next guy */
wbelow += (((double)weights[reverse[i-1]] / (double)0x10000) - lastw) * numleft;
numleft--;
wnext = numleft * ((double)(weights[reverse[i]] - weights[reverse[i-1]]) / (double)0x10000);
wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft;
for (j=i; j<size; j++)
if (weights[reverse[j]] == weights[reverse[i]])
numleft--;
else
break;
wnext = numleft * (weights[reverse[i]] - weights[reverse[i-1]]);
pbelow = wbelow / (wbelow + wnext);
/*printf("wbelow %lf wnext %lf pbelow %lf\n", wbelow, wnext, pbelow);*/
straw *= pow((double)1.0 / pbelow, (double)1.0 / numleft);
straw *= pow((double)1.0 / pbelow, (double)1.0 / (double)numleft);
lastw = weights[reverse[i-1]];
}
free(reverse);
return bucket;
}

View File

@ -0,0 +1,45 @@
#ifndef _CRUSH_BUILDER_H
#define _CRUSH_BUILDER_H
#include "crush.h"
#ifdef __cplusplus
extern "C" {
#endif
extern struct crush_map *crush_create();
extern void crush_finalize(struct crush_map *map);
/* rules */
extern struct crush_rule *crush_make_rule();
extern int crush_add_rule(struct crush_map *map,
int ruleno,
struct crush_rule *rule);
extern void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2);
/* buckets */
extern int crush_add_bucket(struct crush_map *map,
struct crush_bucket *bucket);
struct crush_bucket_uniform *
crush_make_uniform_bucket(int type, int size,
int *items,
int item_weight);
struct crush_bucket_list*
crush_make_list_bucket(int type, int size,
int *items,
int *weights);
struct crush_bucket_tree*
crush_make_tree_bucket(int type, int size,
int *items, /* in leaf order */
int *weights);
struct crush_bucket_straw *
crush_make_straw_bucket(int type, int size,
int *items,
int *weights);
#ifdef __cplusplus
}
#endif
#endif

81
trunk/ceph/crush/crush.c Normal file
View File

@ -0,0 +1,81 @@
#ifdef __KERNEL__
# define free(x) kfree(x)
#else
# include <stdlib.h>
#endif
#include "crush.h"
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{
free(b->primes);
free(b->h.items);
free(b);
}
void crush_destroy_bucket_list(struct crush_bucket_list *b)
{
free(b->item_weights);
free(b->sum_weights);
free(b->h.items);
free(b);
}
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
free(b->node_weights);
free(b);
}
void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
{
free(b->straws);
free(b->h.items);
free(b);
}
/*
* deallocate
*/
void crush_destroy(struct crush_map *map)
{
int b;
/* buckets */
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
switch (map->buckets[b]->type) {
case CRUSH_BUCKET_UNIFORM:
crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]);
break;
case CRUSH_BUCKET_LIST:
crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]);
break;
case CRUSH_BUCKET_TREE:
crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]);
break;
case CRUSH_BUCKET_STRAW:
crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]);
break;
}
}
free(map->buckets);
/* rules */
for (b=0; b<map->max_rules; b++) {
if (map->rules[b] == 0) continue;
if (map->rules[b]->steps)
free(map->rules[b]->steps);
free(map->rules[b]);
}
free(map->rules);
free(map->bucket_parents);
free(map->device_parents);
free(map->device_offload);
free(map);
}

View File

@ -1,543 +1,117 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef _CRUSH_CRUSH_H
#define _CRUSH_CRUSH_H
#ifndef __crush_CRUSH_H
#define __crush_CRUSH_H
#ifdef __cplusplus
extern "C" {
#endif
#include <iostream>
#include <list>
#include <vector>
#include <set>
#include <map>
using std::set;
using std::map;
using std::vector;
using std::list;
#include <ext/hash_map>
#include <ext/hash_set>
using namespace __gnu_cxx;
#include <linux/types.h> /* just for int types */
#ifndef BUG_ON
# include <assert.h>
# define BUG_ON(x) assert(!(x))
#endif
#include "Bucket.h"
/*** RULES ***/
enum {
CRUSH_RULE_TAKE,
CRUSH_RULE_CHOOSE_FIRSTN,
CRUSH_RULE_CHOOSE_INDEP,
CRUSH_RULE_EMIT
};
#include "include/buffer.h"
#define CRUSH_MAX_DEPTH 10
#define CRUSH_MAX_SET 10
struct crush_rule_step {
__u32 op;
__s32 arg1;
__s32 arg2;
};
namespace crush {
// *** RULES ***
class RuleStep {
public:
int cmd;
vector<int> args;
RuleStep(int c) : cmd(c) {}
RuleStep(int c, int a) : cmd(c) {
args.push_back(a);
}
RuleStep(int c, int a, int b) : cmd(c) {
args.push_back(a);
args.push_back(b);
}
RuleStep(int o, int a, int b, int c) : cmd(o) {
args.push_back(a);
args.push_back(b);
args.push_back(c);
}
void _encode(bufferlist& bl) {
bl.append((char*)&cmd, sizeof(cmd));
::_encode(args, bl);
}
void _decode(bufferlist& bl, int& off) {
bl.copy(off, sizeof(cmd), (char*)&cmd);
off += sizeof(cmd);
::_decode(args, bl, off);
}
};
// Rule operations
const int CRUSH_RULE_TAKE = 0;
const int CRUSH_RULE_CHOOSE = 1; // first n by default
const int CRUSH_RULE_CHOOSE_FIRSTN = 1;
const int CRUSH_RULE_CHOOSE_INDEP = 2;
const int CRUSH_RULE_EMIT = 3;
class Rule {
public:
vector< RuleStep > steps;
void _encode(bufferlist& bl) {
int n = steps.size();
bl.append((char*)&n, sizeof(n));
for (int i=0; i<n; i++)
steps[i]._encode(bl);
}
void _decode(bufferlist& bl, int& off) {
steps.clear();
int n;
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
steps.push_back(RuleStep(0));
steps[i]._decode(bl, off);
}
}
};
struct crush_rule {
__u32 len;
struct crush_rule_step *steps;
};
/*** BUCKETS ***/
// *** CRUSH ***
enum {
CRUSH_BUCKET_UNIFORM = 1,
CRUSH_BUCKET_LIST = 2,
CRUSH_BUCKET_TREE = 3,
CRUSH_BUCKET_STRAW = 4
};
class Crush {
protected:
map<int, Bucket*> buckets;
int bucketno;
Hash h;
struct crush_bucket {
__s32 id; /* this'll be negative */
__u16 type;
__u16 bucket_type;
__u32 weight; /* 16-bit fixed point */
__u32 size; /* num items */
__s32 *items;
};
hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
struct crush_bucket_uniform {
struct crush_bucket h;
__u32 *primes;
__u32 item_weight; /* 16-bit fixed point */
};
public:
map<int, Rule> rules;
struct crush_bucket_list {
struct crush_bucket h;
__u32 *item_weights; /* 16-bit fixed point */
__u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */
};
//map<int,int> collisions;
//map<int,int> bumps;
struct crush_bucket_tree {
struct crush_bucket h; /* note: h.size is tree size, not number of actual items */
__u32 *node_weights;
};
void _encode(bufferlist& bl) {
// buckets
int n = buckets.size();
bl.append((char*)&n, sizeof(n));
for (map<int, Bucket*>::const_iterator it = buckets.begin();
it != buckets.end();
it++) {
bl.append((char*)&it->first, sizeof(it->first));
it->second->_encode(bl);
}
bl.append((char*)&bucketno, sizeof(bucketno));
// hash
int s = h.get_seed();
bl.append((char*)&s, sizeof(s));
//::_encode(out, bl);
//::_encode(overload, bl);
// rules
n = rules.size();
bl.append((char*)&n, sizeof(n));
for(map<int, Rule>::iterator it = rules.begin();
it != rules.end();
it++) {
bl.append((char*)&it->first, sizeof(it->first));
it->second._encode(bl);
}
}
void _decode(bufferlist& bl, int& off) {
int n;
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
int bid;
bl.copy(off, sizeof(bid), (char*)&bid);
off += sizeof(bid);
Bucket *b = decode_bucket(bl, off);
buckets[bid] = b;
}
bl.copy(off, sizeof(bucketno), (char*)&bucketno);
off += sizeof(bucketno);
int s;
bl.copy(off, sizeof(s), (char*)&s);
off += sizeof(s);
h.set_seed(s);
//::_decode(out, bl, off);
//::_decode(overload, bl, off);
// rules
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
int r;
bl.copy(off, sizeof(r), (char*)&r);
off += sizeof(r);
rules[r]._decode(bl,off);
}
// index
build_parent_map();
}
void build_parent_map() {
parent_map.clear();
// index every bucket
for (map<int, Bucket*>::iterator bp = buckets.begin();
bp != buckets.end();
++bp) {
// index bucket items
vector<int> items;
bp->second->get_items(items);
for (vector<int>::iterator ip = items.begin();
ip != items.end();
++ip)
parent_map[*ip] = bp->first;
}
}
struct crush_bucket_straw {
struct crush_bucket h;
__u32 *straws; /* 16-bit fixed point */
};
public:
Crush(int seed=123) : bucketno(-1), h(seed) {}
~Crush() {
// hose buckets
for (map<int, Bucket*>::iterator it = buckets.begin();
it != buckets.end();
it++) {
delete it->second;
}
}
int print(ostream& out, int root, int indent=0) {
for (int i=0; i<indent; i++) out << " ";
Bucket *b = buckets[root];
assert(b);
out << b->get_weight() << "\t" << b->get_id() << "\t";
for (int i=0; i<indent; i++) out << " ";
out << b->get_bucket_type() << ": ";
/*** CRUSH ***/
vector<int> items;
b->get_items(items);
if (buckets.count(items[0])) {
out << std::endl;
for (unsigned i=0; i<items.size(); i++)
print(out, items[i], indent+1);
} else {
out << "[";
for (unsigned i=0; i<items.size(); i++) {
if (i) out << " ";
out << items[i];
}
out << "]";
}
return 0;
}
int add_bucket( Bucket *b ) {
int n = bucketno;
bucketno--;
b->set_id(n);
buckets[n] = b;
return n;
}
void add_item(int parent, int item, float w, bool back=false) {
// add item
assert(!buckets[parent]->is_uniform());
Bucket *p = buckets[parent];
p->add_item(item, w, back);
// set item's parent
Bucket *n = buckets[item];
if (n)
n->set_parent(parent);
// update weights
while (buckets.count(p->get_parent())) {
int child = p->get_id();
p = buckets[p->get_parent()];
p->adjust_item_weight(child, w);
}
}
/*
this is a hack, fix me! weights should be consistent throughout hierarchy!
*/
void set_bucket_weight(int item, float w) {
Bucket *b = buckets[item];
float adj = w - b->get_weight();
while (buckets.count(b->get_parent())) {
Bucket *p = buckets[b->get_parent()];
p->adjust_item_weight(b->get_id(), adj);
b = p;
}
}
/*
* choose numrep distinct items of type type
*/
void choose(int x,
int numrep,
int type,
Bucket *inbucket,
vector<int>& outvec,
bool firstn,
set<int>& outset, map<int,float>& overloadmap,
bool forcefeed=false,
int forcefeedval=-1) {
int off = outvec.size();
// for each replica
for (int rep=0; rep<numrep; rep++) {
int outv = -1; // my result
// forcefeed?
if (forcefeed) {
forcefeed = false;
outvec.push_back(forcefeedval);
continue;
}
struct crush_map {
struct crush_bucket **buckets;
struct crush_rule **rules;
// keep trying until we get a non-out, non-colliding item
int ftotal = 0;
bool skip_rep = false;
/* parent pointers */
__u32 *bucket_parents;
__u32 *device_parents;
while (1) {
// start with the input bucket
Bucket *in = inbucket;
// choose through intervening buckets
int flocal = 0;
bool retry_rep = false;
while (1) {
// r may be twiddled to (try to) avoid past collisions
int r = rep;
if (in->is_uniform()) {
// uniform bucket; be careful!
if (firstn || numrep >= in->get_size()) {
// uniform bucket is too small; just walk thru elements
r += ftotal; // r' = r + f_total (first n)
} else {
// make sure numrep is not a multple of bucket size
int add = numrep*flocal; // r' = r + n*f_local
if (in->get_size() % numrep == 0) {
add += add/in->get_size(); // shift seq once per pass through the bucket
}
r += add;
}
} else {
// mixed bucket; just make a distinct-ish r sequence
if (firstn)
r += ftotal; // r' = r + f_total
else
r += numrep * flocal; // r' = r + n*f_local
}
// choose
outv = in->choose_r(x, r, h);
// did we get the type we want?
int itemtype = 0; // 0 is terminal type
Bucket *newin = 0; // remember bucket we hit
if (in->is_uniform()) {
itemtype = ((UniformBucket*)in)->get_item_type();
} else {
if (buckets.count(outv)) { // another bucket
newin = buckets[outv];
itemtype = newin->get_type();
}
}
if (itemtype == type) { // this is what we want!
// collision?
bool collide = false;
for (int prep=0; prep<rep; prep++) {
if (outvec[off+prep] == outv) {
collide = true;
break;
}
}
// ok choice?
bool bad = false;
if (type == 0 && outset.count(outv))
bad = true;
if (overloadmap.count(outv)) {
float f = (float)(h(x, outv) % 1000) / 1000.0;
if (f > overloadmap[outv])
bad = true;
}
if (collide || bad) {
ftotal++;
flocal++;
if (collide && flocal < 3)
continue; // try locally a few times!
if (ftotal >= 10) {
// ok fine, just ignore dup. FIXME.
skip_rep = true;
break;
}
retry_rep = true;
}
break; // ok then!
}
// next
in = newin;
}
if (retry_rep) continue; // try again
break;
}
// skip this rep? (e.g. too many collisions, we give up)
if (skip_rep) continue;
// output this value
outvec.push_back(outv);
} // for rep
// double check!
if (0) {
for (unsigned i=1; i<outvec.size(); i++)
for (unsigned j=0; j<i; j++)
assert(outvec[i] != outvec[j]);
}
}
/* offload
* size max_devices, values 0...0xffff
* 0 == normal
* 0x10000 == 100% offload (i.e. failed)
*/
__u32 *device_offload;
__u32 max_buckets;
__u32 max_rules;
__s32 max_devices;
};
void do_rule(Rule& rule, int x, vector<int>& result,
set<int>& outset, map<int,float>& overloadmap,
int forcefeed=-1) {
//int numresult = 0;
result.clear();
// determine hierarchical context for forcefeed (if any)
list<int> force_stack;
if (forcefeed >= 0 && parent_map.count(forcefeed)) {
int t = forcefeed;
while (1) {
force_stack.push_front(t);
//cout << "push " << t << " onto force_stack" << std::endl;
if (parent_map.count(t) == 0) break; // reached root, presumably.
//cout << " " << t << " parent is " << parent_map[t] << std::endl;
t = parent_map[t];
}
}
// working vector
vector<int> w; // working variable
// go through each statement
for (vector<RuleStep>::iterator pc = rule.steps.begin();
pc != rule.steps.end();
pc++) {
// move input?
// do it
switch (pc->cmd) {
case CRUSH_RULE_TAKE:
{
const int arg = pc->args[0];
//cout << "take " << arg << std::endl;
if (!force_stack.empty()) {
assert(force_stack.front() == arg);
force_stack.pop_front();
}
w.clear();
w.push_back(arg);
}
break;
case CRUSH_RULE_CHOOSE_FIRSTN:
case CRUSH_RULE_CHOOSE_INDEP:
{
const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN;
const int numrep = pc->args[0];
const int type = pc->args[1];
//cout << "choose " << numrep << " of type " << type << std::endl;
assert(!w.empty());
// reset output
vector<int> out;
// forcefeeding?
bool forcing = false;
int forceval = -1;
if (!force_stack.empty()) {
forceval = force_stack.front();
force_stack.pop_front();
//cout << "priming out with " << forceval << std::endl;
forcing = true;
} else if (forcefeed >= 0 && type == 0) {
//cout << "forcing context-less " << forcefeed << std::endl;
forceval = forcefeed;
forcefeed = -1;
forcing = true;
}
// do each row independently
for (vector<int>::iterator i = w.begin();
i != w.end();
i++) {
assert(buckets.count(*i));
Bucket *b = buckets[*i];
choose(x, numrep, type, b, out, firstn,
outset, overloadmap,
forcing,
forceval);
forcing = false; // only once
} // for inrow
// put back into w
w.swap(out);
out.clear();
}
break;
case CRUSH_RULE_EMIT:
{
for (unsigned i=0; i<w.size(); i++)
result.push_back(w[i]);
//result[numresult++] = w[i];
w.clear();
}
break;
default:
assert(0);
}
}
}
/* common destructors */
extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *);
extern void crush_destroy_bucket_list(struct crush_bucket_list *);
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *);
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *);
extern void crush_destroy(struct crush_map *map);
};
#ifdef __cplusplus
}
#endif
#endif

View File

@ -2,6 +2,129 @@
#include "crush.h"
#include "hash.h"
#include <string.h>
#include <stdio.h>
/** bucket choose methods **/
/* uniform */
static int
crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
{
unsigned o, p, s;
o = crush_hash32_2(x, bucket->h.id) & 0xffff;
p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size];
s = (x + o + (r+1)*p) % bucket->h.size;
/*printf("%d %d %d %d\n", x, o, r, p);*/
return bucket->h.items[s];
}
/* list */
static int
crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
{
int i;
__u64 w;
for (i=0; i<bucket->h.size; i++) {
w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id);
w &= 0xffff;
/*printf("%d item %d weight %d sum_weight %d r %lld",
i, bucket->h.items[i], bucket->item_weights[i], bucket->sum_weights[i], w);*/
w *= bucket->sum_weights[i];
w = w >> 16;
/*printf(" scaled %lld\n", w);*/
if (w < bucket->item_weights[i])
return bucket->h.items[i];
}
BUG_ON(1);
return 0;
}
/* tree */
static int height(int n) {
int h = 0;
while ((n & 1) == 0) {
h++;
n = n >> 1;
}
return h;
}
static int left(int x) {
int h = height(x);
return x - (1 << (h-1));
}
static int right(int x) {
int h = height(x);
return x + (1 << (h-1));
}
static int terminal(int x) {
return x & 1;
}
static int
crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
{
int n, l;
__u32 w;
__u64 t;
/* start at root */
n = bucket->h.size >> 1;
while (!terminal(n)) {
/* pick point in [0, w) */
w = bucket->node_weights[n];
t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
t = t >> 32;
/* left or right? */
l = left(n);
if (t < bucket->node_weights[l])
n = l;
else
n = right(n);
}
return bucket->h.items[n];
}
/* straw */
static int
crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
{
int i;
int high = 0;
__u64 high_draw = 0;
__u64 draw;
for (i=0; i<bucket->h.size; i++) {
draw = crush_hash32_3(x, bucket->h.items[i], r);
draw &= 0xffff;
draw *= bucket->straws[i];
if (i == 0 || draw > high_draw) {
high = i;
high_draw = draw;
}
}
return bucket->h.items[high];
}
/** crush proper **/
/*
* choose numrep distinct items of given type
*/
@ -12,7 +135,7 @@ static int crush_choose(struct crush_map *map,
{
int rep;
int ftotal, flocal;
int retry_rep, skip_rep;
int retry_descent, retry_bucket, skip_rep;
struct crush_bucket *in = bucket;
int r;
int i;
@ -22,32 +145,28 @@ static int crush_choose(struct crush_map *map,
int collide, bad;
outpos = 0;
for (rep = 0; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
skip_rep = 0;
while (1) {
do {
retry_descent = 0;
in = bucket; /* initial bucket */
/* choose through intervening buckets */
flocal = 0;
retry_rep = 0;
while (1) {
do {
retry_bucket = 0;
r = rep;
if (in->type == CRUSH_BUCKET_UNIFORM) {
if (in->bucket_type == CRUSH_BUCKET_UNIFORM) {
/* be careful */
if (firstn || numrep >= in->size) {
if (firstn || numrep >= in->size)
r += ftotal; /* r' = r + f_total */
} else {
r += numrep * flocal; /* r' = r + n*f_local */
/* make sure numrep is not a multiple of bucket size */
if (in->size % numrep == 0)
/* shift seq once per pass through the bucket */
r += numrep * flocal / in->size;
}
else if (in->size % numrep == 0)
r += (numrep+1) * flocal; /* r'=r+(n+1)*f_local */
else
r += numrep * flocal; /* r' = r + n*f_local */
} else {
if (firstn)
r += ftotal; /* r' = r + f_total */
@ -56,7 +175,7 @@ static int crush_choose(struct crush_map *map,
}
/* bucket choose */
switch (in->type) {
switch (in->bucket_type) {
case CRUSH_BUCKET_UNIFORM:
item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r);
break;
@ -75,13 +194,14 @@ static int crush_choose(struct crush_map *map,
/* desired type? */
if (item < 0)
itemtype = map->buckets[-item]->type;
itemtype = map->buckets[-1-item]->type;
else
itemtype = 0;
/* keep going? */
if (itemtype != type) {
in = map->buckets[-item];
BUG_ON(item >= 0 || (-1-item) >= map->max_buckets);
in = map->buckets[-1-item];
continue;
}
@ -108,22 +228,17 @@ static int crush_choose(struct crush_map *map,
flocal++;
if (collide && flocal < 3)
continue; /* locally a few times */
if (ftotal >= 10) {
/* give up, ignore dup, fixme */
skip_rep = 1;
break;
}
retry_rep = 1;
retry_bucket = 1; /* retry locally a few times */
else if (ftotal < 10)
retry_descent = 1; /* then retry descent */
else
skip_rep = 1; /* else give up */
}
break;
}
if (retry_rep) continue;
}
} while (retry_bucket);
} while (retry_descent);
if (skip_rep) continue;
out[outpos] = item;
outpos++;
}
@ -159,12 +274,17 @@ int crush_do_rule(struct crush_map *map,
/* determine hierarchical context of forcefeed, if any */
if (forcefeed >= 0) {
if (map->device_parents[forcefeed] == 0) {
/*printf("CRUSH: forcefed device dne\n");*/
return -1; /* force fed device dne */
}
while (1) {
force_stack[++force_pos] = forcefeed;
/*printf("force_stack[%d] = %d\n", force_pos, forcefeed);*/
if (forcefeed >= 0)
forcefeed = map->device_parents[forcefeed];
else
forcefeed = map->bucket_parents[-forcefeed];
forcefeed = map->bucket_parents[-1-forcefeed];
if (forcefeed == 0) break;
}
}
@ -191,23 +311,22 @@ int crush_do_rule(struct crush_map *map,
for (i = 0; i < wsize; i++) {
numrep = rule->steps[step].arg1;
if (force_pos >= 0) {
o[osize++] = force_stack[force_pos];
force_pos--;
numrep--;
}
if (numrep)
crush_choose(map,
map->buckets[-w[i]],
x, numrep, rule->steps[step].arg2,
o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN);
if (!numrep) continue;
osize += crush_choose(map,
map->buckets[-1-w[i]],
x, numrep, rule->steps[step].arg2,
o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN);
}
/* swap t and w arrays */
tmp = o;
o = w;
w = o;
w = tmp;
wsize = osize;
break;
@ -229,3 +348,4 @@ int crush_do_rule(struct crush_map *map,
return result_len;
}

19
trunk/ceph/crush/mapper.h Normal file
View File

@ -0,0 +1,19 @@
#ifndef _CRUSH_MAPPER_H
#define _CRUSH_MAPPER_H
#ifdef __cplusplus
extern "C" {
#endif
#include "crush.h"
extern int crush_do_rule(struct crush_map *map,
int ruleno,
int x, int *result, int result_max,
int forcefeed); /* -1 for none */
#ifdef __cplusplus
}
#endif
#endif

65
trunk/ceph/crush/test.c Normal file
View File

@ -0,0 +1,65 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "crush.h"
#include "mapper.h"
#include "builder.h"
int main()
{
int sub[10];
int subw[10];
int i, j;
int d;
int o[100];
int root;
int ruleno;
int r[10];
int uw[10] = { 1000, 1000, 500, 1000, 2000, 1000, 1000, 3000, 1000, 500 };
struct crush_bucket *b;
struct crush_rule *rule;
struct crush_map *map = crush_create();
d = 0;
for (i=0; i<10; i++) {
for (j=0; j<10; j++)
o[j] = d++;
b = (struct crush_bucket*)crush_make_uniform_bucket(1, 10, o, uw[i]);
sub[i] = crush_add_bucket(map, b);
subw[i] = b->weight;
printf("make bucket %d weight %d\n", sub[i], subw[i]);
}
root = crush_add_bucket(map, (struct crush_bucket*)crush_make_tree_bucket(2, 10, sub, subw));
rule = crush_make_rule();
crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0);
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 3, 1);
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
ruleno = crush_add_rule(map, -1, rule);
crush_finalize(map);
printf("built\n");
/* test */
memset(o, 0, 100*sizeof(o[0]));
for (i=0; i<1000000; i++) {
crush_do_rule(map, ruleno, i, r, 3, -1);
/*printf("%d %d %d\n", r[0], r[1], r[2]);*/
for (j=0; j<3; j++)
o[r[j]]++;
}
for (i=0; i<100; i += 10)
printf("%2d : %d\n", i, o[i]);
return 0;
}

View File

@ -1,82 +0,0 @@
#include "hash.h"
#include "buckets.h"
int
crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
{
unsigned o, p, s;
o = crush_hash32_2(x, bucket->h.id);
p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size];
s = (x + o + (r+1)*p) % bucket->h.size;
return bucket->h.items[s];
}
int
crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
{
int i;
__u64 w;
for (i=0; i<bucket->h.size; i++) {
w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id) & 0xffff;
w = (w * bucket->sum_weights[i]) >> 32;
if (w < bucket->item_weights[i])
return bucket->h.items[i];
}
BUG_ON(1);
return 0;
}
int
crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
{
return 0;
}
int
crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
{
int i;
int high = 0;
unsigned high_draw = 0;
__u64 draw;
for (i=0; i<bucket->h.size; i++) {
draw = (crush_hash32_3(x, bucket->h.items[i], r) & 0xffff) * bucket->straws[i];
draw = draw >> 32;
if (i == 0 || draw > high_draw) {
high = i;
high_draw = draw;
}
}
return high;
}
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{
free(b->primes);
free(b->h.items);
}
void crush_destroy_bucket_list(struct crush_bucket_list *b)
{
free(b->item_weights);
free(b->sum_weights);
free(b->h.items);
}
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
free(b->h.items);
}
void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
{
free(b->straws);
free(b->h.items);
}

View File

@ -1,53 +0,0 @@
#ifndef _CRUSH_BUCKETS_H
#define _CRUSH_BUCKETS_H
#include "types.h"
enum {
CRUSH_BUCKET_UNIFORM = 1,
CRUSH_BUCKET_LIST = 2,
CRUSH_BUCKET_TREE = 3,
CRUSH_BUCKET_STRAW = 4
};
struct crush_bucket {
__u32 id;
__u32 type;
__u32 weight; /* 16-bit fixed point */
__u32 size; /* num items */
__s32 *items;
};
struct crush_bucket_uniform {
struct crush_bucket h;
__u32 item_weight; /* 16-bit fixed point */
__u32 *primes;
};
struct crush_bucket_list {
struct crush_bucket h;
__u32 *item_weights; /* 16-bit fixed point */
__u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */
};
struct crush_bucket_tree {
struct crush_bucket h;
};
struct crush_bucket_straw {
struct crush_bucket h;
__u32 *straws; /* 16-bit fixed point */
};
extern int crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r);
extern int crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r);
extern int crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r);
extern int crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r);
extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *);
extern void crush_destroy_bucket_list(struct crush_bucket_list *);
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *);
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *);
#endif

View File

@ -1,55 +0,0 @@
#ifndef _CRUSH_CRUSH_H
#define _CRUSH_CRUSH_H
#include "types.h"
#include "buckets.h"
enum {
CRUSH_RULE_TAKE,
CRUSH_RULE_CHOOSE_FIRSTN,
CRUSH_RULE_CHOOSE_INDEP,
CRUSH_RULE_EMIT
};
#define CRUSH_MAX_DEPTH 10
#define CRUSH_MAX_SET 10
struct crush_rule_step {
__u32 op;
__s32 arg1;
__s32 arg2;
};
struct crush_rule {
__u32 len;
struct crush_rule_step *steps;
};
struct crush_map {
struct crush_bucket **buckets;
struct crush_rule **rules;
/* parent pointers */
__u32 *bucket_parents;
__u32 *device_parents;
/* offload
* size max_devices, values 0...0xffff
* 0 == normal
* 0x10000 == 100% offload (i.e. failed)
*/
__u32 *device_offload;
__u32 max_buckets;
__u32 max_rules;
__u32 max_devices;
};
extern int crush_do_rule(struct crush_map *map,
int ruleno,
int x, int *result, int result_max,
int forcefeed); /* -1 for none */
/*extern int crush_decode(struct crush_map *map, struct ceph_bufferlist *bl);*/
#endif

View File

@ -18,6 +18,8 @@
#include "MonitorStore.h"
#include "crush/CrushWrapper.h"
#include "messages/MOSDFailure.h"
#include "messages/MOSDMap.h"
#include "messages/MOSDGetMap.h"
@ -158,49 +160,55 @@ void OSDMonitor::create_initial()
}
void OSDMonitor::build_crush_map(Crush& crush,
void OSDMonitor::build_crush_map(CrushWrapper& crush,
map<int,double>& weights)
{
// new
crush.create();
if (g_conf.num_osd >= 12) {
int ndom = g_conf.osd_max_rep;
UniformBucket *domain[ndom];
int domid[ndom];
for (int i=0; i<ndom; i++) {
domain[i] = new UniformBucket(1, 0);
domid[i] = crush.add_bucket(domain[i]);
}
// add osds
int ritems[ndom];
int rweights[ndom];
int nper = ((g_conf.num_osd - 1) / ndom) + 1;
derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl;
int i = 0;
for (int dom=0; dom<ndom; dom++) {
for (int j=0; j<nper; j++) {
domain[dom]->add_item(i, weights[i] ? weights[i]:1.0);
//derr(0) << "osd" << i << " in domain " << dom << dendl;
i++;
if (i == g_conf.num_osd) break;
int o = 0;
for (int i=0; i<ndom; i++) {
int items[nper];
//int w[nper];
int j;
rweights[i] = 0;
for (j=0; j<nper; j++, o++) {
if (o == g_conf.num_osd) break;
dout(20) << "added osd" << o << dendl;
items[j] = o;
//w[j] = weights[o] ? (0x10000 - (int)(weights[o] * 0x10000)):0x10000;
//rweights[i] += w[j];
rweights[i] += 0x10000;
}
crush_bucket_uniform *domain = crush_make_uniform_bucket(1, j, items, 0x10000);
ritems[i] = crush_add_bucket(crush.map, (crush_bucket*)domain);
dout(20) << "added domain bucket i " << ritems[i] << " of size " << j << dendl;
}
// root
Bucket *root = new ListBucket(2);
for (int i=0; i<ndom; i++) {
//derr(0) << "dom " << i << " w " << domain[i]->get_weight() << dendl;
root->add_item(domid[i], domain[i]->get_weight());
}
int nroot = crush.add_bucket(root);
crush_bucket_list *root = crush_make_list_bucket(2, ndom, ritems, rweights);
int rootid = crush_add_bucket(crush.map, (crush_bucket*)root);
// rules
// replication
for (int i=1; i<=ndom; i++) {
int r = CRUSH_REP_RULE(i);
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
crush_rule *rule = crush_make_rule();
crush_rule_add_step(rule, CRUSH_RULE_TAKE, rootid, 0);
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 1);
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule);
}
/*
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
@ -215,6 +223,7 @@ void OSDMonitor::build_crush_map(Crush& crush,
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
}
*/
// test
//vector<int> out;
@ -222,20 +231,24 @@ void OSDMonitor::build_crush_map(Crush& crush,
} else {
// one bucket
Bucket *b = new UniformBucket(1, 0);
int root = crush.add_bucket(b);
for (int i=0; i<g_conf.num_osd; i++) {
b->add_item(i, weights[i] ? weights[i]:1.0);
}
int items[g_conf.num_osd];
for (int i=0; i<g_conf.num_osd; i++)
items[i] = i;
crush_bucket_uniform *b = crush_make_uniform_bucket(1, g_conf.num_osd, items, 0x10000);
int root = crush_add_bucket(crush.map, (crush_bucket*)b);
// rules
// replication
for (int i=1; i<=g_conf.osd_max_rep; i++) {
int r = CRUSH_REP_RULE(i);
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
crush_rule *rule = crush_make_rule();
crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0);
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 0);
crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule);
}
/*
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
@ -243,7 +256,12 @@ void OSDMonitor::build_crush_map(Crush& crush,
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
*/
}
crush.finalize();
dout(20) << "crush max_devices " << crush.map->max_devices << dendl;
//vector<int> t;
//crush.do_rule(2, 132, t, 4, -1);
}
@ -420,7 +438,7 @@ bool OSDMonitor::should_propose(double& delay)
if (pending_inc.new_up.size() == osdmap.get_osds().size()) {
delay = 0.0;
if (g_conf.osd_auto_weight) {
Crush crush;
CrushWrapper crush;
build_crush_map(crush, osd_weight);
crush._encode(pending_inc.crush);
}

View File

@ -43,7 +43,7 @@ private:
map<int,double> osd_weight;
void build_crush_map(Crush& crush,
void build_crush_map(CrushWrapper& crush,
map<int,double>& weights);
// svc

View File

@ -28,8 +28,7 @@
#include "common/Mutex.h"
#include "common/Clock.h"
#include "crush/crush.h"
using namespace crush;
#include "crush/CrushWrapper.h"
#include <vector>
#include <list>
@ -144,7 +143,7 @@ private:
map<int32_t,entity_inst_t> osd_inst;
public:
Crush crush; // hierarchical map
CrushWrapper crush; // hierarchical map
friend class OSDMonitor;
friend class MDS;
@ -207,9 +206,14 @@ private:
void mark_down(int o, bool clean) { down_osds[o] = clean; }
void mark_up(int o) { down_osds.erase(o); }
void mark_out(int o) { out_osds.insert(o); }
void mark_in(int o) { out_osds.erase(o); }
void mark_out(int o) {
out_osds.insert(o);
crush.update_offload_map(out_osds, overload_osds);
}
void mark_in(int o) {
out_osds.erase(o);
crush.update_offload_map(out_osds, overload_osds);
}
void apply_incremental(Incremental &inc) {
assert(inc.epoch == epoch+1);
@ -223,8 +227,8 @@ private:
return;
}
if (inc.crush.length()) {
int off = 0;
crush._decode(inc.crush, off);
bufferlist::iterator blp = inc.crush.begin();
crush._decode(blp);
}
// nope, incremental.
@ -272,6 +276,8 @@ private:
i++) {
overload_osds[i->first] = i->second;
}
crush.update_offload_map(out_osds, overload_osds);
}
// serialize, unserialize
@ -288,7 +294,9 @@ private:
::_encode(overload_osds, blist);
::_encode(osd_inst, blist);
crush._encode(blist);
bufferlist cbl;
crush._encode(cbl);
::_encode(cbl, blist);
}
void decode(bufferlist& blist) {
@ -306,7 +314,12 @@ private:
::_decode(overload_osds, blist, off);
::_decode(osd_inst, blist, off);
crush._decode(blist, off);
bufferlist cbl;
::_decode(cbl, blist, off);
bufferlist::iterator cblp = cbl.begin();
crush._decode(cblp);
crush.update_offload_map(out_osds, overload_osds);
}
@ -320,8 +333,6 @@ private:
}
ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) {
static crush::Hash H(777);
int num = preferred >= 0 ? localized_pg_num:pg_num;
int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask;
@ -334,14 +345,14 @@ private:
case CEPH_OBJECT_LAYOUT_HASHINO:
//ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask);
ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask);
ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask);
break;
case CEPH_OBJECT_LAYOUT_HASH:
//ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask);
//ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask);
//ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask);
ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask);
ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask);
break;
default:
@ -374,10 +385,9 @@ private:
if (pg.preferred() >= 0 &&
out_osds.count(pg.preferred()) == 0)
forcefeed = pg.preferred();
crush.do_rule(crush.rules[rule],
crush.do_rule(rule,
pg.ps(),
osds,
out_osds, overload_osds,
osds, pg.size(),
forcefeed);
}
break;
@ -389,8 +399,7 @@ private:
case CEPH_PG_LAYOUT_HYBRID:
{
static crush::Hash H(777);
int h = H(pg.ps());
int h = crush_hash32(pg.ps());
for (int i=0; i<pg.size(); i++)
osds.push_back( (h+i) % g_conf.num_osd );
}
@ -398,12 +407,11 @@ private:
case CEPH_PG_LAYOUT_HASH:
{
static crush::Hash H(777);
for (int i=0; i<pg.size(); i++) {
int t = 1;
int osd = 0;
while (t++) {
osd = H(i, pg.ps(), t) % g_conf.num_osd;
osd = crush_hash32_3(i, pg.ps(), t) % g_conf.num_osd;
int j = 0;
for (; j<i; j++)
if (osds[j] == osd) break;

View File

@ -31,6 +31,7 @@
using namespace std;
#include <ext/hash_map>
#include <ext/hash_set>
using namespace __gnu_cxx;

View File

@ -74,8 +74,8 @@ typedef uint8_t pruleset_t;
// crush rule ids
#define CRUSH_REP_RULE(nrep) (100+nrep) // replication
#define CRUSH_RAID_RULE(num) (200+num) // raid
#define CRUSH_REP_RULE(nrep) (nrep) // replication
#define CRUSH_RAID_RULE(num) (10+num) // raid