mirror of
https://github.com/ceph/ceph
synced 2025-03-11 02:39:05 +00:00
merged r1961:1988 from ceph/branches/sage/crush back into trunk
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1989 29311d96-e01e-0410-9327-a35deaab8ce9
This commit is contained in:
parent
6820be6c24
commit
dae8bec9a1
@ -16,7 +16,7 @@
|
||||
EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib
|
||||
EXTRA_CFLAGS += -g
|
||||
EXTRA_CFLAGS += -pg
|
||||
EXTRA_CFLAGS += -O3
|
||||
#EXTRA_CFLAGS += -O3
|
||||
|
||||
# base
|
||||
CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS}
|
||||
@ -145,22 +145,22 @@ mkmonmap: mkmonmap.cc common.o
|
||||
extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
cmon: cmon.o mon.o msg/SimpleMessenger.o common.o
|
||||
cmon: cmon.o mon.o msg/SimpleMessenger.o common.o crush/libcrush.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o
|
||||
cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o crush/libcrush.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o
|
||||
cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o
|
||||
csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o
|
||||
cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o crush/libcrush.o
|
||||
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
|
||||
|
||||
|
||||
@ -191,15 +191,15 @@ ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o
|
||||
|
||||
|
||||
# fake*
|
||||
fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o
|
||||
fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o crush/libcrush.o
|
||||
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
|
||||
|
||||
fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
|
||||
fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crush/libcrush.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
|
||||
# mpi startup
|
||||
newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
|
||||
newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o
|
||||
${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
|
||||
|
||||
|
||||
@ -243,6 +243,14 @@ gprof-helper.so: test/gprof-helper.c
|
||||
test_disk_bw: test/test_disk_bw.cc common.o
|
||||
${CC} ${CFLAGS} ${LIBS} $^ -o $@
|
||||
|
||||
# crush
|
||||
|
||||
crush/libcrush.o: force_look
|
||||
cd crush ; make
|
||||
|
||||
force_look:
|
||||
true
|
||||
|
||||
# bits
|
||||
common.o: ${COMMON_OBJS}
|
||||
${LDINC} $@ $^
|
||||
@ -268,7 +276,6 @@ mon.o: ${MON_OBJS}
|
||||
osbdb.o: ${OSBDB_OBJS}
|
||||
${LDINC} $@ $^
|
||||
|
||||
|
||||
# generic rules
|
||||
%.so: %.cc
|
||||
${CC} -shared -fPIC ${CFLAGS} $< -o $@
|
||||
|
543
trunk/ceph/crush.old/crush.h
Normal file
543
trunk/ceph/crush.old/crush.h
Normal file
@ -0,0 +1,543 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __crush_CRUSH_H
|
||||
#define __crush_CRUSH_H
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <map>
|
||||
using std::set;
|
||||
using std::map;
|
||||
using std::vector;
|
||||
using std::list;
|
||||
#include <ext/hash_map>
|
||||
#include <ext/hash_set>
|
||||
using namespace __gnu_cxx;
|
||||
|
||||
|
||||
#include "Bucket.h"
|
||||
|
||||
#include "include/buffer.h"
|
||||
|
||||
|
||||
namespace crush {
|
||||
|
||||
|
||||
// *** RULES ***
|
||||
|
||||
class RuleStep {
|
||||
public:
|
||||
int cmd;
|
||||
vector<int> args;
|
||||
|
||||
RuleStep(int c) : cmd(c) {}
|
||||
RuleStep(int c, int a) : cmd(c) {
|
||||
args.push_back(a);
|
||||
}
|
||||
RuleStep(int c, int a, int b) : cmd(c) {
|
||||
args.push_back(a);
|
||||
args.push_back(b);
|
||||
}
|
||||
RuleStep(int o, int a, int b, int c) : cmd(o) {
|
||||
args.push_back(a);
|
||||
args.push_back(b);
|
||||
args.push_back(c);
|
||||
}
|
||||
|
||||
void _encode(bufferlist& bl) {
|
||||
bl.append((char*)&cmd, sizeof(cmd));
|
||||
::_encode(args, bl);
|
||||
}
|
||||
void _decode(bufferlist& bl, int& off) {
|
||||
bl.copy(off, sizeof(cmd), (char*)&cmd);
|
||||
off += sizeof(cmd);
|
||||
::_decode(args, bl, off);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Rule operations
|
||||
const int CRUSH_RULE_TAKE = 0;
|
||||
const int CRUSH_RULE_CHOOSE = 1; // first n by default
|
||||
const int CRUSH_RULE_CHOOSE_FIRSTN = 1;
|
||||
const int CRUSH_RULE_CHOOSE_INDEP = 2;
|
||||
const int CRUSH_RULE_EMIT = 3;
|
||||
|
||||
class Rule {
|
||||
public:
|
||||
vector< RuleStep > steps;
|
||||
|
||||
void _encode(bufferlist& bl) {
|
||||
int n = steps.size();
|
||||
bl.append((char*)&n, sizeof(n));
|
||||
for (int i=0; i<n; i++)
|
||||
steps[i]._encode(bl);
|
||||
}
|
||||
void _decode(bufferlist& bl, int& off) {
|
||||
steps.clear();
|
||||
int n;
|
||||
bl.copy(off, sizeof(n), (char*)&n);
|
||||
off += sizeof(n);
|
||||
for (int i=0; i<n; i++) {
|
||||
steps.push_back(RuleStep(0));
|
||||
steps[i]._decode(bl, off);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
// *** CRUSH ***
|
||||
|
||||
class Crush {
|
||||
protected:
|
||||
map<int, Bucket*> buckets;
|
||||
int bucketno;
|
||||
Hash h;
|
||||
|
||||
hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
|
||||
|
||||
public:
|
||||
map<int, Rule> rules;
|
||||
|
||||
//map<int,int> collisions;
|
||||
//map<int,int> bumps;
|
||||
|
||||
void _encode(bufferlist& bl) {
|
||||
// buckets
|
||||
int n = buckets.size();
|
||||
bl.append((char*)&n, sizeof(n));
|
||||
for (map<int, Bucket*>::const_iterator it = buckets.begin();
|
||||
it != buckets.end();
|
||||
it++) {
|
||||
bl.append((char*)&it->first, sizeof(it->first));
|
||||
it->second->_encode(bl);
|
||||
}
|
||||
bl.append((char*)&bucketno, sizeof(bucketno));
|
||||
|
||||
// hash
|
||||
int s = h.get_seed();
|
||||
bl.append((char*)&s, sizeof(s));
|
||||
|
||||
//::_encode(out, bl);
|
||||
//::_encode(overload, bl);
|
||||
|
||||
// rules
|
||||
n = rules.size();
|
||||
bl.append((char*)&n, sizeof(n));
|
||||
for(map<int, Rule>::iterator it = rules.begin();
|
||||
it != rules.end();
|
||||
it++) {
|
||||
bl.append((char*)&it->first, sizeof(it->first));
|
||||
it->second._encode(bl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void _decode(bufferlist& bl, int& off) {
|
||||
int n;
|
||||
bl.copy(off, sizeof(n), (char*)&n);
|
||||
off += sizeof(n);
|
||||
for (int i=0; i<n; i++) {
|
||||
int bid;
|
||||
bl.copy(off, sizeof(bid), (char*)&bid);
|
||||
off += sizeof(bid);
|
||||
Bucket *b = decode_bucket(bl, off);
|
||||
buckets[bid] = b;
|
||||
}
|
||||
bl.copy(off, sizeof(bucketno), (char*)&bucketno);
|
||||
off += sizeof(bucketno);
|
||||
|
||||
int s;
|
||||
bl.copy(off, sizeof(s), (char*)&s);
|
||||
off += sizeof(s);
|
||||
h.set_seed(s);
|
||||
|
||||
//::_decode(out, bl, off);
|
||||
//::_decode(overload, bl, off);
|
||||
|
||||
// rules
|
||||
bl.copy(off, sizeof(n), (char*)&n);
|
||||
off += sizeof(n);
|
||||
for (int i=0; i<n; i++) {
|
||||
int r;
|
||||
bl.copy(off, sizeof(r), (char*)&r);
|
||||
off += sizeof(r);
|
||||
rules[r]._decode(bl,off);
|
||||
}
|
||||
|
||||
// index
|
||||
build_parent_map();
|
||||
}
|
||||
|
||||
void build_parent_map() {
|
||||
parent_map.clear();
|
||||
|
||||
// index every bucket
|
||||
for (map<int, Bucket*>::iterator bp = buckets.begin();
|
||||
bp != buckets.end();
|
||||
++bp) {
|
||||
// index bucket items
|
||||
vector<int> items;
|
||||
bp->second->get_items(items);
|
||||
for (vector<int>::iterator ip = items.begin();
|
||||
ip != items.end();
|
||||
++ip)
|
||||
parent_map[*ip] = bp->first;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public:
|
||||
Crush(int seed=123) : bucketno(-1), h(seed) {}
|
||||
~Crush() {
|
||||
// hose buckets
|
||||
for (map<int, Bucket*>::iterator it = buckets.begin();
|
||||
it != buckets.end();
|
||||
it++) {
|
||||
delete it->second;
|
||||
}
|
||||
}
|
||||
|
||||
int print(ostream& out, int root, int indent=0) {
|
||||
for (int i=0; i<indent; i++) out << " ";
|
||||
Bucket *b = buckets[root];
|
||||
assert(b);
|
||||
out << b->get_weight() << "\t" << b->get_id() << "\t";
|
||||
for (int i=0; i<indent; i++) out << " ";
|
||||
out << b->get_bucket_type() << ": ";
|
||||
|
||||
vector<int> items;
|
||||
b->get_items(items);
|
||||
|
||||
if (buckets.count(items[0])) {
|
||||
out << std::endl;
|
||||
for (unsigned i=0; i<items.size(); i++)
|
||||
print(out, items[i], indent+1);
|
||||
} else {
|
||||
out << "[";
|
||||
for (unsigned i=0; i<items.size(); i++) {
|
||||
if (i) out << " ";
|
||||
out << items[i];
|
||||
}
|
||||
out << "]";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int add_bucket( Bucket *b ) {
|
||||
int n = bucketno;
|
||||
bucketno--;
|
||||
b->set_id(n);
|
||||
buckets[n] = b;
|
||||
return n;
|
||||
}
|
||||
|
||||
void add_item(int parent, int item, float w, bool back=false) {
|
||||
// add item
|
||||
assert(!buckets[parent]->is_uniform());
|
||||
Bucket *p = buckets[parent];
|
||||
|
||||
p->add_item(item, w, back);
|
||||
|
||||
// set item's parent
|
||||
Bucket *n = buckets[item];
|
||||
if (n)
|
||||
n->set_parent(parent);
|
||||
|
||||
// update weights
|
||||
while (buckets.count(p->get_parent())) {
|
||||
int child = p->get_id();
|
||||
p = buckets[p->get_parent()];
|
||||
p->adjust_item_weight(child, w);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
this is a hack, fix me! weights should be consistent throughout hierarchy!
|
||||
|
||||
*/
|
||||
void set_bucket_weight(int item, float w) {
|
||||
Bucket *b = buckets[item];
|
||||
float adj = w - b->get_weight();
|
||||
|
||||
while (buckets.count(b->get_parent())) {
|
||||
Bucket *p = buckets[b->get_parent()];
|
||||
p->adjust_item_weight(b->get_id(), adj);
|
||||
b = p;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* choose numrep distinct items of type type
|
||||
*/
|
||||
void choose(int x,
|
||||
int numrep,
|
||||
int type,
|
||||
Bucket *inbucket,
|
||||
vector<int>& outvec,
|
||||
bool firstn,
|
||||
set<int>& outset, map<int,float>& overloadmap,
|
||||
bool forcefeed=false,
|
||||
int forcefeedval=-1) {
|
||||
int off = outvec.size();
|
||||
|
||||
// for each replica
|
||||
for (int rep=0; rep<numrep; rep++) {
|
||||
int outv = -1; // my result
|
||||
|
||||
// forcefeed?
|
||||
if (forcefeed) {
|
||||
forcefeed = false;
|
||||
outvec.push_back(forcefeedval);
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep trying until we get a non-out, non-colliding item
|
||||
int ftotal = 0;
|
||||
bool skip_rep = false;
|
||||
|
||||
while (1) {
|
||||
// start with the input bucket
|
||||
Bucket *in = inbucket;
|
||||
|
||||
// choose through intervening buckets
|
||||
int flocal = 0;
|
||||
bool retry_rep = false;
|
||||
|
||||
while (1) {
|
||||
// r may be twiddled to (try to) avoid past collisions
|
||||
int r = rep;
|
||||
if (in->is_uniform()) {
|
||||
// uniform bucket; be careful!
|
||||
if (firstn || numrep >= in->get_size()) {
|
||||
// uniform bucket is too small; just walk thru elements
|
||||
r += ftotal; // r' = r + f_total (first n)
|
||||
} else {
|
||||
// make sure numrep is not a multple of bucket size
|
||||
int add = numrep*flocal; // r' = r + n*f_local
|
||||
if (in->get_size() % numrep == 0) {
|
||||
add += add/in->get_size(); // shift seq once per pass through the bucket
|
||||
}
|
||||
r += add;
|
||||
}
|
||||
} else {
|
||||
// mixed bucket; just make a distinct-ish r sequence
|
||||
if (firstn)
|
||||
r += ftotal; // r' = r + f_total
|
||||
else
|
||||
r += numrep * flocal; // r' = r + n*f_local
|
||||
}
|
||||
|
||||
// choose
|
||||
outv = in->choose_r(x, r, h);
|
||||
|
||||
// did we get the type we want?
|
||||
int itemtype = 0; // 0 is terminal type
|
||||
Bucket *newin = 0; // remember bucket we hit
|
||||
if (in->is_uniform()) {
|
||||
itemtype = ((UniformBucket*)in)->get_item_type();
|
||||
} else {
|
||||
if (buckets.count(outv)) { // another bucket
|
||||
newin = buckets[outv];
|
||||
itemtype = newin->get_type();
|
||||
}
|
||||
}
|
||||
if (itemtype == type) { // this is what we want!
|
||||
// collision?
|
||||
bool collide = false;
|
||||
for (int prep=0; prep<rep; prep++) {
|
||||
if (outvec[off+prep] == outv) {
|
||||
collide = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ok choice?
|
||||
bool bad = false;
|
||||
if (type == 0 && outset.count(outv))
|
||||
bad = true;
|
||||
if (overloadmap.count(outv)) {
|
||||
float f = (float)(h(x, outv) % 1000) / 1000.0;
|
||||
if (f > overloadmap[outv])
|
||||
bad = true;
|
||||
}
|
||||
|
||||
if (collide || bad) {
|
||||
ftotal++;
|
||||
flocal++;
|
||||
|
||||
if (collide && flocal < 3)
|
||||
continue; // try locally a few times!
|
||||
|
||||
if (ftotal >= 10) {
|
||||
// ok fine, just ignore dup. FIXME.
|
||||
skip_rep = true;
|
||||
break;
|
||||
}
|
||||
|
||||
retry_rep = true;
|
||||
}
|
||||
|
||||
break; // ok then!
|
||||
}
|
||||
|
||||
// next
|
||||
in = newin;
|
||||
}
|
||||
|
||||
if (retry_rep) continue; // try again
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// skip this rep? (e.g. too many collisions, we give up)
|
||||
if (skip_rep) continue;
|
||||
|
||||
// output this value
|
||||
outvec.push_back(outv);
|
||||
} // for rep
|
||||
|
||||
// double check!
|
||||
if (0) {
|
||||
for (unsigned i=1; i<outvec.size(); i++)
|
||||
for (unsigned j=0; j<i; j++)
|
||||
assert(outvec[i] != outvec[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void do_rule(Rule& rule, int x, vector<int>& result,
|
||||
set<int>& outset, map<int,float>& overloadmap,
|
||||
int forcefeed=-1) {
|
||||
//int numresult = 0;
|
||||
result.clear();
|
||||
|
||||
// determine hierarchical context for forcefeed (if any)
|
||||
list<int> force_stack;
|
||||
if (forcefeed >= 0 && parent_map.count(forcefeed)) {
|
||||
int t = forcefeed;
|
||||
while (1) {
|
||||
force_stack.push_front(t);
|
||||
//cout << "push " << t << " onto force_stack" << std::endl;
|
||||
if (parent_map.count(t) == 0) break; // reached root, presumably.
|
||||
//cout << " " << t << " parent is " << parent_map[t] << std::endl;
|
||||
t = parent_map[t];
|
||||
}
|
||||
}
|
||||
|
||||
// working vector
|
||||
vector<int> w; // working variable
|
||||
|
||||
// go through each statement
|
||||
for (vector<RuleStep>::iterator pc = rule.steps.begin();
|
||||
pc != rule.steps.end();
|
||||
pc++) {
|
||||
// move input?
|
||||
|
||||
// do it
|
||||
switch (pc->cmd) {
|
||||
case CRUSH_RULE_TAKE:
|
||||
{
|
||||
const int arg = pc->args[0];
|
||||
//cout << "take " << arg << std::endl;
|
||||
|
||||
if (!force_stack.empty()) {
|
||||
assert(force_stack.front() == arg);
|
||||
force_stack.pop_front();
|
||||
}
|
||||
|
||||
w.clear();
|
||||
w.push_back(arg);
|
||||
}
|
||||
break;
|
||||
|
||||
case CRUSH_RULE_CHOOSE_FIRSTN:
|
||||
case CRUSH_RULE_CHOOSE_INDEP:
|
||||
{
|
||||
const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN;
|
||||
const int numrep = pc->args[0];
|
||||
const int type = pc->args[1];
|
||||
|
||||
//cout << "choose " << numrep << " of type " << type << std::endl;
|
||||
|
||||
assert(!w.empty());
|
||||
|
||||
// reset output
|
||||
vector<int> out;
|
||||
|
||||
// forcefeeding?
|
||||
bool forcing = false;
|
||||
int forceval = -1;
|
||||
if (!force_stack.empty()) {
|
||||
forceval = force_stack.front();
|
||||
force_stack.pop_front();
|
||||
//cout << "priming out with " << forceval << std::endl;
|
||||
forcing = true;
|
||||
} else if (forcefeed >= 0 && type == 0) {
|
||||
//cout << "forcing context-less " << forcefeed << std::endl;
|
||||
forceval = forcefeed;
|
||||
forcefeed = -1;
|
||||
forcing = true;
|
||||
}
|
||||
|
||||
// do each row independently
|
||||
for (vector<int>::iterator i = w.begin();
|
||||
i != w.end();
|
||||
i++) {
|
||||
assert(buckets.count(*i));
|
||||
Bucket *b = buckets[*i];
|
||||
choose(x, numrep, type, b, out, firstn,
|
||||
outset, overloadmap,
|
||||
forcing,
|
||||
forceval);
|
||||
forcing = false; // only once
|
||||
} // for inrow
|
||||
|
||||
// put back into w
|
||||
w.swap(out);
|
||||
out.clear();
|
||||
}
|
||||
break;
|
||||
|
||||
case CRUSH_RULE_EMIT:
|
||||
{
|
||||
for (unsigned i=0; i<w.size(); i++)
|
||||
result.push_back(w[i]);
|
||||
//result[numresult++] = w[i];
|
||||
w.clear();
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
227
trunk/ceph/crush/CrushWrapper.h
Normal file
227
trunk/ceph/crush/CrushWrapper.h
Normal file
@ -0,0 +1,227 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
|
||||
#ifndef __CRUSH_WRAPPER_H
|
||||
#define __CRUSH_WRAPPER_H
|
||||
|
||||
#include "crush.h"
|
||||
#include "hash.h"
|
||||
#include "mapper.h"
|
||||
#include "builder.h"
|
||||
|
||||
#include "include/encodable.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
class CrushWrapper {
|
||||
public:
|
||||
struct crush_map *map;
|
||||
|
||||
CrushWrapper() : map(0) {}
|
||||
~CrushWrapper() {
|
||||
if (map) crush_destroy(map);
|
||||
}
|
||||
|
||||
void create() {
|
||||
if (map) crush_destroy(map);
|
||||
map = crush_create();
|
||||
}
|
||||
void finalize() {
|
||||
assert(map);
|
||||
crush_finalize(map);
|
||||
}
|
||||
|
||||
void update_offload_map(std::set<int32_t>& out_osds,
|
||||
std::map<int32_t,float>& overload_osds) {
|
||||
for (int i=0; i<map->max_devices; i++) {
|
||||
if (out_osds.count(i))
|
||||
map->device_offload[i] = 0x10000;
|
||||
else if (overload_osds.count(i))
|
||||
map->device_offload[i] = (int)(0x10000 * overload_osds[i]); // FIXME: reverse?
|
||||
else
|
||||
map->device_offload[i] = 0; // normal.
|
||||
}
|
||||
}
|
||||
|
||||
void do_rule(int rule, int x, vector<int>& out, int maxout, int forcefeed) {
|
||||
int rawout[maxout];
|
||||
|
||||
int numrep = crush_do_rule(map, rule, x, rawout, maxout, forcefeed);
|
||||
|
||||
out.resize(numrep);
|
||||
for (int i=0; i<numrep; i++)
|
||||
out[i] = rawout[i];
|
||||
}
|
||||
|
||||
void _encode(bufferlist &bl) {
|
||||
::_encode_simple(map->max_buckets, bl);
|
||||
::_encode_simple(map->max_rules, bl);
|
||||
::_encode_simple(map->max_devices, bl);
|
||||
|
||||
// simple arrays
|
||||
bl.append((char*)map->device_offload, sizeof(map->device_offload[0]) * map->max_devices);
|
||||
|
||||
// buckets
|
||||
for (unsigned i=0; i<map->max_buckets; i++) {
|
||||
__u32 type = 0;
|
||||
if (map->buckets[i]) type = map->buckets[i]->bucket_type;
|
||||
::_encode_simple(type, bl);
|
||||
if (!type) continue;
|
||||
|
||||
::_encode_simple(map->buckets[i]->id, bl);
|
||||
::_encode_simple(map->buckets[i]->type, bl);
|
||||
::_encode_simple(map->buckets[i]->bucket_type, bl);
|
||||
::_encode_simple(map->buckets[i]->weight, bl);
|
||||
::_encode_simple(map->buckets[i]->size, bl);
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_encode_simple(map->buckets[i]->items[j], bl);
|
||||
|
||||
switch (map->buckets[i]->type) {
|
||||
case CRUSH_BUCKET_UNIFORM:
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], bl);
|
||||
::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, bl);
|
||||
break;
|
||||
|
||||
case CRUSH_BUCKET_LIST:
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++) {
|
||||
::_encode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], bl);
|
||||
::_encode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], bl);
|
||||
}
|
||||
break;
|
||||
|
||||
case CRUSH_BUCKET_TREE:
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_encode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], bl);
|
||||
break;
|
||||
|
||||
case CRUSH_BUCKET_STRAW:
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_encode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], bl);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// rules
|
||||
for (unsigned i=0; i<map->max_rules; i++) {
|
||||
__u32 yes = map->rules[i] ? 1:0;
|
||||
::_encode_simple(yes, bl);
|
||||
if (!yes) continue;
|
||||
|
||||
::_encode_simple(map->rules[i]->len, bl);
|
||||
for (unsigned j=0; j<map->rules[i]->len; j++)
|
||||
::_encode_simple(map->rules[i]->steps[j], bl);
|
||||
}
|
||||
}
|
||||
|
||||
void _decode(bufferlist::iterator &blp) {
|
||||
create();
|
||||
::_decode_simple(map->max_buckets, blp);
|
||||
::_decode_simple(map->max_rules, blp);
|
||||
::_decode_simple(map->max_devices, blp);
|
||||
|
||||
map->device_offload = (__u32*)malloc(sizeof(map->device_offload[0])*map->max_devices);
|
||||
blp.copy(sizeof(map->device_offload[0])*map->max_devices, (char*)map->device_offload);
|
||||
|
||||
// buckets
|
||||
map->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*map->max_buckets);
|
||||
for (unsigned i=0; i<map->max_buckets; i++) {
|
||||
__u32 type;
|
||||
::_decode_simple(type, blp);
|
||||
if (!type) {
|
||||
map->buckets[i] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
int size = 0;
|
||||
switch (type) {
|
||||
case CRUSH_BUCKET_UNIFORM:
|
||||
size = sizeof(crush_bucket_uniform);
|
||||
break;
|
||||
case CRUSH_BUCKET_LIST:
|
||||
size = sizeof(crush_bucket_list);
|
||||
break;
|
||||
case CRUSH_BUCKET_TREE:
|
||||
size = sizeof(crush_bucket_tree);
|
||||
break;
|
||||
case CRUSH_BUCKET_STRAW:
|
||||
size = sizeof(crush_bucket_straw);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
map->buckets[i] = (crush_bucket*)malloc(size);
|
||||
memset(map->buckets[i], 0, size);
|
||||
|
||||
::_decode_simple(map->buckets[i]->id, blp);
|
||||
::_decode_simple(map->buckets[i]->type, blp);
|
||||
::_decode_simple(map->buckets[i]->bucket_type, blp);
|
||||
::_decode_simple(map->buckets[i]->weight, blp);
|
||||
::_decode_simple(map->buckets[i]->size, blp);
|
||||
|
||||
map->buckets[i]->items = (__s32*)malloc(sizeof(__s32)*map->buckets[i]->size);
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_decode_simple(map->buckets[i]->items[j], blp);
|
||||
|
||||
switch (map->buckets[i]->type) {
|
||||
case CRUSH_BUCKET_UNIFORM:
|
||||
((crush_bucket_uniform*)map->buckets[i])->primes =
|
||||
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], blp);
|
||||
::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, blp);
|
||||
break;
|
||||
|
||||
case CRUSH_BUCKET_LIST:
|
||||
((crush_bucket_list*)map->buckets[i])->item_weights =
|
||||
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
|
||||
((crush_bucket_list*)map->buckets[i])->sum_weights =
|
||||
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
|
||||
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++) {
|
||||
::_decode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], blp);
|
||||
::_decode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], blp);
|
||||
}
|
||||
break;
|
||||
|
||||
case CRUSH_BUCKET_TREE:
|
||||
((crush_bucket_tree*)map->buckets[i])->node_weights =
|
||||
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_decode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], blp);
|
||||
break;
|
||||
|
||||
case CRUSH_BUCKET_STRAW:
|
||||
((crush_bucket_straw*)map->buckets[i])->straws =
|
||||
(__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
|
||||
for (unsigned j=0; j<map->buckets[i]->size; j++)
|
||||
::_decode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], blp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// rules
|
||||
map->rules = (crush_rule**)malloc(sizeof(crush_rule*)*map->max_rules);
|
||||
for (unsigned i=0; i<map->max_rules; i++) {
|
||||
__u32 yes;
|
||||
::_decode_simple(yes, blp);
|
||||
if (!yes) {
|
||||
map->rules[i] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
map->rules[i] = (crush_rule*)malloc(sizeof(crush_rule));
|
||||
memset(map->rules[i], 0, sizeof(crush_rule));
|
||||
|
||||
::_decode_simple(map->rules[i]->len, blp);
|
||||
map->rules[i]->steps = (crush_rule_step*)malloc(sizeof(crush_rule_step) * map->rules[i]->len);
|
||||
for (unsigned j=0; j<map->rules[i]->len; j++)
|
||||
::_decode_simple(map->rules[i]->steps[j], blp);
|
||||
}
|
||||
|
||||
finalize();
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
@ -1,11 +1,12 @@
|
||||
|
||||
CC = gcc
|
||||
CFLAGS = -Wall
|
||||
CFLAGS += -O3 -g
|
||||
CFLAGS += -g
|
||||
CFLAGS += -O3
|
||||
LD = ld
|
||||
RM = rm
|
||||
|
||||
all: depend builder.o libcrush.o
|
||||
all: depend libcrush.o test
|
||||
|
||||
clean:
|
||||
rm -f *.o libcrush.o
|
||||
@ -13,9 +14,12 @@ clean:
|
||||
%.o: %.c
|
||||
${CC} ${CFLAGS} -c $< -o $@
|
||||
|
||||
libcrush.o: crush.o buckets.o
|
||||
libcrush.o: builder.o crush.o mapper.o
|
||||
$(LD) -i -o $@ $^
|
||||
|
||||
test: test.c libcrush.o
|
||||
$(CC) ${CFLAGS} -lm $^ -o $@
|
||||
|
||||
.depend:
|
||||
touch .depend
|
||||
|
6
trunk/ceph/crush/buckets.c
Normal file
6
trunk/ceph/crush/buckets.c
Normal file
@ -0,0 +1,6 @@
|
||||
|
||||
#include "crush.h"
|
||||
#include "hash.h"
|
||||
|
||||
int
|
||||
|
@ -2,16 +2,16 @@
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "builder.h"
|
||||
#include "hash.h"
|
||||
|
||||
|
||||
struct crush_map *crush_new()
|
||||
struct crush_map *crush_create()
|
||||
{
|
||||
struct crush_map *m;
|
||||
m = malloc(sizeof(*m));
|
||||
memset(&m, 0, sizeof(m));
|
||||
memset(m, 0, sizeof(*m));
|
||||
return m;
|
||||
}
|
||||
|
||||
@ -26,73 +26,36 @@ void crush_finalize(struct crush_map *map)
|
||||
for (b=0; b<map->max_buckets; b++) {
|
||||
if (map->buckets[b] == 0) continue;
|
||||
for (i=0; i<map->buckets[b]->size; i++)
|
||||
if (map->buckets[b]->items[i] > map->max_devices)
|
||||
map->max_devices = map->buckets[b]->items[i];
|
||||
if (map->buckets[b]->items[i] >= map->max_devices)
|
||||
map->max_devices = map->buckets[b]->items[i] + 1;
|
||||
}
|
||||
|
||||
/* allocate arrays */
|
||||
map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices);
|
||||
map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices);
|
||||
memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices);
|
||||
memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices);
|
||||
map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets);
|
||||
memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets);
|
||||
|
||||
/* build reverse map */
|
||||
/* build parent maps */
|
||||
for (b=0; b<map->max_buckets; b++) {
|
||||
if (map->buckets[b] == 0) continue;
|
||||
for (i=0; i<map->buckets[b]->size; i++) {
|
||||
c = map->buckets[b]->items[i];
|
||||
BUG_ON(c >= map->max_devices);
|
||||
if (c >= 0)
|
||||
map->device_parents[c] = map->buckets[b]->id;
|
||||
else
|
||||
map->bucket_parents[-c] = map->buckets[b]->id;
|
||||
map->bucket_parents[-1-c] = map->buckets[b]->id;
|
||||
}
|
||||
}
|
||||
|
||||
/* new device offload map? */
|
||||
if (!map->device_offload) {
|
||||
map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices);
|
||||
memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* deallocate
|
||||
*/
|
||||
void crush_destroy(struct crush_map *map)
|
||||
{
|
||||
int b;
|
||||
|
||||
/* buckets */
|
||||
for (b=0; b<map->max_buckets; b++) {
|
||||
if (map->buckets[b] == 0) continue;
|
||||
switch (map->buckets[b]->type) {
|
||||
case CRUSH_BUCKET_UNIFORM:
|
||||
crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]);
|
||||
break;
|
||||
case CRUSH_BUCKET_LIST:
|
||||
crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]);
|
||||
break;
|
||||
case CRUSH_BUCKET_TREE:
|
||||
crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]);
|
||||
break;
|
||||
case CRUSH_BUCKET_STRAW:
|
||||
crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(map->buckets);
|
||||
|
||||
/* rules */
|
||||
for (b=0; b<map->max_rules; b++) {
|
||||
if (map->rules[b] == 0) continue;
|
||||
if (map->rules[b]->steps)
|
||||
free(map->rules[b]->steps);
|
||||
free(map->rules[b]);
|
||||
}
|
||||
free(map->rules);
|
||||
|
||||
free(map->bucket_parents);
|
||||
free(map->device_parents);
|
||||
free(map->device_offload);
|
||||
|
||||
memset(map, 0, sizeof(*map));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -100,25 +63,26 @@ void crush_destroy(struct crush_map *map)
|
||||
/** rules **/
|
||||
|
||||
int crush_add_rule(struct crush_map *map,
|
||||
int ruleno,
|
||||
struct crush_rule *rule)
|
||||
{
|
||||
int id;
|
||||
|
||||
/* find a rule id */
|
||||
for (id=0; id < map->max_rules; id++)
|
||||
if (map->rules[id] == 0) break;
|
||||
if (id == map->max_rules) {
|
||||
int oldsize;
|
||||
|
||||
if (ruleno < 0) {
|
||||
for (ruleno=0; ruleno < map->max_rules; ruleno++)
|
||||
if (map->rules[ruleno] == 0) break;
|
||||
}
|
||||
if (ruleno >= map->max_rules) {
|
||||
/* expand array */
|
||||
if (map->max_rules)
|
||||
map->max_rules *= 2;
|
||||
else
|
||||
map->max_rules = 8;
|
||||
oldsize = map->max_rules;
|
||||
map->max_rules = ruleno+1;
|
||||
map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0]));
|
||||
memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
|
||||
}
|
||||
|
||||
/* add it */
|
||||
map->rules[id] = rule;
|
||||
return id;
|
||||
map->rules[ruleno] = rule;
|
||||
return ruleno;
|
||||
}
|
||||
|
||||
struct crush_rule *crush_make_rule()
|
||||
@ -126,7 +90,7 @@ struct crush_rule *crush_make_rule()
|
||||
struct crush_rule *rule;
|
||||
|
||||
rule = malloc(sizeof(struct crush_rule));
|
||||
memset(&rule, 0, sizeof(rule));
|
||||
memset(rule, 0, sizeof(*rule));
|
||||
return rule;
|
||||
}
|
||||
|
||||
@ -134,9 +98,9 @@ void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2)
|
||||
{
|
||||
rule->len++;
|
||||
if (rule->steps)
|
||||
rule->steps = malloc(sizeof(rule->steps[0])*rule->len);
|
||||
else
|
||||
rule->steps = realloc(rule->steps, sizeof(rule->steps[0])*rule->len);
|
||||
else
|
||||
rule->steps = malloc(sizeof(rule->steps[0])*rule->len);
|
||||
rule->steps[rule->len-1].op = op;
|
||||
rule->steps[rule->len-1].arg1 = arg1;
|
||||
rule->steps[rule->len-1].arg2 = arg2;
|
||||
@ -149,35 +113,43 @@ int crush_add_bucket(struct crush_map *map,
|
||||
struct crush_bucket *bucket)
|
||||
{
|
||||
int id;
|
||||
int oldsize;
|
||||
|
||||
/* find a bucket id */
|
||||
for (id=0; id < map->max_buckets; id++)
|
||||
if (map->buckets[id] == 0) break;
|
||||
if (id == map->max_buckets) {
|
||||
/* expand array */
|
||||
oldsize = map->max_buckets;
|
||||
if (map->max_buckets)
|
||||
map->max_buckets *= 2;
|
||||
else
|
||||
map->max_buckets = 8;
|
||||
map->buckets = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]));
|
||||
memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));
|
||||
}
|
||||
|
||||
/* add it */
|
||||
bucket->id = id;
|
||||
bucket->id = -1 - id;
|
||||
map->buckets[id] = bucket;
|
||||
return id;
|
||||
return -1 - id;
|
||||
}
|
||||
|
||||
|
||||
void crush_make_uniform_bucket(struct crush_map *map,
|
||||
struct crush_bucket_uniform *bucket,
|
||||
int size,
|
||||
int *items,
|
||||
int item_weight)
|
||||
/* uniform bucket */
|
||||
|
||||
struct crush_bucket_uniform *
|
||||
crush_make_uniform_bucket(int type, int size,
|
||||
int *items,
|
||||
int item_weight)
|
||||
{
|
||||
int i, j, x;
|
||||
struct crush_bucket_uniform *bucket;
|
||||
|
||||
bucket = malloc(sizeof(*bucket));
|
||||
memset(bucket, 0, sizeof(*bucket));
|
||||
bucket->h.bucket_type = CRUSH_BUCKET_UNIFORM;
|
||||
bucket->h.type = type;
|
||||
bucket->h.size = size;
|
||||
bucket->h.weight = size * item_weight;
|
||||
|
||||
@ -202,48 +174,135 @@ void crush_make_uniform_bucket(struct crush_map *map,
|
||||
bucket->primes[i++] = x;
|
||||
x += 2;
|
||||
}
|
||||
|
||||
return bucket;
|
||||
}
|
||||
|
||||
void crush_make_list_bucket(struct crush_map *map,
|
||||
struct crush_bucket_list *bucket,
|
||||
int size,
|
||||
int *items,
|
||||
int *weights)
|
||||
|
||||
/* list bucket */
|
||||
|
||||
struct crush_bucket_list*
|
||||
crush_make_list_bucket(int type, int size,
|
||||
int *items,
|
||||
int *weights)
|
||||
{
|
||||
int i;
|
||||
int w;
|
||||
|
||||
struct crush_bucket_list *bucket;
|
||||
|
||||
bucket = malloc(sizeof(*bucket));
|
||||
memset(bucket, 0, sizeof(*bucket));
|
||||
bucket->h.bucket_type = CRUSH_BUCKET_LIST;
|
||||
bucket->h.type = type;
|
||||
bucket->h.size = size;
|
||||
|
||||
bucket->h.items = malloc(sizeof(__u32)*size);
|
||||
bucket->item_weights = malloc(sizeof(__u32)*size);
|
||||
bucket->sum_weights = malloc(sizeof(__u32)*size);
|
||||
w = 0;
|
||||
for (i=0; i<size; i++) {
|
||||
for (i=size-1; i>=0; i--) {
|
||||
bucket->h.items[i] = items[i];
|
||||
bucket->item_weights[i] = weights[i];
|
||||
w += weights[i];
|
||||
bucket->sum_weights[i] = w;
|
||||
/*printf("%d item %d weight %d sum %d\n",
|
||||
i, items[i], weights[i], bucket->sum_weights[i]);*/
|
||||
}
|
||||
|
||||
bucket->h.weight = w;
|
||||
|
||||
return bucket;
|
||||
}
|
||||
|
||||
|
||||
void crush_make_straw_bucket(struct crush_map *map,
|
||||
struct crush_bucket_straw *bucket,
|
||||
int size,
|
||||
int *items,
|
||||
int *weights)
|
||||
/* tree bucket */
|
||||
|
||||
static int height(int n) {
|
||||
int h = 0;
|
||||
while ((n & 1) == 0) {
|
||||
h++;
|
||||
n = n >> 1;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
static int on_right(int n, int h) {
|
||||
return n & (1 << (h+1));
|
||||
}
|
||||
static int parent(int n)
|
||||
{
|
||||
int h = height(n);
|
||||
if (on_right(n, h))
|
||||
return n - (1<<h);
|
||||
else
|
||||
return n + (1<<h);
|
||||
}
|
||||
|
||||
struct crush_bucket_tree*
|
||||
crush_make_tree_bucket(int type, int size,
|
||||
int *items, /* in leaf order */
|
||||
int *weights)
|
||||
{
|
||||
struct crush_bucket_tree *bucket;
|
||||
int depth;
|
||||
int node;
|
||||
int t, i, j;
|
||||
|
||||
bucket = malloc(sizeof(*bucket));
|
||||
memset(bucket, 0, sizeof(*bucket));
|
||||
bucket->h.bucket_type = CRUSH_BUCKET_TREE;
|
||||
bucket->h.type = type;
|
||||
bucket->h.size = size;
|
||||
|
||||
/* calc tree depth */
|
||||
depth = 1;
|
||||
t = size - 1;
|
||||
while (t) {
|
||||
t = t >> 1;
|
||||
depth++;
|
||||
}
|
||||
bucket->h.size = 1 << depth;
|
||||
|
||||
bucket->h.items = malloc(sizeof(__u32)*bucket->h.size);
|
||||
bucket->node_weights = malloc(sizeof(__u32)*bucket->h.size);
|
||||
|
||||
memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size);
|
||||
memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size);
|
||||
|
||||
for (i=0; i<size; i++) {
|
||||
node = ((i+1) << 1)-1;
|
||||
bucket->h.items[node] = items[i];
|
||||
bucket->node_weights[node] = weights[i];
|
||||
bucket->h.weight += weights[i];
|
||||
for (j=1; j<depth; j++) {
|
||||
node = parent(node);
|
||||
bucket->node_weights[node] += weights[i];
|
||||
}
|
||||
}
|
||||
BUG_ON(bucket->node_weights[bucket->h.size/2] != bucket->h.weight);
|
||||
|
||||
return bucket;
|
||||
}
|
||||
|
||||
|
||||
/* straw bucket */
|
||||
|
||||
struct crush_bucket_straw *
|
||||
crush_make_straw_bucket(int type,
|
||||
int size,
|
||||
int *items,
|
||||
int *weights)
|
||||
{
|
||||
struct crush_bucket_straw *bucket;
|
||||
int *reverse;
|
||||
int i, j, k;
|
||||
|
||||
double straw, wbelow, lastw, wnext, pbelow;
|
||||
int numleft;
|
||||
|
||||
bucket = malloc(sizeof(*bucket));
|
||||
memset(bucket, 0, sizeof(*bucket));
|
||||
bucket->h.bucket_type = CRUSH_BUCKET_STRAW;
|
||||
bucket->h.type = type;
|
||||
bucket->h.size = size;
|
||||
|
||||
bucket->h.items = malloc(sizeof(__u32)*size);
|
||||
@ -255,20 +314,21 @@ void crush_make_straw_bucket(struct crush_map *map,
|
||||
bucket->h.weight += weights[i];
|
||||
}
|
||||
|
||||
/* reverse sort by weight */
|
||||
/* reverse sort by weight (simple insertion sort) */
|
||||
reverse = malloc(sizeof(int) * size);
|
||||
reverse[0] = items[0];
|
||||
reverse[0] = 0;
|
||||
for (i=1; i<size; i++) {
|
||||
for (j=0; j<i; j++) {
|
||||
if (weights[i] < weights[reverse[j]]) {
|
||||
/* insert here */
|
||||
for (k=i; k>j; k--)
|
||||
reverse[k] = reverse[k-1];
|
||||
reverse[j] = items[i];
|
||||
reverse[j] = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == i)
|
||||
reverse[i] = items[i];
|
||||
reverse[i] = i;
|
||||
}
|
||||
|
||||
numleft = size;
|
||||
@ -280,25 +340,36 @@ void crush_make_straw_bucket(struct crush_map *map,
|
||||
while (i < size) {
|
||||
/* set this item's straw */
|
||||
bucket->straws[reverse[i]] = straw * 0x10000;
|
||||
/*printf("item %d at %d weight %d straw %d (%lf)\n",
|
||||
items[reverse[i]],
|
||||
reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/
|
||||
i++;
|
||||
if (i == size) break;
|
||||
|
||||
/* same weight as previous? */
|
||||
if (weights[reverse[i]] == weights[reverse[i-1]])
|
||||
if (weights[reverse[i]] == weights[reverse[i-1]]) {
|
||||
/*printf("same as previous\n");*/
|
||||
continue;
|
||||
}
|
||||
|
||||
/* adjust straw for next guy */
|
||||
wbelow += (((double)weights[reverse[i-1]] / (double)0x10000) - lastw) * numleft;
|
||||
numleft--;
|
||||
wnext = numleft * ((double)(weights[reverse[i]] - weights[reverse[i-1]]) / (double)0x10000);
|
||||
wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft;
|
||||
for (j=i; j<size; j++)
|
||||
if (weights[reverse[j]] == weights[reverse[i]])
|
||||
numleft--;
|
||||
else
|
||||
break;
|
||||
wnext = numleft * (weights[reverse[i]] - weights[reverse[i-1]]);
|
||||
pbelow = wbelow / (wbelow + wnext);
|
||||
/*printf("wbelow %lf wnext %lf pbelow %lf\n", wbelow, wnext, pbelow);*/
|
||||
|
||||
straw *= pow((double)1.0 / pbelow, (double)1.0 / numleft);
|
||||
straw *= pow((double)1.0 / pbelow, (double)1.0 / (double)numleft);
|
||||
|
||||
lastw = weights[reverse[i-1]];
|
||||
}
|
||||
|
||||
free(reverse);
|
||||
|
||||
return bucket;
|
||||
}
|
||||
|
45
trunk/ceph/crush/builder.h
Normal file
45
trunk/ceph/crush/builder.h
Normal file
@ -0,0 +1,45 @@
|
||||
#ifndef _CRUSH_BUILDER_H
|
||||
#define _CRUSH_BUILDER_H
|
||||
|
||||
#include "crush.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern struct crush_map *crush_create();
|
||||
extern void crush_finalize(struct crush_map *map);
|
||||
|
||||
/* rules */
|
||||
extern struct crush_rule *crush_make_rule();
|
||||
extern int crush_add_rule(struct crush_map *map,
|
||||
int ruleno,
|
||||
struct crush_rule *rule);
|
||||
extern void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2);
|
||||
|
||||
/* buckets */
|
||||
extern int crush_add_bucket(struct crush_map *map,
|
||||
struct crush_bucket *bucket);
|
||||
|
||||
struct crush_bucket_uniform *
|
||||
crush_make_uniform_bucket(int type, int size,
|
||||
int *items,
|
||||
int item_weight);
|
||||
struct crush_bucket_list*
|
||||
crush_make_list_bucket(int type, int size,
|
||||
int *items,
|
||||
int *weights);
|
||||
struct crush_bucket_tree*
|
||||
crush_make_tree_bucket(int type, int size,
|
||||
int *items, /* in leaf order */
|
||||
int *weights);
|
||||
struct crush_bucket_straw *
|
||||
crush_make_straw_bucket(int type, int size,
|
||||
int *items,
|
||||
int *weights);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
81
trunk/ceph/crush/crush.c
Normal file
81
trunk/ceph/crush/crush.c
Normal file
@ -0,0 +1,81 @@
|
||||
|
||||
#ifdef __KERNEL__
|
||||
# define free(x) kfree(x)
|
||||
#else
|
||||
# include <stdlib.h>
|
||||
#endif
|
||||
|
||||
#include "crush.h"
|
||||
|
||||
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
|
||||
{
|
||||
free(b->primes);
|
||||
free(b->h.items);
|
||||
free(b);
|
||||
}
|
||||
|
||||
void crush_destroy_bucket_list(struct crush_bucket_list *b)
|
||||
{
|
||||
free(b->item_weights);
|
||||
free(b->sum_weights);
|
||||
free(b->h.items);
|
||||
free(b);
|
||||
}
|
||||
|
||||
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
|
||||
{
|
||||
free(b->node_weights);
|
||||
free(b);
|
||||
}
|
||||
|
||||
void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
|
||||
{
|
||||
free(b->straws);
|
||||
free(b->h.items);
|
||||
free(b);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* deallocate
|
||||
*/
|
||||
void crush_destroy(struct crush_map *map)
|
||||
{
|
||||
int b;
|
||||
|
||||
/* buckets */
|
||||
for (b=0; b<map->max_buckets; b++) {
|
||||
if (map->buckets[b] == 0) continue;
|
||||
switch (map->buckets[b]->type) {
|
||||
case CRUSH_BUCKET_UNIFORM:
|
||||
crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]);
|
||||
break;
|
||||
case CRUSH_BUCKET_LIST:
|
||||
crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]);
|
||||
break;
|
||||
case CRUSH_BUCKET_TREE:
|
||||
crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]);
|
||||
break;
|
||||
case CRUSH_BUCKET_STRAW:
|
||||
crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(map->buckets);
|
||||
|
||||
/* rules */
|
||||
for (b=0; b<map->max_rules; b++) {
|
||||
if (map->rules[b] == 0) continue;
|
||||
if (map->rules[b]->steps)
|
||||
free(map->rules[b]->steps);
|
||||
free(map->rules[b]);
|
||||
}
|
||||
free(map->rules);
|
||||
|
||||
free(map->bucket_parents);
|
||||
free(map->device_parents);
|
||||
free(map->device_offload);
|
||||
free(map);
|
||||
}
|
||||
|
||||
|
@ -1,543 +1,117 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
#ifndef _CRUSH_CRUSH_H
|
||||
#define _CRUSH_CRUSH_H
|
||||
|
||||
#ifndef __crush_CRUSH_H
|
||||
#define __crush_CRUSH_H
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <map>
|
||||
using std::set;
|
||||
using std::map;
|
||||
using std::vector;
|
||||
using std::list;
|
||||
#include <ext/hash_map>
|
||||
#include <ext/hash_set>
|
||||
using namespace __gnu_cxx;
|
||||
#include <linux/types.h> /* just for int types */
|
||||
|
||||
#ifndef BUG_ON
|
||||
# include <assert.h>
|
||||
# define BUG_ON(x) assert(!(x))
|
||||
#endif
|
||||
|
||||
|
||||
#include "Bucket.h"
|
||||
/*** RULES ***/
|
||||
enum {
|
||||
CRUSH_RULE_TAKE,
|
||||
CRUSH_RULE_CHOOSE_FIRSTN,
|
||||
CRUSH_RULE_CHOOSE_INDEP,
|
||||
CRUSH_RULE_EMIT
|
||||
};
|
||||
|
||||
#include "include/buffer.h"
|
||||
#define CRUSH_MAX_DEPTH 10
|
||||
#define CRUSH_MAX_SET 10
|
||||
|
||||
struct crush_rule_step {
|
||||
__u32 op;
|
||||
__s32 arg1;
|
||||
__s32 arg2;
|
||||
};
|
||||
|
||||
namespace crush {
|
||||
|
||||
|
||||
// *** RULES ***
|
||||
|
||||
class RuleStep {
|
||||
public:
|
||||
int cmd;
|
||||
vector<int> args;
|
||||
|
||||
RuleStep(int c) : cmd(c) {}
|
||||
RuleStep(int c, int a) : cmd(c) {
|
||||
args.push_back(a);
|
||||
}
|
||||
RuleStep(int c, int a, int b) : cmd(c) {
|
||||
args.push_back(a);
|
||||
args.push_back(b);
|
||||
}
|
||||
RuleStep(int o, int a, int b, int c) : cmd(o) {
|
||||
args.push_back(a);
|
||||
args.push_back(b);
|
||||
args.push_back(c);
|
||||
}
|
||||
|
||||
void _encode(bufferlist& bl) {
|
||||
bl.append((char*)&cmd, sizeof(cmd));
|
||||
::_encode(args, bl);
|
||||
}
|
||||
void _decode(bufferlist& bl, int& off) {
|
||||
bl.copy(off, sizeof(cmd), (char*)&cmd);
|
||||
off += sizeof(cmd);
|
||||
::_decode(args, bl, off);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Rule operations
|
||||
const int CRUSH_RULE_TAKE = 0;
|
||||
const int CRUSH_RULE_CHOOSE = 1; // first n by default
|
||||
const int CRUSH_RULE_CHOOSE_FIRSTN = 1;
|
||||
const int CRUSH_RULE_CHOOSE_INDEP = 2;
|
||||
const int CRUSH_RULE_EMIT = 3;
|
||||
|
||||
class Rule {
|
||||
public:
|
||||
vector< RuleStep > steps;
|
||||
|
||||
void _encode(bufferlist& bl) {
|
||||
int n = steps.size();
|
||||
bl.append((char*)&n, sizeof(n));
|
||||
for (int i=0; i<n; i++)
|
||||
steps[i]._encode(bl);
|
||||
}
|
||||
void _decode(bufferlist& bl, int& off) {
|
||||
steps.clear();
|
||||
int n;
|
||||
bl.copy(off, sizeof(n), (char*)&n);
|
||||
off += sizeof(n);
|
||||
for (int i=0; i<n; i++) {
|
||||
steps.push_back(RuleStep(0));
|
||||
steps[i]._decode(bl, off);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct crush_rule {
|
||||
__u32 len;
|
||||
struct crush_rule_step *steps;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*** BUCKETS ***/
|
||||
|
||||
// *** CRUSH ***
|
||||
enum {
|
||||
CRUSH_BUCKET_UNIFORM = 1,
|
||||
CRUSH_BUCKET_LIST = 2,
|
||||
CRUSH_BUCKET_TREE = 3,
|
||||
CRUSH_BUCKET_STRAW = 4
|
||||
};
|
||||
|
||||
class Crush {
|
||||
protected:
|
||||
map<int, Bucket*> buckets;
|
||||
int bucketno;
|
||||
Hash h;
|
||||
struct crush_bucket {
|
||||
__s32 id; /* this'll be negative */
|
||||
__u16 type;
|
||||
__u16 bucket_type;
|
||||
__u32 weight; /* 16-bit fixed point */
|
||||
__u32 size; /* num items */
|
||||
__s32 *items;
|
||||
};
|
||||
|
||||
hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
|
||||
struct crush_bucket_uniform {
|
||||
struct crush_bucket h;
|
||||
__u32 *primes;
|
||||
__u32 item_weight; /* 16-bit fixed point */
|
||||
};
|
||||
|
||||
public:
|
||||
map<int, Rule> rules;
|
||||
struct crush_bucket_list {
|
||||
struct crush_bucket h;
|
||||
__u32 *item_weights; /* 16-bit fixed point */
|
||||
__u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */
|
||||
};
|
||||
|
||||
//map<int,int> collisions;
|
||||
//map<int,int> bumps;
|
||||
struct crush_bucket_tree {
|
||||
struct crush_bucket h; /* note: h.size is tree size, not number of actual items */
|
||||
__u32 *node_weights;
|
||||
};
|
||||
|
||||
void _encode(bufferlist& bl) {
|
||||
// buckets
|
||||
int n = buckets.size();
|
||||
bl.append((char*)&n, sizeof(n));
|
||||
for (map<int, Bucket*>::const_iterator it = buckets.begin();
|
||||
it != buckets.end();
|
||||
it++) {
|
||||
bl.append((char*)&it->first, sizeof(it->first));
|
||||
it->second->_encode(bl);
|
||||
}
|
||||
bl.append((char*)&bucketno, sizeof(bucketno));
|
||||
|
||||
// hash
|
||||
int s = h.get_seed();
|
||||
bl.append((char*)&s, sizeof(s));
|
||||
|
||||
//::_encode(out, bl);
|
||||
//::_encode(overload, bl);
|
||||
|
||||
// rules
|
||||
n = rules.size();
|
||||
bl.append((char*)&n, sizeof(n));
|
||||
for(map<int, Rule>::iterator it = rules.begin();
|
||||
it != rules.end();
|
||||
it++) {
|
||||
bl.append((char*)&it->first, sizeof(it->first));
|
||||
it->second._encode(bl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void _decode(bufferlist& bl, int& off) {
|
||||
int n;
|
||||
bl.copy(off, sizeof(n), (char*)&n);
|
||||
off += sizeof(n);
|
||||
for (int i=0; i<n; i++) {
|
||||
int bid;
|
||||
bl.copy(off, sizeof(bid), (char*)&bid);
|
||||
off += sizeof(bid);
|
||||
Bucket *b = decode_bucket(bl, off);
|
||||
buckets[bid] = b;
|
||||
}
|
||||
bl.copy(off, sizeof(bucketno), (char*)&bucketno);
|
||||
off += sizeof(bucketno);
|
||||
|
||||
int s;
|
||||
bl.copy(off, sizeof(s), (char*)&s);
|
||||
off += sizeof(s);
|
||||
h.set_seed(s);
|
||||
|
||||
//::_decode(out, bl, off);
|
||||
//::_decode(overload, bl, off);
|
||||
|
||||
// rules
|
||||
bl.copy(off, sizeof(n), (char*)&n);
|
||||
off += sizeof(n);
|
||||
for (int i=0; i<n; i++) {
|
||||
int r;
|
||||
bl.copy(off, sizeof(r), (char*)&r);
|
||||
off += sizeof(r);
|
||||
rules[r]._decode(bl,off);
|
||||
}
|
||||
|
||||
// index
|
||||
build_parent_map();
|
||||
}
|
||||
|
||||
void build_parent_map() {
|
||||
parent_map.clear();
|
||||
|
||||
// index every bucket
|
||||
for (map<int, Bucket*>::iterator bp = buckets.begin();
|
||||
bp != buckets.end();
|
||||
++bp) {
|
||||
// index bucket items
|
||||
vector<int> items;
|
||||
bp->second->get_items(items);
|
||||
for (vector<int>::iterator ip = items.begin();
|
||||
ip != items.end();
|
||||
++ip)
|
||||
parent_map[*ip] = bp->first;
|
||||
}
|
||||
}
|
||||
|
||||
struct crush_bucket_straw {
|
||||
struct crush_bucket h;
|
||||
__u32 *straws; /* 16-bit fixed point */
|
||||
};
|
||||
|
||||
|
||||
public:
|
||||
Crush(int seed=123) : bucketno(-1), h(seed) {}
|
||||
~Crush() {
|
||||
// hose buckets
|
||||
for (map<int, Bucket*>::iterator it = buckets.begin();
|
||||
it != buckets.end();
|
||||
it++) {
|
||||
delete it->second;
|
||||
}
|
||||
}
|
||||
|
||||
int print(ostream& out, int root, int indent=0) {
|
||||
for (int i=0; i<indent; i++) out << " ";
|
||||
Bucket *b = buckets[root];
|
||||
assert(b);
|
||||
out << b->get_weight() << "\t" << b->get_id() << "\t";
|
||||
for (int i=0; i<indent; i++) out << " ";
|
||||
out << b->get_bucket_type() << ": ";
|
||||
/*** CRUSH ***/
|
||||
|
||||
vector<int> items;
|
||||
b->get_items(items);
|
||||
|
||||
if (buckets.count(items[0])) {
|
||||
out << std::endl;
|
||||
for (unsigned i=0; i<items.size(); i++)
|
||||
print(out, items[i], indent+1);
|
||||
} else {
|
||||
out << "[";
|
||||
for (unsigned i=0; i<items.size(); i++) {
|
||||
if (i) out << " ";
|
||||
out << items[i];
|
||||
}
|
||||
out << "]";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int add_bucket( Bucket *b ) {
|
||||
int n = bucketno;
|
||||
bucketno--;
|
||||
b->set_id(n);
|
||||
buckets[n] = b;
|
||||
return n;
|
||||
}
|
||||
|
||||
void add_item(int parent, int item, float w, bool back=false) {
|
||||
// add item
|
||||
assert(!buckets[parent]->is_uniform());
|
||||
Bucket *p = buckets[parent];
|
||||
|
||||
p->add_item(item, w, back);
|
||||
|
||||
// set item's parent
|
||||
Bucket *n = buckets[item];
|
||||
if (n)
|
||||
n->set_parent(parent);
|
||||
|
||||
// update weights
|
||||
while (buckets.count(p->get_parent())) {
|
||||
int child = p->get_id();
|
||||
p = buckets[p->get_parent()];
|
||||
p->adjust_item_weight(child, w);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
this is a hack, fix me! weights should be consistent throughout hierarchy!
|
||||
|
||||
*/
|
||||
void set_bucket_weight(int item, float w) {
|
||||
Bucket *b = buckets[item];
|
||||
float adj = w - b->get_weight();
|
||||
|
||||
while (buckets.count(b->get_parent())) {
|
||||
Bucket *p = buckets[b->get_parent()];
|
||||
p->adjust_item_weight(b->get_id(), adj);
|
||||
b = p;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* choose numrep distinct items of type type
|
||||
*/
|
||||
void choose(int x,
|
||||
int numrep,
|
||||
int type,
|
||||
Bucket *inbucket,
|
||||
vector<int>& outvec,
|
||||
bool firstn,
|
||||
set<int>& outset, map<int,float>& overloadmap,
|
||||
bool forcefeed=false,
|
||||
int forcefeedval=-1) {
|
||||
int off = outvec.size();
|
||||
|
||||
// for each replica
|
||||
for (int rep=0; rep<numrep; rep++) {
|
||||
int outv = -1; // my result
|
||||
|
||||
// forcefeed?
|
||||
if (forcefeed) {
|
||||
forcefeed = false;
|
||||
outvec.push_back(forcefeedval);
|
||||
continue;
|
||||
}
|
||||
struct crush_map {
|
||||
struct crush_bucket **buckets;
|
||||
struct crush_rule **rules;
|
||||
|
||||
// keep trying until we get a non-out, non-colliding item
|
||||
int ftotal = 0;
|
||||
bool skip_rep = false;
|
||||
/* parent pointers */
|
||||
__u32 *bucket_parents;
|
||||
__u32 *device_parents;
|
||||
|
||||
while (1) {
|
||||
// start with the input bucket
|
||||
Bucket *in = inbucket;
|
||||
|
||||
// choose through intervening buckets
|
||||
int flocal = 0;
|
||||
bool retry_rep = false;
|
||||
|
||||
while (1) {
|
||||
// r may be twiddled to (try to) avoid past collisions
|
||||
int r = rep;
|
||||
if (in->is_uniform()) {
|
||||
// uniform bucket; be careful!
|
||||
if (firstn || numrep >= in->get_size()) {
|
||||
// uniform bucket is too small; just walk thru elements
|
||||
r += ftotal; // r' = r + f_total (first n)
|
||||
} else {
|
||||
// make sure numrep is not a multple of bucket size
|
||||
int add = numrep*flocal; // r' = r + n*f_local
|
||||
if (in->get_size() % numrep == 0) {
|
||||
add += add/in->get_size(); // shift seq once per pass through the bucket
|
||||
}
|
||||
r += add;
|
||||
}
|
||||
} else {
|
||||
// mixed bucket; just make a distinct-ish r sequence
|
||||
if (firstn)
|
||||
r += ftotal; // r' = r + f_total
|
||||
else
|
||||
r += numrep * flocal; // r' = r + n*f_local
|
||||
}
|
||||
|
||||
// choose
|
||||
outv = in->choose_r(x, r, h);
|
||||
|
||||
// did we get the type we want?
|
||||
int itemtype = 0; // 0 is terminal type
|
||||
Bucket *newin = 0; // remember bucket we hit
|
||||
if (in->is_uniform()) {
|
||||
itemtype = ((UniformBucket*)in)->get_item_type();
|
||||
} else {
|
||||
if (buckets.count(outv)) { // another bucket
|
||||
newin = buckets[outv];
|
||||
itemtype = newin->get_type();
|
||||
}
|
||||
}
|
||||
if (itemtype == type) { // this is what we want!
|
||||
// collision?
|
||||
bool collide = false;
|
||||
for (int prep=0; prep<rep; prep++) {
|
||||
if (outvec[off+prep] == outv) {
|
||||
collide = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ok choice?
|
||||
bool bad = false;
|
||||
if (type == 0 && outset.count(outv))
|
||||
bad = true;
|
||||
if (overloadmap.count(outv)) {
|
||||
float f = (float)(h(x, outv) % 1000) / 1000.0;
|
||||
if (f > overloadmap[outv])
|
||||
bad = true;
|
||||
}
|
||||
|
||||
if (collide || bad) {
|
||||
ftotal++;
|
||||
flocal++;
|
||||
|
||||
if (collide && flocal < 3)
|
||||
continue; // try locally a few times!
|
||||
|
||||
if (ftotal >= 10) {
|
||||
// ok fine, just ignore dup. FIXME.
|
||||
skip_rep = true;
|
||||
break;
|
||||
}
|
||||
|
||||
retry_rep = true;
|
||||
}
|
||||
|
||||
break; // ok then!
|
||||
}
|
||||
|
||||
// next
|
||||
in = newin;
|
||||
}
|
||||
|
||||
if (retry_rep) continue; // try again
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// skip this rep? (e.g. too many collisions, we give up)
|
||||
if (skip_rep) continue;
|
||||
|
||||
// output this value
|
||||
outvec.push_back(outv);
|
||||
} // for rep
|
||||
|
||||
// double check!
|
||||
if (0) {
|
||||
for (unsigned i=1; i<outvec.size(); i++)
|
||||
for (unsigned j=0; j<i; j++)
|
||||
assert(outvec[i] != outvec[j]);
|
||||
}
|
||||
}
|
||||
/* offload
|
||||
* size max_devices, values 0...0xffff
|
||||
* 0 == normal
|
||||
* 0x10000 == 100% offload (i.e. failed)
|
||||
*/
|
||||
__u32 *device_offload;
|
||||
|
||||
__u32 max_buckets;
|
||||
__u32 max_rules;
|
||||
__s32 max_devices;
|
||||
};
|
||||
|
||||
|
||||
void do_rule(Rule& rule, int x, vector<int>& result,
|
||||
set<int>& outset, map<int,float>& overloadmap,
|
||||
int forcefeed=-1) {
|
||||
//int numresult = 0;
|
||||
result.clear();
|
||||
|
||||
// determine hierarchical context for forcefeed (if any)
|
||||
list<int> force_stack;
|
||||
if (forcefeed >= 0 && parent_map.count(forcefeed)) {
|
||||
int t = forcefeed;
|
||||
while (1) {
|
||||
force_stack.push_front(t);
|
||||
//cout << "push " << t << " onto force_stack" << std::endl;
|
||||
if (parent_map.count(t) == 0) break; // reached root, presumably.
|
||||
//cout << " " << t << " parent is " << parent_map[t] << std::endl;
|
||||
t = parent_map[t];
|
||||
}
|
||||
}
|
||||
|
||||
// working vector
|
||||
vector<int> w; // working variable
|
||||
|
||||
// go through each statement
|
||||
for (vector<RuleStep>::iterator pc = rule.steps.begin();
|
||||
pc != rule.steps.end();
|
||||
pc++) {
|
||||
// move input?
|
||||
|
||||
// do it
|
||||
switch (pc->cmd) {
|
||||
case CRUSH_RULE_TAKE:
|
||||
{
|
||||
const int arg = pc->args[0];
|
||||
//cout << "take " << arg << std::endl;
|
||||
|
||||
if (!force_stack.empty()) {
|
||||
assert(force_stack.front() == arg);
|
||||
force_stack.pop_front();
|
||||
}
|
||||
|
||||
w.clear();
|
||||
w.push_back(arg);
|
||||
}
|
||||
break;
|
||||
|
||||
case CRUSH_RULE_CHOOSE_FIRSTN:
|
||||
case CRUSH_RULE_CHOOSE_INDEP:
|
||||
{
|
||||
const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN;
|
||||
const int numrep = pc->args[0];
|
||||
const int type = pc->args[1];
|
||||
|
||||
//cout << "choose " << numrep << " of type " << type << std::endl;
|
||||
|
||||
assert(!w.empty());
|
||||
|
||||
// reset output
|
||||
vector<int> out;
|
||||
|
||||
// forcefeeding?
|
||||
bool forcing = false;
|
||||
int forceval = -1;
|
||||
if (!force_stack.empty()) {
|
||||
forceval = force_stack.front();
|
||||
force_stack.pop_front();
|
||||
//cout << "priming out with " << forceval << std::endl;
|
||||
forcing = true;
|
||||
} else if (forcefeed >= 0 && type == 0) {
|
||||
//cout << "forcing context-less " << forcefeed << std::endl;
|
||||
forceval = forcefeed;
|
||||
forcefeed = -1;
|
||||
forcing = true;
|
||||
}
|
||||
|
||||
// do each row independently
|
||||
for (vector<int>::iterator i = w.begin();
|
||||
i != w.end();
|
||||
i++) {
|
||||
assert(buckets.count(*i));
|
||||
Bucket *b = buckets[*i];
|
||||
choose(x, numrep, type, b, out, firstn,
|
||||
outset, overloadmap,
|
||||
forcing,
|
||||
forceval);
|
||||
forcing = false; // only once
|
||||
} // for inrow
|
||||
|
||||
// put back into w
|
||||
w.swap(out);
|
||||
out.clear();
|
||||
}
|
||||
break;
|
||||
|
||||
case CRUSH_RULE_EMIT:
|
||||
{
|
||||
for (unsigned i=0; i<w.size(); i++)
|
||||
result.push_back(w[i]);
|
||||
//result[numresult++] = w[i];
|
||||
w.clear();
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
/* common destructors */
|
||||
extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *);
|
||||
extern void crush_destroy_bucket_list(struct crush_bucket_list *);
|
||||
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *);
|
||||
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *);
|
||||
extern void crush_destroy(struct crush_map *map);
|
||||
|
||||
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -2,6 +2,129 @@
|
||||
#include "crush.h"
|
||||
#include "hash.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/** bucket choose methods **/
|
||||
|
||||
/* uniform */
|
||||
|
||||
static int
|
||||
crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
|
||||
{
|
||||
unsigned o, p, s;
|
||||
o = crush_hash32_2(x, bucket->h.id) & 0xffff;
|
||||
p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size];
|
||||
s = (x + o + (r+1)*p) % bucket->h.size;
|
||||
/*printf("%d %d %d %d\n", x, o, r, p);*/
|
||||
return bucket->h.items[s];
|
||||
}
|
||||
|
||||
|
||||
/* list */
|
||||
|
||||
static int
|
||||
crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
|
||||
{
|
||||
int i;
|
||||
__u64 w;
|
||||
|
||||
for (i=0; i<bucket->h.size; i++) {
|
||||
w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id);
|
||||
w &= 0xffff;
|
||||
/*printf("%d item %d weight %d sum_weight %d r %lld",
|
||||
i, bucket->h.items[i], bucket->item_weights[i], bucket->sum_weights[i], w);*/
|
||||
w *= bucket->sum_weights[i];
|
||||
w = w >> 16;
|
||||
/*printf(" scaled %lld\n", w);*/
|
||||
if (w < bucket->item_weights[i])
|
||||
return bucket->h.items[i];
|
||||
}
|
||||
|
||||
BUG_ON(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* tree */
|
||||
|
||||
static int height(int n) {
|
||||
int h = 0;
|
||||
while ((n & 1) == 0) {
|
||||
h++;
|
||||
n = n >> 1;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
static int left(int x) {
|
||||
int h = height(x);
|
||||
return x - (1 << (h-1));
|
||||
}
|
||||
static int right(int x) {
|
||||
int h = height(x);
|
||||
return x + (1 << (h-1));
|
||||
}
|
||||
static int terminal(int x) {
|
||||
return x & 1;
|
||||
}
|
||||
|
||||
static int
|
||||
crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
|
||||
{
|
||||
int n, l;
|
||||
__u32 w;
|
||||
__u64 t;
|
||||
|
||||
/* start at root */
|
||||
n = bucket->h.size >> 1;
|
||||
|
||||
while (!terminal(n)) {
|
||||
/* pick point in [0, w) */
|
||||
w = bucket->node_weights[n];
|
||||
t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
|
||||
t = t >> 32;
|
||||
|
||||
/* left or right? */
|
||||
l = left(n);
|
||||
if (t < bucket->node_weights[l])
|
||||
n = l;
|
||||
else
|
||||
n = right(n);
|
||||
}
|
||||
|
||||
return bucket->h.items[n];
|
||||
}
|
||||
|
||||
|
||||
/* straw */
|
||||
|
||||
static int
|
||||
crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
|
||||
{
|
||||
int i;
|
||||
int high = 0;
|
||||
__u64 high_draw = 0;
|
||||
__u64 draw;
|
||||
|
||||
for (i=0; i<bucket->h.size; i++) {
|
||||
draw = crush_hash32_3(x, bucket->h.items[i], r);
|
||||
draw &= 0xffff;
|
||||
draw *= bucket->straws[i];
|
||||
if (i == 0 || draw > high_draw) {
|
||||
high = i;
|
||||
high_draw = draw;
|
||||
}
|
||||
}
|
||||
|
||||
return bucket->h.items[high];
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/** crush proper **/
|
||||
|
||||
|
||||
/*
|
||||
* choose numrep distinct items of given type
|
||||
*/
|
||||
@ -12,7 +135,7 @@ static int crush_choose(struct crush_map *map,
|
||||
{
|
||||
int rep;
|
||||
int ftotal, flocal;
|
||||
int retry_rep, skip_rep;
|
||||
int retry_descent, retry_bucket, skip_rep;
|
||||
struct crush_bucket *in = bucket;
|
||||
int r;
|
||||
int i;
|
||||
@ -22,32 +145,28 @@ static int crush_choose(struct crush_map *map,
|
||||
int collide, bad;
|
||||
|
||||
outpos = 0;
|
||||
|
||||
|
||||
for (rep = 0; rep < numrep; rep++) {
|
||||
/* keep trying until we get a non-out, non-colliding item */
|
||||
ftotal = 0;
|
||||
skip_rep = 0;
|
||||
|
||||
while (1) {
|
||||
do {
|
||||
retry_descent = 0;
|
||||
in = bucket; /* initial bucket */
|
||||
|
||||
/* choose through intervening buckets */
|
||||
flocal = 0;
|
||||
retry_rep = 0;
|
||||
|
||||
while (1) {
|
||||
do {
|
||||
retry_bucket = 0;
|
||||
r = rep;
|
||||
if (in->type == CRUSH_BUCKET_UNIFORM) {
|
||||
if (in->bucket_type == CRUSH_BUCKET_UNIFORM) {
|
||||
/* be careful */
|
||||
if (firstn || numrep >= in->size) {
|
||||
if (firstn || numrep >= in->size)
|
||||
r += ftotal; /* r' = r + f_total */
|
||||
} else {
|
||||
r += numrep * flocal; /* r' = r + n*f_local */
|
||||
/* make sure numrep is not a multiple of bucket size */
|
||||
if (in->size % numrep == 0)
|
||||
/* shift seq once per pass through the bucket */
|
||||
r += numrep * flocal / in->size;
|
||||
}
|
||||
else if (in->size % numrep == 0)
|
||||
r += (numrep+1) * flocal; /* r'=r+(n+1)*f_local */
|
||||
else
|
||||
r += numrep * flocal; /* r' = r + n*f_local */
|
||||
} else {
|
||||
if (firstn)
|
||||
r += ftotal; /* r' = r + f_total */
|
||||
@ -56,7 +175,7 @@ static int crush_choose(struct crush_map *map,
|
||||
}
|
||||
|
||||
/* bucket choose */
|
||||
switch (in->type) {
|
||||
switch (in->bucket_type) {
|
||||
case CRUSH_BUCKET_UNIFORM:
|
||||
item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r);
|
||||
break;
|
||||
@ -75,13 +194,14 @@ static int crush_choose(struct crush_map *map,
|
||||
|
||||
/* desired type? */
|
||||
if (item < 0)
|
||||
itemtype = map->buckets[-item]->type;
|
||||
itemtype = map->buckets[-1-item]->type;
|
||||
else
|
||||
itemtype = 0;
|
||||
|
||||
/* keep going? */
|
||||
if (itemtype != type) {
|
||||
in = map->buckets[-item];
|
||||
BUG_ON(item >= 0 || (-1-item) >= map->max_buckets);
|
||||
in = map->buckets[-1-item];
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -108,22 +228,17 @@ static int crush_choose(struct crush_map *map,
|
||||
flocal++;
|
||||
|
||||
if (collide && flocal < 3)
|
||||
continue; /* locally a few times */
|
||||
if (ftotal >= 10) {
|
||||
/* give up, ignore dup, fixme */
|
||||
skip_rep = 1;
|
||||
break;
|
||||
}
|
||||
retry_rep = 1;
|
||||
retry_bucket = 1; /* retry locally a few times */
|
||||
else if (ftotal < 10)
|
||||
retry_descent = 1; /* then retry descent */
|
||||
else
|
||||
skip_rep = 1; /* else give up */
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (retry_rep) continue;
|
||||
}
|
||||
} while (retry_bucket);
|
||||
} while (retry_descent);
|
||||
|
||||
if (skip_rep) continue;
|
||||
|
||||
|
||||
out[outpos] = item;
|
||||
outpos++;
|
||||
}
|
||||
@ -159,12 +274,17 @@ int crush_do_rule(struct crush_map *map,
|
||||
|
||||
/* determine hierarchical context of forcefeed, if any */
|
||||
if (forcefeed >= 0) {
|
||||
if (map->device_parents[forcefeed] == 0) {
|
||||
/*printf("CRUSH: forcefed device dne\n");*/
|
||||
return -1; /* force fed device dne */
|
||||
}
|
||||
while (1) {
|
||||
force_stack[++force_pos] = forcefeed;
|
||||
/*printf("force_stack[%d] = %d\n", force_pos, forcefeed);*/
|
||||
if (forcefeed >= 0)
|
||||
forcefeed = map->device_parents[forcefeed];
|
||||
else
|
||||
forcefeed = map->bucket_parents[-forcefeed];
|
||||
forcefeed = map->bucket_parents[-1-forcefeed];
|
||||
if (forcefeed == 0) break;
|
||||
}
|
||||
}
|
||||
@ -191,23 +311,22 @@ int crush_do_rule(struct crush_map *map,
|
||||
|
||||
for (i = 0; i < wsize; i++) {
|
||||
numrep = rule->steps[step].arg1;
|
||||
|
||||
if (force_pos >= 0) {
|
||||
o[osize++] = force_stack[force_pos];
|
||||
force_pos--;
|
||||
numrep--;
|
||||
}
|
||||
if (numrep)
|
||||
crush_choose(map,
|
||||
map->buckets[-w[i]],
|
||||
x, numrep, rule->steps[step].arg2,
|
||||
o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN);
|
||||
if (!numrep) continue;
|
||||
osize += crush_choose(map,
|
||||
map->buckets[-1-w[i]],
|
||||
x, numrep, rule->steps[step].arg2,
|
||||
o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN);
|
||||
}
|
||||
|
||||
/* swap t and w arrays */
|
||||
tmp = o;
|
||||
o = w;
|
||||
w = o;
|
||||
w = tmp;
|
||||
wsize = osize;
|
||||
break;
|
||||
|
||||
@ -229,3 +348,4 @@ int crush_do_rule(struct crush_map *map,
|
||||
return result_len;
|
||||
}
|
||||
|
||||
|
19
trunk/ceph/crush/mapper.h
Normal file
19
trunk/ceph/crush/mapper.h
Normal file
@ -0,0 +1,19 @@
|
||||
#ifndef _CRUSH_MAPPER_H
|
||||
#define _CRUSH_MAPPER_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "crush.h"
|
||||
|
||||
extern int crush_do_rule(struct crush_map *map,
|
||||
int ruleno,
|
||||
int x, int *result, int result_max,
|
||||
int forcefeed); /* -1 for none */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
65
trunk/ceph/crush/test.c
Normal file
65
trunk/ceph/crush/test.c
Normal file
@ -0,0 +1,65 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "crush.h"
|
||||
#include "mapper.h"
|
||||
#include "builder.h"
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
int sub[10];
|
||||
int subw[10];
|
||||
int i, j;
|
||||
int d;
|
||||
int o[100];
|
||||
int root;
|
||||
int ruleno;
|
||||
int r[10];
|
||||
|
||||
int uw[10] = { 1000, 1000, 500, 1000, 2000, 1000, 1000, 3000, 1000, 500 };
|
||||
|
||||
struct crush_bucket *b;
|
||||
struct crush_rule *rule;
|
||||
|
||||
struct crush_map *map = crush_create();
|
||||
|
||||
d = 0;
|
||||
for (i=0; i<10; i++) {
|
||||
for (j=0; j<10; j++)
|
||||
o[j] = d++;
|
||||
b = (struct crush_bucket*)crush_make_uniform_bucket(1, 10, o, uw[i]);
|
||||
sub[i] = crush_add_bucket(map, b);
|
||||
subw[i] = b->weight;
|
||||
printf("make bucket %d weight %d\n", sub[i], subw[i]);
|
||||
}
|
||||
|
||||
root = crush_add_bucket(map, (struct crush_bucket*)crush_make_tree_bucket(2, 10, sub, subw));
|
||||
|
||||
rule = crush_make_rule();
|
||||
crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 3, 1);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
|
||||
ruleno = crush_add_rule(map, -1, rule);
|
||||
|
||||
crush_finalize(map);
|
||||
printf("built\n");
|
||||
|
||||
/* test */
|
||||
memset(o, 0, 100*sizeof(o[0]));
|
||||
for (i=0; i<1000000; i++) {
|
||||
crush_do_rule(map, ruleno, i, r, 3, -1);
|
||||
/*printf("%d %d %d\n", r[0], r[1], r[2]);*/
|
||||
for (j=0; j<3; j++)
|
||||
o[r[j]]++;
|
||||
}
|
||||
|
||||
for (i=0; i<100; i += 10)
|
||||
printf("%2d : %d\n", i, o[i]);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,82 +0,0 @@
|
||||
|
||||
#include "hash.h"
|
||||
#include "buckets.h"
|
||||
|
||||
int
|
||||
crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
|
||||
{
|
||||
unsigned o, p, s;
|
||||
o = crush_hash32_2(x, bucket->h.id);
|
||||
p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size];
|
||||
s = (x + o + (r+1)*p) % bucket->h.size;
|
||||
return bucket->h.items[s];
|
||||
}
|
||||
|
||||
int
|
||||
crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
|
||||
{
|
||||
int i;
|
||||
__u64 w;
|
||||
|
||||
for (i=0; i<bucket->h.size; i++) {
|
||||
w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id) & 0xffff;
|
||||
w = (w * bucket->sum_weights[i]) >> 32;
|
||||
if (w < bucket->item_weights[i])
|
||||
return bucket->h.items[i];
|
||||
}
|
||||
|
||||
BUG_ON(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
|
||||
{
|
||||
int i;
|
||||
int high = 0;
|
||||
unsigned high_draw = 0;
|
||||
__u64 draw;
|
||||
|
||||
for (i=0; i<bucket->h.size; i++) {
|
||||
draw = (crush_hash32_3(x, bucket->h.items[i], r) & 0xffff) * bucket->straws[i];
|
||||
draw = draw >> 32;
|
||||
if (i == 0 || draw > high_draw) {
|
||||
high = i;
|
||||
high_draw = draw;
|
||||
}
|
||||
}
|
||||
|
||||
return high;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
|
||||
{
|
||||
free(b->primes);
|
||||
free(b->h.items);
|
||||
}
|
||||
|
||||
void crush_destroy_bucket_list(struct crush_bucket_list *b)
|
||||
{
|
||||
free(b->item_weights);
|
||||
free(b->sum_weights);
|
||||
free(b->h.items);
|
||||
}
|
||||
|
||||
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
|
||||
{
|
||||
free(b->h.items);
|
||||
}
|
||||
|
||||
void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
|
||||
{
|
||||
free(b->straws);
|
||||
free(b->h.items);
|
||||
}
|
@ -1,53 +0,0 @@
|
||||
#ifndef _CRUSH_BUCKETS_H
|
||||
#define _CRUSH_BUCKETS_H
|
||||
|
||||
#include "types.h"
|
||||
|
||||
enum {
|
||||
CRUSH_BUCKET_UNIFORM = 1,
|
||||
CRUSH_BUCKET_LIST = 2,
|
||||
CRUSH_BUCKET_TREE = 3,
|
||||
CRUSH_BUCKET_STRAW = 4
|
||||
};
|
||||
|
||||
struct crush_bucket {
|
||||
__u32 id;
|
||||
__u32 type;
|
||||
__u32 weight; /* 16-bit fixed point */
|
||||
__u32 size; /* num items */
|
||||
__s32 *items;
|
||||
};
|
||||
|
||||
struct crush_bucket_uniform {
|
||||
struct crush_bucket h;
|
||||
__u32 item_weight; /* 16-bit fixed point */
|
||||
__u32 *primes;
|
||||
};
|
||||
|
||||
struct crush_bucket_list {
|
||||
struct crush_bucket h;
|
||||
__u32 *item_weights; /* 16-bit fixed point */
|
||||
__u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */
|
||||
};
|
||||
|
||||
struct crush_bucket_tree {
|
||||
struct crush_bucket h;
|
||||
|
||||
};
|
||||
|
||||
struct crush_bucket_straw {
|
||||
struct crush_bucket h;
|
||||
__u32 *straws; /* 16-bit fixed point */
|
||||
};
|
||||
|
||||
extern int crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r);
|
||||
extern int crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r);
|
||||
extern int crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r);
|
||||
extern int crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r);
|
||||
|
||||
extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *);
|
||||
extern void crush_destroy_bucket_list(struct crush_bucket_list *);
|
||||
extern void crush_destroy_bucket_tree(struct crush_bucket_tree *);
|
||||
extern void crush_destroy_bucket_straw(struct crush_bucket_straw *);
|
||||
|
||||
#endif
|
@ -1,55 +0,0 @@
|
||||
#ifndef _CRUSH_CRUSH_H
|
||||
#define _CRUSH_CRUSH_H
|
||||
|
||||
#include "types.h"
|
||||
#include "buckets.h"
|
||||
|
||||
enum {
|
||||
CRUSH_RULE_TAKE,
|
||||
CRUSH_RULE_CHOOSE_FIRSTN,
|
||||
CRUSH_RULE_CHOOSE_INDEP,
|
||||
CRUSH_RULE_EMIT
|
||||
};
|
||||
|
||||
#define CRUSH_MAX_DEPTH 10
|
||||
#define CRUSH_MAX_SET 10
|
||||
|
||||
struct crush_rule_step {
|
||||
__u32 op;
|
||||
__s32 arg1;
|
||||
__s32 arg2;
|
||||
};
|
||||
|
||||
struct crush_rule {
|
||||
__u32 len;
|
||||
struct crush_rule_step *steps;
|
||||
};
|
||||
|
||||
struct crush_map {
|
||||
struct crush_bucket **buckets;
|
||||
struct crush_rule **rules;
|
||||
|
||||
/* parent pointers */
|
||||
__u32 *bucket_parents;
|
||||
__u32 *device_parents;
|
||||
|
||||
/* offload
|
||||
* size max_devices, values 0...0xffff
|
||||
* 0 == normal
|
||||
* 0x10000 == 100% offload (i.e. failed)
|
||||
*/
|
||||
__u32 *device_offload;
|
||||
|
||||
__u32 max_buckets;
|
||||
__u32 max_rules;
|
||||
__u32 max_devices;
|
||||
};
|
||||
|
||||
extern int crush_do_rule(struct crush_map *map,
|
||||
int ruleno,
|
||||
int x, int *result, int result_max,
|
||||
int forcefeed); /* -1 for none */
|
||||
|
||||
/*extern int crush_decode(struct crush_map *map, struct ceph_bufferlist *bl);*/
|
||||
|
||||
#endif
|
@ -18,6 +18,8 @@
|
||||
|
||||
#include "MonitorStore.h"
|
||||
|
||||
#include "crush/CrushWrapper.h"
|
||||
|
||||
#include "messages/MOSDFailure.h"
|
||||
#include "messages/MOSDMap.h"
|
||||
#include "messages/MOSDGetMap.h"
|
||||
@ -158,49 +160,55 @@ void OSDMonitor::create_initial()
|
||||
}
|
||||
|
||||
|
||||
void OSDMonitor::build_crush_map(Crush& crush,
|
||||
void OSDMonitor::build_crush_map(CrushWrapper& crush,
|
||||
map<int,double>& weights)
|
||||
{
|
||||
// new
|
||||
crush.create();
|
||||
|
||||
if (g_conf.num_osd >= 12) {
|
||||
int ndom = g_conf.osd_max_rep;
|
||||
UniformBucket *domain[ndom];
|
||||
int domid[ndom];
|
||||
for (int i=0; i<ndom; i++) {
|
||||
domain[i] = new UniformBucket(1, 0);
|
||||
domid[i] = crush.add_bucket(domain[i]);
|
||||
}
|
||||
|
||||
// add osds
|
||||
int ritems[ndom];
|
||||
int rweights[ndom];
|
||||
|
||||
int nper = ((g_conf.num_osd - 1) / ndom) + 1;
|
||||
derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl;
|
||||
int i = 0;
|
||||
for (int dom=0; dom<ndom; dom++) {
|
||||
for (int j=0; j<nper; j++) {
|
||||
domain[dom]->add_item(i, weights[i] ? weights[i]:1.0);
|
||||
//derr(0) << "osd" << i << " in domain " << dom << dendl;
|
||||
i++;
|
||||
if (i == g_conf.num_osd) break;
|
||||
|
||||
int o = 0;
|
||||
for (int i=0; i<ndom; i++) {
|
||||
int items[nper];
|
||||
//int w[nper];
|
||||
int j;
|
||||
rweights[i] = 0;
|
||||
for (j=0; j<nper; j++, o++) {
|
||||
if (o == g_conf.num_osd) break;
|
||||
dout(20) << "added osd" << o << dendl;
|
||||
items[j] = o;
|
||||
//w[j] = weights[o] ? (0x10000 - (int)(weights[o] * 0x10000)):0x10000;
|
||||
//rweights[i] += w[j];
|
||||
rweights[i] += 0x10000;
|
||||
}
|
||||
|
||||
crush_bucket_uniform *domain = crush_make_uniform_bucket(1, j, items, 0x10000);
|
||||
ritems[i] = crush_add_bucket(crush.map, (crush_bucket*)domain);
|
||||
dout(20) << "added domain bucket i " << ritems[i] << " of size " << j << dendl;
|
||||
}
|
||||
|
||||
// root
|
||||
Bucket *root = new ListBucket(2);
|
||||
for (int i=0; i<ndom; i++) {
|
||||
//derr(0) << "dom " << i << " w " << domain[i]->get_weight() << dendl;
|
||||
root->add_item(domid[i], domain[i]->get_weight());
|
||||
}
|
||||
int nroot = crush.add_bucket(root);
|
||||
crush_bucket_list *root = crush_make_list_bucket(2, ndom, ritems, rweights);
|
||||
int rootid = crush_add_bucket(crush.map, (crush_bucket*)root);
|
||||
|
||||
// rules
|
||||
// replication
|
||||
for (int i=1; i<=ndom; i++) {
|
||||
int r = CRUSH_REP_RULE(i);
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1));
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
|
||||
crush_rule *rule = crush_make_rule();
|
||||
crush_rule_add_step(rule, CRUSH_RULE_TAKE, rootid, 0);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 1);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
|
||||
crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule);
|
||||
}
|
||||
/*
|
||||
// raid
|
||||
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
|
||||
int r = CRUSH_RAID_RULE(i);
|
||||
@ -215,6 +223,7 @@ void OSDMonitor::build_crush_map(Crush& crush,
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// test
|
||||
//vector<int> out;
|
||||
@ -222,20 +231,24 @@ void OSDMonitor::build_crush_map(Crush& crush,
|
||||
|
||||
} else {
|
||||
// one bucket
|
||||
Bucket *b = new UniformBucket(1, 0);
|
||||
int root = crush.add_bucket(b);
|
||||
for (int i=0; i<g_conf.num_osd; i++) {
|
||||
b->add_item(i, weights[i] ? weights[i]:1.0);
|
||||
}
|
||||
|
||||
int items[g_conf.num_osd];
|
||||
for (int i=0; i<g_conf.num_osd; i++)
|
||||
items[i] = i;
|
||||
|
||||
crush_bucket_uniform *b = crush_make_uniform_bucket(1, g_conf.num_osd, items, 0x10000);
|
||||
int root = crush_add_bucket(crush.map, (crush_bucket*)b);
|
||||
|
||||
// rules
|
||||
// replication
|
||||
for (int i=1; i<=g_conf.osd_max_rep; i++) {
|
||||
int r = CRUSH_REP_RULE(i);
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
|
||||
crush_rule *rule = crush_make_rule();
|
||||
crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 0);
|
||||
crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
|
||||
crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule);
|
||||
}
|
||||
/*
|
||||
// raid
|
||||
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
|
||||
int r = CRUSH_RAID_RULE(i);
|
||||
@ -243,7 +256,12 @@ void OSDMonitor::build_crush_map(Crush& crush,
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0));
|
||||
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
|
||||
}
|
||||
*/
|
||||
}
|
||||
crush.finalize();
|
||||
dout(20) << "crush max_devices " << crush.map->max_devices << dendl;
|
||||
//vector<int> t;
|
||||
//crush.do_rule(2, 132, t, 4, -1);
|
||||
}
|
||||
|
||||
|
||||
@ -420,7 +438,7 @@ bool OSDMonitor::should_propose(double& delay)
|
||||
if (pending_inc.new_up.size() == osdmap.get_osds().size()) {
|
||||
delay = 0.0;
|
||||
if (g_conf.osd_auto_weight) {
|
||||
Crush crush;
|
||||
CrushWrapper crush;
|
||||
build_crush_map(crush, osd_weight);
|
||||
crush._encode(pending_inc.crush);
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ private:
|
||||
|
||||
map<int,double> osd_weight;
|
||||
|
||||
void build_crush_map(Crush& crush,
|
||||
void build_crush_map(CrushWrapper& crush,
|
||||
map<int,double>& weights);
|
||||
|
||||
// svc
|
||||
|
@ -28,8 +28,7 @@
|
||||
#include "common/Mutex.h"
|
||||
#include "common/Clock.h"
|
||||
|
||||
#include "crush/crush.h"
|
||||
using namespace crush;
|
||||
#include "crush/CrushWrapper.h"
|
||||
|
||||
#include <vector>
|
||||
#include <list>
|
||||
@ -144,7 +143,7 @@ private:
|
||||
map<int32_t,entity_inst_t> osd_inst;
|
||||
|
||||
public:
|
||||
Crush crush; // hierarchical map
|
||||
CrushWrapper crush; // hierarchical map
|
||||
|
||||
friend class OSDMonitor;
|
||||
friend class MDS;
|
||||
@ -207,9 +206,14 @@ private:
|
||||
|
||||
void mark_down(int o, bool clean) { down_osds[o] = clean; }
|
||||
void mark_up(int o) { down_osds.erase(o); }
|
||||
void mark_out(int o) { out_osds.insert(o); }
|
||||
void mark_in(int o) { out_osds.erase(o); }
|
||||
|
||||
void mark_out(int o) {
|
||||
out_osds.insert(o);
|
||||
crush.update_offload_map(out_osds, overload_osds);
|
||||
}
|
||||
void mark_in(int o) {
|
||||
out_osds.erase(o);
|
||||
crush.update_offload_map(out_osds, overload_osds);
|
||||
}
|
||||
|
||||
void apply_incremental(Incremental &inc) {
|
||||
assert(inc.epoch == epoch+1);
|
||||
@ -223,8 +227,8 @@ private:
|
||||
return;
|
||||
}
|
||||
if (inc.crush.length()) {
|
||||
int off = 0;
|
||||
crush._decode(inc.crush, off);
|
||||
bufferlist::iterator blp = inc.crush.begin();
|
||||
crush._decode(blp);
|
||||
}
|
||||
|
||||
// nope, incremental.
|
||||
@ -272,6 +276,8 @@ private:
|
||||
i++) {
|
||||
overload_osds[i->first] = i->second;
|
||||
}
|
||||
|
||||
crush.update_offload_map(out_osds, overload_osds);
|
||||
}
|
||||
|
||||
// serialize, unserialize
|
||||
@ -288,7 +294,9 @@ private:
|
||||
::_encode(overload_osds, blist);
|
||||
::_encode(osd_inst, blist);
|
||||
|
||||
crush._encode(blist);
|
||||
bufferlist cbl;
|
||||
crush._encode(cbl);
|
||||
::_encode(cbl, blist);
|
||||
}
|
||||
|
||||
void decode(bufferlist& blist) {
|
||||
@ -306,7 +314,12 @@ private:
|
||||
::_decode(overload_osds, blist, off);
|
||||
::_decode(osd_inst, blist, off);
|
||||
|
||||
crush._decode(blist, off);
|
||||
bufferlist cbl;
|
||||
::_decode(cbl, blist, off);
|
||||
bufferlist::iterator cblp = cbl.begin();
|
||||
crush._decode(cblp);
|
||||
|
||||
crush.update_offload_map(out_osds, overload_osds);
|
||||
}
|
||||
|
||||
|
||||
@ -320,8 +333,6 @@ private:
|
||||
}
|
||||
|
||||
ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) {
|
||||
static crush::Hash H(777);
|
||||
|
||||
int num = preferred >= 0 ? localized_pg_num:pg_num;
|
||||
int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask;
|
||||
|
||||
@ -334,14 +345,14 @@ private:
|
||||
|
||||
case CEPH_OBJECT_LAYOUT_HASHINO:
|
||||
//ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask);
|
||||
ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask);
|
||||
ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask);
|
||||
break;
|
||||
|
||||
case CEPH_OBJECT_LAYOUT_HASH:
|
||||
//ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask);
|
||||
//ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask);
|
||||
//ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask);
|
||||
ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask);
|
||||
ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask);
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -374,10 +385,9 @@ private:
|
||||
if (pg.preferred() >= 0 &&
|
||||
out_osds.count(pg.preferred()) == 0)
|
||||
forcefeed = pg.preferred();
|
||||
crush.do_rule(crush.rules[rule],
|
||||
crush.do_rule(rule,
|
||||
pg.ps(),
|
||||
osds,
|
||||
out_osds, overload_osds,
|
||||
osds, pg.size(),
|
||||
forcefeed);
|
||||
}
|
||||
break;
|
||||
@ -389,8 +399,7 @@ private:
|
||||
|
||||
case CEPH_PG_LAYOUT_HYBRID:
|
||||
{
|
||||
static crush::Hash H(777);
|
||||
int h = H(pg.ps());
|
||||
int h = crush_hash32(pg.ps());
|
||||
for (int i=0; i<pg.size(); i++)
|
||||
osds.push_back( (h+i) % g_conf.num_osd );
|
||||
}
|
||||
@ -398,12 +407,11 @@ private:
|
||||
|
||||
case CEPH_PG_LAYOUT_HASH:
|
||||
{
|
||||
static crush::Hash H(777);
|
||||
for (int i=0; i<pg.size(); i++) {
|
||||
int t = 1;
|
||||
int osd = 0;
|
||||
while (t++) {
|
||||
osd = H(i, pg.ps(), t) % g_conf.num_osd;
|
||||
osd = crush_hash32_3(i, pg.ps(), t) % g_conf.num_osd;
|
||||
int j = 0;
|
||||
for (; j<i; j++)
|
||||
if (osds[j] == osd) break;
|
||||
|
@ -31,6 +31,7 @@
|
||||
using namespace std;
|
||||
|
||||
#include <ext/hash_map>
|
||||
#include <ext/hash_set>
|
||||
using namespace __gnu_cxx;
|
||||
|
||||
|
||||
|
@ -74,8 +74,8 @@ typedef uint8_t pruleset_t;
|
||||
|
||||
|
||||
// crush rule ids
|
||||
#define CRUSH_REP_RULE(nrep) (100+nrep) // replication
|
||||
#define CRUSH_RAID_RULE(num) (200+num) // raid
|
||||
#define CRUSH_REP_RULE(nrep) (nrep) // replication
|
||||
#define CRUSH_RAID_RULE(num) (10+num) // raid
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user