ceph/branches/sage/pgs/crush/crush.h

522 lines
13 KiB
C
Raw Normal View History

#ifndef __crush_CRUSH_H
#define __crush_CRUSH_H
#include <iostream>
#include <list>
#include <vector>
#include <set>
#include <map>
using namespace std;
#include <ext/hash_map>
#include <ext/hash_set>
using namespace __gnu_cxx;
#include "Bucket.h"
#include "include/buffer.h"
namespace crush {
// *** RULES ***
class RuleStep {
public:
int cmd;
vector<int> args;
RuleStep(int c) : cmd(c) {}
RuleStep(int c, int a) : cmd(c) {
args.push_back(a);
}
RuleStep(int c, int a, int b) : cmd(c) {
args.push_back(a);
args.push_back(b);
}
RuleStep(int o, int a, int b, int c) : cmd(o) {
args.push_back(a);
args.push_back(b);
args.push_back(c);
}
void _encode(bufferlist& bl) {
bl.append((char*)&cmd, sizeof(cmd));
::_encode(args, bl);
}
void _decode(bufferlist& bl, int& off) {
bl.copy(off, sizeof(cmd), (char*)&cmd);
off += sizeof(cmd);
::_decode(args, bl, off);
}
};
// Rule operations
const int CRUSH_RULE_TAKE = 0;
const int CRUSH_RULE_CHOOSE = 1; // first n by default
const int CRUSH_RULE_CHOOSE_FIRSTN = 1;
const int CRUSH_RULE_CHOOSE_INDEP = 2;
const int CRUSH_RULE_EMIT = 3;
class Rule {
public:
vector< RuleStep > steps;
void _encode(bufferlist& bl) {
int n = steps.size();
bl.append((char*)&n, sizeof(n));
for (int i=0; i<n; i++)
steps[i]._encode(bl);
}
void _decode(bufferlist& bl, int& off) {
steps.clear();
int n;
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
steps.push_back(RuleStep(0));
steps[i]._decode(bl, off);
}
}
};
// *** CRUSH ***
class Crush {
protected:
map<int, Bucket*> buckets;
int bucketno;
Hash h;
hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
public:
map<int, Rule> rules;
//map<int,int> collisions;
//map<int,int> bumps;
void _encode(bufferlist& bl) {
// buckets
int n = buckets.size();
bl.append((char*)&n, sizeof(n));
for (map<int, Bucket*>::const_iterator it = buckets.begin();
it != buckets.end();
it++) {
bl.append((char*)&it->first, sizeof(it->first));
it->second->_encode(bl);
}
bl.append((char*)&bucketno, sizeof(bucketno));
// hash
int s = h.get_seed();
bl.append((char*)&s, sizeof(s));
//::_encode(out, bl);
//::_encode(overload, bl);
// rules
n = rules.size();
bl.append((char*)&n, sizeof(n));
for(map<int, Rule>::iterator it = rules.begin();
it != rules.end();
it++) {
bl.append((char*)&it->first, sizeof(it->first));
it->second._encode(bl);
}
}
void _decode(bufferlist& bl, int& off) {
int n;
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
int bid;
bl.copy(off, sizeof(bid), (char*)&bid);
off += sizeof(bid);
Bucket *b = decode_bucket(bl, off);
buckets[bid] = b;
}
bl.copy(off, sizeof(bucketno), (char*)&bucketno);
off += sizeof(bucketno);
int s;
bl.copy(off, sizeof(s), (char*)&s);
off += sizeof(s);
h.set_seed(s);
//::_decode(out, bl, off);
//::_decode(overload, bl, off);
// rules
bl.copy(off, sizeof(n), (char*)&n);
off += sizeof(n);
for (int i=0; i<n; i++) {
int r;
bl.copy(off, sizeof(r), (char*)&r);
off += sizeof(r);
rules[r]._decode(bl,off);
}
// index
build_parent_map();
}
void build_parent_map() {
parent_map.clear();
// index every bucket
for (map<int, Bucket*>::iterator bp = buckets.begin();
bp != buckets.end();
++bp) {
// index bucket items
vector<int> items;
bp->second->get_items(items);
for (vector<int>::iterator ip = items.begin();
ip != items.end();
++ip)
parent_map[*ip] = bp->first;
}
}
public:
Crush(int seed=123) : bucketno(-1), h(seed) {}
~Crush() {
// hose buckets
for (map<int, Bucket*>::iterator it = buckets.begin();
it != buckets.end();
it++) {
delete it->second;
}
}
int print(ostream& out, int root, int indent=0) {
for (int i=0; i<indent; i++) out << " ";
Bucket *b = buckets[root];
assert(b);
out << b->get_weight() << "\t" << b->get_id() << "\t";
for (int i=0; i<indent; i++) out << " ";
out << b->get_bucket_type() << ": ";
vector<int> items;
b->get_items(items);
if (buckets.count(items[0])) {
out << endl;
for (unsigned i=0; i<items.size(); i++)
print(out, items[i], indent+1);
} else {
out << "[";
for (unsigned i=0; i<items.size(); i++) {
if (i) out << " ";
out << items[i];
}
out << "]";
}
return 0;
}
int add_bucket( Bucket *b ) {
int n = bucketno;
bucketno--;
b->set_id(n);
buckets[n] = b;
return n;
}
void add_item(int parent, int item, float w, bool back=false) {
// add item
assert(!buckets[parent]->is_uniform());
Bucket *p = buckets[parent];
p->add_item(item, w, back);
// set item's parent
Bucket *n = buckets[item];
if (n)
n->set_parent(parent);
// update weights
while (buckets.count(p->get_parent())) {
int child = p->get_id();
p = buckets[p->get_parent()];
p->adjust_item_weight(child, w);
}
}
/*
this is a hack, fix me! weights should be consistent throughout hierarchy!
*/
void set_bucket_weight(int item, float w) {
Bucket *b = buckets[item];
float adj = w - b->get_weight();
while (buckets.count(b->get_parent())) {
Bucket *p = buckets[b->get_parent()];
p->adjust_item_weight(b->get_id(), adj);
b = p;
}
}
/*
* choose numrep distinct items of type type
*/
void choose(int x,
int numrep,
int type,
Bucket *inbucket,
vector<int>& outvec,
bool firstn,
set<int>& outset, map<int,float>& overloadmap,
bool forcefeed=false,
int forcefeedval=-1) {
int off = outvec.size();
// for each replica
for (int rep=0; rep<numrep; rep++) {
int outv = -1; // my result
// forcefeed?
if (forcefeed) {
forcefeed = false;
outvec.push_back(forcefeedval);
continue;
}
// keep trying until we get a non-out, non-colliding item
int ftotal = 0;
bool skip_rep = false;
while (1) {
// start with the input bucket
Bucket *in = inbucket;
// choose through intervening buckets
int flocal = 0;
bool retry_rep = false;
while (1) {
// r may be twiddled to (try to) avoid past collisions
int r = rep;
if (in->is_uniform()) {
// uniform bucket; be careful!
if (firstn || numrep >= in->get_size()) {
// uniform bucket is too small; just walk thru elements
r += ftotal; // r' = r + f_total (first n)
} else {
// make sure numrep is not a multple of bucket size
int add = numrep*flocal; // r' = r + n*f_local
if (in->get_size() % numrep == 0) {
add += add/in->get_size(); // shift seq once per pass through the bucket
}
r += add;
}
} else {
// mixed bucket; just make a distinct-ish r sequence
if (firstn)
r += ftotal; // r' = r + f_total
else
r += numrep * flocal; // r' = r + n*f_local
}
// choose
outv = in->choose_r(x, r, h);
// did we get the type we want?
int itemtype = 0; // 0 is terminal type
Bucket *newin = 0; // remember bucket we hit
if (in->is_uniform()) {
itemtype = ((UniformBucket*)in)->get_item_type();
} else {
if (buckets.count(outv)) { // another bucket
newin = buckets[outv];
itemtype = newin->get_type();
}
}
if (itemtype == type) { // this is what we want!
// collision?
bool collide = false;
for (int prep=0; prep<rep; prep++) {
if (outvec[off+prep] == outv) {
collide = true;
break;
}
}
// ok choice?
bool bad = false;
if (type == 0 && outset.count(outv))
bad = true;
if (overloadmap.count(outv)) {
float f = (float)(h(x, outv) % 1000) / 1000.0;
if (f > overloadmap[outv])
bad = true;
}
if (collide || bad) {
ftotal++;
flocal++;
if (collide && flocal < 3)
continue; // try locally a few times!
if (ftotal >= 10) {
// ok fine, just ignore dup. FIXME.
skip_rep = true;
break;
}
retry_rep = true;
}
break; // ok then!
}
// next
in = newin;
}
if (retry_rep) continue; // try again
break;
}
// skip this rep? (e.g. too many collisions, we give up)
if (skip_rep) continue;
// output this value
outvec.push_back(outv);
} // for rep
// double check!
if (0) {
for (unsigned i=1; i<outvec.size(); i++)
for (unsigned j=0; j<i; j++)
assert(outvec[i] != outvec[j]);
}
}
void do_rule(Rule& rule, int x, vector<int>& result,
set<int>& outset, map<int,float>& overloadmap,
int forcefeed=-1) {
//int numresult = 0;
result.clear();
// determine hierarchical context for first.
list<int> force_stack;
if (forcefeed >= 0) {
int t = forcefeed;
while (1) {
force_stack.push_front(t);
if (parent_map.count(t) == 0) break; // reached root, presumably.
//cout << " " << t << " parent is " << parent_map[t] << endl;
t = parent_map[t];
}
}
// working vector
vector<int> w; // working variable
// go through each statement
for (vector<RuleStep>::iterator pc = rule.steps.begin();
pc != rule.steps.end();
pc++) {
// move input?
// do it
switch (pc->cmd) {
case CRUSH_RULE_TAKE:
{
const int arg = pc->args[0];
//cout << "take " << arg << endl;
if (!force_stack.empty()) {
int forceval = force_stack.front();
force_stack.pop_front();
assert(arg == forceval);
}
w.clear();
w.push_back(arg);
}
break;
case CRUSH_RULE_CHOOSE_FIRSTN:
case CRUSH_RULE_CHOOSE_INDEP:
{
const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN;
const int numrep = pc->args[0];
const int type = pc->args[1];
//cout << "choose " << numrep << " of type " << type << endl;
assert(!w.empty());
// reset output
vector<int> out;
// forcefeeding?
bool forcing = false;
int forceval;
if (!force_stack.empty()) {
forceval = force_stack.front();
force_stack.pop_front();
//cout << "priming out with " << forceval << endl;
forcing = true;
}
// do each row independently
for (vector<int>::iterator i = w.begin();
i != w.end();
i++) {
assert(buckets.count(*i));
Bucket *b = buckets[*i];
choose(x, numrep, type, b, out, firstn,
outset, overloadmap,
forcing,
forceval);
forcing = false; // only once
} // for inrow
// put back into w
w.swap(out);
out.clear();
}
break;
case CRUSH_RULE_EMIT:
{
for (unsigned i=0; i<w.size(); i++)
result.push_back(w[i]);
//result[numresult++] = w[i];
w.clear();
}
break;
default:
assert(0);
}
}
}
};
}
#endif