osds may benchmark themeslves on startup and supply crush weights config.cc config.h

git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1858 29311d96-e01e-0410-9327-a35deaab8ce9
This commit is contained in:
sageweil 2007-09-28 21:07:08 +00:00
parent 2011b3b16b
commit db19ba582d
5 changed files with 139 additions and 88 deletions

View File

@ -119,92 +119,13 @@ void OSDMonitor::create_initial()
// start at epoch 1 until all osds boot
newmap.inc_epoch(); // = 1
assert(newmap.get_epoch() == 1);
if (g_conf.num_osd >= 12) {
int ndom = g_conf.osd_max_rep;
UniformBucket *domain[ndom];
int domid[ndom];
for (int i=0; i<ndom; i++) {
domain[i] = new UniformBucket(1, 0);
domid[i] = newmap.crush.add_bucket(domain[i]);
}
// add osds
int nper = ((g_conf.num_osd - 1) / ndom) + 1;
derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl;
int i = 0;
for (int dom=0; dom<ndom; dom++) {
for (int j=0; j<nper; j++) {
newmap.osds.insert(i);
newmap.down_osds[i] = true; // initially DOWN
domain[dom]->add_item(i, 1.0);
//derr(0) << "osd" << i << " in domain " << dom << dendl;
i++;
if (i == g_conf.num_osd) break;
}
}
// root
Bucket *root = new ListBucket(2);
for (int i=0; i<ndom; i++) {
//derr(0) << "dom " << i << " w " << domain[i]->get_weight() << dendl;
root->add_item(domid[i], domain[i]->get_weight());
}
int nroot = newmap.crush.add_bucket(root);
// rules
// replication
for (int i=1; i<=ndom; i++) {
int r = CRUSH_REP_RULE(i);
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
if (ndom >= i) {
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 1));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, 1, 0));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
} else {
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
}
// test
//vector<int> out;
//newmap.pg_to_osds(0x40200000110ULL, out);
} else {
// one bucket
Bucket *b = new UniformBucket(1, 0);
int root = newmap.crush.add_bucket(b);
for (int i=0; i<g_conf.num_osd; i++) {
newmap.osds.insert(i);
newmap.down_osds[i] = true;
b->add_item(i, 1.0);
}
// rules
// replication
for (int i=1; i<=g_conf.osd_max_rep; i++) {
int r = CRUSH_REP_RULE(i);
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0));
newmap.crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
map<int,double> weights;
build_crush_map(newmap.crush, weights);
for (int i=0; i<g_conf.num_osd; i++) {
newmap.osds.insert(i);
newmap.down_osds[i] = true;
}
if (g_conf.mds_local_osd) {
@ -237,6 +158,94 @@ void OSDMonitor::create_initial()
}
void OSDMonitor::build_crush_map(Crush& crush,
map<int,double>& weights)
{
if (g_conf.num_osd >= 12) {
int ndom = g_conf.osd_max_rep;
UniformBucket *domain[ndom];
int domid[ndom];
for (int i=0; i<ndom; i++) {
domain[i] = new UniformBucket(1, 0);
domid[i] = crush.add_bucket(domain[i]);
}
// add osds
int nper = ((g_conf.num_osd - 1) / ndom) + 1;
derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl;
int i = 0;
for (int dom=0; dom<ndom; dom++) {
for (int j=0; j<nper; j++) {
domain[dom]->add_item(i, weights[i] ? weights[i]:1.0);
//derr(0) << "osd" << i << " in domain " << dom << dendl;
i++;
if (i == g_conf.num_osd) break;
}
}
// root
Bucket *root = new ListBucket(2);
for (int i=0; i<ndom; i++) {
//derr(0) << "dom " << i << " w " << domain[i]->get_weight() << dendl;
root->add_item(domid[i], domain[i]->get_weight());
}
int nroot = crush.add_bucket(root);
// rules
// replication
for (int i=1; i<=ndom; i++) {
int r = CRUSH_REP_RULE(i);
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
if (ndom >= i) {
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 1));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, 1, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
} else {
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
}
// test
//vector<int> out;
//pg_to_osds(0x40200000110ULL, out);
} else {
// one bucket
Bucket *b = new UniformBucket(1, 0);
int root = crush.add_bucket(b);
for (int i=0; i<g_conf.num_osd; i++) {
b->add_item(i, weights[i] ? weights[i]:1.0);
}
// rules
// replication
for (int i=1; i<=g_conf.osd_max_rep; i++) {
int r = CRUSH_REP_RULE(i);
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
}
}
bool OSDMonitor::update_from_paxos()
{
@ -410,6 +419,11 @@ bool OSDMonitor::should_propose(double& delay)
if (osdmap.epoch == 1) {
if (pending_inc.new_up.size() == osdmap.get_osds().size()) {
delay = 0.0;
if (g_conf.osd_auto_weight) {
Crush crush;
build_crush_map(crush, osd_weight);
crush._encode(pending_inc.crush);
}
return true;
} else
return false;
@ -552,6 +566,8 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
if (osdmap.out_osds.count(from))
pending_inc.new_in.push_back(from);
osd_weight[from] = m->sb.weight;
// wait
paxos->wait_for_commit(new C_Booted(this, m));
}
@ -560,7 +576,7 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
void OSDMonitor::_booted(MOSDBoot *m)
{
dout(7) << "_booted " << m->inst << dendl;
dout(7) << "_booted " << m->inst << " w " << m->sb.weight << dendl;
send_latest(m->inst, m->sb.current_epoch);
delete m;
}

View File

@ -41,6 +41,11 @@ private:
OSDMap::Incremental pending_inc;
map<int,utime_t> down_pending_out; // osd down -> out
map<int,double> osd_weight;
void build_crush_map(Crush& crush,
map<int,double>& weights);
// svc
void create_initial();
bool update_from_paxos();

View File

@ -221,6 +221,26 @@ int OSD::init()
50000,
g_conf.osd_age - .05);
}
if (g_conf.osd_auto_weight) {
// benchmark
bufferlist bl;
bufferptr bp(1048576);
bp.zero();
bl.push_back(bp);
utime_t start = g_clock.now();
for (int i=0; i<1000; i++)
store->write(object_t(999,i), 0, bl.length(), bl, 0);
store->sync();
utime_t end = g_clock.now();
end -= start;
dout(0) << "measured " << (1000.0 / (double)end) << " mb/sec" << dendl;
for (int i=0; i<1000; i++)
store->remove(object_t(999,i), 0);
// set osd weight
superblock.weight = (1000.0 / (double)end);
}
}
else {
dout(2) << "boot" << dendl;
@ -235,6 +255,8 @@ int OSD::init()
assert(whoami == superblock.whoami);
}
// log
char name[80];

View File

@ -90,6 +90,7 @@ public:
// full (rare)
bufferlist fullmap; // in leiu of below.
bufferlist crush;
// incremental
map<int32_t,entity_inst_t> new_up;
@ -109,6 +110,7 @@ public:
::_encode(new_out, bl);
::_encode(new_overload, bl);
::_encode(fullmap, bl);
::_encode(crush, bl);
}
void decode(bufferlist& bl, int& off) {
::_decode(epoch, bl, off);
@ -120,6 +122,7 @@ public:
::_decode(new_out, bl, off);
::_decode(new_overload, bl, off);
::_decode(fullmap, bl, off);
::_decode(crush, bl, off);
}
Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {}
@ -219,6 +222,10 @@ private:
decode(inc.fullmap);
return;
}
if (inc.crush.length()) {
int off = 0;
crush._decode(inc.crush, off);
}
// nope, incremental.
for (map<int32_t,pair<entity_inst_t,bool> >::iterator i = inc.new_down.begin();

View File

@ -312,9 +312,10 @@ public:
int32_t whoami; // my role in this fs.
epoch_t current_epoch; // most recent epoch
epoch_t oldest_map, newest_map; // oldest/newest maps we have.
double weight;
OSDSuperblock(uint64_t f=0, int w=0) :
magic(MAGIC), fsid(f), whoami(w),
current_epoch(0), oldest_map(0), newest_map(0) {}
current_epoch(0), oldest_map(0), newest_map(0), weight(0) {}
};
inline ostream& operator<<(ostream& out, OSDSuperblock& sb)