mirror of
https://github.com/ceph/ceph
synced 2025-02-19 00:47:49 +00:00
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1592 29311d96-e01e-0410-9327-a35deaab8ce9
516 lines
13 KiB
C++
516 lines
13 KiB
C++
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
|
// vim: ts=8 sw=2 smarttab
|
|
/*
|
|
* Ceph - scalable distributed file system
|
|
*
|
|
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
|
*
|
|
* This is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License version 2.1, as published by the Free Software
|
|
* Foundation. See file COPYING.
|
|
*
|
|
*/
|
|
|
|
|
|
#ifndef __OSDMAP_H
|
|
#define __OSDMAP_H
|
|
|
|
/*
|
|
* describe properties of the OSD cluster.
|
|
* disks, disk groups, total # osds,
|
|
*
|
|
*/
|
|
#include "config.h"
|
|
#include "include/types.h"
|
|
#include "osd_types.h"
|
|
#include "msg/Message.h"
|
|
#include "common/Mutex.h"
|
|
#include "common/Clock.h"
|
|
|
|
#include "crush/crush.h"
|
|
using namespace crush;
|
|
|
|
#include <vector>
|
|
#include <list>
|
|
#include <set>
|
|
#include <map>
|
|
using namespace std;
|
|
|
|
|
|
/*
|
|
* some system constants
|
|
*/
|
|
|
|
// from LSB to MSB,
|
|
#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG
|
|
#define PG_REP_BITS 6 // up to 64 replicas
|
|
#define PG_TYPE_BITS 2
|
|
#define PG_PS_MASK ((1LL<<PG_PS_BITS)-1)
|
|
|
|
#define PG_TYPE_RAND 1 // default: distribution randomly
|
|
#define PG_TYPE_STARTOSD 2 // place primary on a specific OSD
|
|
|
|
// pg roles
|
|
#define PG_ROLE_STRAY -1
|
|
#define PG_ROLE_HEAD 0
|
|
#define PG_ROLE_ACKER 1
|
|
#define PG_ROLE_MIDDLE 2 // der.. misnomer
|
|
//#define PG_ROLE_TAIL 2
|
|
|
|
|
|
inline int stable_mod(int x, int b, int bmask) {
|
|
if ((x & bmask) < b)
|
|
return x & bmask;
|
|
else
|
|
return (x & (bmask>>1));
|
|
}
|
|
|
|
inline int calc_bits_of(int t) {
|
|
int b = 0;
|
|
while (t) {
|
|
t = t >> 1;
|
|
b++;
|
|
}
|
|
return b;
|
|
}
|
|
|
|
|
|
|
|
/** OSDMap
|
|
*/
|
|
class OSDMap {
|
|
|
|
public:
|
|
class Incremental {
|
|
public:
|
|
epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
|
|
epoch_t mon_epoch; // monitor epoch (election iteration)
|
|
utime_t ctime;
|
|
|
|
// full (rare)
|
|
bufferlist fullmap; // in leiu of below.
|
|
|
|
// incremental
|
|
map<int32_t,entity_inst_t> new_up;
|
|
map<int32_t,entity_inst_t> new_down;
|
|
list<int32_t> new_in;
|
|
list<int32_t> new_out;
|
|
map<int32_t,float> new_overload; // updated overload value
|
|
list<int32_t> old_overload; // no longer overload
|
|
|
|
void encode(bufferlist& bl) {
|
|
::_encode(epoch, bl);
|
|
::_encode(mon_epoch, bl);
|
|
::_encode(ctime, bl);
|
|
::_encode(new_up, bl);
|
|
::_encode(new_down, bl);
|
|
::_encode(new_in, bl);
|
|
::_encode(new_out, bl);
|
|
::_encode(new_overload, bl);
|
|
::_encode(fullmap, bl);
|
|
}
|
|
void decode(bufferlist& bl, int& off) {
|
|
::_decode(epoch, bl, off);
|
|
::_decode(mon_epoch, bl, off);
|
|
::_decode(ctime, bl, off);
|
|
::_decode(new_up, bl, off);
|
|
::_decode(new_down, bl, off);
|
|
::_decode(new_in, bl, off);
|
|
::_decode(new_out, bl, off);
|
|
::_decode(new_overload, bl, off);
|
|
::_decode(fullmap, bl, off);
|
|
}
|
|
|
|
Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {}
|
|
};
|
|
|
|
private:
|
|
epoch_t epoch; // what epoch of the osd cluster descriptor is this
|
|
epoch_t mon_epoch; // monitor epoch (election iteration)
|
|
utime_t ctime; // epoch start time
|
|
int32_t pg_num; // placement group count
|
|
int32_t pg_num_mask; // bitmask for above
|
|
int32_t localized_pg_num; // localized place group count
|
|
int32_t localized_pg_num_mask; // ditto
|
|
|
|
set<int32_t> osds; // all osds
|
|
set<int32_t> down_osds; // list of down disks
|
|
set<int32_t> out_osds; // list of unmapped disks
|
|
map<int32_t,float> overload_osds;
|
|
map<int32_t,entity_inst_t> osd_inst;
|
|
|
|
public:
|
|
Crush crush; // hierarchical map
|
|
|
|
friend class OSDMonitor;
|
|
friend class MDS;
|
|
|
|
public:
|
|
OSDMap() : epoch(0), mon_epoch(0),
|
|
pg_num(1<<5),
|
|
localized_pg_num(1<<3) {
|
|
calc_pg_masks();
|
|
}
|
|
|
|
// map info
|
|
epoch_t get_epoch() const { return epoch; }
|
|
void inc_epoch() { epoch++; }
|
|
|
|
void calc_pg_masks() {
|
|
pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1;
|
|
localized_pg_num_mask = (1 << calc_bits_of(localized_pg_num-1)) - 1;
|
|
}
|
|
|
|
int get_pg_num() const { return pg_num; }
|
|
void set_pg_num(int m) { pg_num = m; calc_pg_masks(); }
|
|
int get_localized_pg_num() const { return localized_pg_num; }
|
|
|
|
const utime_t& get_ctime() const { return ctime; }
|
|
|
|
bool is_mkfs() const { return epoch == 2; }
|
|
bool post_mkfs() const { return epoch > 2; }
|
|
|
|
/***** cluster state *****/
|
|
int num_osds() { return osds.size(); }
|
|
void get_all_osds(set<int>& ls) { ls = osds; }
|
|
|
|
const set<int>& get_osds() { return osds; }
|
|
const set<int>& get_down_osds() { return down_osds; }
|
|
const set<int>& get_out_osds() { return out_osds; }
|
|
const map<int,float>& get_overload_osds() { return overload_osds; }
|
|
|
|
bool exists(int osd) { return osds.count(osd); }
|
|
bool is_down(int osd) { return down_osds.count(osd); }
|
|
bool is_up(int osd) { return exists(osd) && !is_down(osd); }
|
|
bool is_out(int osd) { return out_osds.count(osd); }
|
|
bool is_in(int osd) { return exists(osd) && !is_out(osd); }
|
|
|
|
bool have_inst(int osd) {
|
|
return osd_inst.count(osd);
|
|
}
|
|
const entity_inst_t& get_inst(int osd) {
|
|
assert(osd_inst.count(osd));
|
|
return osd_inst[osd];
|
|
}
|
|
bool get_inst(int osd, entity_inst_t& inst) {
|
|
if (osd_inst.count(osd)) {
|
|
inst = osd_inst[osd];
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void mark_down(int o) { down_osds.insert(o); }
|
|
void mark_up(int o) { down_osds.erase(o); }
|
|
void mark_out(int o) { out_osds.insert(o); }
|
|
void mark_in(int o) { out_osds.erase(o); }
|
|
|
|
|
|
void apply_incremental(Incremental &inc) {
|
|
assert(inc.epoch == epoch+1);
|
|
epoch++;
|
|
mon_epoch = inc.mon_epoch;
|
|
ctime = inc.ctime;
|
|
|
|
// full map?
|
|
if (inc.fullmap.length()) {
|
|
decode(inc.fullmap);
|
|
return;
|
|
}
|
|
|
|
// nope, incremental.
|
|
for (map<int32_t,entity_inst_t>::iterator i = inc.new_down.begin();
|
|
i != inc.new_down.end();
|
|
i++) {
|
|
assert(down_osds.count(i->first) == 0);
|
|
down_osds.insert(i->first);
|
|
assert(osd_inst.count(i->first) == 0 ||
|
|
osd_inst[i->first] == i->second);
|
|
osd_inst.erase(i->first);
|
|
//cout << "epoch " << epoch << " down osd" << i->first << endl;
|
|
}
|
|
for (list<int32_t>::iterator i = inc.new_out.begin();
|
|
i != inc.new_out.end();
|
|
i++) {
|
|
assert(out_osds.count(*i) == 0);
|
|
out_osds.insert(*i);
|
|
//cout << "epoch " << epoch << " out osd" << *i << endl;
|
|
}
|
|
for (list<int32_t>::iterator i = inc.old_overload.begin();
|
|
i != inc.old_overload.end();
|
|
i++) {
|
|
assert(overload_osds.count(*i));
|
|
overload_osds.erase(*i);
|
|
}
|
|
|
|
for (map<int32_t,entity_inst_t>::iterator i = inc.new_up.begin();
|
|
i != inc.new_up.end();
|
|
i++) {
|
|
assert(down_osds.count(i->first));
|
|
down_osds.erase(i->first);
|
|
assert(osd_inst.count(i->first) == 0);
|
|
osd_inst[i->first] = i->second;
|
|
//cout << "epoch " << epoch << " up osd" << i->first << endl;
|
|
}
|
|
for (list<int32_t>::iterator i = inc.new_in.begin();
|
|
i != inc.new_in.end();
|
|
i++) {
|
|
assert(out_osds.count(*i));
|
|
out_osds.erase(*i);
|
|
//cout << "epoch " << epoch << " in osd" << *i << endl;
|
|
}
|
|
for (map<int32_t,float>::iterator i = inc.new_overload.begin();
|
|
i != inc.new_overload.end();
|
|
i++) {
|
|
overload_osds[i->first] = i->second;
|
|
}
|
|
}
|
|
|
|
// serialize, unserialize
|
|
void encode(bufferlist& blist) {
|
|
::_encode(epoch, blist);
|
|
::_encode(mon_epoch, blist);
|
|
::_encode(ctime, blist);
|
|
::_encode(pg_num, blist);
|
|
::_encode(localized_pg_num, blist);
|
|
|
|
::_encode(osds, blist);
|
|
::_encode(down_osds, blist);
|
|
::_encode(out_osds, blist);
|
|
::_encode(overload_osds, blist);
|
|
::_encode(osd_inst, blist);
|
|
|
|
crush._encode(blist);
|
|
}
|
|
|
|
void decode(bufferlist& blist) {
|
|
int off = 0;
|
|
::_decode(epoch, blist, off);
|
|
::_decode(mon_epoch, blist, off);
|
|
::_decode(ctime, blist, off);
|
|
::_decode(pg_num, blist, off);
|
|
::_decode(localized_pg_num, blist, off);
|
|
calc_pg_masks();
|
|
|
|
::_decode(osds, blist, off);
|
|
::_decode(down_osds, blist, off);
|
|
::_decode(out_osds, blist, off);
|
|
::_decode(overload_osds, blist, off);
|
|
::_decode(osd_inst, blist, off);
|
|
|
|
crush._decode(blist, off);
|
|
}
|
|
|
|
|
|
|
|
|
|
/**** mapping facilities ****/
|
|
|
|
// oid -> pg
|
|
ObjectLayout file_to_object_layout(object_t oid, FileLayout& layout) {
|
|
return make_object_layout(oid, layout.pg_type, layout.pg_size, layout.preferred, layout.object_stripe_unit);
|
|
}
|
|
|
|
ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) {
|
|
static crush::Hash H(777);
|
|
|
|
// calculate ps (placement seed)
|
|
ps_t ps;
|
|
switch (g_conf.osd_object_layout) {
|
|
case OBJECT_LAYOUT_LINEAR:
|
|
ps = stable_mod(oid.bno + oid.ino, pg_num, pg_num_mask);
|
|
break;
|
|
|
|
case OBJECT_LAYOUT_HASHINO:
|
|
ps = stable_mod(oid.bno + H(oid.ino), pg_num, pg_num_mask);
|
|
break;
|
|
|
|
case OBJECT_LAYOUT_HASH:
|
|
ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), pg_num, pg_num_mask);
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
|
|
// construct object layout
|
|
return ObjectLayout(pg_t(pg_type, pg_size, ps, preferred),
|
|
object_stripe_unit);
|
|
}
|
|
|
|
|
|
// pg -> (osd list)
|
|
int pg_to_osds(pg_t pg,
|
|
vector<int>& osds) { // list of osd addr's
|
|
// map to osds[]
|
|
switch (g_conf.osd_pg_layout) {
|
|
case PG_LAYOUT_CRUSH:
|
|
{
|
|
// what crush rule?
|
|
int rule;
|
|
if (pg.is_rep()) rule = CRUSH_REP_RULE(pg.size());
|
|
else if (pg.is_raid4()) rule = CRUSH_RAID_RULE(pg.size());
|
|
else assert(0);
|
|
|
|
// forcefeed?
|
|
int forcefeed = -1;
|
|
if (pg.preferred() >= 0 &&
|
|
out_osds.count(pg.preferred()) == 0)
|
|
forcefeed = pg.preferred();
|
|
crush.do_rule(crush.rules[rule],
|
|
pg.ps(),
|
|
osds,
|
|
out_osds, overload_osds,
|
|
forcefeed);
|
|
}
|
|
break;
|
|
|
|
case PG_LAYOUT_LINEAR:
|
|
for (int i=0; i<pg.size(); i++)
|
|
osds.push_back( (i + pg.ps()*pg.size()) % g_conf.num_osd );
|
|
break;
|
|
|
|
case PG_LAYOUT_HYBRID:
|
|
{
|
|
static crush::Hash H(777);
|
|
int h = H(pg.ps());
|
|
for (int i=0; i<pg.size(); i++)
|
|
osds.push_back( (h+i) % g_conf.num_osd );
|
|
}
|
|
break;
|
|
|
|
case PG_LAYOUT_HASH:
|
|
{
|
|
static crush::Hash H(777);
|
|
for (int i=0; i<pg.size(); i++) {
|
|
int t = 1;
|
|
int osd = 0;
|
|
while (t++) {
|
|
osd = H(i, pg.ps(), t) % g_conf.num_osd;
|
|
int j = 0;
|
|
for (; j<i; j++)
|
|
if (osds[j] == osd) break;
|
|
if (j == i) break;
|
|
}
|
|
osds.push_back(osd);
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
|
|
// no crush, but forcefeeding?
|
|
if (pg.preferred() >= 0 &&
|
|
g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) {
|
|
int osd = pg.preferred();
|
|
|
|
// already in there?
|
|
if (osds.empty()) {
|
|
osds.push_back(osd);
|
|
} else {
|
|
assert(pg.size() > 0);
|
|
for (int i=1; i<pg.size(); i++)
|
|
if (osds[i] == osd) {
|
|
// swap with position 0
|
|
osds[i] = osds[0];
|
|
}
|
|
osds[0] = osd;
|
|
}
|
|
|
|
if (is_out(osd))
|
|
osds.erase(osds.begin()); // oops, but it's down!
|
|
}
|
|
|
|
return osds.size();
|
|
}
|
|
|
|
// pg -> (up osd list)
|
|
int pg_to_acting_osds(pg_t pg,
|
|
vector<int>& osds) { // list of osd addr's
|
|
// get rush list
|
|
vector<int> raw;
|
|
pg_to_osds(pg, raw);
|
|
|
|
osds.clear();
|
|
for (unsigned i=0; i<raw.size(); i++) {
|
|
if (is_down(raw[i])) continue;
|
|
osds.push_back( raw[i] );
|
|
}
|
|
return osds.size();
|
|
}
|
|
|
|
|
|
|
|
// pg -> primary osd
|
|
int get_pg_primary(pg_t pg) {
|
|
vector<int> group;
|
|
int nrep = pg_to_osds(pg, group);
|
|
if (nrep)
|
|
return group[0];
|
|
return -1; // we fail!
|
|
}
|
|
|
|
// pg -> acting primary osd
|
|
int get_pg_acting_primary(pg_t pg) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
if (nrep > 0)
|
|
return group[0];
|
|
return -1; // we fail!
|
|
}
|
|
int get_pg_acting_tail(pg_t pg) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
if (nrep > 0)
|
|
return group[group.size()-1];
|
|
return -1; // we fail!
|
|
}
|
|
|
|
|
|
/* what replica # is a given osd? 0 primary, -1 for none. */
|
|
int calc_pg_rank(int osd, vector<int>& acting, int nrep=0) {
|
|
if (!nrep) nrep = acting.size();
|
|
for (int i=0; i<nrep; i++)
|
|
if (acting[i] == osd) return i;
|
|
return -1;
|
|
}
|
|
int calc_pg_role(int osd, vector<int>& acting, int nrep=0) {
|
|
if (!nrep) nrep = acting.size();
|
|
int rank = calc_pg_rank(osd, acting, nrep);
|
|
|
|
if (rank < 0) return PG_ROLE_STRAY;
|
|
else if (rank == 0) return PG_ROLE_HEAD;
|
|
else if (rank == 1) return PG_ROLE_ACKER;
|
|
else return PG_ROLE_MIDDLE;
|
|
}
|
|
|
|
int get_pg_role(pg_t pg, int osd) {
|
|
vector<int> group;
|
|
int nrep = pg_to_osds(pg, group);
|
|
return calc_pg_role(osd, group, nrep);
|
|
}
|
|
|
|
/* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
|
|
int get_pg_acting_rank(pg_t pg, int osd) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
return calc_pg_rank(osd, group, nrep);
|
|
}
|
|
/* role is -1 (stray), 0 (primary), 1 (replica) */
|
|
int get_pg_acting_role(pg_t pg, int osd) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
return calc_pg_role(osd, group, nrep);
|
|
}
|
|
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
#endif
|