mirror of
https://github.com/ceph/ceph
synced 2025-01-03 09:32:43 +00:00
0d081ba016
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1107 29311d96-e01e-0410-9327-a35deaab8ce9
517 lines
13 KiB
C++
517 lines
13 KiB
C++
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
|
/*
|
|
* Ceph - scalable distributed file system
|
|
*
|
|
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
|
*
|
|
* This is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License version 2.1, as published by the Free Software
|
|
* Foundation. See file COPYING.
|
|
*
|
|
*/
|
|
|
|
|
|
#ifndef __OSDMAP_H
|
|
#define __OSDMAP_H
|
|
|
|
/*
|
|
* describe properties of the OSD cluster.
|
|
* disks, disk groups, total # osds,
|
|
*
|
|
*/
|
|
#include "config.h"
|
|
#include "include/types.h"
|
|
#include "osd_types.h"
|
|
#include "msg/Message.h"
|
|
#include "common/Mutex.h"
|
|
#include "common/Clock.h"
|
|
|
|
#include "crush/crush.h"
|
|
using namespace crush;
|
|
|
|
#include <vector>
|
|
#include <list>
|
|
#include <set>
|
|
#include <map>
|
|
using namespace std;
|
|
|
|
|
|
/*
|
|
* some system constants
|
|
*/
|
|
|
|
// from LSB to MSB,
|
|
#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG
|
|
#define PG_REP_BITS 6 // up to 64 replicas
|
|
#define PG_TYPE_BITS 2
|
|
#define PG_PS_MASK ((1LL<<PG_PS_BITS)-1)
|
|
|
|
#define PG_TYPE_RAND 1 // default: distribution randomly
|
|
#define PG_TYPE_STARTOSD 2 // place primary on a specific OSD (named by the pg_bits)
|
|
|
|
// pg roles
|
|
#define PG_ROLE_STRAY -1
|
|
#define PG_ROLE_HEAD 0
|
|
#define PG_ROLE_ACKER 1
|
|
#define PG_ROLE_MIDDLE 2 // der.. misnomer
|
|
//#define PG_ROLE_TAIL 2
|
|
|
|
|
|
|
|
/** OSDMap
|
|
*/
|
|
class OSDMap {
|
|
|
|
public:
|
|
class Incremental {
|
|
public:
|
|
epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
|
|
epoch_t mon_epoch; // monitor epoch (election iteration)
|
|
utime_t ctime;
|
|
map<int,entity_inst_t> new_up;
|
|
map<int,entity_inst_t> new_down;
|
|
list<int> new_in;
|
|
list<int> new_out;
|
|
map<int,float> new_overload; // updated overload value
|
|
list<int> old_overload; // no longer overload
|
|
|
|
void encode(bufferlist& bl) {
|
|
bl.append((char*)&epoch, sizeof(epoch));
|
|
bl.append((char*)&mon_epoch, sizeof(mon_epoch));
|
|
bl.append((char*)&ctime, sizeof(ctime));
|
|
::_encode(new_up, bl);
|
|
::_encode(new_down, bl);
|
|
::_encode(new_in, bl);
|
|
::_encode(new_out, bl);
|
|
::_encode(new_overload, bl);
|
|
}
|
|
void decode(bufferlist& bl, int& off) {
|
|
bl.copy(off, sizeof(epoch), (char*)&epoch);
|
|
off += sizeof(epoch);
|
|
bl.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
|
|
off += sizeof(mon_epoch);
|
|
bl.copy(off, sizeof(ctime), (char*)&ctime);
|
|
off += sizeof(ctime);
|
|
::_decode(new_up, bl, off);
|
|
::_decode(new_down, bl, off);
|
|
::_decode(new_in, bl, off);
|
|
::_decode(new_out, bl, off);
|
|
::_decode(new_overload, bl, off);
|
|
}
|
|
|
|
Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {}
|
|
};
|
|
|
|
private:
|
|
epoch_t epoch; // what epoch of the osd cluster descriptor is this
|
|
epoch_t mon_epoch; // monitor epoch (election iteration)
|
|
utime_t ctime; // epoch start time
|
|
int pg_bits; // placement group bits
|
|
int localized_pg_bits; // bits for localized pgs
|
|
|
|
set<int> osds; // all osds
|
|
set<int> down_osds; // list of down disks
|
|
set<int> out_osds; // list of unmapped disks
|
|
map<int,float> overload_osds;
|
|
map<int,entity_inst_t> osd_inst;
|
|
|
|
public:
|
|
Crush crush; // hierarchical map
|
|
|
|
friend class OSDMonitor;
|
|
friend class MDS;
|
|
|
|
public:
|
|
OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {}
|
|
|
|
// map info
|
|
epoch_t get_epoch() const { return epoch; }
|
|
void inc_epoch() { epoch++; }
|
|
|
|
int get_pg_bits() const { return pg_bits; }
|
|
void set_pg_bits(int b) { pg_bits = b; }
|
|
int get_localized_pg_bits() const { return localized_pg_bits; }
|
|
|
|
const utime_t& get_ctime() const { return ctime; }
|
|
|
|
bool is_mkfs() const { return epoch == 1; }
|
|
//void set_mkfs() { assert(epoch == 1); }
|
|
|
|
/***** cluster state *****/
|
|
int num_osds() { return osds.size(); }
|
|
void get_all_osds(set<int>& ls) { ls = osds; }
|
|
|
|
const set<int>& get_osds() { return osds; }
|
|
const set<int>& get_down_osds() { return down_osds; }
|
|
const set<int>& get_out_osds() { return out_osds; }
|
|
const map<int,float>& get_overload_osds() { return overload_osds; }
|
|
|
|
bool is_down(int osd) { return down_osds.count(osd); }
|
|
bool is_up(int osd) { return !is_down(osd); }
|
|
bool is_out(int osd) { return out_osds.count(osd); }
|
|
bool is_in(int osd) { return !is_out(osd); }
|
|
|
|
const entity_inst_t& get_inst(int osd) {
|
|
assert(osd_inst.count(osd));
|
|
return osd_inst[osd];
|
|
}
|
|
bool get_inst(int osd, entity_inst_t& inst) {
|
|
if (osd_inst.count(osd)) {
|
|
inst = osd_inst[osd];
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void mark_down(int o) { down_osds.insert(o); }
|
|
void mark_up(int o) { down_osds.erase(o); }
|
|
void mark_out(int o) { out_osds.insert(o); }
|
|
void mark_in(int o) { out_osds.erase(o); }
|
|
|
|
|
|
void apply_incremental(Incremental &inc) {
|
|
assert(inc.epoch == epoch+1);
|
|
epoch++;
|
|
mon_epoch = inc.mon_epoch;
|
|
ctime = inc.ctime;
|
|
|
|
for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
|
|
i != inc.new_up.end();
|
|
i++) {
|
|
assert(down_osds.count(i->first));
|
|
down_osds.erase(i->first);
|
|
assert(osd_inst.count(i->first) == 0);
|
|
osd_inst[i->first] = i->second;
|
|
//cout << "epoch " << epoch << " up osd" << i->first << endl;
|
|
}
|
|
for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
|
|
i != inc.new_down.end();
|
|
i++) {
|
|
assert(down_osds.count(i->first) == 0);
|
|
down_osds.insert(i->first);
|
|
assert(osd_inst.count(i->first) == 0 ||
|
|
osd_inst[i->first] == i->second);
|
|
osd_inst.erase(i->first);
|
|
//cout << "epoch " << epoch << " down osd" << i->first << endl;
|
|
}
|
|
for (list<int>::iterator i = inc.new_in.begin();
|
|
i != inc.new_in.end();
|
|
i++) {
|
|
assert(out_osds.count(*i));
|
|
out_osds.erase(*i);
|
|
//cout << "epoch " << epoch << " in osd" << *i << endl;
|
|
}
|
|
for (list<int>::iterator i = inc.new_out.begin();
|
|
i != inc.new_out.end();
|
|
i++) {
|
|
assert(out_osds.count(*i) == 0);
|
|
out_osds.insert(*i);
|
|
//cout << "epoch " << epoch << " out osd" << *i << endl;
|
|
}
|
|
for (map<int,float>::iterator i = inc.new_overload.begin();
|
|
i != inc.new_overload.end();
|
|
i++) {
|
|
overload_osds[i->first] = i->second;
|
|
}
|
|
for (list<int>::iterator i = inc.old_overload.begin();
|
|
i != inc.old_overload.end();
|
|
i++) {
|
|
assert(overload_osds.count(*i));
|
|
overload_osds.erase(*i);
|
|
}
|
|
}
|
|
|
|
// serialize, unserialize
|
|
void encode(bufferlist& blist) {
|
|
blist.append((char*)&epoch, sizeof(epoch));
|
|
blist.append((char*)&mon_epoch, sizeof(mon_epoch));
|
|
blist.append((char*)&ctime, sizeof(ctime));
|
|
blist.append((char*)&pg_bits, sizeof(pg_bits));
|
|
|
|
_encode(osds, blist);
|
|
_encode(down_osds, blist);
|
|
_encode(out_osds, blist);
|
|
_encode(overload_osds, blist);
|
|
_encode(osd_inst, blist);
|
|
|
|
crush._encode(blist);
|
|
}
|
|
|
|
void decode(bufferlist& blist) {
|
|
int off = 0;
|
|
blist.copy(off, sizeof(epoch), (char*)&epoch);
|
|
off += sizeof(epoch);
|
|
blist.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
|
|
off += sizeof(mon_epoch);
|
|
blist.copy(off, sizeof(ctime), (char*)&ctime);
|
|
off += sizeof(ctime);
|
|
blist.copy(off, sizeof(pg_bits), (char*)&pg_bits);
|
|
off += sizeof(pg_bits);
|
|
|
|
_decode(osds, blist, off);
|
|
_decode(down_osds, blist, off);
|
|
_decode(out_osds, blist, off);
|
|
_decode(overload_osds, blist, off);
|
|
_decode(osd_inst, blist, off);
|
|
|
|
crush._decode(blist, off);
|
|
}
|
|
|
|
|
|
|
|
|
|
/**** mapping facilities ****/
|
|
|
|
// oid -> pg
|
|
pg_t object_to_pg(object_t oid, FileLayout& layout) {
|
|
static crush::Hash H(777);
|
|
|
|
int policy = layout.object_layout;
|
|
if (policy == 0)
|
|
policy = g_conf.osd_object_layout;
|
|
|
|
int type = PG_TYPE_RAND;
|
|
ps_t ps;
|
|
|
|
switch (policy) {
|
|
case OBJECT_LAYOUT_LINEAR:
|
|
{
|
|
//const object_t ono = oid.bno;
|
|
//const inodeno_t ino = oid >> OID_ONO_BITS;
|
|
ps = (oid.bno + oid.ino) & PG_PS_MASK;
|
|
ps &= ((1ULL<<pg_bits)-1ULL);
|
|
}
|
|
break;
|
|
|
|
case OBJECT_LAYOUT_HASHINO:
|
|
{
|
|
//const object_t ono = oid & ((1ULL << OID_ONO_BITS)-1ULL);
|
|
//const inodeno_t ino = oid >> OID_ONO_BITS;
|
|
ps = (oid.bno + H(oid.ino)) & PG_PS_MASK;
|
|
ps &= ((1ULL<<pg_bits)-1ULL);
|
|
}
|
|
break;
|
|
|
|
case OBJECT_LAYOUT_HASH:
|
|
{
|
|
ps = H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ) & PG_PS_MASK;
|
|
ps &= ((1ULL<<pg_bits)-1ULL);
|
|
}
|
|
break;
|
|
|
|
case OBJECT_LAYOUT_STARTOSD:
|
|
{
|
|
ps = layout.osd;
|
|
type = PG_TYPE_STARTOSD;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// construct final PG
|
|
/*pg_t pg = type;
|
|
pg = (pg << PG_REP_BITS) | (pg_t)layout.num_rep;
|
|
pg = (pg << PG_PS_BITS) | ps;
|
|
*/
|
|
//cout << "pg " << hex << pg << dec << endl;
|
|
return pg_t(ps, 0, layout.num_rep);
|
|
}
|
|
|
|
// (ps, nrep) -> pg
|
|
pg_t ps_nrep_to_pg(ps_t ps, int nrep) {
|
|
/*return ((pg_t)ps & ((1ULL<<pg_bits)-1ULL))
|
|
| ((pg_t)nrep << PG_PS_BITS)
|
|
| ((pg_t)PG_TYPE_RAND << (PG_PS_BITS+PG_REP_BITS));
|
|
*/
|
|
return pg_t(ps, 0, nrep, 0);
|
|
}
|
|
pg_t ps_osd_nrep_to_pg(ps_t ps, int osd, int nrep) {
|
|
/*return ((pg_t)osd)
|
|
| ((pg_t)nrep << PG_PS_BITS)
|
|
| ((pg_t)PG_TYPE_STARTOSD << (PG_PS_BITS+PG_REP_BITS));
|
|
*/
|
|
return pg_t(ps, osd+1, nrep, 0);
|
|
}
|
|
|
|
// pg -> nrep
|
|
int pg_to_nrep(pg_t pg) {
|
|
return pg.u.fields.nrep;
|
|
//return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1);
|
|
}
|
|
|
|
// pg -> ps
|
|
int pg_to_ps(pg_t pg) {
|
|
//return pg & PG_PS_MASK;
|
|
return pg.u.fields.ps;
|
|
}
|
|
|
|
// pg -> (osd list)
|
|
int pg_to_osds(pg_t pg,
|
|
vector<int>& osds) { // list of osd addr's
|
|
pg_t ps = pg_to_ps(pg);
|
|
int num_rep = pg_to_nrep(pg);
|
|
assert(num_rep > 0);
|
|
|
|
// map to osds[]
|
|
switch (g_conf.osd_pg_layout) {
|
|
case PG_LAYOUT_CRUSH:
|
|
{
|
|
int forcefeed = -1;
|
|
if (pg.u.fields.preferred > 0 &&
|
|
out_osds.count(pg.u.fields.preferred-1) == 0)
|
|
forcefeed = pg.u.fields.preferred-1;
|
|
crush.do_rule(crush.rules[num_rep], // FIXME rule thing.
|
|
ps,
|
|
osds,
|
|
out_osds, overload_osds,
|
|
forcefeed);
|
|
}
|
|
break;
|
|
|
|
case PG_LAYOUT_LINEAR:
|
|
for (int i=0; i<num_rep; i++)
|
|
osds.push_back( (i + ps*num_rep) % g_conf.num_osd );
|
|
break;
|
|
|
|
case PG_LAYOUT_HYBRID:
|
|
{
|
|
static crush::Hash H(777);
|
|
int h = H(ps);
|
|
for (int i=0; i<num_rep; i++)
|
|
osds.push_back( (h+i) % g_conf.num_osd );
|
|
}
|
|
break;
|
|
|
|
case PG_LAYOUT_HASH:
|
|
{
|
|
static crush::Hash H(777);
|
|
for (int i=0; i<num_rep; i++) {
|
|
int t = 1;
|
|
int osd = 0;
|
|
while (t++) {
|
|
osd = H(i, ps, t) % g_conf.num_osd;
|
|
int j = 0;
|
|
for (; j<i; j++)
|
|
if (osds[j] == osd) break;
|
|
if (j == i) break;
|
|
}
|
|
osds.push_back(osd);
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
|
|
if (pg.u.fields.preferred > 0 &&
|
|
g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) {
|
|
int osd = pg.u.fields.preferred-1;
|
|
|
|
// already in there?
|
|
if (osds.empty()) {
|
|
osds.push_back(osd);
|
|
} else {
|
|
assert(num_rep > 0);
|
|
for (int i=1; i<num_rep; i++)
|
|
if (osds[i] == osd) {
|
|
// swap with position 0
|
|
osds[i] = osds[0];
|
|
}
|
|
osds[0] = osd;
|
|
}
|
|
|
|
if (is_out(osd))
|
|
osds.erase(osds.begin()); // oops, but it's down!
|
|
}
|
|
|
|
return osds.size();
|
|
}
|
|
|
|
// pg -> (up osd list)
|
|
int pg_to_acting_osds(pg_t pg,
|
|
vector<int>& osds) { // list of osd addr's
|
|
// get rush list
|
|
vector<int> raw;
|
|
pg_to_osds(pg, raw);
|
|
|
|
osds.clear();
|
|
for (unsigned i=0; i<raw.size(); i++) {
|
|
if (is_down(raw[i])) continue;
|
|
osds.push_back( raw[i] );
|
|
}
|
|
return osds.size();
|
|
}
|
|
|
|
|
|
|
|
// pg -> primary osd
|
|
int get_pg_primary(pg_t pg) {
|
|
vector<int> group;
|
|
int nrep = pg_to_osds(pg, group);
|
|
if (nrep)
|
|
return group[0];
|
|
return -1; // we fail!
|
|
}
|
|
|
|
// pg -> acting primary osd
|
|
int get_pg_acting_primary(pg_t pg) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
if (nrep > 0)
|
|
return group[0];
|
|
return -1; // we fail!
|
|
}
|
|
int get_pg_acting_tail(pg_t pg) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
if (nrep > 0)
|
|
return group[group.size()-1];
|
|
return -1; // we fail!
|
|
}
|
|
|
|
|
|
/* what replica # is a given osd? 0 primary, -1 for none. */
|
|
int calc_pg_rank(int osd, vector<int>& acting, int nrep=0) {
|
|
if (!nrep) nrep = acting.size();
|
|
for (int i=0; i<nrep; i++)
|
|
if (acting[i] == osd) return i;
|
|
return -1;
|
|
}
|
|
int calc_pg_role(int osd, vector<int>& acting, int nrep=0) {
|
|
if (!nrep) nrep = acting.size();
|
|
int rank = calc_pg_rank(osd, acting, nrep);
|
|
|
|
if (rank < 0) return PG_ROLE_STRAY;
|
|
else if (rank == 0) return PG_ROLE_HEAD;
|
|
else if (rank == 1) return PG_ROLE_ACKER;
|
|
else return PG_ROLE_MIDDLE;
|
|
}
|
|
|
|
int get_pg_role(pg_t pg, int osd) {
|
|
vector<int> group;
|
|
int nrep = pg_to_osds(pg, group);
|
|
return calc_pg_role(osd, group, nrep);
|
|
}
|
|
|
|
/* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
|
|
int get_pg_acting_rank(pg_t pg, int osd) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
return calc_pg_rank(osd, group, nrep);
|
|
}
|
|
/* role is -1 (stray), 0 (primary), 1 (replica) */
|
|
int get_pg_acting_role(pg_t pg, int osd) {
|
|
vector<int> group;
|
|
int nrep = pg_to_acting_osds(pg, group);
|
|
return calc_pg_role(osd, group, nrep);
|
|
}
|
|
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
#endif
|