osd: maintain up_epoch AND boot_epoch; revise OSDSuperblock accordingly

In order to make the superblock clean interval meaningful after we
are marked down and then up again (over the life of a single
cosd process insance), we track both boot_epoch and up_epoch,
and keep [boot_epoch,clean_thru] in the superblock.

This avoids seeing crashed pgs when and osd is wrongly marked down
and the osd marks itself up again.
This commit is contained in:
Sage Weil 2009-05-08 14:04:03 -07:00
parent 18851ce0f5
commit 026166ff53
3 changed files with 33 additions and 26 deletions

View File

@ -223,7 +223,7 @@ OSD::OSD(int id, Messenger *m, Messenger *hbm, MonMap *mm, const char *dev, cons
logclient(messenger, monmap),
whoami(id),
dev_path(dev), journal_path(jdev),
state(STATE_BOOTING), boot_epoch(0),
state(STATE_BOOTING), boot_epoch(0), up_epoch(0),
op_tp("OSD::op_tp", g_conf.osd_maxthreads),
recovery_tp("OSD::recovery_tp", 1),
disk_tp("OSD::disk_tp", 2),
@ -467,8 +467,8 @@ int OSD::shutdown()
// note unmount epoch
dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
superblock.epoch_mounted = boot_epoch;
superblock.epoch_unmounted = osdmap->get_epoch();
superblock.mounted = boot_epoch;
superblock.clean_thru = osdmap->get_epoch();
ObjectStore::Transaction t;
write_superblock(t);
store->apply_transaction(t);
@ -1915,16 +1915,17 @@ void OSD::handle_osd_map(MOSDMap *m)
dout(0) << "map says i am down. switching to boot state." << dendl;
//shutdown();
// note in the superblock that we were clean up until this point.
superblock.epoch_mounted = boot_epoch;
superblock.epoch_unmounted = osdmap->get_epoch();
state = STATE_BOOTING;
boot_epoch = 0;
up_epoch = 0;
reset_heartbeat_peers();
}
// note in the superblock that we were clean thru the prior epoch
if (boot_epoch && boot_epoch >= superblock.mounted) {
superblock.mounted = boot_epoch;
superblock.clean_thru = osdmap->get_epoch();
}
// superblock and commit
write_superblock(t);
@ -1957,11 +1958,15 @@ void OSD::advance_map(ObjectStore::Transaction& t, interval_set<snapid_t>& remov
<< " removed_snaps " << removed_snaps
<< dendl;
if (!boot_epoch &&
if (!up_epoch &&
osdmap->is_up(whoami) &&
osdmap->get_inst(whoami) == messenger->get_myinst()) {
boot_epoch = osdmap->get_epoch();
dout(10) << "my boot_epoch is " << boot_epoch << dendl;
up_epoch = osdmap->get_epoch();
dout(10) << "up_epoch is " << up_epoch << dendl;
if (!boot_epoch) {
boot_epoch = osdmap->get_epoch();
dout(10) << "boot_epoch is " << boot_epoch << dendl;
}
}
// scan pg creations
@ -2334,8 +2339,8 @@ bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch)
return false;
}
if (epoch < boot_epoch) {
dout(7) << "from pre-boot epoch " << epoch << " < " << boot_epoch << dendl;
if (epoch < up_epoch) {
dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
delete m;
return false;
}
@ -3535,8 +3540,8 @@ void OSD::handle_op(MOSDOp *op)
void OSD::handle_sub_op(MOSDSubOp *op)
{
dout(10) << "handle_sub_op " << *op << " epoch " << op->map_epoch << dendl;
if (op->map_epoch < boot_epoch) {
dout(3) << "replica op from before boot" << dendl;
if (op->map_epoch < up_epoch) {
dout(3) << "replica op from before up" << dendl;
delete op;
return;
}
@ -3583,8 +3588,8 @@ void OSD::handle_sub_op(MOSDSubOp *op)
}
void OSD::handle_sub_op_reply(MOSDSubOpReply *op)
{
if (op->get_map_epoch() < boot_epoch) {
dout(3) << "replica op reply from before boot" << dendl;
if (op->get_map_epoch() < up_epoch) {
dout(3) << "replica op reply from before up" << dendl;
delete op;
return;
}

View File

@ -144,7 +144,8 @@ public:
private:
int state;
epoch_t boot_epoch;
epoch_t boot_epoch; // _first_ epoch we were marked up (after this process started)
epoch_t up_epoch; // _most_recent_ epoch we were marked up
public:
bool is_booting() { return state == STATE_BOOTING; }

View File

@ -630,13 +630,14 @@ public:
epoch_t oldest_map, newest_map; // oldest/newest maps we have.
double weight;
epoch_t epoch_unmounted; // last epoch i cleanly unmounted
epoch_t epoch_mounted; // ...and the epoch i originally mounted it
// last interval over which i mounted and was then active
epoch_t mounted; // last epoch i mounted
epoch_t clean_thru; // epoch i was active and clean thru
OSDSuperblock() :
whoami(-1),
current_epoch(0), oldest_map(0), newest_map(0), weight(0),
epoch_unmounted(0), epoch_mounted(0) {
mounted(0), clean_thru(0) {
memset(&fsid, 0, sizeof(fsid));
}
@ -648,8 +649,8 @@ public:
::encode(oldest_map, bl);
::encode(newest_map, bl);
::encode(weight, bl);
::encode(epoch_unmounted, bl);
::encode(epoch_mounted, bl);
::encode(clean_thru, bl);
::encode(mounted, bl);
}
void decode(bufferlist::iterator &bl) {
::decode(magic, bl);
@ -659,8 +660,8 @@ public:
::decode(oldest_map, bl);
::decode(newest_map, bl);
::decode(weight, bl);
::decode(epoch_unmounted, bl);
::decode(epoch_mounted, bl);
::decode(clean_thru, bl);
::decode(mounted, bl);
}
};
WRITE_CLASS_ENCODER(OSDSuperblock)
@ -671,7 +672,7 @@ inline ostream& operator<<(ostream& out, OSDSuperblock& sb)
<< " osd" << sb.whoami
<< " e" << sb.current_epoch
<< " [" << sb.oldest_map << "," << sb.newest_map << "]"
<< " lci=[" << sb.epoch_mounted << "," << sb.epoch_unmounted << "]"
<< " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
<< ")";
}