osd: Adjust failure reporting.

When a failure report is sent to the mon, the failed OSD is added to
pending_failures. If the OSD gets a heartbeat from an OSD in pending_failures,
it sends an MOSDFailure message repealing the previous failure report.
If an OSD is marked as failed but a message hasn't been sent, it's simply
removed from the failed list.
This commit is contained in:
Greg Farnum 2010-08-05 09:48:41 -07:00
parent 83e58257a9
commit d979e48d93
2 changed files with 18 additions and 0 deletions

View File

@ -1334,6 +1334,12 @@ void OSD::handle_osd_ping(MOSDPing *m)
}
heartbeat_from_stamp[from] = g_clock.now(); // don't let _my_ lag interfere.
// remove from failure lists if needed
if (failure_pending.count(from)) {
send_still_alive(from);
failure_pending.erase(from);
}
failure_queue.erase(from);
} else {
dout(10) << " ignoring " << m->get_source_inst() << dendl;
}
@ -1376,6 +1382,7 @@ void OSD::heartbeat_check()
<< " since " << heartbeat_from_stamp[p->first]
<< " (cutoff " << grace << ")" << dendl;
queue_failure(p->first);
}
}
}
@ -1620,9 +1627,18 @@ void OSD::send_failures()
int osd = *failure_queue.begin();
monc->send_mon_message(new MOSDFailure(monc->get_fsid(), osdmap->get_inst(osd), osdmap->get_epoch()));
failure_queue.erase(osd);
failure_pending.insert(osd);
}
}
void OSD::send_still_alive(int osd)
{
MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osdmap->get_inst(osd),
osdmap->get_epoch());
m->is_failed = false;
monc->send_mon_message(m);
}
void OSD::send_pg_stats()
{
assert(osd_lock.is_locked());

View File

@ -546,10 +546,12 @@ protected:
set<int> failure_queue;
set<int> failure_pending;
void queue_failure(int n) {
failure_queue.insert(n);
}
void send_failures();
void send_still_alive(int osd);
// -- pg stats --
Mutex pg_stat_queue_lock;