Merge pull request #6550 from cernceph/wip-deepscrub

osd: randomize deep scrubbing

Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2015-11-20 06:56:04 -05:00
commit bc7d677185
4 changed files with 41 additions and 9 deletions

View File

@ -686,6 +686,7 @@ OPTION(osd_scrub_sleep, OPT_FLOAT, 0) // sleep between [deep]scrub ops
OPTION(osd_scrub_auto_repair, OPT_BOOL, false) // whether auto-repair inconsistencies upon deep-scrubbing
OPTION(osd_scrub_auto_repair_num_errors, OPT_U32, 5) // only auto-repair when number of errors is below this threshold
OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT, 0.15) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT, 2*60*60) // objects must be this old (seconds) before we update the whole-object digest on scrub
OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)

View File

@ -1809,6 +1809,15 @@ int OSD::init()
dout(2) << "boot" << dendl;
// initialize the daily loadavg with current 15min loadavg
double loadavgs[3];
if (getloadavg(loadavgs, 3) == 3) {
daily_loadavg = loadavgs[2];
} else {
derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
daily_loadavg = 1.0;
}
// read superblock
r = read_superblock();
if (r < 0) {
@ -3862,8 +3871,12 @@ void OSD::heartbeat()
// get CPU load avg
double loadavgs[1];
if (getloadavg(loadavgs, 1) == 1)
int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
if (getloadavg(loadavgs, 1) == 1) {
logger->set(l_osd_loadavg, 100 * loadavgs[0]);
daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
}
dout(30) << "heartbeat checking stats" << dendl;
@ -6065,23 +6078,35 @@ bool OSD::scrub_time_permit(utime_t now)
bool OSD::scrub_load_below_threshold()
{
double loadavgs[1];
if (getloadavg(loadavgs, 1) != 1) {
double loadavgs[3];
if (getloadavg(loadavgs, 3) != 3) {
dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
return false;
}
if (loadavgs[0] >= cct->_conf->osd_scrub_load_threshold) {
dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " >= max " << cct->_conf->osd_scrub_load_threshold
<< " = no, load too high" << dendl;
return false;
} else {
// allow scrub if below configured threshold
if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " < max " << cct->_conf->osd_scrub_load_threshold
<< " = yes" << dendl;
return true;
}
// allow scrub if below daily avg and currently decreasing
if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " < daily_loadavg " << daily_loadavg
<< " and < 15m avg " << loadavgs[2]
<< " = yes" << dendl;
return true;
}
dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " >= max " << cct->_conf->osd_scrub_load_threshold
<< " and ( >= daily_loadavg " << daily_loadavg
<< " or >= 15m avg " << loadavgs[2]
<< ") = no" << dendl;
return false;
}
void OSD::sched_scrub()

View File

@ -1505,6 +1505,7 @@ private:
Messenger *hb_front_server_messenger;
Messenger *hb_back_server_messenger;
utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state
double daily_loadavg;
void _add_heartbeat_peer(int p);
void _remove_heartbeat_peer(int p);

View File

@ -3246,6 +3246,11 @@ bool PG::sched_scrub()
bool time_for_deep = (ceph_clock_now(cct) >=
info.history.last_deep_scrub_stamp + cct->_conf->osd_deep_scrub_interval);
bool deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
time_for_deep = (time_for_deep || deep_coin_flip);
//NODEEP_SCRUB so ignore time initiated deep-scrub
if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB))