Merge pull request #4737 from kylinstorage/wip-temp-based-object-eviction

osd: improve temperature calculation for cache tier agent

Reviewed-by: Sage Weil
This commit is contained in:
Sage Weil 2015-11-13 15:39:54 -05:00
commit 8d3082df78
11 changed files with 151 additions and 82 deletions

View File

@ -884,7 +884,7 @@ Only for tiered pools::
ceph osd pool get <poolname> hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|
target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|
cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|
min_read_recency_for_promote
min_read_recency_for_promote|hit_set_grade_decay_rate|hit_set_search_last_n
Only for erasure coded pools::
@ -934,7 +934,8 @@ Usage::
target_max_bytes|target_max_objects|cache_target_dirty_ratio|
cache_target_dirty_high_ratio|
cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|
min_read_recency_for_promote|write_fadvise_dontneed
min_read_recency_for_promote|write_fadvise_dontneed|hit_set_grade_decay_rate|
hit_set_search_last_n
<val> {--yes-i-really-mean-it}
Subcommand ``set-quota`` sets object or byte limit on pool.

View File

@ -441,6 +441,20 @@ You may set values for the following keys:
.. _cache_min_flush_age:
``hit_set_grade_decay_rate``
:Description: Temperature decay rate between two successive hit_sets
:Type: Integer
:Valid Range: 0 - 100
:Default: ``20``
``hit_set_grade_search_last_n``
:Description: Count at most N appearance in hit_sets for temperature calculation
:Type: Integer
:Valid Range: 0 - hit_set_count
:Default: ``1``
``cache_min_flush_age``
:Description: The time (in seconds) before the cache tiering agent will flush

View File

@ -597,6 +597,8 @@ OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read)
OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on write)
OPTION(osd_tier_default_cache_hit_set_grade_decay_rate, OPT_INT, 20)
OPTION(osd_tier_default_cache_hit_set_search_last_n, OPT_INT, 1)
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_max_advance, OPT_INT, 150) // make this < cache_size!

View File

@ -674,11 +674,11 @@ COMMAND("osd pool rename " \
"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
COMMAND("osd pool get " \
"name=pool,type=CephPoolname " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read", \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n", \
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_promote|fast_read " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n " \
"name=val,type=CephString " \
"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")

View File

@ -2885,7 +2885,8 @@ namespace {
CACHE_TARGET_FULL_RATIO,
CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ};
MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N};
std::set<osd_pool_get_choices>
subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@ -3359,16 +3360,18 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
("erasure_code_profile", ERASURE_CODE_PROFILE)
("min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE)
("min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE)
("fast_read", FAST_READ);
("fast_read", FAST_READ)
("hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE)
("hit_set_search_last_n", HIT_SET_SEARCH_LAST_N);
typedef std::set<osd_pool_get_choices> choices_set_t;
const choices_set_t ONLY_TIER_CHOICES = boost::assign::list_of
(HIT_SET_TYPE)(HIT_SET_PERIOD)(HIT_SET_COUNT)(HIT_SET_FPP)
(TARGET_MAX_OBJECTS)(TARGET_MAX_BYTES)(CACHE_TARGET_FULL_RATIO)
(CACHE_TARGET_DIRTY_RATIO)(CACHE_TARGET_DIRTY_HIGH_RATIO)(CACHE_MIN_FLUSH_AGE)
(CACHE_MIN_EVICT_AGE)(MIN_READ_RECENCY_FOR_PROMOTE);
(CACHE_TARGET_DIRTY_RATIO)(CACHE_TARGET_DIRTY_HIGH_RATIO)
(CACHE_MIN_FLUSH_AGE)(CACHE_MIN_EVICT_AGE)(MIN_READ_RECENCY_FOR_PROMOTE)
(HIT_SET_GRADE_DECAY_RATE)(HIT_SET_SEARCH_LAST_N);
const choices_set_t ONLY_ERASURE_CHOICES = boost::assign::list_of
(ERASURE_CODE_PROFILE);
@ -3530,6 +3533,14 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
case FAST_READ:
f->dump_int("fast_read", p->fast_read);
break;
case HIT_SET_GRADE_DECAY_RATE:
f->dump_int("hit_set_grade_decay_rate",
p->hit_set_grade_decay_rate);
break;
case HIT_SET_SEARCH_LAST_N:
f->dump_int("hit_set_search_last_n",
p->hit_set_search_last_n);
break;
}
f->close_section();
f->flush(rdata);
@ -3620,6 +3631,14 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
ss << "min_read_recency_for_promote: " <<
p->min_read_recency_for_promote << "\n";
break;
case HIT_SET_GRADE_DECAY_RATE:
ss << "hit_set_grade_decay_rate: " <<
p->hit_set_grade_decay_rate << "\n";
break;
case HIT_SET_SEARCH_LAST_N:
ss << "hit_set_search_last_n: " <<
p->hit_set_search_last_n << "\n";
break;
case HASHPSPOOL:
case NODELETE:
case NOPGCHANGE:
@ -4734,8 +4753,8 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
var == "target_max_objects" || var == "target_max_bytes" ||
var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
var == "cache_target_dirty_high_ratio" ||
var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n")) {
return -EACCES;
}
@ -4921,7 +4940,6 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
}
p.hit_set_period = n;
} else if (var == "hit_set_count") {
if (interr.length()) {
ss << "error parsing integer value '" << val << "': " << interr;
return -EINVAL;
@ -5013,6 +5031,26 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
return -EINVAL;
}
p.min_read_recency_for_promote = n;
} else if (var == "hit_set_grade_decay_rate") {
if (interr.length()) {
ss << "error parsing integer value '" << val << "': " << interr;
return -EINVAL;
}
if (n > 100 || n < 0) {
ss << "value out of range,valid range is 0 - 100";
return -EINVAL;
}
p.hit_set_grade_decay_rate = n;
} else if (var == "hit_set_search_last_n") {
if (interr.length()) {
ss << "error parsing integer value '" << val << "': " << interr;
return -EINVAL;
}
if (n > p.hit_set_count || n < 0) {
ss << "value out of range,valid range is 0 - hit_set_count";
return -EINVAL;
}
p.hit_set_search_last_n = n;
} else if (var == "min_write_recency_for_promote") {
if (interr.length()) {
ss << "error parsing integer value '" << val << "': " << interr;
@ -7167,6 +7205,8 @@ done:
ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
ntp->hit_set_params = hsp;
ntp->target_max_bytes = size;
ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";

View File

@ -11524,7 +11524,6 @@ bool ReplicatedPG::agent_work(int start_max, int agent_flush_quota)
if (++agent_state->hist_age > g_conf->osd_agent_hist_halflife) {
dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
agent_state->hist_age = 0;
agent_state->atime_hist.decay();
agent_state->temp_hist.decay();
}
@ -11731,42 +11730,15 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
// is this object old and/or cold enough?
int atime = -1, temp = 0;
int temp = 0;
uint64_t temp_upper = 0, temp_lower = 0;
if (hit_set)
agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
uint64_t atime_upper = 0, atime_lower = 0;
if (atime < 0 && obc->obs.oi.mtime != utime_t()) {
if (obc->obs.oi.local_mtime != utime_t()) {
atime = ceph_clock_now(NULL).sec() - obc->obs.oi.local_mtime;
} else {
atime = ceph_clock_now(NULL).sec() - obc->obs.oi.mtime;
}
}
if (atime < 0) {
if (hit_set) {
atime = pool.info.hit_set_period * pool.info.hit_set_count; // "infinite"
} else {
atime_upper = 1000000;
}
}
if (atime >= 0) {
agent_state->atime_hist.add(atime);
agent_state->atime_hist.get_position_micro(atime, &atime_lower,
&atime_upper);
}
unsigned temp_upper = 0, temp_lower = 0;
/*
// FIXME: bound atime based on creation time?
agent_state->temp_hist.add(atime);
agent_estimate_temp(soid, &temp);
agent_state->temp_hist.add(temp);
agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
*/
dout(20) << __func__
<< " atime " << atime
<< " pos " << atime_lower << "-" << atime_upper
<< ", temp " << temp
<< " temp " << temp
<< " pos " << temp_lower << "-" << temp_upper
<< ", evict_effort " << agent_state->evict_effort
<< dendl;
@ -11779,9 +11751,7 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
delete f;
*_dout << dendl;
// FIXME: ignore temperature for now.
if (1000000 - atime_upper >= agent_state->evict_effort)
if (1000000 - temp_upper >= agent_state->evict_effort)
return false;
}
@ -12065,32 +12035,21 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
return requeued;
}
void ReplicatedPG::agent_estimate_atime_temp(const hobject_t& oid,
int *atime, int *temp)
void ReplicatedPG::agent_estimate_temp(const hobject_t& oid, int *temp)
{
assert(hit_set);
*atime = -1;
if (temp)
*temp = 0;
if (hit_set->contains(oid)) {
*atime = 0;
if (temp)
++(*temp);
else
return;
}
time_t now = ceph_clock_now(NULL).sec();
assert(temp);
*temp = 0;
if (hit_set->contains(oid))
*temp = 1000000;
unsigned i = 0;
int last_n = pool.info.hit_set_search_last_n;
for (map<time_t,HitSetRef>::reverse_iterator p =
agent_state->hit_set_map.rbegin();
p != agent_state->hit_set_map.rend();
++p) {
agent_state->hit_set_map.rbegin(); last_n > 0 &&
p != agent_state->hit_set_map.rend(); ++p, ++i) {
if (p->second->contains(oid)) {
if (*atime < 0)
*atime = now - p->first;
if (temp)
++(*temp);
else
return;
*temp += pool.info.get_grade(i);
--last_n;
}
}
}

View File

@ -941,10 +941,8 @@ protected:
/// estimate object atime and temperature
///
/// @param oid [in] object name
/// @param atime [out] seconds since last access (lower bound)
/// @param temperature [out] relative temperature (# hitset bins we appear in)
void agent_estimate_atime_temp(const hobject_t& oid,
int *atime, int *temperature);
/// @param temperature [out] relative temperature (# consider both access time and frequency)
void agent_estimate_temp(const hobject_t& oid, int *temperature);
/// stop the agent
void agent_stop();

View File

@ -23,7 +23,6 @@ struct TierAgentState {
bool delaying;
/// histogram of ages we've encountered
pow2_hist_t atime_hist;
pow2_hist_t temp_hist;
int hist_age;
@ -109,9 +108,6 @@ struct TierAgentState {
f->dump_string("evict_mode", get_evict_mode_name());
f->dump_unsigned("evict_effort", evict_effort);
f->dump_stream("position") << position;
f->open_object_section("atime_hist");
atime_hist.dump(f);
f->close_section();
f->open_object_section("temp_hist");
temp_hist.dump(f);
f->close_section();

View File

@ -889,7 +889,6 @@ void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
o.back()->name = "foo";
}
// -- pg_pool_t --
void pg_pool_t::dump(Formatter *f) const
@ -947,6 +946,12 @@ void pg_pool_t::dump(Formatter *f) const
f->dump_bool("use_gmt_hitset", use_gmt_hitset);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
f->open_array_section("grade_table");
for (unsigned i = 0; i < hit_set_count; ++i)
f->dump_unsigned("value", get_grade(i));
f->close_section();
f->dump_unsigned("stripe_width", get_stripe_width());
f->dump_unsigned("expected_num_objects", expected_num_objects);
f->dump_bool("fast_read", fast_read);
@ -1259,7 +1264,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
return;
}
ENCODE_START(22, 5, bl);
ENCODE_START(23, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
@ -1305,12 +1310,14 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(min_write_recency_for_promote, bl);
::encode(use_gmt_hitset, bl);
::encode(fast_read, bl);
::encode(hit_set_grade_decay_rate, bl);
::encode(hit_set_search_last_n, bl);
ENCODE_FINISH(bl);
}
void pg_pool_t::decode(bufferlist::iterator& bl)
{
DECODE_START_LEGACY_COMPAT_LEN(22, 5, 5, bl);
DECODE_START_LEGACY_COMPAT_LEN(23, 5, 5, bl);
::decode(type, bl);
::decode(size, bl);
::decode(crush_ruleset, bl);
@ -1442,8 +1449,16 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
} else {
fast_read = false;
}
if (struct_v >= 23) {
::decode(hit_set_grade_decay_rate, bl);
::decode(hit_set_search_last_n, bl);
} else {
hit_set_grade_decay_rate = 0;
hit_set_search_last_n = 1;
}
DECODE_FINISH(bl);
calc_pg_masks();
calc_grade_table();
}
void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
@ -1489,6 +1504,9 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
a.hit_set_count = 8;
a.min_read_recency_for_promote = 1;
a.min_write_recency_for_promote = 1;
a.hit_set_grade_decay_rate = 50;
a.hit_set_search_last_n = 1;
a.calc_grade_table();
a.set_stripe_width(12345);
a.target_max_bytes = 1238132132;
a.target_max_objects = 1232132;
@ -1542,7 +1560,9 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
out << " hit_set " << p.hit_set_params
<< " " << p.hit_set_period << "s"
<< " x" << p.hit_set_count;
<< " x" << p.hit_set_count << " decay_rate "
<< p.hit_set_grade_decay_rate
<< " search_last_n " << p.hit_set_search_last_n;
}
if (p.min_read_recency_for_promote)
out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;

View File

@ -1104,6 +1104,9 @@ public:
hit_set_params = HitSet::Params();
hit_set_period = 0;
hit_set_count = 0;
hit_set_grade_decay_rate = 0;
hit_set_search_last_n = 0;
grade_table.resize(0);
}
uint64_t target_max_bytes; ///< tiering: target max pool size
@ -1122,6 +1125,10 @@ public:
bool use_gmt_hitset; ///< use gmt to name the hitset archive object
uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
///temperature count,the follow hit_set's priority decay
///by this params than pre hit_set
uint32_t hit_set_search_last_n; ///<accumulate atmost N hit_sets for temperature
uint32_t stripe_width; ///< erasure coded stripe size in bytes
@ -1129,6 +1136,24 @@ public:
///< user does not specify any expected value
bool fast_read; ///< whether turn on fast read on the pool or not
private:
vector<uint32_t> grade_table;
public:
uint32_t get_grade(unsigned i) const {
if (grade_table.size() <= i)
return 0;
return grade_table[i];
}
void calc_grade_table() {
unsigned v = 1000000;
grade_table.resize(hit_set_count);
for (unsigned i = 0; i < hit_set_count; i++) {
v = v * (1 - (hit_set_grade_decay_rate / 100.0));
grade_table[i] = v;
}
}
pg_pool_t()
: flags(0), type(0), size(0), min_size(0),
crush_ruleset(0), object_hash(0),
@ -1154,6 +1179,8 @@ public:
use_gmt_hitset(true),
min_read_recency_for_promote(0),
min_write_recency_for_promote(0),
hit_set_grade_decay_rate(0),
hit_set_search_last_n(0),
stripe_width(0),
expected_num_objects(0),
fast_read(false)

View File

@ -2406,6 +2406,12 @@ TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) {
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_grade_decay_rate", 20),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_search_last_n", 1),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
@ -4759,6 +4765,12 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteOn2ndRead) {
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_grade_decay_rate", 20),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_search_last_n", 1),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();