Merge branch 'wip-cache-second'

Reviewed-by: Samuel Just <sam.just@inktank.com>
This commit is contained in:
Sage Weil 2014-08-01 15:37:33 -07:00
commit 79d1aff182
8 changed files with 264 additions and 12 deletions

View File

@ -452,6 +452,7 @@ OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback")
OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4)
OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read)
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!

View File

@ -571,11 +571,11 @@ COMMAND("osd pool rename " \
"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
COMMAND("osd pool get " \
"name=pool,type=CephPoolname " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote", \
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \
"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote " \
"name=val,type=CephString " \
"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")

View File

@ -2571,6 +2571,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
} else if (var == "erasure_code_profile") {
f->dump_string("erasure_code_profile", p->erasure_code_profile);
} else if (var == "min_read_recency_for_promote") {
f->dump_int("min_read_recency_for_promote", p->min_read_recency_for_promote);
}
f->close_section();
@ -2620,6 +2622,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
ss << "cache_min_evict_age: " << p->cache_min_evict_age;
} else if (var == "erasure_code_profile") {
ss << "erasure_code_profile: " << p->erasure_code_profile;
} else if (var == "min_read_recency_for_promote") {
ss << "min_read_recency_for_promote: " << p->min_read_recency_for_promote;
}
rdata.append(ss);
@ -3735,6 +3739,12 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
return -EINVAL;
}
p.cache_min_evict_age = n;
} else if (var == "min_read_recency_for_promote") {
if (interr.length()) {
ss << "error parsing integer value '" << val << "': " << interr;
return -EINVAL;
}
p.min_read_recency_for_promote = n;
} else {
ss << "unrecognized variable '" << var << "'";
return -EINVAL;
@ -5606,6 +5616,7 @@ done:
ntp->cache_mode = mode;
ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
ntp->hit_set_params = hsp;
ntp->target_max_bytes = size;
ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";

View File

@ -1342,7 +1342,10 @@ void ReplicatedPG::do_op(OpRequestRef& op)
}
}
bool in_hit_set = false;
if (hit_set) {
if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
in_hit_set = true;
hit_set->insert(oid);
if (hit_set->is_full() ||
hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
@ -1355,7 +1358,7 @@ void ReplicatedPG::do_op(OpRequestRef& op)
}
if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false))
maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false, in_hit_set))
return;
if (r) {
@ -1550,7 +1553,8 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
bool write_ordered,
ObjectContextRef obc,
int r, const hobject_t& missing_oid,
bool must_promote)
bool must_promote,
bool in_hit_set)
{
if (obc)
dout(25) << __func__ << " " << obc->obs.oi << " "
@ -1595,7 +1599,43 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
if (!must_promote && can_skip_promote(op, obc)) {
return false;
}
promote_object(op, obc, missing_oid);
if (op->may_write() || must_promote || !hit_set) {
promote_object(op, obc, missing_oid);
} else {
switch (pool.info.min_read_recency_for_promote) {
case 0:
promote_object(op, obc, missing_oid);
break;
case 1:
// Check if in the current hit set
if (in_hit_set) {
promote_object(op, obc, missing_oid);
} else {
do_cache_redirect(op, obc);
}
break;
default:
if (in_hit_set) {
promote_object(op, obc, missing_oid);
} else {
// Check if in other hit sets
map<time_t,HitSetRef>::iterator itor;
bool in_other_hit_sets = false;
for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end(); itor++) {
if (itor->second->contains(missing_oid)) {
in_other_hit_sets = true;
break;
}
}
if (in_other_hit_sets) {
promote_object(op, obc, missing_oid);
} else {
do_cache_redirect(op, obc);
}
}
break;
}
}
return true;
case pg_pool_t::CACHEMODE_FORWARD:
@ -10865,8 +10905,10 @@ void ReplicatedPG::hit_set_persist()
info.hit_set.current_info.end = now;
dout(20) << __func__ << " archive " << oid << dendl;
if (agent_state)
if (agent_state) {
agent_state->add_hit_set(info.hit_set.current_info.begin, hit_set);
hit_set_in_memory_trim();
}
// hold a ref until it is flushed to disk
hit_set_flushing[info.hit_set.current_info.begin] = hit_set;
@ -11002,8 +11044,6 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
repop->ctx->op_t->remove(oid);
repop->ctx->log.back().mod_desc.mark_unrollbackable();
}
if (agent_state)
agent_state->remove_oldest_hit_set();
updated_hit_set_hist.history.pop_front();
ObjectContextRef obc = get_object_context(oid, false);
@ -11014,6 +11054,19 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
}
}
void ReplicatedPG::hit_set_in_memory_trim()
{
unsigned max = pool.info.hit_set_count;
unsigned max_in_memory = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0;
if (max_in_memory > max) {
max_in_memory = max;
}
while (agent_state->hit_set_map.size() > max_in_memory) {
agent_state->remove_oldest_hit_set();
}
}
// =======================================
// cache agent
@ -11206,6 +11259,9 @@ bool ReplicatedPG::agent_work(int start_max)
else
agent_state->position = next;
// Discard old in memory HitSets
hit_set_in_memory_trim();
if (need_delay) {
assert(agent_state->delaying == false);
agent_delay();
@ -11220,7 +11276,6 @@ bool ReplicatedPG::agent_work(int start_max)
void ReplicatedPG::agent_load_hit_sets()
{
if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
agent_state->discard_hit_sets();
return;
}

View File

@ -799,6 +799,7 @@ protected:
void hit_set_persist(); ///< persist hit info
bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets
void hit_set_in_memory_trim(); ///< discard old in memory HitSets
hobject_t get_hit_set_current_object(utime_t stamp);
hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
@ -1053,7 +1054,8 @@ protected:
bool write_ordered,
ObjectContextRef obc, int r,
const hobject_t& missing_oid,
bool must_promote);
bool must_promote,
bool in_hit_set = false);
/**
* This helper function tells the client to redirect their request elsewhere.
*/

View File

@ -809,6 +809,7 @@ void pg_pool_t::dump(Formatter *f) const
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
}
@ -1109,7 +1110,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
return;
}
ENCODE_START(15, 5, bl);
ENCODE_START(16, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
@ -1149,12 +1150,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(cache_min_evict_age, bl);
::encode(erasure_code_profile, bl);
::encode(last_force_op_resend, bl);
::encode(min_read_recency_for_promote, bl);
ENCODE_FINISH(bl);
}
void pg_pool_t::decode(bufferlist::iterator& bl)
{
DECODE_START_LEGACY_COMPAT_LEN(15, 5, 5, bl);
DECODE_START_LEGACY_COMPAT_LEN(16, 5, 5, bl);
::decode(type, bl);
::decode(size, bl);
::decode(crush_ruleset, bl);
@ -1256,6 +1258,12 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
} else {
last_force_op_resend = 0;
}
if (struct_v >= 16) {
::decode(min_read_recency_for_promote, bl);
} else {
pg_pool_t def;
min_read_recency_for_promote = def.min_read_recency_for_promote;
}
DECODE_FINISH(bl);
calc_pg_masks();
}
@ -1301,6 +1309,7 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
a.hit_set_period = 3600;
a.hit_set_count = 8;
a.min_read_recency_for_promote = 1;
a.set_stripe_width(12345);
a.target_max_bytes = 1238132132;
a.target_max_objects = 1232132;
@ -1353,6 +1362,8 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
<< " " << p.hit_set_period << "s"
<< " x" << p.hit_set_count;
}
if (p.min_read_recency_for_promote)
out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
out << " stripe_width " << p.get_stripe_width();
return out;
}

View File

@ -973,6 +973,7 @@ public:
HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
uint32_t hit_set_count; ///< number of periods to retain
uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote
uint32_t stripe_width; ///< erasure coded stripe size in bytes
@ -997,6 +998,7 @@ public:
hit_set_params(),
hit_set_period(0),
hit_set_count(0),
min_read_recency_for_promote(0),
stripe_width(0)
{ }

View File

@ -2200,6 +2200,91 @@ TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
inbl, NULL, NULL));
}
TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) {
// create object
{
bufferlist bl;
bl.append("hi there");
ObjectWriteOperation op;
op.write_full(bl);
ASSERT_EQ(0, ioctx.operate("foo", &op));
}
// configure cache
bufferlist inbl;
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
"\", \"tierpool\": \"" + cache_pool_name +
"\", \"force_nonempty\": \"--force-nonempty\" }",
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
"\", \"overlaypool\": \"" + cache_pool_name + "\"}",
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
"\", \"mode\": \"writeback\"}",
inbl, NULL, NULL));
// enable hitset tracking for this pool
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_count", 2),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_period", 600),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
// 1st read, don't trigger a promote
{
bufferlist bl;
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
}
// verify the object is NOT present in the cache tier
{
ObjectIterator it = cache_ioctx.objects_begin();
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
// Read until the object is present in the cache tier
while (true) {
bufferlist bl;
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
ObjectIterator it = cache_ioctx.objects_begin();
if (it != cache_ioctx.objects_end()) {
ASSERT_TRUE(it->first == string("foo"));
++it;
ASSERT_TRUE(it == cache_ioctx.objects_end());
break;
}
sleep(1);
}
// tear down tiers
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
"\"}",
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
"\", \"tierpool\": \"" + cache_pool_name + "\"}",
inbl, NULL, NULL));
// wait for maps to settle before next test
cluster.wait_for_latest_osdmap();
}
class LibRadosTwoPoolsECPP : public RadosTestECPP
{
public:
@ -4197,6 +4282,91 @@ TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
inbl, NULL, NULL));
}
TEST_F(LibRadosTwoPoolsECPP, PromoteOn2ndRead) {
// create object
{
bufferlist bl;
bl.append("hi there");
ObjectWriteOperation op;
op.write_full(bl);
ASSERT_EQ(0, ioctx.operate("foo", &op));
}
// configure cache
bufferlist inbl;
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
"\", \"tierpool\": \"" + cache_pool_name +
"\", \"force_nonempty\": \"--force-nonempty\" }",
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
"\", \"overlaypool\": \"" + cache_pool_name + "\"}",
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
"\", \"mode\": \"writeback\"}",
inbl, NULL, NULL));
// enable hitset tracking for this pool
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_count", 2),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_period", 600),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
// 1st read, don't trigger a promote
{
bufferlist bl;
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
}
// verify the object is NOT present in the cache tier
{
ObjectIterator it = cache_ioctx.objects_begin();
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
// Read until the object is present in the cache tier
while (true) {
bufferlist bl;
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
ObjectIterator it = cache_ioctx.objects_begin();
if (it != cache_ioctx.objects_end()) {
ASSERT_TRUE(it->first == string("foo"));
++it;
ASSERT_TRUE(it == cache_ioctx.objects_end());
break;
}
sleep(1);
}
// tear down tiers
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
"\"}",
inbl, NULL, NULL));
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
"\", \"tierpool\": \"" + cache_pool_name + "\"}",
inbl, NULL, NULL));
// wait for maps to settle before next test
cluster.wait_for_latest_osdmap();
}
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);