osd: trim pg logs based on a per-osd budget

Set the default budget based on the current defaults: 3000 per osd, and a
rule of thumb target of 100 PGs per OSD.  Set the per-PG trim target
by dividing the overall value by the number of PGs on the OSD.

Increase the max pg log length alone, so if the OSD has <100 PGs,
those PGs will get more entries.  Reduce the minimum to be smaller than
the max.  Use the min/max config options to bracket what is allocated to
a single PG.

Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2020-01-16 11:22:34 -06:00 committed by Kefu Chai
parent 8aa8b41fa5
commit 0db140c15c
10 changed files with 68 additions and 4 deletions

View File

@ -721,6 +721,7 @@ OPTION(osd_kill_backfill_at, OPT_INT)
// Bounds how infrequently a new map epoch will be persisted for a pg
OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32) // make this < map_cache_size!
OPTION(osd_target_pg_log_entries_per_osd, OPT_U32)
OPTION(osd_min_pg_log_entries, OPT_U32) // number of entries to keep in the pg log when trimming it
OPTION(osd_max_pg_log_entries, OPT_U32) // max entries, say when degraded, before we trim
OPTION(osd_pg_log_dups_tracked, OPT_U32) // how many versions back to track combined in both pglog's regular + dup logs

View File

@ -3302,15 +3302,21 @@ std::vector<Option> get_global_options() {
.set_default(40)
.set_description(""),
Option("osd_target_pg_log_entries_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(3000 * 100)
.set_description("target number of PG entries total on an OSD")
.add_see_also("osd_max_pg_log_entries")
.add_see_also("osd_min_pg_log_entries"),
Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(3000)
.set_default(250)
.set_description("minimum number of entries to maintain in the PG log")
.add_service("osd")
.add_see_also("osd_max_pg_log_entries")
.add_see_also("osd_pg_log_dups_tracked"),
Option("osd_max_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(3000)
.set_default(10000)
.set_description("maximum number of entries to maintain in the PG log when degraded before we trim")
.add_service("osd")
.add_see_also("osd_min_pg_log_entries")

View File

@ -194,6 +194,30 @@ void PG::recheck_readable()
}
}
unsigned PG::get_target_pg_log_entries() const
{
const unsigned num_pgs = shard_services.get_pg_num();
const unsigned target =
local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd");
const unsigned min_pg_log_entries =
local_conf().get_val<uint64_t>("osd_min_pg_log_entries");
if (num_pgs > 0 && target > 0) {
// target an even spread of our budgeted log entries across all
// PGs. note that while we only get to control the entry count
// for primary PGs, we'll normally be responsible for a mix of
// primary and replica PGs (for the same pool(s) even), so this
// will work out.
const unsigned max_pg_log_entries =
local_conf().get_val<uint64_t>("osd_max_pg_log_entries");
return std::clamp(target / num_pgs,
min_pg_log_entries,
max_pg_log_entries);
} else {
// fall back to a per-pg value.
return min_pg_log_entries;
}
}
void PG::on_activate(interval_set<snapid_t>)
{
projected_last_update = peering_state.get_info().last_update;

View File

@ -240,6 +240,8 @@ public:
ceph::timespan delay) final;
void recheck_readable() final;
unsigned get_target_pg_log_entries() const final;
void on_pool_change() final {
// Not needed yet
}

View File

@ -9400,6 +9400,26 @@ bool OSDService::_recover_now(uint64_t *available_pushes)
return true;
}
unsigned OSDService::get_target_pg_log_entries() const
{
auto num_pgs = osd->get_num_pgs();
auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
if (num_pgs > 0 && target > 0) {
// target an even spread of our budgeted log entries across all
// PGs. note that while we only get to control the entry count
// for primary PGs, we'll normally be responsible for a mix of
// primary and replica PGs (for the same pool(s) even), so this
// will work out.
return std::max<unsigned>(
std::min<unsigned>(target / num_pgs,
cct->_conf->osd_max_pg_log_entries),
cct->_conf->osd_min_pg_log_entries);
} else {
// fall back to a per-pg value.
return cct->_conf->osd_min_pg_log_entries;
}
}
void OSD::do_recovery(
PG *pg, epoch_t queued, uint64_t reserved_pushes,
ThreadPool::TPHandle &handle)

View File

@ -662,6 +662,9 @@ public:
return awaiting.second.get() == pg;
});
}
unsigned get_target_pg_log_entries() const;
// delayed pg activation
void queue_for_recovery(PG *pg) {
std::lock_guard l(recovery_lock);

View File

@ -849,6 +849,11 @@ void PG::publish_stats_to_osd()
}
}
unsigned PG::get_target_pg_log_entries() const
{
return osd->get_target_pg_log_entries();
}
void PG::clear_publish_stats()
{
dout(15) << "clear_stats" << dendl;

View File

@ -400,6 +400,7 @@ public:
uint64_t get_snap_trimq_size() const override {
return snap_trimq.size();
}
unsigned get_target_pg_log_entries() const override;
void clear_publish_stats() override;
void clear_primary_state() override;

View File

@ -4050,7 +4050,7 @@ void PeeringState::calc_trim_to()
PG_STATE_BACKFILLING |
PG_STATE_BACKFILL_WAIT |
PG_STATE_BACKFILL_TOOFULL)) {
target = cct->_conf->osd_max_pg_log_entries;
target = pl->get_target_pg_log_entries();
}
eversion_t limit = std::min(
@ -4092,7 +4092,7 @@ void PeeringState::calc_trim_to_aggressive()
PG_STATE_BACKFILLING |
PG_STATE_BACKFILL_WAIT |
PG_STATE_BACKFILL_TOOFULL)) {
target = cct->_conf->osd_max_pg_log_entries;
target = pl->get_target_pg_log_entries();
}
// limit pg log trimming up to the can_rollback_to value
eversion_t limit = std::min(

View File

@ -281,6 +281,8 @@ public:
virtual void queue_check_readable(epoch_t lpr, ceph::timespan delay) = 0;
virtual void recheck_readable() = 0;
virtual unsigned get_target_pg_log_entries() const = 0;
// ============ Flush state ==================
/**
* try_flush_or_schedule_async()