mirror of
https://github.com/ceph/ceph
synced 2025-01-20 10:01:45 +00:00
osd/scrub: PGs that are scrubbing now update stats periodically
Added periodic calls to all PGs for which the OSD is the primary, asking for a scrub statistics update. This allows operator queries (e.g. 'pg dump pgs') to present up-to-date scrub duration, "scrub is blocked" duration, etc'. Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
This commit is contained in:
parent
9a498550d1
commit
c50c20fd19
@ -457,6 +457,23 @@ options:
|
||||
long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked.
|
||||
default: 120
|
||||
with_legacy: true
|
||||
# timely updates to the 'pg dump' output, esp. re scrub scheduling
|
||||
- name: osd_stats_update_period_scrubbing
|
||||
type: int
|
||||
level: advanced
|
||||
desc: Stats update period (seconds) when scrubbing
|
||||
long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its
|
||||
stats (inc. scrub/block duration) every this many seconds.
|
||||
default: 15
|
||||
with_legacy: false
|
||||
- name: osd_stats_update_period_not_scrubbing
|
||||
type: int
|
||||
level: advanced
|
||||
desc: Stats update period (seconds) when not scrubbing
|
||||
long_desc: A PG we are a primary of, publishes its
|
||||
stats (inc. scrub/block duration) every this many seconds.
|
||||
default: 120
|
||||
with_legacy: false
|
||||
# where rados plugins are stored
|
||||
- name: osd_class_dir
|
||||
type: str
|
||||
|
@ -7536,6 +7536,8 @@ MPGStats* OSD::collect_pg_stats()
|
||||
min_last_epoch_clean = get_osdmap_epoch();
|
||||
min_last_epoch_clean_pgs.clear();
|
||||
|
||||
auto now_is = ceph::coarse_real_clock::now();
|
||||
|
||||
std::set<int64_t> pool_set;
|
||||
vector<PGRef> pgs;
|
||||
_get_pgs(&pgs);
|
||||
@ -7545,7 +7547,7 @@ MPGStats* OSD::collect_pg_stats()
|
||||
if (!pg->is_primary()) {
|
||||
continue;
|
||||
}
|
||||
pg->with_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
|
||||
pg->with_pg_stats(now_is, [&](const pg_stat_t& s, epoch_t lec) {
|
||||
m->pg_stat[pg->pg_id.pgid] = s;
|
||||
min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
|
||||
min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
|
||||
|
@ -2677,8 +2677,18 @@ void PG::dump_missing(Formatter *f)
|
||||
}
|
||||
}
|
||||
|
||||
void PG::with_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)>&& f)
|
||||
void PG::with_pg_stats(ceph::coarse_real_clock::time_point now_is,
|
||||
std::function<void(const pg_stat_t&, epoch_t lec)>&& f)
|
||||
{
|
||||
dout(30) << __func__ << dendl;
|
||||
// possibly update the scrub state & timers
|
||||
lock();
|
||||
if (m_scrubber) {
|
||||
m_scrubber->update_scrub_stats(now_is);
|
||||
}
|
||||
unlock();
|
||||
|
||||
// now - the actual publishing
|
||||
std::lock_guard l{pg_stats_publish_lock};
|
||||
if (pg_stats_publish) {
|
||||
f(*pg_stats_publish, pg_stats_publish->get_effective_last_epoch_clean());
|
||||
|
@ -699,7 +699,8 @@ public:
|
||||
void dump_pgstate_history(ceph::Formatter *f);
|
||||
void dump_missing(ceph::Formatter *f);
|
||||
|
||||
void with_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)>&& f);
|
||||
void with_pg_stats(ceph::coarse_real_clock::time_point now_is,
|
||||
std::function<void(const pg_stat_t&, epoch_t lec)>&& f);
|
||||
void with_heartbeat_peers(std::function<void(int)>&& f);
|
||||
|
||||
void shutdown();
|
||||
|
@ -724,7 +724,7 @@ ScrubQueue::ScrubQContainer ScrubQueue::list_registered_jobs() const
|
||||
}
|
||||
|
||||
// ////////////////////////////////////////////////////////////////////////// //
|
||||
// ScrubJob - scrub resource management
|
||||
// ScrubQueue - scrub resource management
|
||||
|
||||
bool ScrubQueue::can_inc_scrubs() const
|
||||
{
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#include "common/ceph_time.h"
|
||||
#include "common/errno.h"
|
||||
#include "messages/MOSDOp.h"
|
||||
#include "messages/MOSDRepScrub.h"
|
||||
@ -18,6 +19,7 @@
|
||||
#include "messages/MOSDScrubReserve.h"
|
||||
#include "osd/OSD.h"
|
||||
#include "osd/PG.h"
|
||||
#include "include/utime_fmt.h"
|
||||
#include "osd/osd_types_fmt.h"
|
||||
|
||||
#include "ScrubStore.h"
|
||||
@ -532,6 +534,7 @@ void PgScrubber::update_scrub_job(const requested_scrub_t& request_flags)
|
||||
}
|
||||
|
||||
if (is_primary() && m_scrub_job) {
|
||||
ceph_assert(m_pg->is_locked());
|
||||
auto suggested = m_osds->get_scrub_services().determine_scrub_time(
|
||||
request_flags,
|
||||
m_pg->info,
|
||||
@ -777,14 +780,16 @@ Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm()
|
||||
int grace = get_pg_cct()->_conf->osd_blocked_scrub_grace_period;
|
||||
if (grace == 0) {
|
||||
// we will not be sending any alarms re the blocked object
|
||||
dout(20)
|
||||
dout(10)
|
||||
<< __func__
|
||||
<< ": blocked-alarm disabled ('osd_blocked_scrub_grace_period' set to 0)"
|
||||
<< dendl;
|
||||
return nullptr;
|
||||
}
|
||||
ceph::timespan grace_period{m_debug_blockrange ? 4s : seconds{grace}};
|
||||
dout(30) << __func__ << ": timeout:" << grace_period.count() << dendl;
|
||||
dout(20) << fmt::format(": timeout:{}",
|
||||
std::chrono::duration_cast<seconds>(grace_period))
|
||||
<< dendl;
|
||||
return std::make_unique<blocked_range_t>(m_osds,
|
||||
grace_period,
|
||||
*this,
|
||||
@ -1747,6 +1752,7 @@ void PgScrubber::set_scrub_blocked(utime_t since)
|
||||
// we are called from a time-triggered lambda,
|
||||
// thus - not under PG-lock
|
||||
PGRef pg = m_osds->osd->lookup_lock_pg(m_pg_id);
|
||||
ceph_assert(pg); // 'this' here should not exist if the PG was removed
|
||||
m_osds->get_scrub_services().mark_pg_scrub_blocked(m_pg_id);
|
||||
m_scrub_job->blocked_since = since;
|
||||
m_scrub_job->blocked = true;
|
||||
@ -2386,9 +2392,9 @@ int PgScrubber::asok_debug(std::string_view cmd,
|
||||
dout(10) << __func__ << " cmd: " << cmd << " param: " << param << dendl;
|
||||
|
||||
if (cmd == "block") {
|
||||
// set a flag that will cause the next 'select_range' to report a blocked
|
||||
// 'm_debug_blockrange' causes the next 'select_range' to report a blocked
|
||||
// object
|
||||
m_debug_blockrange = 1;
|
||||
m_debug_blockrange = 10; // >1, so that will trigger fast state reports
|
||||
|
||||
} else if (cmd == "unblock") {
|
||||
// send an 'unblock' event, as if a blocked range was freed
|
||||
@ -2405,7 +2411,7 @@ int PgScrubber::asok_debug(std::string_view cmd,
|
||||
if (cmd == "set") {
|
||||
// set a flag that will cause the next 'select_range' to report a
|
||||
// blocked object
|
||||
m_debug_blockrange = 1;
|
||||
m_debug_blockrange = 10; // >1, so that will trigger fast state reports
|
||||
} else {
|
||||
// send an 'unblock' event, as if a blocked range was freed
|
||||
m_debug_blockrange = 0;
|
||||
@ -2416,6 +2422,56 @@ int PgScrubber::asok_debug(std::string_view cmd,
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: under PG lock
|
||||
*/
|
||||
void PgScrubber::update_scrub_stats(ceph::coarse_real_clock::time_point now_is)
|
||||
{
|
||||
using clock = ceph::coarse_real_clock;
|
||||
using namespace std::chrono;
|
||||
|
||||
const seconds period_active = seconds(m_pg->get_cct()->_conf.get_val<int64_t>(
|
||||
"osd_stats_update_period_scrubbing"));
|
||||
if (!period_active.count()) {
|
||||
// a way for the operator to disable these stats updates
|
||||
return;
|
||||
}
|
||||
const seconds period_inactive =
|
||||
seconds(m_pg->get_cct()->_conf.get_val<int64_t>(
|
||||
"osd_stats_update_period_not_scrubbing") +
|
||||
m_pg_id.pgid.m_seed % 30);
|
||||
|
||||
// determine the required update period, based on our current state
|
||||
auto period{period_inactive};
|
||||
if (m_active) {
|
||||
period = m_debug_blockrange ? 2s : period_active;
|
||||
}
|
||||
|
||||
/// \todo use the date library (either the one included in Arrow or directly)
|
||||
/// to get the formatting of the time_points.
|
||||
|
||||
if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
|
||||
// will only create the debug strings if required
|
||||
char buf[50];
|
||||
auto printable_last = fmt::localtime(clock::to_time_t(m_last_stat_upd));
|
||||
strftime(buf, sizeof(buf), "%Y-%m-%dT%T", &printable_last);
|
||||
dout(20) << fmt::format("{}: period: {}/{}-> {} last:{}",
|
||||
__func__,
|
||||
period_active,
|
||||
period_inactive,
|
||||
period,
|
||||
buf)
|
||||
<< dendl;
|
||||
}
|
||||
|
||||
if (now_is - m_last_stat_upd > period) {
|
||||
m_pg->publish_stats_to_osd();
|
||||
m_last_stat_upd = now_is;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ///////////////////// preemption_data_t //////////////////////////////////
|
||||
|
||||
PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
|
||||
|
@ -441,6 +441,8 @@ class PgScrubber : public ScrubPgIF,
|
||||
return false;
|
||||
}
|
||||
|
||||
void update_scrub_stats(ceph::coarse_real_clock::time_point now_is) final;
|
||||
|
||||
int asok_debug(std::string_view cmd,
|
||||
std::string param,
|
||||
Formatter* f,
|
||||
@ -882,6 +884,10 @@ class PgScrubber : public ScrubPgIF,
|
||||
void persist_scrub_results(inconsistent_objs_t&& all_errors);
|
||||
void apply_snap_mapper_fixes(const std::vector<snap_mapper_fix_t>& fix_list);
|
||||
|
||||
// our latest periodic 'publish_stats_to_osd()'. Required frequency depends on
|
||||
// scrub state.
|
||||
ceph::coarse_real_clock::time_point m_last_stat_upd{};
|
||||
|
||||
// ------------ members used if we are a replica
|
||||
|
||||
epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
|
||||
|
@ -315,6 +315,13 @@ struct ScrubPgIF {
|
||||
virtual bool get_store_errors(const scrub_ls_arg_t& arg,
|
||||
scrub_ls_result_t& res_inout) const = 0;
|
||||
|
||||
/**
|
||||
* force a periodic 'publish_stats_to_osd()' call, to update scrub-related
|
||||
* counters and statistics.
|
||||
*/
|
||||
virtual void update_scrub_stats(
|
||||
ceph::coarse_real_clock::time_point now_is) = 0;
|
||||
|
||||
// --------------- reservations -----------------------------------
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user