Merge pull request #22797 from dzafman/wip-19753

osd: Deny reservation if expected backfill size would put us over bac…

Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Reviewed-by: Neha Ojha <nojha@redhat.com>
This commit is contained in:
David Zafman 2019-01-18 07:42:00 -08:00 committed by GitHub
commit 99ddd3666b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 1738 additions and 77 deletions

View File

@ -1137,6 +1137,35 @@ function test_get_not_primary() {
#######################################################################
function _objectstore_tool_nodown() {
local dir=$1
shift
local id=$1
shift
local osd_data=$dir/$id
local journal_args
if [ "$objectstore_type" == "filestore" ]; then
journal_args=" --journal-path $osd_data/journal"
fi
ceph-objectstore-tool \
--data-path $osd_data \
$journal_args \
"$@" || return 1
}
function _objectstore_tool_nowait() {
local dir=$1
shift
local id=$1
shift
kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
_objectstore_tool_nodown $dir $id "$@" || return 1
activate_osd $dir $id $ceph_osd_args >&2 || return 1
}
##
# Run ceph-objectstore-tool against the OSD **id** using the data path
# **dir**. The OSD is killed with TERM prior to running
@ -1158,21 +1187,8 @@ function objectstore_tool() {
shift
local id=$1
shift
local osd_data=$dir/$id
local osd_type=$(cat $osd_data/type)
kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
local journal_args
if [ "$objectstore_type" == "filestore" ]; then
journal_args=" --journal-path $osd_data/journal"
fi
ceph-objectstore-tool \
--data-path $osd_data \
$journal_args \
"$@" || return 1
activate_osd $dir $id $ceph_osd_args >&2 || return 1
_objectstore_tool_nowait $dir $id "$@" || return 1
wait_for_clean >&2
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,49 @@
roles:
- - mon.a
- mgr.x
- osd.0
- osd.1
- osd.2
- - osd.3
- osd.4
- osd.5
- client.0
openstack:
- volumes: # attached to each instance
count: 3
size: 10 # GB
override:
ceph:
conf:
mon:
osd default pool size: 3
osd min pg log entries: 5
osd max pg log entries: 10
tasks:
- install:
- ceph:
log-whitelist:
- but it is still running
- missing primary copy of
- objects unfound and apparently lost
- overall HEALTH_
- \(OSDMAP_FLAGS\)
- \(SLOW_OPS\)
- \(PG_
- \(OBJECT_MISPLACED\)
- \(OSD_
- \(OBJECT_
- \(TOO_FEW_PGS\)
- \(POOL_BACKFILLFULL\)
- thrashosds:
op_delay: 30
clean_interval: 120
chance_down: .75
min_live: 5
min_in: 5
chance_test_backfill_full: .5
- radosbench:
clients: [client.0]
time: 1800
type: rand
objectsize: 1048576

View File

@ -1547,3 +1547,4 @@ OPTION(rgw_sts_entry, OPT_STR)
OPTION(rgw_sts_key, OPT_STR)
OPTION(rgw_s3_auth_use_sts, OPT_BOOL) // should we try to use sts for s3?
OPTION(rgw_sts_max_session_duration, OPT_U64) // Max duration in seconds for which the session token is valid.
OPTION(fake_statfs_for_testing, OPT_INT) // Set a value for kb and compute kb_used from total of num_bytes

View File

@ -7840,12 +7840,17 @@ std::vector<Option> get_mds_client_options() {
Option("client_mds_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
.set_description("CephFS file system name to mount")
.set_long_description("Use this with ceph-fuse, or with any process "
"that uses libcephfs. Programs using libcephfs may also pass "
"the filesystem name into mount(), which will override this setting. "
"If no filesystem name is given in mount() or this setting, the default "
"filesystem will be mounted (usually the first created)."),
Option("fake_statfs_for_testing", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description("Set a value for kb and compute kb_used from total of num_bytes"),
});
}

View File

@ -22,7 +22,7 @@ class MBackfillReserve : public MessageInstance<MBackfillReserve, MOSDPeeringOp>
public:
friend factory;
private:
static constexpr int HEAD_VERSION = 4;
static constexpr int HEAD_VERSION = 5;
static constexpr int COMPAT_VERSION = 4;
public:
spg_t pgid;
@ -38,6 +38,8 @@ public:
};
uint32_t type;
uint32_t priority;
int64_t primary_num_bytes;
int64_t shard_num_bytes;
spg_t get_spg() const {
return pgid;
@ -55,7 +57,7 @@ public:
return new PGPeeringEvent(
query_epoch,
query_epoch,
RequestBackfillPrio(priority));
RequestBackfillPrio(priority, primary_num_bytes, shard_num_bytes));
case GRANT:
return new PGPeeringEvent(
query_epoch,
@ -93,13 +95,17 @@ public:
MBackfillReserve()
: MessageInstance(MSG_OSD_BACKFILL_RESERVE, HEAD_VERSION, COMPAT_VERSION),
query_epoch(0), type(-1), priority(-1) {}
query_epoch(0), type(-1), priority(-1), primary_num_bytes(0),
shard_num_bytes(0) {}
MBackfillReserve(int type,
spg_t pgid,
epoch_t query_epoch, unsigned prio = -1)
epoch_t query_epoch, unsigned prio = -1,
int64_t primary_num_bytes = 0,
int64_t shard_num_bytes = 0)
: MessageInstance(MSG_OSD_BACKFILL_RESERVE, HEAD_VERSION, COMPAT_VERSION),
pgid(pgid), query_epoch(query_epoch),
type(type), priority(prio) {}
type(type), priority(prio), primary_num_bytes(primary_num_bytes),
shard_num_bytes(shard_num_bytes) {}
std::string_view get_type_name() const override {
return "MBackfillReserve";
@ -137,6 +143,13 @@ public:
decode(type, p);
decode(priority, p);
decode(pgid.shard, p);
if (header.version >= 5) {
decode(primary_num_bytes, p);
decode(shard_num_bytes, p);
} else {
primary_num_bytes = 0;
shard_num_bytes = 0;
}
}
void encode_payload(uint64_t features) override {
@ -159,6 +172,8 @@ public:
encode(type, payload);
encode(priority, payload);
encode(pgid.shard, payload);
encode(primary_num_bytes, payload);
encode(shard_num_bytes, payload);
}
};

View File

@ -329,6 +329,15 @@ void ECBackend::handle_recovery_push(
ceph_assert(op.data.length() == 0);
}
if (get_parent()->pg_is_remote_backfilling()) {
get_parent()->pg_add_local_num_bytes(op.data.length());
get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count());
dout(10) << __func__ << " " << op.soid
<< " add new actual data by " << op.data.length()
<< " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count()
<< dendl;
}
if (op.before_progress.first) {
ceph_assert(op.attrset.count(string("_")));
m->t.setattrs(
@ -365,6 +374,20 @@ void ECBackend::handle_recovery_push(
ObjectContextRef(),
false,
&m->t);
if (get_parent()->pg_is_remote_backfilling()) {
struct stat st;
int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN,
get_parent()->whoami_shard().shard), &st);
if (r == 0) {
get_parent()->pg_sub_local_num_bytes(st.st_size);
// XXX: This can be way overestimated for small objects
get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count());
dout(10) << __func__ << " " << op.soid
<< " sub actual data by " << st.st_size
<< " sub num_bytes by " << st.st_size * get_ec_data_chunk_count()
<< dendl;
}
}
}
}
m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp());

View File

@ -601,6 +601,13 @@ public:
return new ECRecPred(ec_impl);
}
int get_ec_data_chunk_count() const override {
return ec_impl->get_data_chunk_count();
}
int get_ec_stripe_chunk_size() const override {
return sinfo.get_chunk_size();
}
/**
* ECReadPred
*

View File

@ -269,7 +269,7 @@ OSDService::OSDService(OSD *osd) :
stat_lock("OSDService::stat_lock"),
full_status_lock("OSDService::full_status_lock"),
cur_state(NONE),
cur_ratio(0),
cur_ratio(0), physical_ratio(0),
epoch_lock("OSDService::epoch_lock"),
boot_epoch(0), up_epoch(0), bind_epoch(0),
is_stopping_lock("OSDService::is_stopping_lock")
@ -684,20 +684,15 @@ float OSDService::get_failsafe_full_ratio()
return full_ratio;
}
void OSDService::check_full_status(float ratio)
OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
{
std::lock_guard l(full_status_lock);
cur_ratio = ratio;
// The OSDMap ratios take precendence. So if the failsafe is .95 and
// the admin sets the cluster full to .96, the failsafe moves up to .96
// too. (Not that having failsafe == full is ideal, but it's better than
// dropping writes before the clusters appears full.)
OSDMapRef osdmap = get_osdmap();
if (!osdmap || osdmap->get_epoch() == 0) {
cur_state = NONE;
return;
return NONE;
}
float nearfull_ratio = osdmap->get_nearfull_ratio();
float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
@ -721,27 +716,34 @@ void OSDService::check_full_status(float ratio)
nearfull_ratio = failsafe_ratio;
}
if (injectfull_state > NONE && injectfull) {
inject = "(Injected)";
return injectfull_state;
} else if (pratio > failsafe_ratio) {
return FAILSAFE;
} else if (ratio > full_ratio) {
return FULL;
} else if (ratio > backfillfull_ratio) {
return BACKFILLFULL;
} else if (ratio > nearfull_ratio) {
return NEARFULL;
}
return NONE;
}
void OSDService::check_full_status(float ratio, float pratio)
{
std::lock_guard l(full_status_lock);
cur_ratio = ratio;
physical_ratio = pratio;
string inject;
s_names new_state;
if (injectfull_state > NONE && injectfull) {
new_state = injectfull_state;
inject = "(Injected)";
} else if (ratio > failsafe_ratio) {
new_state = FAILSAFE;
} else if (ratio > full_ratio) {
new_state = FULL;
} else if (ratio > backfillfull_ratio) {
new_state = BACKFILLFULL;
} else if (ratio > nearfull_ratio) {
new_state = NEARFULL;
} else {
new_state = NONE;
}
new_state = recalc_full_state(ratio, pratio, inject);
dout(20) << __func__ << " cur ratio " << ratio
<< ". nearfull_ratio " << nearfull_ratio
<< ". backfillfull_ratio " << backfillfull_ratio
<< ", full_ratio " << full_ratio
<< ", failsafe_ratio " << failsafe_ratio
<< ", physical ratio " << pratio
<< ", new state " << get_full_state_name(new_state)
<< " " << inject
<< dendl;
@ -784,10 +786,8 @@ bool OSDService::need_fullness_update()
return want != cur;
}
bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
{
std::lock_guard l(full_status_lock);
if (injectfull && injectfull_state >= type) {
// injectfull is either a count of the number of times to return failsafe full
// or if -1 then always return full
@ -798,12 +798,45 @@ bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
<< dendl;
return true;
}
return false;
}
bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
{
std::lock_guard l(full_status_lock);
if (_check_inject_full(dpp, type))
return true;
if (cur_state >= type)
ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio << dendl;
ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
<< " physical " << physical_ratio << dendl;
return cur_state >= type;
}
bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
{
ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
{
std::lock_guard l(full_status_lock);
if (_check_inject_full(dpp, type)) {
return true;
}
}
float pratio;
float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
string notused;
s_names tentative_state = recalc_full_state(ratio, pratio, notused);
if (tentative_state >= type)
ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
return tentative_state >= type;
}
bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
{
return _check_full(dpp, FAILSAFE);
@ -814,6 +847,11 @@ bool OSDService::check_full(DoutPrefixProvider *dpp) const
return _check_full(dpp, FULL);
}
bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
{
return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
}
bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
{
return _check_full(dpp, BACKFILLFULL);
@ -861,12 +899,38 @@ void OSDService::set_statfs(const struct store_statfs_t &stbuf)
uint64_t avail = stbuf.available;
uint64_t used = stbuf.get_used_raw();
// For testing fake statfs values so it doesn't matter if all
// OSDs are using the same partition.
if (cct->_conf->fake_statfs_for_testing) {
uint64_t total_num_bytes = 0;
vector<PGRef> pgs;
osd->_get_pgs(&pgs);
for (auto p : pgs) {
total_num_bytes += p->get_stats_num_bytes();
}
bytes = cct->_conf->fake_statfs_for_testing;
if (total_num_bytes < bytes)
avail = bytes - total_num_bytes;
else
avail = 0;
dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
<< " adjust available " << avail
<< dendl;
used = bytes - avail;
}
osd->logger->set(l_osd_stat_bytes, bytes);
osd->logger->set(l_osd_stat_bytes_used, used);
osd->logger->set(l_osd_stat_bytes_avail, avail);
std::lock_guard l(stat_lock);
osd_stat.statfs = stbuf;
if (cct->_conf->fake_statfs_for_testing) {
osd_stat.statfs.total = bytes;
osd_stat.statfs.available = avail;
// For testing don't want used to go negative, so clear reserved
osd_stat.statfs.internally_reserved = 0;
}
}
osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
@ -879,6 +943,34 @@ osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
return osd_stat;
}
float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
uint64_t adjust_used)
{
*pratio =
((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
if (adjust_used) {
dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
if (new_stat.statfs.available > adjust_used)
new_stat.statfs.available -= adjust_used;
else
new_stat.statfs.available = 0;
dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
}
// Check all pgs and adjust kb_used to include all pending backfill data
int backfill_adjusted = 0;
vector<PGRef> pgs;
osd->_get_pgs(&pgs);
for (auto p : pgs) {
backfill_adjusted += p->pg_stat_adjust(&new_stat);
}
if (backfill_adjusted) {
dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
}
return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
}
bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
{
OSDMapRef osdmap = get_osdmap();
@ -5053,9 +5145,10 @@ void OSD::heartbeat()
dout(5) << __func__ << " " << new_stat << dendl;
ceph_assert(new_stat.statfs.total);
float ratio =
((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
service.check_full_status(ratio);
float pratio;
float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
service.check_full_status(ratio, pratio);
utime_t now = ceph_clock_now();
utime_t deadline = now;

View File

@ -905,6 +905,7 @@ public:
void set_statfs(const struct store_statfs_t &stbuf);
osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs);
float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
osd_stat_t get_osd_stat() {
std::lock_guard l(stat_lock);
++seq;
@ -946,15 +947,19 @@ private:
else
return INVALID;
}
double cur_ratio; ///< current utilization
double cur_ratio, physical_ratio; ///< current utilization
mutable int64_t injectfull = 0;
s_names injectfull_state = NONE;
float get_failsafe_full_ratio();
bool _check_inject_full(DoutPrefixProvider *dpp, s_names type) const;
bool _check_full(DoutPrefixProvider *dpp, s_names type) const;
public:
void check_full_status(float ratio);
void check_full_status(float ratio, float pratio);
s_names recalc_full_state(float ratio, float pratio, string &inject);
bool _tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t);
bool check_failsafe_full(DoutPrefixProvider *dpp) const;
bool check_full(DoutPrefixProvider *dpp) const;
bool tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t);
bool check_backfill_full(DoutPrefixProvider *dpp) const;
bool check_nearfull(DoutPrefixProvider *dpp) const;
bool is_failsafe_full() const;

View File

@ -1074,6 +1074,7 @@ void PG::clear_primary_state()
peer_log_requested.clear();
peer_missing_requested.clear();
peer_info.clear();
peer_bytes.clear();
peer_missing.clear();
need_up_thru = false;
peer_last_complete_ondisk.clear();
@ -2004,7 +2005,9 @@ void PG::activate(ObjectStore::Transaction& t,
pi.last_interval_started = info.last_interval_started;
pi.history = info.history;
pi.hit_set = info.hit_set;
pi.stats.stats.clear();
// Save num_bytes for reservation request
peer_bytes[peer] = pi.stats.stats.sum.num_bytes;
pi.stats.stats.clear();
// initialize peer with our purged_snaps.
pi.purged_snaps = info.purged_snaps;
@ -4469,8 +4472,25 @@ void PG::handle_scrub_reserve_release(OpRequestRef op)
clear_scrub_reserved();
}
// We can zero the value of primary num_bytes as just an atomic.
// However, setting above zero reserves space for backfill and requires
// the OSDService::stat_lock which protects all OSD usage
void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
ceph_assert(osd->stat_lock.is_locked_by_me());
primary_num_bytes.store(primary);
local_num_bytes.store(local);
return;
}
void PG::clear_reserved_num_bytes() {
primary_num_bytes.store(0);
local_num_bytes.store(0);
return;
}
void PG::reject_reservation()
{
clear_reserved_num_bytes();
osd->send_message_osd_cluster(
primary.osd,
new MBackfillReserve(
@ -6836,6 +6856,41 @@ void PG::_delete_some(ObjectStore::Transaction *t)
}
}
// Compute pending backfill data
static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
{
lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
<< " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
return std::max((int64_t)0, bf_bytes - local_bytes);
}
int PG::pg_stat_adjust(osd_stat_t *ns)
{
osd_stat_t &new_stat = *ns;
if (is_primary()) {
return 0;
}
// Adjust the kb_used by adding pending backfill data
uint64_t reserved_num_bytes = get_reserved_num_bytes();
// For now we don't consider projected space gains here
// I suggest we have an optional 2 pass backfill that frees up
// space in a first pass. This could be triggered when at nearfull
// or near to backfillfull.
if (reserved_num_bytes > 0) {
// TODO: Handle compression by adjusting by the PGs average
// compression precentage.
dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
<< " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
if (new_stat.statfs.available > reserved_num_bytes)
new_stat.statfs.available -= reserved_num_bytes;
else
new_stat.statfs.available = 0;
dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
return 1;
}
return 0;
}
/*------------ Recovery State Machine----------------*/
@ -7387,6 +7442,8 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
{
PG *pg = context< RecoveryMachine >().pg;
int64_t num_bytes = pg->info.stats.stats.sum.num_bytes;
ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl;
if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
//The primary never backfills itself
ceph_assert(*backfill_osd_it != pg->pg_whoami);
@ -7398,11 +7455,14 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
MBackfillReserve::REQUEST,
spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
pg->get_osdmap_epoch(),
pg->get_backfill_priority()),
pg->get_backfill_priority(),
num_bytes,
pg->peer_bytes[*backfill_osd_it]),
con.get());
}
++backfill_osd_it;
} else {
pg->peer_bytes.clear();
post_event(AllBackfillsReserved());
}
return discard_event();
@ -7592,6 +7652,7 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(
const RemoteReservationCanceled &evt)
{
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
return transit<RepNotRecovering>();
}
@ -7616,18 +7677,64 @@ boost::statechart::result
PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
{
PG *pg = context< RecoveryMachine >().pg;
// Use tentative_bacfill_full() to make sure enough
// space is available to handle target bytes from primary.
// TODO: If we passed num_objects from primary we could account for
// an estimate of the metadata overhead.
// TODO: If we had compressed_allocated and compressed_original from primary
// we could compute compression ratio and adjust accordingly.
// XXX: There is no way to get omap overhead and this would only apply
// to whatever possibly different partition that is storing the database.
// update_osd_stat() from heartbeat will do this on a new
// statfs using pg->primary_num_bytes.
uint64_t pending_adjustment = 0;
int64_t primary_num_bytes = evt.primary_num_bytes;
int64_t local_num_bytes = evt.local_num_bytes;
if (primary_num_bytes) {
// For erasure coded pool overestimate by a full stripe per object
// because we don't know how each objected rounded to the nearest stripe
if (pg->pool.info.is_erasure()) {
primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
}
pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
<< " local " << (local_num_bytes >> 10) << "KiB"
<< " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
<< dendl;
}
// This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
// That's why we don't immediately unlock
Mutex::Locker l(pg->osd->stat_lock);
osd_stat_t cur_stat = pg->osd->osd_stat;
if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
(rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
<< dendl;
post_event(RejectRemoteReservation());
} else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
pg->osd->check_backfill_full(pg)) {
pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
ldout(pg->cct, 10) << "backfill reservation rejected: backfill full"
<< dendl;
post_event(RejectRemoteReservation());
} else {
Context *preempt = nullptr;
// Don't reserve space if skipped reservation check, this is used
// to test the other backfill full check AND in case a corruption
// of num_bytes requires ignoring that value and trying the
// backfill anyway.
if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
else
pg->clear_reserved_num_bytes();
// Use un-ec-adjusted bytes for stats.
pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
// older peers will interpret preemption as TOOFULL
preempt = new QueuePeeringEvt<RemoteBackfillPreempted>(
@ -7685,28 +7792,14 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
{
PG *pg = context< RecoveryMachine >().pg;
if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
(rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
<< "failure injection" << dendl;
post_event(RejectRemoteReservation());
return discard_event();
} else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
pg->osd->check_backfill_full(pg)) {
ldout(pg->cct, 10) << "backfill reservation rejected after reservation: backfill full"
<< dendl;
post_event(RejectRemoteReservation());
return discard_event();
} else {
pg->osd->send_message_osd_cluster(
pg->osd->send_message_osd_cluster(
pg->primary.osd,
new MBackfillReserve(
MBackfillReserve::GRANT,
spg_t(pg->info.pgid.pgid, pg->primary.shard),
pg->get_osdmap_epoch()),
pg->get_osdmap_epoch());
return transit<RepRecovering>();
}
return transit<RepRecovering>();
}
boost::statechart::result
@ -7724,6 +7817,7 @@ PG::RecoveryState::RepWaitBackfillReserved::react(
const RemoteReservationRejected &evt)
{
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
return transit<RepNotRecovering>();
}
@ -7733,6 +7827,7 @@ PG::RecoveryState::RepWaitBackfillReserved::react(
const RemoteReservationCanceled &evt)
{
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
return transit<RepNotRecovering>();
}
@ -7741,6 +7836,7 @@ boost::statechart::result
PG::RecoveryState::RepWaitBackfillReserved::react(const RecoveryDone&)
{
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
return transit<RepNotRecovering>();
}
@ -7757,6 +7853,7 @@ boost::statechart::result
PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &)
{
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->send_message_osd_cluster(
pg->primary.osd,
new MRecoveryReserve(
@ -7771,6 +7868,7 @@ boost::statechart::result
PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
{
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->send_message_osd_cluster(
pg->primary.osd,
new MBackfillReserve(
@ -7785,6 +7883,7 @@ boost::statechart::result
PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &)
{
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->send_message_osd_cluster(
pg->primary.osd,
new MBackfillReserve(
@ -7799,6 +7898,7 @@ void PG::RecoveryState::RepRecovering::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
utime_t dur = ceph_clock_now() - enter_time;
pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
@ -8625,6 +8725,7 @@ void PG::RecoveryState::ReplicaActive::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
pg->clear_reserved_num_bytes();
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
utime_t dur = ceph_clock_now() - enter_time;
pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);

View File

@ -1084,6 +1084,7 @@ protected:
bool need_up_thru;
set<pg_shard_t> stray_set; // non-acting osds that have PG data.
map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info
set<pg_shard_t> peer_purged; // peers purged
map<pg_shard_t, pg_missing_t> peer_missing;
set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
@ -1202,10 +1203,99 @@ protected:
set<pg_shard_t> backfill_targets, async_recovery_targets;
// The primary's num_bytes and local num_bytes for this pg, only valid
// during backfill for non-primary shards.
// Both of these are adjusted for EC to reflect the on-disk bytes
std::atomic<int64_t> primary_num_bytes = 0;
std::atomic<int64_t> local_num_bytes = 0;
public:
bool is_backfill_targets(pg_shard_t osd) {
return backfill_targets.count(osd);
}
// Space reserved for backfill is primary_num_bytes - local_num_bytes
// Don't care that difference itself isn't atomic
uint64_t get_reserved_num_bytes() {
int64_t primary = primary_num_bytes.load();
int64_t local = local_num_bytes.load();
if (primary > local)
return primary - local;
else
return 0;
}
bool is_remote_backfilling() {
return primary_num_bytes.load() > 0;
}
void set_reserved_num_bytes(int64_t primary, int64_t local);
void clear_reserved_num_bytes();
// If num_bytes are inconsistent and local_num- goes negative
// it's ok, because it would then be ignored.
// The value of num_bytes could be negative,
// but we don't let local_num_bytes go negative.
void add_local_num_bytes(int64_t num_bytes) {
if (num_bytes) {
int64_t prev = local_num_bytes.fetch_add(num_bytes);
ceph_assert(prev >= 0);
if (num_bytes < 0 && prev < -num_bytes) {
local_num_bytes.store(0);
}
}
}
void sub_local_num_bytes(int64_t num_bytes) {
ceph_assert(num_bytes >= 0);
if (num_bytes) {
if (local_num_bytes.fetch_sub(num_bytes) < num_bytes) {
local_num_bytes.store(0);
}
}
}
// The value of num_bytes could be negative,
// but we don't let info.stats.stats.sum.num_bytes go negative.
void add_num_bytes(int64_t num_bytes) {
ceph_assert(_lock.is_locked_by_me());
if (num_bytes) {
info.stats.stats.sum.num_bytes += num_bytes;
if (info.stats.stats.sum.num_bytes < 0) {
info.stats.stats.sum.num_bytes = 0;
}
}
}
void sub_num_bytes(int64_t num_bytes) {
ceph_assert(_lock.is_locked_by_me());
ceph_assert(num_bytes >= 0);
if (num_bytes) {
info.stats.stats.sum.num_bytes -= num_bytes;
if (info.stats.stats.sum.num_bytes < 0) {
info.stats.stats.sum.num_bytes = 0;
}
}
}
// Only used in testing so not worried about needing the PG lock here
int64_t get_stats_num_bytes() {
Mutex::Locker l(_lock);
int num_bytes = info.stats.stats.sum.num_bytes;
if (pool.info.is_erasure()) {
num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
// Round up each object by a stripe
num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
}
int64_t lnb = local_num_bytes.load();
if (lnb && lnb != num_bytes) {
lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
<< lnb << " vs stats "
<< info.stats.stats.sum.num_bytes << " / chunk "
<< get_pgbackend()->get_ec_data_chunk_count()
<< dendl;
}
return num_bytes;
}
protected:
/*
@ -1848,6 +1938,7 @@ protected:
};
public:
int pg_stat_adjust(osd_stat_t *new_stat);
protected:
struct AdvMap : boost::statechart::event< AdvMap > {

View File

@ -295,6 +295,11 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
virtual bool pg_is_remote_backfilling() = 0;
virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
virtual void pg_add_num_bytes(int64_t num_bytes) = 0;
virtual void pg_sub_num_bytes(int64_t num_bytes) = 0;
virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
virtual ~Listener() {}
};
@ -406,6 +411,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0;
virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
virtual int get_ec_data_chunk_count() const { return 0; };
virtual int get_ec_stripe_chunk_size() const { return 0; };
virtual void dump_recovery_info(Formatter *f) const = 0;

View File

@ -134,11 +134,15 @@ struct MTrim : boost::statechart::event<MTrim> {
struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
unsigned priority;
explicit RequestBackfillPrio(unsigned prio) :
int64_t primary_num_bytes;
int64_t local_num_bytes;
explicit RequestBackfillPrio(unsigned prio, int64_t pbytes, int64_t lbytes) :
boost::statechart::event< RequestBackfillPrio >(),
priority(prio) {}
priority(prio), primary_num_bytes(pbytes), local_num_bytes(lbytes) {}
void print(std::ostream *out) const {
*out << "RequestBackfillPrio: priority " << priority;
*out << "RequestBackfillPrio: priority " << priority
<< " primary bytes " << primary_num_bytes
<< " local bytes " << local_num_bytes;
}
};

View File

@ -4310,7 +4310,19 @@ void PrimaryLogPG::do_backfill(OpRequestRef op)
ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
info.set_last_backfill(m->last_backfill);
info.stats = m->stats;
// During backfill submit_push_data() tracks num_bytes which is needed in case
// backfill stops and starts again. We want to know how many bytes this
// pg is consuming on the disk in order to compute amount of new data
// reserved to hold backfill if it won't fit.
if (m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS) {
dout(0) << __func__ << " primary " << m->stats.stats.sum.num_bytes << " local " << info.stats.stats.sum.num_bytes << dendl;
int64_t bytes = info.stats.stats.sum.num_bytes;
info.stats = m->stats;
info.stats.stats.sum.num_bytes = bytes;
} else {
dout(0) << __func__ << " final " << m->stats.stats.sum.num_bytes << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
info.stats = m->stats;
}
ObjectStore::Transaction t;
dirty_info = true;
@ -4341,6 +4353,38 @@ void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
ObjectStore::Transaction t;
for (auto& p : m->ls) {
if (is_remote_backfilling()) {
struct stat st;
int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
pg_whoami.shard) , &st);
if (r == 0) {
sub_local_num_bytes(st.st_size);
int64_t usersize;
if (pool.info.is_erasure()) {
bufferlist bv;
int r = osd->store->getattr(
ch,
ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
OI_ATTR,
bv);
if (r >= 0) {
object_info_t oi(bv);
usersize = oi.size * pgbackend->get_ec_data_chunk_count();
} else {
dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
<< " can't get object info" << dendl;
usersize = 0;
}
} else {
usersize = st.st_size;
}
sub_num_bytes(usersize);
dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
<< " sub actual data by " << st.st_size
<< " sub num_bytes by " << usersize
<< dendl;
}
}
remove_snap_mapped_object(t, p.first);
}
int r = osd->store->queue_transaction(ch, std::move(t), NULL);
@ -8566,6 +8610,7 @@ void PrimaryLogPG::apply_stats(
const object_stat_sum_t &delta_stats) {
info.stats.stats.add(delta_stats);
info.stats.stats.floor(0);
for (set<pg_shard_t>::iterator i = backfill_targets.begin();
i != backfill_targets.end();

View File

@ -402,6 +402,22 @@ public:
release_object_locks(manager);
}
bool pg_is_remote_backfilling() override {
return is_remote_backfilling();
}
void pg_add_local_num_bytes(int64_t num_bytes) override {
add_local_num_bytes(num_bytes);
}
void pg_sub_local_num_bytes(int64_t num_bytes) override {
sub_local_num_bytes(num_bytes);
}
void pg_add_num_bytes(int64_t num_bytes) override {
add_num_bytes(num_bytes);
}
void pg_sub_num_bytes(int64_t num_bytes) override {
sub_num_bytes(num_bytes);
}
void pgb_set_object_snap_mapping(
const hobject_t &soid,
const set<snapid_t> &snaps,

View File

@ -1531,6 +1531,24 @@ void ReplicatedBackend::submit_push_data(
oi.expected_object_size,
oi.expected_write_size,
oi.alloc_hint_flags);
if (get_parent()->pg_is_remote_backfilling()) {
struct stat st;
uint64_t size = 0;
int r = store->stat(ch, ghobject_t(recovery_info.soid), &st);
if (r == 0) {
size = st.st_size;
}
// Don't need to do anything if object is still the same size
if (size != recovery_info.oi.size) {
get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
get_parent()->pg_add_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
dout(10) << __func__ << " " << recovery_info.soid
<< " backfill size " << recovery_info.oi.size
<< " previous size " << size
<< " net size " << recovery_info.oi.size - size
<< dendl;
}
}
}
uint64_t off = 0;
uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;

View File

@ -2195,6 +2195,9 @@ struct store_statfs_t
uint64_t kb() const {
return total >> 10;
}
uint64_t kb_used() const {
return (total - available - internally_reserved) >> 10;
}
uint64_t kb_used_raw() const {
return get_used_raw() >> 10;
}