mirror of
https://github.com/ceph/ceph
synced 2025-03-25 11:48:05 +00:00
Merge pull request #22797 from dzafman/wip-19753
osd: Deny reservation if expected backfill size would put us over bac… Reviewed-by: Josh Durgin <jdurgin@redhat.com> Reviewed-by: Neha Ojha <nojha@redhat.com>
This commit is contained in:
commit
99ddd3666b
@ -1137,6 +1137,35 @@ function test_get_not_primary() {
|
||||
|
||||
#######################################################################
|
||||
|
||||
function _objectstore_tool_nodown() {
|
||||
local dir=$1
|
||||
shift
|
||||
local id=$1
|
||||
shift
|
||||
local osd_data=$dir/$id
|
||||
|
||||
local journal_args
|
||||
if [ "$objectstore_type" == "filestore" ]; then
|
||||
journal_args=" --journal-path $osd_data/journal"
|
||||
fi
|
||||
ceph-objectstore-tool \
|
||||
--data-path $osd_data \
|
||||
$journal_args \
|
||||
"$@" || return 1
|
||||
}
|
||||
|
||||
function _objectstore_tool_nowait() {
|
||||
local dir=$1
|
||||
shift
|
||||
local id=$1
|
||||
shift
|
||||
|
||||
kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
|
||||
|
||||
_objectstore_tool_nodown $dir $id "$@" || return 1
|
||||
activate_osd $dir $id $ceph_osd_args >&2 || return 1
|
||||
}
|
||||
|
||||
##
|
||||
# Run ceph-objectstore-tool against the OSD **id** using the data path
|
||||
# **dir**. The OSD is killed with TERM prior to running
|
||||
@ -1158,21 +1187,8 @@ function objectstore_tool() {
|
||||
shift
|
||||
local id=$1
|
||||
shift
|
||||
local osd_data=$dir/$id
|
||||
|
||||
local osd_type=$(cat $osd_data/type)
|
||||
|
||||
kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
|
||||
|
||||
local journal_args
|
||||
if [ "$objectstore_type" == "filestore" ]; then
|
||||
journal_args=" --journal-path $osd_data/journal"
|
||||
fi
|
||||
ceph-objectstore-tool \
|
||||
--data-path $osd_data \
|
||||
$journal_args \
|
||||
"$@" || return 1
|
||||
activate_osd $dir $id $ceph_osd_args >&2 || return 1
|
||||
_objectstore_tool_nowait $dir $id "$@" || return 1
|
||||
wait_for_clean >&2
|
||||
}
|
||||
|
||||
|
1162
qa/standalone/osd/osd-backfill-space.sh
Executable file
1162
qa/standalone/osd/osd-backfill-space.sh
Executable file
File diff suppressed because it is too large
Load Diff
49
qa/suites/rados/singleton/all/thrash-backfill-full.yaml
Normal file
49
qa/suites/rados/singleton/all/thrash-backfill-full.yaml
Normal file
@ -0,0 +1,49 @@
|
||||
roles:
|
||||
- - mon.a
|
||||
- mgr.x
|
||||
- osd.0
|
||||
- osd.1
|
||||
- osd.2
|
||||
- - osd.3
|
||||
- osd.4
|
||||
- osd.5
|
||||
- client.0
|
||||
openstack:
|
||||
- volumes: # attached to each instance
|
||||
count: 3
|
||||
size: 10 # GB
|
||||
override:
|
||||
ceph:
|
||||
conf:
|
||||
mon:
|
||||
osd default pool size: 3
|
||||
osd min pg log entries: 5
|
||||
osd max pg log entries: 10
|
||||
tasks:
|
||||
- install:
|
||||
- ceph:
|
||||
log-whitelist:
|
||||
- but it is still running
|
||||
- missing primary copy of
|
||||
- objects unfound and apparently lost
|
||||
- overall HEALTH_
|
||||
- \(OSDMAP_FLAGS\)
|
||||
- \(SLOW_OPS\)
|
||||
- \(PG_
|
||||
- \(OBJECT_MISPLACED\)
|
||||
- \(OSD_
|
||||
- \(OBJECT_
|
||||
- \(TOO_FEW_PGS\)
|
||||
- \(POOL_BACKFILLFULL\)
|
||||
- thrashosds:
|
||||
op_delay: 30
|
||||
clean_interval: 120
|
||||
chance_down: .75
|
||||
min_live: 5
|
||||
min_in: 5
|
||||
chance_test_backfill_full: .5
|
||||
- radosbench:
|
||||
clients: [client.0]
|
||||
time: 1800
|
||||
type: rand
|
||||
objectsize: 1048576
|
@ -1547,3 +1547,4 @@ OPTION(rgw_sts_entry, OPT_STR)
|
||||
OPTION(rgw_sts_key, OPT_STR)
|
||||
OPTION(rgw_s3_auth_use_sts, OPT_BOOL) // should we try to use sts for s3?
|
||||
OPTION(rgw_sts_max_session_duration, OPT_U64) // Max duration in seconds for which the session token is valid.
|
||||
OPTION(fake_statfs_for_testing, OPT_INT) // Set a value for kb and compute kb_used from total of num_bytes
|
||||
|
@ -7840,12 +7840,17 @@ std::vector<Option> get_mds_client_options() {
|
||||
|
||||
Option("client_mds_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
|
||||
.set_default("")
|
||||
|
||||
.set_description("CephFS file system name to mount")
|
||||
.set_long_description("Use this with ceph-fuse, or with any process "
|
||||
"that uses libcephfs. Programs using libcephfs may also pass "
|
||||
"the filesystem name into mount(), which will override this setting. "
|
||||
"If no filesystem name is given in mount() or this setting, the default "
|
||||
"filesystem will be mounted (usually the first created)."),
|
||||
|
||||
Option("fake_statfs_for_testing", Option::TYPE_INT, Option::LEVEL_DEV)
|
||||
.set_default(0)
|
||||
.set_description("Set a value for kb and compute kb_used from total of num_bytes"),
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -22,7 +22,7 @@ class MBackfillReserve : public MessageInstance<MBackfillReserve, MOSDPeeringOp>
|
||||
public:
|
||||
friend factory;
|
||||
private:
|
||||
static constexpr int HEAD_VERSION = 4;
|
||||
static constexpr int HEAD_VERSION = 5;
|
||||
static constexpr int COMPAT_VERSION = 4;
|
||||
public:
|
||||
spg_t pgid;
|
||||
@ -38,6 +38,8 @@ public:
|
||||
};
|
||||
uint32_t type;
|
||||
uint32_t priority;
|
||||
int64_t primary_num_bytes;
|
||||
int64_t shard_num_bytes;
|
||||
|
||||
spg_t get_spg() const {
|
||||
return pgid;
|
||||
@ -55,7 +57,7 @@ public:
|
||||
return new PGPeeringEvent(
|
||||
query_epoch,
|
||||
query_epoch,
|
||||
RequestBackfillPrio(priority));
|
||||
RequestBackfillPrio(priority, primary_num_bytes, shard_num_bytes));
|
||||
case GRANT:
|
||||
return new PGPeeringEvent(
|
||||
query_epoch,
|
||||
@ -93,13 +95,17 @@ public:
|
||||
|
||||
MBackfillReserve()
|
||||
: MessageInstance(MSG_OSD_BACKFILL_RESERVE, HEAD_VERSION, COMPAT_VERSION),
|
||||
query_epoch(0), type(-1), priority(-1) {}
|
||||
query_epoch(0), type(-1), priority(-1), primary_num_bytes(0),
|
||||
shard_num_bytes(0) {}
|
||||
MBackfillReserve(int type,
|
||||
spg_t pgid,
|
||||
epoch_t query_epoch, unsigned prio = -1)
|
||||
epoch_t query_epoch, unsigned prio = -1,
|
||||
int64_t primary_num_bytes = 0,
|
||||
int64_t shard_num_bytes = 0)
|
||||
: MessageInstance(MSG_OSD_BACKFILL_RESERVE, HEAD_VERSION, COMPAT_VERSION),
|
||||
pgid(pgid), query_epoch(query_epoch),
|
||||
type(type), priority(prio) {}
|
||||
type(type), priority(prio), primary_num_bytes(primary_num_bytes),
|
||||
shard_num_bytes(shard_num_bytes) {}
|
||||
|
||||
std::string_view get_type_name() const override {
|
||||
return "MBackfillReserve";
|
||||
@ -137,6 +143,13 @@ public:
|
||||
decode(type, p);
|
||||
decode(priority, p);
|
||||
decode(pgid.shard, p);
|
||||
if (header.version >= 5) {
|
||||
decode(primary_num_bytes, p);
|
||||
decode(shard_num_bytes, p);
|
||||
} else {
|
||||
primary_num_bytes = 0;
|
||||
shard_num_bytes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void encode_payload(uint64_t features) override {
|
||||
@ -159,6 +172,8 @@ public:
|
||||
encode(type, payload);
|
||||
encode(priority, payload);
|
||||
encode(pgid.shard, payload);
|
||||
encode(primary_num_bytes, payload);
|
||||
encode(shard_num_bytes, payload);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -329,6 +329,15 @@ void ECBackend::handle_recovery_push(
|
||||
ceph_assert(op.data.length() == 0);
|
||||
}
|
||||
|
||||
if (get_parent()->pg_is_remote_backfilling()) {
|
||||
get_parent()->pg_add_local_num_bytes(op.data.length());
|
||||
get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count());
|
||||
dout(10) << __func__ << " " << op.soid
|
||||
<< " add new actual data by " << op.data.length()
|
||||
<< " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count()
|
||||
<< dendl;
|
||||
}
|
||||
|
||||
if (op.before_progress.first) {
|
||||
ceph_assert(op.attrset.count(string("_")));
|
||||
m->t.setattrs(
|
||||
@ -365,6 +374,20 @@ void ECBackend::handle_recovery_push(
|
||||
ObjectContextRef(),
|
||||
false,
|
||||
&m->t);
|
||||
if (get_parent()->pg_is_remote_backfilling()) {
|
||||
struct stat st;
|
||||
int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN,
|
||||
get_parent()->whoami_shard().shard), &st);
|
||||
if (r == 0) {
|
||||
get_parent()->pg_sub_local_num_bytes(st.st_size);
|
||||
// XXX: This can be way overestimated for small objects
|
||||
get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count());
|
||||
dout(10) << __func__ << " " << op.soid
|
||||
<< " sub actual data by " << st.st_size
|
||||
<< " sub num_bytes by " << st.st_size * get_ec_data_chunk_count()
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp());
|
||||
|
@ -601,6 +601,13 @@ public:
|
||||
return new ECRecPred(ec_impl);
|
||||
}
|
||||
|
||||
int get_ec_data_chunk_count() const override {
|
||||
return ec_impl->get_data_chunk_count();
|
||||
}
|
||||
int get_ec_stripe_chunk_size() const override {
|
||||
return sinfo.get_chunk_size();
|
||||
}
|
||||
|
||||
/**
|
||||
* ECReadPred
|
||||
*
|
||||
|
159
src/osd/OSD.cc
159
src/osd/OSD.cc
@ -269,7 +269,7 @@ OSDService::OSDService(OSD *osd) :
|
||||
stat_lock("OSDService::stat_lock"),
|
||||
full_status_lock("OSDService::full_status_lock"),
|
||||
cur_state(NONE),
|
||||
cur_ratio(0),
|
||||
cur_ratio(0), physical_ratio(0),
|
||||
epoch_lock("OSDService::epoch_lock"),
|
||||
boot_epoch(0), up_epoch(0), bind_epoch(0),
|
||||
is_stopping_lock("OSDService::is_stopping_lock")
|
||||
@ -684,20 +684,15 @@ float OSDService::get_failsafe_full_ratio()
|
||||
return full_ratio;
|
||||
}
|
||||
|
||||
void OSDService::check_full_status(float ratio)
|
||||
OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
|
||||
{
|
||||
std::lock_guard l(full_status_lock);
|
||||
|
||||
cur_ratio = ratio;
|
||||
|
||||
// The OSDMap ratios take precendence. So if the failsafe is .95 and
|
||||
// the admin sets the cluster full to .96, the failsafe moves up to .96
|
||||
// too. (Not that having failsafe == full is ideal, but it's better than
|
||||
// dropping writes before the clusters appears full.)
|
||||
OSDMapRef osdmap = get_osdmap();
|
||||
if (!osdmap || osdmap->get_epoch() == 0) {
|
||||
cur_state = NONE;
|
||||
return;
|
||||
return NONE;
|
||||
}
|
||||
float nearfull_ratio = osdmap->get_nearfull_ratio();
|
||||
float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
|
||||
@ -721,27 +716,34 @@ void OSDService::check_full_status(float ratio)
|
||||
nearfull_ratio = failsafe_ratio;
|
||||
}
|
||||
|
||||
if (injectfull_state > NONE && injectfull) {
|
||||
inject = "(Injected)";
|
||||
return injectfull_state;
|
||||
} else if (pratio > failsafe_ratio) {
|
||||
return FAILSAFE;
|
||||
} else if (ratio > full_ratio) {
|
||||
return FULL;
|
||||
} else if (ratio > backfillfull_ratio) {
|
||||
return BACKFILLFULL;
|
||||
} else if (ratio > nearfull_ratio) {
|
||||
return NEARFULL;
|
||||
}
|
||||
return NONE;
|
||||
}
|
||||
|
||||
void OSDService::check_full_status(float ratio, float pratio)
|
||||
{
|
||||
std::lock_guard l(full_status_lock);
|
||||
|
||||
cur_ratio = ratio;
|
||||
physical_ratio = pratio;
|
||||
|
||||
string inject;
|
||||
s_names new_state;
|
||||
if (injectfull_state > NONE && injectfull) {
|
||||
new_state = injectfull_state;
|
||||
inject = "(Injected)";
|
||||
} else if (ratio > failsafe_ratio) {
|
||||
new_state = FAILSAFE;
|
||||
} else if (ratio > full_ratio) {
|
||||
new_state = FULL;
|
||||
} else if (ratio > backfillfull_ratio) {
|
||||
new_state = BACKFILLFULL;
|
||||
} else if (ratio > nearfull_ratio) {
|
||||
new_state = NEARFULL;
|
||||
} else {
|
||||
new_state = NONE;
|
||||
}
|
||||
new_state = recalc_full_state(ratio, pratio, inject);
|
||||
|
||||
dout(20) << __func__ << " cur ratio " << ratio
|
||||
<< ". nearfull_ratio " << nearfull_ratio
|
||||
<< ". backfillfull_ratio " << backfillfull_ratio
|
||||
<< ", full_ratio " << full_ratio
|
||||
<< ", failsafe_ratio " << failsafe_ratio
|
||||
<< ", physical ratio " << pratio
|
||||
<< ", new state " << get_full_state_name(new_state)
|
||||
<< " " << inject
|
||||
<< dendl;
|
||||
@ -784,10 +786,8 @@ bool OSDService::need_fullness_update()
|
||||
return want != cur;
|
||||
}
|
||||
|
||||
bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
|
||||
bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
|
||||
{
|
||||
std::lock_guard l(full_status_lock);
|
||||
|
||||
if (injectfull && injectfull_state >= type) {
|
||||
// injectfull is either a count of the number of times to return failsafe full
|
||||
// or if -1 then always return full
|
||||
@ -798,12 +798,45 @@ bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
|
||||
<< dendl;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
|
||||
{
|
||||
std::lock_guard l(full_status_lock);
|
||||
|
||||
if (_check_inject_full(dpp, type))
|
||||
return true;
|
||||
|
||||
if (cur_state >= type)
|
||||
ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio << dendl;
|
||||
ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
|
||||
<< " physical " << physical_ratio << dendl;
|
||||
|
||||
return cur_state >= type;
|
||||
}
|
||||
|
||||
bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
|
||||
{
|
||||
ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
|
||||
{
|
||||
std::lock_guard l(full_status_lock);
|
||||
if (_check_inject_full(dpp, type)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
float pratio;
|
||||
float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
|
||||
|
||||
string notused;
|
||||
s_names tentative_state = recalc_full_state(ratio, pratio, notused);
|
||||
|
||||
if (tentative_state >= type)
|
||||
ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
|
||||
|
||||
return tentative_state >= type;
|
||||
}
|
||||
|
||||
bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
|
||||
{
|
||||
return _check_full(dpp, FAILSAFE);
|
||||
@ -814,6 +847,11 @@ bool OSDService::check_full(DoutPrefixProvider *dpp) const
|
||||
return _check_full(dpp, FULL);
|
||||
}
|
||||
|
||||
bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
|
||||
{
|
||||
return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
|
||||
}
|
||||
|
||||
bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
|
||||
{
|
||||
return _check_full(dpp, BACKFILLFULL);
|
||||
@ -861,12 +899,38 @@ void OSDService::set_statfs(const struct store_statfs_t &stbuf)
|
||||
uint64_t avail = stbuf.available;
|
||||
uint64_t used = stbuf.get_used_raw();
|
||||
|
||||
// For testing fake statfs values so it doesn't matter if all
|
||||
// OSDs are using the same partition.
|
||||
if (cct->_conf->fake_statfs_for_testing) {
|
||||
uint64_t total_num_bytes = 0;
|
||||
vector<PGRef> pgs;
|
||||
osd->_get_pgs(&pgs);
|
||||
for (auto p : pgs) {
|
||||
total_num_bytes += p->get_stats_num_bytes();
|
||||
}
|
||||
bytes = cct->_conf->fake_statfs_for_testing;
|
||||
if (total_num_bytes < bytes)
|
||||
avail = bytes - total_num_bytes;
|
||||
else
|
||||
avail = 0;
|
||||
dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
|
||||
<< " adjust available " << avail
|
||||
<< dendl;
|
||||
used = bytes - avail;
|
||||
}
|
||||
|
||||
osd->logger->set(l_osd_stat_bytes, bytes);
|
||||
osd->logger->set(l_osd_stat_bytes_used, used);
|
||||
osd->logger->set(l_osd_stat_bytes_avail, avail);
|
||||
|
||||
std::lock_guard l(stat_lock);
|
||||
osd_stat.statfs = stbuf;
|
||||
if (cct->_conf->fake_statfs_for_testing) {
|
||||
osd_stat.statfs.total = bytes;
|
||||
osd_stat.statfs.available = avail;
|
||||
// For testing don't want used to go negative, so clear reserved
|
||||
osd_stat.statfs.internally_reserved = 0;
|
||||
}
|
||||
}
|
||||
|
||||
osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
|
||||
@ -879,6 +943,34 @@ osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
|
||||
return osd_stat;
|
||||
}
|
||||
|
||||
float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
|
||||
uint64_t adjust_used)
|
||||
{
|
||||
*pratio =
|
||||
((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
|
||||
|
||||
if (adjust_used) {
|
||||
dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
|
||||
if (new_stat.statfs.available > adjust_used)
|
||||
new_stat.statfs.available -= adjust_used;
|
||||
else
|
||||
new_stat.statfs.available = 0;
|
||||
dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
|
||||
}
|
||||
|
||||
// Check all pgs and adjust kb_used to include all pending backfill data
|
||||
int backfill_adjusted = 0;
|
||||
vector<PGRef> pgs;
|
||||
osd->_get_pgs(&pgs);
|
||||
for (auto p : pgs) {
|
||||
backfill_adjusted += p->pg_stat_adjust(&new_stat);
|
||||
}
|
||||
if (backfill_adjusted) {
|
||||
dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
|
||||
}
|
||||
return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
|
||||
}
|
||||
|
||||
bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
|
||||
{
|
||||
OSDMapRef osdmap = get_osdmap();
|
||||
@ -5053,9 +5145,10 @@ void OSD::heartbeat()
|
||||
dout(5) << __func__ << " " << new_stat << dendl;
|
||||
ceph_assert(new_stat.statfs.total);
|
||||
|
||||
float ratio =
|
||||
((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
|
||||
service.check_full_status(ratio);
|
||||
float pratio;
|
||||
float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
|
||||
|
||||
service.check_full_status(ratio, pratio);
|
||||
|
||||
utime_t now = ceph_clock_now();
|
||||
utime_t deadline = now;
|
||||
|
@ -905,6 +905,7 @@ public:
|
||||
|
||||
void set_statfs(const struct store_statfs_t &stbuf);
|
||||
osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs);
|
||||
float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
|
||||
osd_stat_t get_osd_stat() {
|
||||
std::lock_guard l(stat_lock);
|
||||
++seq;
|
||||
@ -946,15 +947,19 @@ private:
|
||||
else
|
||||
return INVALID;
|
||||
}
|
||||
double cur_ratio; ///< current utilization
|
||||
double cur_ratio, physical_ratio; ///< current utilization
|
||||
mutable int64_t injectfull = 0;
|
||||
s_names injectfull_state = NONE;
|
||||
float get_failsafe_full_ratio();
|
||||
bool _check_inject_full(DoutPrefixProvider *dpp, s_names type) const;
|
||||
bool _check_full(DoutPrefixProvider *dpp, s_names type) const;
|
||||
public:
|
||||
void check_full_status(float ratio);
|
||||
void check_full_status(float ratio, float pratio);
|
||||
s_names recalc_full_state(float ratio, float pratio, string &inject);
|
||||
bool _tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t);
|
||||
bool check_failsafe_full(DoutPrefixProvider *dpp) const;
|
||||
bool check_full(DoutPrefixProvider *dpp) const;
|
||||
bool tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t);
|
||||
bool check_backfill_full(DoutPrefixProvider *dpp) const;
|
||||
bool check_nearfull(DoutPrefixProvider *dpp) const;
|
||||
bool is_failsafe_full() const;
|
||||
|
139
src/osd/PG.cc
139
src/osd/PG.cc
@ -1074,6 +1074,7 @@ void PG::clear_primary_state()
|
||||
peer_log_requested.clear();
|
||||
peer_missing_requested.clear();
|
||||
peer_info.clear();
|
||||
peer_bytes.clear();
|
||||
peer_missing.clear();
|
||||
need_up_thru = false;
|
||||
peer_last_complete_ondisk.clear();
|
||||
@ -2004,7 +2005,9 @@ void PG::activate(ObjectStore::Transaction& t,
|
||||
pi.last_interval_started = info.last_interval_started;
|
||||
pi.history = info.history;
|
||||
pi.hit_set = info.hit_set;
|
||||
pi.stats.stats.clear();
|
||||
// Save num_bytes for reservation request
|
||||
peer_bytes[peer] = pi.stats.stats.sum.num_bytes;
|
||||
pi.stats.stats.clear();
|
||||
|
||||
// initialize peer with our purged_snaps.
|
||||
pi.purged_snaps = info.purged_snaps;
|
||||
@ -4469,8 +4472,25 @@ void PG::handle_scrub_reserve_release(OpRequestRef op)
|
||||
clear_scrub_reserved();
|
||||
}
|
||||
|
||||
// We can zero the value of primary num_bytes as just an atomic.
|
||||
// However, setting above zero reserves space for backfill and requires
|
||||
// the OSDService::stat_lock which protects all OSD usage
|
||||
void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
|
||||
ceph_assert(osd->stat_lock.is_locked_by_me());
|
||||
primary_num_bytes.store(primary);
|
||||
local_num_bytes.store(local);
|
||||
return;
|
||||
}
|
||||
|
||||
void PG::clear_reserved_num_bytes() {
|
||||
primary_num_bytes.store(0);
|
||||
local_num_bytes.store(0);
|
||||
return;
|
||||
}
|
||||
|
||||
void PG::reject_reservation()
|
||||
{
|
||||
clear_reserved_num_bytes();
|
||||
osd->send_message_osd_cluster(
|
||||
primary.osd,
|
||||
new MBackfillReserve(
|
||||
@ -6836,6 +6856,41 @@ void PG::_delete_some(ObjectStore::Transaction *t)
|
||||
}
|
||||
}
|
||||
|
||||
// Compute pending backfill data
|
||||
static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
|
||||
{
|
||||
lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
|
||||
<< " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
|
||||
return std::max((int64_t)0, bf_bytes - local_bytes);
|
||||
}
|
||||
|
||||
int PG::pg_stat_adjust(osd_stat_t *ns)
|
||||
{
|
||||
osd_stat_t &new_stat = *ns;
|
||||
if (is_primary()) {
|
||||
return 0;
|
||||
}
|
||||
// Adjust the kb_used by adding pending backfill data
|
||||
uint64_t reserved_num_bytes = get_reserved_num_bytes();
|
||||
|
||||
// For now we don't consider projected space gains here
|
||||
// I suggest we have an optional 2 pass backfill that frees up
|
||||
// space in a first pass. This could be triggered when at nearfull
|
||||
// or near to backfillfull.
|
||||
if (reserved_num_bytes > 0) {
|
||||
// TODO: Handle compression by adjusting by the PGs average
|
||||
// compression precentage.
|
||||
dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
|
||||
<< " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
|
||||
if (new_stat.statfs.available > reserved_num_bytes)
|
||||
new_stat.statfs.available -= reserved_num_bytes;
|
||||
else
|
||||
new_stat.statfs.available = 0;
|
||||
dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*------------ Recovery State Machine----------------*/
|
||||
@ -7387,6 +7442,8 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
|
||||
int64_t num_bytes = pg->info.stats.stats.sum.num_bytes;
|
||||
ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl;
|
||||
if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
|
||||
//The primary never backfills itself
|
||||
ceph_assert(*backfill_osd_it != pg->pg_whoami);
|
||||
@ -7398,11 +7455,14 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
|
||||
MBackfillReserve::REQUEST,
|
||||
spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
|
||||
pg->get_osdmap_epoch(),
|
||||
pg->get_backfill_priority()),
|
||||
pg->get_backfill_priority(),
|
||||
num_bytes,
|
||||
pg->peer_bytes[*backfill_osd_it]),
|
||||
con.get());
|
||||
}
|
||||
++backfill_osd_it;
|
||||
} else {
|
||||
pg->peer_bytes.clear();
|
||||
post_event(AllBackfillsReserved());
|
||||
}
|
||||
return discard_event();
|
||||
@ -7592,6 +7652,7 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(
|
||||
const RemoteReservationCanceled &evt)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
|
||||
return transit<RepNotRecovering>();
|
||||
}
|
||||
@ -7616,18 +7677,64 @@ boost::statechart::result
|
||||
PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
// Use tentative_bacfill_full() to make sure enough
|
||||
// space is available to handle target bytes from primary.
|
||||
|
||||
// TODO: If we passed num_objects from primary we could account for
|
||||
// an estimate of the metadata overhead.
|
||||
|
||||
// TODO: If we had compressed_allocated and compressed_original from primary
|
||||
// we could compute compression ratio and adjust accordingly.
|
||||
|
||||
// XXX: There is no way to get omap overhead and this would only apply
|
||||
// to whatever possibly different partition that is storing the database.
|
||||
|
||||
// update_osd_stat() from heartbeat will do this on a new
|
||||
// statfs using pg->primary_num_bytes.
|
||||
uint64_t pending_adjustment = 0;
|
||||
int64_t primary_num_bytes = evt.primary_num_bytes;
|
||||
int64_t local_num_bytes = evt.local_num_bytes;
|
||||
if (primary_num_bytes) {
|
||||
// For erasure coded pool overestimate by a full stripe per object
|
||||
// because we don't know how each objected rounded to the nearest stripe
|
||||
if (pg->pool.info.is_erasure()) {
|
||||
primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
|
||||
primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
|
||||
local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
|
||||
local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
|
||||
}
|
||||
pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
|
||||
ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
|
||||
<< " local " << (local_num_bytes >> 10) << "KiB"
|
||||
<< " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
|
||||
<< dendl;
|
||||
}
|
||||
// This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
|
||||
// That's why we don't immediately unlock
|
||||
Mutex::Locker l(pg->osd->stat_lock);
|
||||
osd_stat_t cur_stat = pg->osd->osd_stat;
|
||||
if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
|
||||
(rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
|
||||
ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
|
||||
<< dendl;
|
||||
post_event(RejectRemoteReservation());
|
||||
} else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
|
||||
pg->osd->check_backfill_full(pg)) {
|
||||
pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
|
||||
ldout(pg->cct, 10) << "backfill reservation rejected: backfill full"
|
||||
<< dendl;
|
||||
post_event(RejectRemoteReservation());
|
||||
} else {
|
||||
Context *preempt = nullptr;
|
||||
// Don't reserve space if skipped reservation check, this is used
|
||||
// to test the other backfill full check AND in case a corruption
|
||||
// of num_bytes requires ignoring that value and trying the
|
||||
// backfill anyway.
|
||||
if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
|
||||
pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
|
||||
else
|
||||
pg->clear_reserved_num_bytes();
|
||||
// Use un-ec-adjusted bytes for stats.
|
||||
pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
|
||||
if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
|
||||
// older peers will interpret preemption as TOOFULL
|
||||
preempt = new QueuePeeringEvt<RemoteBackfillPreempted>(
|
||||
@ -7685,28 +7792,14 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
|
||||
if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
|
||||
(rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
|
||||
ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
|
||||
<< "failure injection" << dendl;
|
||||
post_event(RejectRemoteReservation());
|
||||
return discard_event();
|
||||
} else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
|
||||
pg->osd->check_backfill_full(pg)) {
|
||||
ldout(pg->cct, 10) << "backfill reservation rejected after reservation: backfill full"
|
||||
<< dendl;
|
||||
post_event(RejectRemoteReservation());
|
||||
return discard_event();
|
||||
} else {
|
||||
pg->osd->send_message_osd_cluster(
|
||||
pg->osd->send_message_osd_cluster(
|
||||
pg->primary.osd,
|
||||
new MBackfillReserve(
|
||||
MBackfillReserve::GRANT,
|
||||
spg_t(pg->info.pgid.pgid, pg->primary.shard),
|
||||
pg->get_osdmap_epoch()),
|
||||
pg->get_osdmap_epoch());
|
||||
return transit<RepRecovering>();
|
||||
}
|
||||
return transit<RepRecovering>();
|
||||
}
|
||||
|
||||
boost::statechart::result
|
||||
@ -7724,6 +7817,7 @@ PG::RecoveryState::RepWaitBackfillReserved::react(
|
||||
const RemoteReservationRejected &evt)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
|
||||
return transit<RepNotRecovering>();
|
||||
}
|
||||
@ -7733,6 +7827,7 @@ PG::RecoveryState::RepWaitBackfillReserved::react(
|
||||
const RemoteReservationCanceled &evt)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
|
||||
return transit<RepNotRecovering>();
|
||||
}
|
||||
@ -7741,6 +7836,7 @@ boost::statechart::result
|
||||
PG::RecoveryState::RepWaitBackfillReserved::react(const RecoveryDone&)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
|
||||
return transit<RepNotRecovering>();
|
||||
}
|
||||
@ -7757,6 +7853,7 @@ boost::statechart::result
|
||||
PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->send_message_osd_cluster(
|
||||
pg->primary.osd,
|
||||
new MRecoveryReserve(
|
||||
@ -7771,6 +7868,7 @@ boost::statechart::result
|
||||
PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->send_message_osd_cluster(
|
||||
pg->primary.osd,
|
||||
new MBackfillReserve(
|
||||
@ -7785,6 +7883,7 @@ boost::statechart::result
|
||||
PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &)
|
||||
{
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->send_message_osd_cluster(
|
||||
pg->primary.osd,
|
||||
new MBackfillReserve(
|
||||
@ -7799,6 +7898,7 @@ void PG::RecoveryState::RepRecovering::exit()
|
||||
{
|
||||
context< RecoveryMachine >().log_exit(state_name, enter_time);
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
|
||||
utime_t dur = ceph_clock_now() - enter_time;
|
||||
pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
|
||||
@ -8625,6 +8725,7 @@ void PG::RecoveryState::ReplicaActive::exit()
|
||||
{
|
||||
context< RecoveryMachine >().log_exit(state_name, enter_time);
|
||||
PG *pg = context< RecoveryMachine >().pg;
|
||||
pg->clear_reserved_num_bytes();
|
||||
pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
|
||||
utime_t dur = ceph_clock_now() - enter_time;
|
||||
pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
|
||||
|
91
src/osd/PG.h
91
src/osd/PG.h
@ -1084,6 +1084,7 @@ protected:
|
||||
bool need_up_thru;
|
||||
set<pg_shard_t> stray_set; // non-acting osds that have PG data.
|
||||
map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
|
||||
map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info
|
||||
set<pg_shard_t> peer_purged; // peers purged
|
||||
map<pg_shard_t, pg_missing_t> peer_missing;
|
||||
set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
|
||||
@ -1202,10 +1203,99 @@ protected:
|
||||
|
||||
set<pg_shard_t> backfill_targets, async_recovery_targets;
|
||||
|
||||
// The primary's num_bytes and local num_bytes for this pg, only valid
|
||||
// during backfill for non-primary shards.
|
||||
// Both of these are adjusted for EC to reflect the on-disk bytes
|
||||
std::atomic<int64_t> primary_num_bytes = 0;
|
||||
std::atomic<int64_t> local_num_bytes = 0;
|
||||
|
||||
public:
|
||||
bool is_backfill_targets(pg_shard_t osd) {
|
||||
return backfill_targets.count(osd);
|
||||
}
|
||||
|
||||
// Space reserved for backfill is primary_num_bytes - local_num_bytes
|
||||
// Don't care that difference itself isn't atomic
|
||||
uint64_t get_reserved_num_bytes() {
|
||||
int64_t primary = primary_num_bytes.load();
|
||||
int64_t local = local_num_bytes.load();
|
||||
if (primary > local)
|
||||
return primary - local;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool is_remote_backfilling() {
|
||||
return primary_num_bytes.load() > 0;
|
||||
}
|
||||
|
||||
void set_reserved_num_bytes(int64_t primary, int64_t local);
|
||||
void clear_reserved_num_bytes();
|
||||
|
||||
// If num_bytes are inconsistent and local_num- goes negative
|
||||
// it's ok, because it would then be ignored.
|
||||
|
||||
// The value of num_bytes could be negative,
|
||||
// but we don't let local_num_bytes go negative.
|
||||
void add_local_num_bytes(int64_t num_bytes) {
|
||||
if (num_bytes) {
|
||||
int64_t prev = local_num_bytes.fetch_add(num_bytes);
|
||||
ceph_assert(prev >= 0);
|
||||
if (num_bytes < 0 && prev < -num_bytes) {
|
||||
local_num_bytes.store(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
void sub_local_num_bytes(int64_t num_bytes) {
|
||||
ceph_assert(num_bytes >= 0);
|
||||
if (num_bytes) {
|
||||
if (local_num_bytes.fetch_sub(num_bytes) < num_bytes) {
|
||||
local_num_bytes.store(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
// The value of num_bytes could be negative,
|
||||
// but we don't let info.stats.stats.sum.num_bytes go negative.
|
||||
void add_num_bytes(int64_t num_bytes) {
|
||||
ceph_assert(_lock.is_locked_by_me());
|
||||
if (num_bytes) {
|
||||
info.stats.stats.sum.num_bytes += num_bytes;
|
||||
if (info.stats.stats.sum.num_bytes < 0) {
|
||||
info.stats.stats.sum.num_bytes = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
void sub_num_bytes(int64_t num_bytes) {
|
||||
ceph_assert(_lock.is_locked_by_me());
|
||||
ceph_assert(num_bytes >= 0);
|
||||
if (num_bytes) {
|
||||
info.stats.stats.sum.num_bytes -= num_bytes;
|
||||
if (info.stats.stats.sum.num_bytes < 0) {
|
||||
info.stats.stats.sum.num_bytes = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only used in testing so not worried about needing the PG lock here
|
||||
int64_t get_stats_num_bytes() {
|
||||
Mutex::Locker l(_lock);
|
||||
int num_bytes = info.stats.stats.sum.num_bytes;
|
||||
if (pool.info.is_erasure()) {
|
||||
num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
|
||||
// Round up each object by a stripe
|
||||
num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
|
||||
}
|
||||
int64_t lnb = local_num_bytes.load();
|
||||
if (lnb && lnb != num_bytes) {
|
||||
lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
|
||||
<< lnb << " vs stats "
|
||||
<< info.stats.stats.sum.num_bytes << " / chunk "
|
||||
<< get_pgbackend()->get_ec_data_chunk_count()
|
||||
<< dendl;
|
||||
}
|
||||
return num_bytes;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
/*
|
||||
@ -1848,6 +1938,7 @@ protected:
|
||||
};
|
||||
|
||||
public:
|
||||
int pg_stat_adjust(osd_stat_t *new_stat);
|
||||
protected:
|
||||
|
||||
struct AdvMap : boost::statechart::event< AdvMap > {
|
||||
|
@ -295,6 +295,11 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
|
||||
|
||||
virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
|
||||
|
||||
virtual bool pg_is_remote_backfilling() = 0;
|
||||
virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
|
||||
virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
|
||||
virtual void pg_add_num_bytes(int64_t num_bytes) = 0;
|
||||
virtual void pg_sub_num_bytes(int64_t num_bytes) = 0;
|
||||
virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
|
||||
virtual ~Listener() {}
|
||||
};
|
||||
@ -406,6 +411,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
|
||||
|
||||
virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0;
|
||||
virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
|
||||
virtual int get_ec_data_chunk_count() const { return 0; };
|
||||
virtual int get_ec_stripe_chunk_size() const { return 0; };
|
||||
|
||||
virtual void dump_recovery_info(Formatter *f) const = 0;
|
||||
|
||||
|
@ -134,11 +134,15 @@ struct MTrim : boost::statechart::event<MTrim> {
|
||||
|
||||
struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
|
||||
unsigned priority;
|
||||
explicit RequestBackfillPrio(unsigned prio) :
|
||||
int64_t primary_num_bytes;
|
||||
int64_t local_num_bytes;
|
||||
explicit RequestBackfillPrio(unsigned prio, int64_t pbytes, int64_t lbytes) :
|
||||
boost::statechart::event< RequestBackfillPrio >(),
|
||||
priority(prio) {}
|
||||
priority(prio), primary_num_bytes(pbytes), local_num_bytes(lbytes) {}
|
||||
void print(std::ostream *out) const {
|
||||
*out << "RequestBackfillPrio: priority " << priority;
|
||||
*out << "RequestBackfillPrio: priority " << priority
|
||||
<< " primary bytes " << primary_num_bytes
|
||||
<< " local bytes " << local_num_bytes;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -4310,7 +4310,19 @@ void PrimaryLogPG::do_backfill(OpRequestRef op)
|
||||
ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
|
||||
|
||||
info.set_last_backfill(m->last_backfill);
|
||||
info.stats = m->stats;
|
||||
// During backfill submit_push_data() tracks num_bytes which is needed in case
|
||||
// backfill stops and starts again. We want to know how many bytes this
|
||||
// pg is consuming on the disk in order to compute amount of new data
|
||||
// reserved to hold backfill if it won't fit.
|
||||
if (m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS) {
|
||||
dout(0) << __func__ << " primary " << m->stats.stats.sum.num_bytes << " local " << info.stats.stats.sum.num_bytes << dendl;
|
||||
int64_t bytes = info.stats.stats.sum.num_bytes;
|
||||
info.stats = m->stats;
|
||||
info.stats.stats.sum.num_bytes = bytes;
|
||||
} else {
|
||||
dout(0) << __func__ << " final " << m->stats.stats.sum.num_bytes << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
|
||||
info.stats = m->stats;
|
||||
}
|
||||
|
||||
ObjectStore::Transaction t;
|
||||
dirty_info = true;
|
||||
@ -4341,6 +4353,38 @@ void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
|
||||
|
||||
ObjectStore::Transaction t;
|
||||
for (auto& p : m->ls) {
|
||||
if (is_remote_backfilling()) {
|
||||
struct stat st;
|
||||
int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
|
||||
pg_whoami.shard) , &st);
|
||||
if (r == 0) {
|
||||
sub_local_num_bytes(st.st_size);
|
||||
int64_t usersize;
|
||||
if (pool.info.is_erasure()) {
|
||||
bufferlist bv;
|
||||
int r = osd->store->getattr(
|
||||
ch,
|
||||
ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
|
||||
OI_ATTR,
|
||||
bv);
|
||||
if (r >= 0) {
|
||||
object_info_t oi(bv);
|
||||
usersize = oi.size * pgbackend->get_ec_data_chunk_count();
|
||||
} else {
|
||||
dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
|
||||
<< " can't get object info" << dendl;
|
||||
usersize = 0;
|
||||
}
|
||||
} else {
|
||||
usersize = st.st_size;
|
||||
}
|
||||
sub_num_bytes(usersize);
|
||||
dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
|
||||
<< " sub actual data by " << st.st_size
|
||||
<< " sub num_bytes by " << usersize
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
remove_snap_mapped_object(t, p.first);
|
||||
}
|
||||
int r = osd->store->queue_transaction(ch, std::move(t), NULL);
|
||||
@ -8566,6 +8610,7 @@ void PrimaryLogPG::apply_stats(
|
||||
const object_stat_sum_t &delta_stats) {
|
||||
|
||||
info.stats.stats.add(delta_stats);
|
||||
info.stats.stats.floor(0);
|
||||
|
||||
for (set<pg_shard_t>::iterator i = backfill_targets.begin();
|
||||
i != backfill_targets.end();
|
||||
|
@ -402,6 +402,22 @@ public:
|
||||
release_object_locks(manager);
|
||||
}
|
||||
|
||||
bool pg_is_remote_backfilling() override {
|
||||
return is_remote_backfilling();
|
||||
}
|
||||
void pg_add_local_num_bytes(int64_t num_bytes) override {
|
||||
add_local_num_bytes(num_bytes);
|
||||
}
|
||||
void pg_sub_local_num_bytes(int64_t num_bytes) override {
|
||||
sub_local_num_bytes(num_bytes);
|
||||
}
|
||||
void pg_add_num_bytes(int64_t num_bytes) override {
|
||||
add_num_bytes(num_bytes);
|
||||
}
|
||||
void pg_sub_num_bytes(int64_t num_bytes) override {
|
||||
sub_num_bytes(num_bytes);
|
||||
}
|
||||
|
||||
void pgb_set_object_snap_mapping(
|
||||
const hobject_t &soid,
|
||||
const set<snapid_t> &snaps,
|
||||
|
@ -1531,6 +1531,24 @@ void ReplicatedBackend::submit_push_data(
|
||||
oi.expected_object_size,
|
||||
oi.expected_write_size,
|
||||
oi.alloc_hint_flags);
|
||||
if (get_parent()->pg_is_remote_backfilling()) {
|
||||
struct stat st;
|
||||
uint64_t size = 0;
|
||||
int r = store->stat(ch, ghobject_t(recovery_info.soid), &st);
|
||||
if (r == 0) {
|
||||
size = st.st_size;
|
||||
}
|
||||
// Don't need to do anything if object is still the same size
|
||||
if (size != recovery_info.oi.size) {
|
||||
get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
|
||||
get_parent()->pg_add_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
|
||||
dout(10) << __func__ << " " << recovery_info.soid
|
||||
<< " backfill size " << recovery_info.oi.size
|
||||
<< " previous size " << size
|
||||
<< " net size " << recovery_info.oi.size - size
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
}
|
||||
uint64_t off = 0;
|
||||
uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
|
||||
|
@ -2195,6 +2195,9 @@ struct store_statfs_t
|
||||
uint64_t kb() const {
|
||||
return total >> 10;
|
||||
}
|
||||
uint64_t kb_used() const {
|
||||
return (total - available - internally_reserved) >> 10;
|
||||
}
|
||||
uint64_t kb_used_raw() const {
|
||||
return get_used_raw() >> 10;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user