Merge pull request #52711 from cbodley/wip-62250

rgw: retry metadata cache notifications with INVALIDATE_OBJ

Reviewed-by: Adam Emerson <aemerson@redhat.com>
This commit is contained in:
Casey Bodley 2023-08-03 09:04:05 -04:00 committed by GitHub
commit b6383e9e0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 70 additions and 18 deletions

View File

@ -3258,7 +3258,7 @@ options:
is very heavily loaded. Beware that increasing this value may cause some operations
to take longer in exceptional cases and thus may, rarely, cause clients to time
out.
default: 3
default: 10
tags:
- error recovery
services:

View File

@ -1351,13 +1351,7 @@ int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
*/
int RGWRados::init_begin(const DoutPrefixProvider *dpp)
{
int ret;
inject_notify_timeout_probability =
cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
ret = init_svc(false, dpp);
int ret = init_svc(false, dpp);
if (ret < 0) {
ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
return ret;

View File

@ -390,11 +390,6 @@ class RGWRados
librados::IoCtx root_pool_ctx; // .rgw
double inject_notify_timeout_probability{0.0};
unsigned max_notify_retries{0};
friend class RGWWatcher;
ceph::mutex bucket_id_lock{ceph::make_mutex("rados_bucket_id")};
// This field represents the number of bucket index object shards

View File

@ -278,6 +278,10 @@ int RGWSI_Notify::do_start(optional_yield y, const DoutPrefixProvider *dpp)
return r;
}
inject_notify_timeout_probability =
cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
control_pool = zone_svc->get_zone_params().control_pool;
int ret = init_watch(dpp, y);
@ -396,19 +400,69 @@ int RGWSI_Notify::distribute(const DoutPrefixProvider *dpp, const string& key,
return 0;
}
namespace librados {
static std::ostream& operator<<(std::ostream& out, const notify_timeout_t& t)
{
return out << t.notifier_id << ':' << t.cookie;
}
} // namespace librados
using timeout_vector = std::vector<librados::notify_timeout_t>;
static timeout_vector decode_timeouts(const bufferlist& bl)
{
using ceph::decode;
auto p = bl.begin();
// decode and discard the acks
uint32_t num_acks;
decode(num_acks, p);
for (auto i = 0u; i < num_acks; ++i) {
std::pair<uint64_t, uint64_t> id;
decode(id, p);
// discard the payload
uint32_t blen;
decode(blen, p);
p += blen;
}
// decode and return the timeouts
uint32_t num_timeouts;
decode(num_timeouts, p);
timeout_vector timeouts;
for (auto i = 0u; i < num_timeouts; ++i) {
std::pair<uint64_t, uint64_t> id;
decode(id, p);
timeouts.push_back({id.first, id.second});
}
return timeouts;
}
int RGWSI_Notify::robust_notify(const DoutPrefixProvider *dpp,
RGWSI_RADOS::Obj& notify_obj,
const RGWCacheNotifyInfo& cni,
optional_yield y)
{
bufferlist bl;
bufferlist bl, rbl;
encode(cni, bl);
// First, try to send, without being fancy about it.
auto r = notify_obj.notify(dpp, bl, 0, nullptr, y);
auto r = notify_obj.notify(dpp, bl, 0, &rbl, y);
if (r < 0) {
timeout_vector timeouts;
try {
timeouts = decode_timeouts(rbl);
} catch (const buffer::error& e) {
ldpp_dout(dpp, 0) << "robust_notify failed to decode notify response: "
<< e.what() << dendl;
}
ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__
<< " Watchers " << timeouts << " did not respond."
<< " Notify failed on object " << cni.obj << ": "
<< cpp_strerror(-r) << dendl;
}
@ -427,10 +481,19 @@ int RGWSI_Notify::robust_notify(const DoutPrefixProvider *dpp,
ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__
<< " Invalidating obj=" << info.obj << " tries="
<< tries << dendl;
r = notify_obj.notify(dpp, bl, 0, nullptr, y);
r = notify_obj.notify(dpp, retrybl, 0, &rbl, y);
if (r < 0) {
timeout_vector timeouts;
try {
timeouts = decode_timeouts(rbl);
} catch (const buffer::error& e) {
ldpp_dout(dpp, 0) << "robust_notify failed to decode notify response: "
<< e.what() << dendl;
}
ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__
<< " invalidation attempt " << tries << " failed: "
<< " Watchers " << timeouts << " did not respond."
<< " Invalidation attempt " << tries << " failed: "
<< cpp_strerror(-r) << dendl;
}
}

View File

@ -42,7 +42,7 @@ private:
bool enabled{false};
double inject_notify_timeout_probability{0};
static constexpr unsigned max_notify_retries = 10;
uint64_t max_notify_retries = 10;
std::string get_control_oid(int i);
RGWSI_RADOS::Obj pick_control_obj(const std::string& key);